Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Jsoup to parse HTML #327

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
*
* @author Patrick Wright
*/
@Deprecated
public class FSCatalog {
/**
* Default constructor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
*
* @author Patrick Wright
*/
@Deprecated
public class FSEntityResolver implements EntityResolver2 {
private static final Logger log = LoggerFactory.getLogger(FSEntityResolver.class);

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* {{{ header & license
* Flying Saucer - Copyright (c) 2024
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copyrights should be owned by individual contributors.

*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 2.1
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
* }}}
*/
package org.xhtmlrenderer.resource;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.xhtmlrenderer.util.XRLog;
import org.xhtmlrenderer.util.XRRuntimeException;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.net.URL;

import javax.annotation.ParametersAreNonnullByDefault;
import javax.xml.transform.TransformerException;

/**
* HTMLResource uses JSoup to parse XML resources.
*/
@ParametersAreNonnullByDefault
public class HTMLResource extends AbstractResource {
private org.w3c.dom.Document document;

private HTMLResource(InputStream stream) {
super(stream);
try {
Document jsoupDoc = Jsoup.parse(stream, StandardCharsets.UTF_8.name(), "", Parser.xmlParser());
this.document = convertJsoupToW3CDocument(jsoupDoc);
} catch (IOException | ParserConfigurationException | TransformerException | SAXException e) {
XRLog.load(java.util.logging.Level.SEVERE, "Failed to parse and convert HTML document.", e);
throw new XRRuntimeException("Failed to parse and convert HTML document.", e);
}
}

public static HTMLResource load(URL source) {
try (InputStream stream = source.openStream()) {
return new HTMLResource(stream);
} catch (IOException e) {
XRLog.load(java.util.logging.Level.SEVERE, "Failed to load HTML from URL.", e);
throw new XRRuntimeException("Failed to load HTML from URL.", e);
}
}

public static HTMLResource load(InputStream stream) {
return new HTMLResource(stream);
}

public static HTMLResource load(Reader reader) {
try {
InputStream stream = convertReaderToInputStream(reader);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can jsoup read from a Reader directly? It would be nice to avoid the conversion.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return new HTMLResource(stream);
} catch (IOException e) {
XRLog.load(java.util.logging.Level.SEVERE, "Failed to load HTML from Reader.", e);
throw new XRRuntimeException("Failed to load HTML from Reader.", e);
}
}

public static HTMLResource load(String xml) {
return load(new StringReader(xml));
}

public org.w3c.dom.Document getDocument() {
return document;
}

private static org.w3c.dom.Document convertJsoupToW3CDocument(Document jsoupDoc) throws ParserConfigurationException, IOException, TransformerException, SAXException {
String html = jsoupDoc.outerHtml();
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
return builder.parse(new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8)));
}

private static InputStream convertReaderToInputStream(Reader reader) throws IOException {
char[] charBuffer = new char[8 * 1024];
StringBuilder builder = new StringBuilder();
int numCharsRead;
while ((numCharsRead = reader.read(charBuffer, 0, charBuffer.length)) != -1) {
builder.append(charBuffer, 0, numCharsRead);
}
return new ByteArrayInputStream(builder.toString().getBytes(StandardCharsets.UTF_8));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,11 @@


/**
* Deprecated, see HTMLResource instead.
*
* @author Patrick Wright
*/
@Deprecated
@ParametersAreNonnullByDefault
public class XMLResource extends AbstractResource {
private Document document;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import org.xhtmlrenderer.render.Box;
import org.xhtmlrenderer.render.PageBox;
import org.xhtmlrenderer.render.RenderingContext;
import org.xhtmlrenderer.resource.HTMLResource;
import org.xhtmlrenderer.resource.XMLResource;
import org.xhtmlrenderer.simple.NoNamespaceHandler;
import org.xhtmlrenderer.simple.extend.FormSubmissionListener;
Expand Down Expand Up @@ -340,8 +341,7 @@ public void setDocument(InputStream stream, String url, NamespaceHandler nsh) {
}

public void setDocumentFromString(String content, @Nullable String url, NamespaceHandler nsh) {
InputSource is = new InputSource(new StringReader(content));
Document dom = XMLResource.load(is).getDocument();
Document dom = HTMLResource.load(content).getDocument();

setDocument(dom, url, nsh);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.xhtmlrenderer.pdf.ITextRenderer;
import org.xhtmlrenderer.pdf.PDFCreationListener;
import org.xhtmlrenderer.pdf.util.XHtmlMetaToPdfInfoAdapter;
import org.xhtmlrenderer.resource.HTMLResource;
import org.xhtmlrenderer.resource.XMLResource;
import org.xhtmlrenderer.simple.FSScrollPane;
import org.xhtmlrenderer.swing.ImageResourceLoader;
Expand Down Expand Up @@ -316,7 +317,7 @@ public void exportToPdf(String path) throws IOException, ParserConfigurationExce
}

private void handlePageLoadFailed(String url_text, XRRuntimeException ex) {
final XMLResource xr;
final HTMLResource xr;
final String rootCause = getRootCause(ex);
final String msg = GeneralUtil.escapeHTML(addLineBreaks(rootCause));
String notFound =
Expand All @@ -333,7 +334,7 @@ private void handlePageLoadFailed(String url_text, XRRuntimeException ex) {
"</body>\n" +
"</html>";

xr = XMLResource.load(new StringReader(notFound));
xr = HTMLResource.load(notFound);
SwingUtilities.invokeLater(() -> root.panel.view.setDocument(xr.getDocument(), null));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,9 @@
import org.xhtmlrenderer.render.PageBox;
import org.xhtmlrenderer.render.RenderingContext;
import org.xhtmlrenderer.render.ViewportBox;
import org.xhtmlrenderer.resource.XMLResource;
import org.xhtmlrenderer.resource.HTMLResource;
import org.xhtmlrenderer.simple.extend.XhtmlNamespaceHandler;
import org.xhtmlrenderer.util.Configuration;
import org.xml.sax.InputSource;

import javax.annotation.Nullable;
import javax.annotation.ParametersAreNonnullByDefault;
Expand Down Expand Up @@ -215,7 +214,7 @@ public final void setDocumentFromString(String content, @Nullable String baseUrl

private Document parse(String content) {
try (var is = new StringReader(content)) {
return XMLResource.load(new InputSource(is)).getDocument();
return HTMLResource.load(is).getDocument();
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xhtmlrenderer.pdf.ITextRenderer;
import org.xhtmlrenderer.resource.HTMLResource;
import org.xhtmlrenderer.resource.XMLResource;

import java.io.File;
Expand Down Expand Up @@ -36,7 +37,7 @@ void tableWithFloatedCell() throws IOException {
</html>""";

ITextRenderer renderer = new ITextRenderer();
byte[] result = renderer.createPDF(XMLResource.load(page).getDocument());
byte[] result = renderer.createPDF(HTMLResource.load(page).getDocument());
printFile(result, "table-with-floated-cell.pdf");
PDF pdf = new PDF(result);
assertThat(pdf).containsText("first cell", "second cell");
Expand All @@ -48,7 +49,7 @@ void tableCell() throws IOException {
<table><tr><td style="float:left;">first cell {float:left;}</td></tr></table>""";

ITextRenderer renderer = new ITextRenderer();
byte[] result = renderer.createPDF(XMLResource.load(page).getDocument());
byte[] result = renderer.createPDF(HTMLResource.load(page).getDocument());
printFile(result, "table-cell.pdf");

PDF pdf = new PDF(result);
Expand Down
5 changes: 5 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,11 @@
<version>${slf4j.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.17.2</version>
</dependency>
</dependencies>

<modules>
Expand Down