diff --git a/README.md b/README.md index 6011c505d..aa88953e0 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,10 @@ Flying Saucer is a pure-Java library for rendering arbitrary well-formed XML (or XHTML) using CSS 2.1 for layout and formatting, output to Swing panels, -PDF, and images. +PDF, and images. + +The new [Jsoup](https://jsoup.org/) based HTML parser will allow supporting HTML 5 syntax +and features in Flying Saucer, with a goal of implementing full HTML5 support over time. Comprehensive documentation available in [The Flying Saucer User's Guide](https://flyingsaucerproject.github.io/flyingsaucer/r8/guide/users-guide-R8.html). diff --git a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSCatalog.java b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSCatalog.java index 3776740f3..5b401e091 100644 --- a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSCatalog.java +++ b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSCatalog.java @@ -63,6 +63,7 @@ * * @author Patrick Wright */ +@Deprecated public class FSCatalog { /** * Default constructor diff --git a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSEntityResolver.java b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSEntityResolver.java index 1007542bd..cc6a24a76 100644 --- a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSEntityResolver.java +++ b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSEntityResolver.java @@ -54,6 +54,7 @@ * * @author Patrick Wright */ +@Deprecated public class FSEntityResolver implements EntityResolver2 { private static final Logger log = LoggerFactory.getLogger(FSEntityResolver.class); diff --git a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/HTMLResource.java b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/HTMLResource.java new file mode 100644 index 000000000..4b31c0e52 --- /dev/null +++ b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/HTMLResource.java @@ -0,0 +1,93 @@ +/* + * {{{ header & license + * Flying Saucer - Copyright (c) 2024 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * }}} + */ +package org.xhtmlrenderer.resource; + +import org.jsoup.Jsoup; +import org.jsoup.helper.W3CDom; +import org.jsoup.nodes.Document; +import org.jsoup.parser.Parser; +import org.xhtmlrenderer.util.XRLog; +import org.xhtmlrenderer.util.XRRuntimeException; +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.net.URL; + +import javax.annotation.ParametersAreNonnullByDefault; + +/** + * HTMLResource uses JSoup to parse HTML. + */ +@ParametersAreNonnullByDefault +public class HTMLResource extends AbstractResource { + private org.w3c.dom.Document document; + + private HTMLResource(InputStream stream) { + super(stream); + try { + Document jsoupDoc = Jsoup.parse(stream, StandardCharsets.UTF_8.name(), "", Parser.htmlParser()); + W3CDom w3cDom = new W3CDom(); + this.document = w3cDom.fromJsoup(jsoupDoc); + } catch (IOException e) { + XRLog.load(java.util.logging.Level.SEVERE, "Failed to parse and convert HTML document.", e); + throw new XRRuntimeException("Failed to parse and convert HTML document.", e); + } + } + + public static HTMLResource load(URL source) { + try (InputStream stream = source.openStream()) { + return new HTMLResource(stream); + } catch (IOException e) { + XRLog.load(java.util.logging.Level.SEVERE, "Failed to load HTML from URL.", e); + throw new XRRuntimeException("Failed to load HTML from URL.", e); + } + } + + public static HTMLResource load(InputStream stream) { + return new HTMLResource(stream); + } + + public static HTMLResource load(Reader reader) { + try { + InputStream stream = convertReaderToInputStream(reader); + return new HTMLResource(stream); + } catch (IOException e) { + XRLog.load(java.util.logging.Level.SEVERE, "Failed to load HTML from Reader.", e); + throw new XRRuntimeException("Failed to load HTML from Reader.", e); + } + } + + public static HTMLResource load(String xml) { + return load(new StringReader(xml)); + } + + public org.w3c.dom.Document getDocument() { + return document; + } + + private static InputStream convertReaderToInputStream(Reader reader) throws IOException { + char[] charBuffer = new char[8 * 1024]; + StringBuilder builder = new StringBuilder(); + int numCharsRead; + while ((numCharsRead = reader.read(charBuffer, 0, charBuffer.length)) != -1) { + builder.append(charBuffer, 0, numCharsRead); + } + return new ByteArrayInputStream(builder.toString().getBytes(StandardCharsets.UTF_8)); + } +} \ No newline at end of file diff --git a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/XMLResource.java b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/XMLResource.java index 739a70dce..2ef5fc132 100644 --- a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/XMLResource.java +++ b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/XMLResource.java @@ -59,8 +59,11 @@ /** + * Deprecated, see HTMLResource instead. + * * @author Patrick Wright */ +@Deprecated @ParametersAreNonnullByDefault public class XMLResource extends AbstractResource { private Document document; diff --git a/flying-saucer-core/src/main/java/org/xhtmlrenderer/swing/BasicPanel.java b/flying-saucer-core/src/main/java/org/xhtmlrenderer/swing/BasicPanel.java index 178b9d4f4..566117486 100644 --- a/flying-saucer-core/src/main/java/org/xhtmlrenderer/swing/BasicPanel.java +++ b/flying-saucer-core/src/main/java/org/xhtmlrenderer/swing/BasicPanel.java @@ -29,6 +29,7 @@ import org.xhtmlrenderer.render.Box; import org.xhtmlrenderer.render.PageBox; import org.xhtmlrenderer.render.RenderingContext; +import org.xhtmlrenderer.resource.HTMLResource; import org.xhtmlrenderer.resource.XMLResource; import org.xhtmlrenderer.simple.NoNamespaceHandler; import org.xhtmlrenderer.simple.extend.FormSubmissionListener; @@ -340,8 +341,7 @@ public void setDocument(InputStream stream, String url, NamespaceHandler nsh) { } public void setDocumentFromString(String content, @Nullable String url, NamespaceHandler nsh) { - InputSource is = new InputSource(new StringReader(content)); - Document dom = XMLResource.load(is).getDocument(); + Document dom = HTMLResource.load(content).getDocument(); setDocument(dom, url, nsh); } diff --git a/flying-saucer-examples/src/main/java/org/xhtmlrenderer/demo/browser/BrowserPanel.java b/flying-saucer-examples/src/main/java/org/xhtmlrenderer/demo/browser/BrowserPanel.java index 6bf4b1c72..e260c791f 100755 --- a/flying-saucer-examples/src/main/java/org/xhtmlrenderer/demo/browser/BrowserPanel.java +++ b/flying-saucer-examples/src/main/java/org/xhtmlrenderer/demo/browser/BrowserPanel.java @@ -26,6 +26,7 @@ import org.xhtmlrenderer.pdf.ITextRenderer; import org.xhtmlrenderer.pdf.PDFCreationListener; import org.xhtmlrenderer.pdf.util.XHtmlMetaToPdfInfoAdapter; +import org.xhtmlrenderer.resource.HTMLResource; import org.xhtmlrenderer.resource.XMLResource; import org.xhtmlrenderer.simple.FSScrollPane; import org.xhtmlrenderer.swing.ImageResourceLoader; @@ -316,7 +317,7 @@ public void exportToPdf(String path) throws IOException, ParserConfigurationExce } private void handlePageLoadFailed(String url_text, XRRuntimeException ex) { - final XMLResource xr; + final HTMLResource xr; final String rootCause = getRootCause(ex); final String msg = GeneralUtil.escapeHTML(addLineBreaks(rootCause)); String notFound = @@ -333,7 +334,7 @@ private void handlePageLoadFailed(String url_text, XRRuntimeException ex) { "\n" + ""; - xr = XMLResource.load(new StringReader(notFound)); + xr = HTMLResource.load(notFound); SwingUtilities.invokeLater(() -> root.panel.view.setDocument(xr.getDocument(), null)); } diff --git a/flying-saucer-pdf/src/main/java/org/xhtmlrenderer/pdf/ITextRenderer.java b/flying-saucer-pdf/src/main/java/org/xhtmlrenderer/pdf/ITextRenderer.java index 5340518dd..dd7dbfbbe 100644 --- a/flying-saucer-pdf/src/main/java/org/xhtmlrenderer/pdf/ITextRenderer.java +++ b/flying-saucer-pdf/src/main/java/org/xhtmlrenderer/pdf/ITextRenderer.java @@ -39,10 +39,9 @@ import org.xhtmlrenderer.render.PageBox; import org.xhtmlrenderer.render.RenderingContext; import org.xhtmlrenderer.render.ViewportBox; -import org.xhtmlrenderer.resource.XMLResource; +import org.xhtmlrenderer.resource.HTMLResource; import org.xhtmlrenderer.simple.extend.XhtmlNamespaceHandler; import org.xhtmlrenderer.util.Configuration; -import org.xml.sax.InputSource; import javax.annotation.Nullable; import javax.annotation.ParametersAreNonnullByDefault; @@ -215,7 +214,7 @@ public final void setDocumentFromString(String content, @Nullable String baseUrl private Document parse(String content) { try (var is = new StringReader(content)) { - return XMLResource.load(new InputSource(is)).getDocument(); + return HTMLResource.load(is).getDocument(); } } diff --git a/flying-saucer-pdf/src/test/java/org/xhtmlrenderer/pdf/bug/FloatedTableCellTest.java b/flying-saucer-pdf/src/test/java/org/xhtmlrenderer/pdf/bug/FloatedTableCellTest.java index ed5add4db..415295316 100644 --- a/flying-saucer-pdf/src/test/java/org/xhtmlrenderer/pdf/bug/FloatedTableCellTest.java +++ b/flying-saucer-pdf/src/test/java/org/xhtmlrenderer/pdf/bug/FloatedTableCellTest.java @@ -5,6 +5,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xhtmlrenderer.pdf.ITextRenderer; +import org.xhtmlrenderer.resource.HTMLResource; import org.xhtmlrenderer.resource.XMLResource; import java.io.File; @@ -36,7 +37,7 @@ void tableWithFloatedCell() throws IOException { """; ITextRenderer renderer = new ITextRenderer(); - byte[] result = renderer.createPDF(XMLResource.load(page).getDocument()); + byte[] result = renderer.createPDF(HTMLResource.load(page).getDocument()); printFile(result, "table-with-floated-cell.pdf"); PDF pdf = new PDF(result); assertThat(pdf).containsText("first cell", "second cell"); @@ -48,7 +49,7 @@ void tableCell() throws IOException {
first cell {float:left;}
"""; ITextRenderer renderer = new ITextRenderer(); - byte[] result = renderer.createPDF(XMLResource.load(page).getDocument()); + byte[] result = renderer.createPDF(HTMLResource.load(page).getDocument()); printFile(result, "table-cell.pdf"); PDF pdf = new PDF(result); diff --git a/pom.xml b/pom.xml index d913b06a2..9a71c401b 100644 --- a/pom.xml +++ b/pom.xml @@ -74,6 +74,11 @@ ${slf4j.version} test + + org.jsoup + jsoup + 1.17.2 +