flyingsaucerproject · andreasrosdal · May 26, 2024 · May 26, 2024 · May 27, 2024 · May 27, 2024
diff --git a/README.md b/README.md
@@ -5,7 +5,10 @@
 
 Flying Saucer is a pure-Java library for rendering arbitrary well-formed XML 
 (or XHTML) using CSS 2.1 for layout and formatting, output to Swing panels, 
-PDF, and images.
+PDF, and images. 
+
+The new [Jsoup](https://jsoup.org/) based HTML parser will allow supporting HTML 5 syntax
+and features in Flying Saucer, with a goal of implementing full HTML5 support over time. 
 
 Comprehensive documentation available in [The Flying Saucer User's Guide](https://flyingsaucerproject.github.io/flyingsaucer/r8/guide/users-guide-R8.html).
 

diff --git a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSCatalog.java b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSCatalog.java
@@ -63,6 +63,7 @@
  *
  * @author Patrick Wright
  */
+@Deprecated
 public class FSCatalog {
     /**
      * Default constructor

diff --git a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSEntityResolver.java b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/FSEntityResolver.java
@@ -54,6 +54,7 @@
  *
  * @author Patrick Wright
  */
+@Deprecated
 public class FSEntityResolver implements EntityResolver2 {
     private static final Logger log = LoggerFactory.getLogger(FSEntityResolver.class);
 

diff --git a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/HTMLResource.java b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/HTMLResource.java
@@ -0,0 +1,93 @@
+/*
+ * {{{ header & license
+ * Flying Saucer - Copyright (c) 2024
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * }}}
+ */
+package org.xhtmlrenderer.resource;
+
+import org.jsoup.Jsoup;
+import org.jsoup.helper.W3CDom;
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.Parser;
+import org.xhtmlrenderer.util.XRLog;
+import org.xhtmlrenderer.util.XRRuntimeException;
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.net.URL;
+
+import javax.annotation.ParametersAreNonnullByDefault;
+
+/**
+ * HTMLResource uses JSoup to parse HTML.
+ */
+@ParametersAreNonnullByDefault
+public class HTMLResource extends AbstractResource {
+    private org.w3c.dom.Document document;
+
+    private HTMLResource(InputStream stream) {
+        super(stream);
+        try {
+            Document jsoupDoc = Jsoup.parse(stream, StandardCharsets.UTF_8.name(), "", Parser.htmlParser());
+            W3CDom w3cDom = new W3CDom();
+            this.document = w3cDom.fromJsoup(jsoupDoc);
+        } catch (IOException  e) {
+            XRLog.load(java.util.logging.Level.SEVERE, "Failed to parse and convert HTML document.", e);
+            throw new XRRuntimeException("Failed to parse and convert HTML document.", e);
+        }
+    }
+
+    public static HTMLResource load(URL source) {
+        try (InputStream stream = source.openStream()) {
+            return new HTMLResource(stream);
+        } catch (IOException e) {
+            XRLog.load(java.util.logging.Level.SEVERE, "Failed to load HTML from URL.", e);
+            throw new XRRuntimeException("Failed to load HTML from URL.", e);
+        }
+    }
+
+    public static HTMLResource load(InputStream stream) {
+        return new HTMLResource(stream);
+    }
+
+    public static HTMLResource load(Reader reader) {
+        try {
+            InputStream stream = convertReaderToInputStream(reader);
+            return new HTMLResource(stream);
+        } catch (IOException e) {
+            XRLog.load(java.util.logging.Level.SEVERE, "Failed to load HTML from Reader.", e);
+            throw new XRRuntimeException("Failed to load HTML from Reader.", e);
+        }
+    }
+
+    public static HTMLResource load(String xml) {
+        return load(new StringReader(xml));
+    }
+
+    public org.w3c.dom.Document getDocument() {
+        return document;
+    }
+
+    private static InputStream convertReaderToInputStream(Reader reader) throws IOException {
+        char[] charBuffer = new char[8 * 1024];
+        StringBuilder builder = new StringBuilder();
+        int numCharsRead;
+        while ((numCharsRead = reader.read(charBuffer, 0, charBuffer.length)) != -1) {
+            builder.append(charBuffer, 0, numCharsRead);
+        }
+        return new ByteArrayInputStream(builder.toString().getBytes(StandardCharsets.UTF_8));
+    }
+}
diff --git a/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/XMLResource.java b/flying-saucer-core/src/main/java/org/xhtmlrenderer/resource/XMLResource.java
@@ -59,8 +59,11 @@
 
 
 /**
+ * Deprecated, see HTMLResource instead.
+ *
  * @author Patrick Wright
  */
+@Deprecated
 @ParametersAreNonnullByDefault
 public class XMLResource extends AbstractResource {
     private Document document;

diff --git a/flying-saucer-core/src/main/java/org/xhtmlrenderer/swing/BasicPanel.java b/flying-saucer-core/src/main/java/org/xhtmlrenderer/swing/BasicPanel.java
@@ -29,6 +29,7 @@
 import org.xhtmlrenderer.render.Box;
 import org.xhtmlrenderer.render.PageBox;
 import org.xhtmlrenderer.render.RenderingContext;
+import org.xhtmlrenderer.resource.HTMLResource;
 import org.xhtmlrenderer.resource.XMLResource;
 import org.xhtmlrenderer.simple.NoNamespaceHandler;
 import org.xhtmlrenderer.simple.extend.FormSubmissionListener;
@@ -340,8 +341,7 @@ public void setDocument(InputStream stream, String url, NamespaceHandler nsh) {
     }
 
     public void setDocumentFromString(String content, @Nullable String url, NamespaceHandler nsh) {
-        InputSource is = new InputSource(new StringReader(content));
-        Document dom = XMLResource.load(is).getDocument();
+        Document dom = HTMLResource.load(content).getDocument();
 
         setDocument(dom, url, nsh);
     }

diff --git a/flying-saucer-examples/src/main/java/org/xhtmlrenderer/demo/browser/BrowserPanel.java b/flying-saucer-examples/src/main/java/org/xhtmlrenderer/demo/browser/BrowserPanel.java
@@ -26,6 +26,7 @@
 import org.xhtmlrenderer.pdf.ITextRenderer;
 import org.xhtmlrenderer.pdf.PDFCreationListener;
 import org.xhtmlrenderer.pdf.util.XHtmlMetaToPdfInfoAdapter;
+import org.xhtmlrenderer.resource.HTMLResource;
 import org.xhtmlrenderer.resource.XMLResource;
 import org.xhtmlrenderer.simple.FSScrollPane;
 import org.xhtmlrenderer.swing.ImageResourceLoader;
@@ -316,7 +317,7 @@ public void exportToPdf(String path) throws IOException, ParserConfigurationExce
     }
 
     private void handlePageLoadFailed(String url_text, XRRuntimeException ex) {
-        final XMLResource xr;
+        final HTMLResource xr;
         final String rootCause = getRootCause(ex);
         final String msg = GeneralUtil.escapeHTML(addLineBreaks(rootCause));
         String notFound =
@@ -333,7 +334,7 @@ private void handlePageLoadFailed(String url_text, XRRuntimeException ex) {
                         "</body>\n" +
                         "</html>";
 
-        xr = XMLResource.load(new StringReader(notFound));
+        xr = HTMLResource.load(notFound);
         SwingUtilities.invokeLater(() -> root.panel.view.setDocument(xr.getDocument(), null));
    }
 

diff --git a/flying-saucer-pdf/src/main/java/org/xhtmlrenderer/pdf/ITextRenderer.java b/flying-saucer-pdf/src/main/java/org/xhtmlrenderer/pdf/ITextRenderer.java
@@ -39,10 +39,9 @@
 import org.xhtmlrenderer.render.PageBox;
 import org.xhtmlrenderer.render.RenderingContext;
 import org.xhtmlrenderer.render.ViewportBox;
-import org.xhtmlrenderer.resource.XMLResource;
+import org.xhtmlrenderer.resource.HTMLResource;
 import org.xhtmlrenderer.simple.extend.XhtmlNamespaceHandler;
 import org.xhtmlrenderer.util.Configuration;
-import org.xml.sax.InputSource;
 
 import javax.annotation.Nullable;
 import javax.annotation.ParametersAreNonnullByDefault;
@@ -215,7 +214,7 @@ public final void setDocumentFromString(String content, @Nullable String baseUrl
 
     private Document parse(String content) {
         try (var is = new StringReader(content)) {
-            return XMLResource.load(new InputSource(is)).getDocument();
+            return HTMLResource.load(is).getDocument();
         }
     }
 

diff --git a/flying-saucer-pdf/src/test/java/org/xhtmlrenderer/pdf/bug/FloatedTableCellTest.java b/flying-saucer-pdf/src/test/java/org/xhtmlrenderer/pdf/bug/FloatedTableCellTest.java
@@ -5,6 +5,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xhtmlrenderer.pdf.ITextRenderer;
+import org.xhtmlrenderer.resource.HTMLResource;
 import org.xhtmlrenderer.resource.XMLResource;
 
 import java.io.File;
@@ -36,7 +37,7 @@ void tableWithFloatedCell() throws IOException {
 			</html>""";
 
         ITextRenderer renderer = new ITextRenderer();
-		byte[] result = renderer.createPDF(XMLResource.load(page).getDocument());
+		byte[] result = renderer.createPDF(HTMLResource.load(page).getDocument());
 		printFile(result, "table-with-floated-cell.pdf");
 		PDF pdf = new PDF(result);
 		assertThat(pdf).containsText("first cell", "second cell");
@@ -48,7 +49,7 @@ void tableCell() throws IOException {
 			<table><tr><td style="float:left;">first cell {float:left;}</td></tr></table>""";
 
         ITextRenderer renderer = new ITextRenderer();
-		byte[] result = renderer.createPDF(XMLResource.load(page).getDocument());
+		byte[] result = renderer.createPDF(HTMLResource.load(page).getDocument());
 		printFile(result, "table-cell.pdf");
 
 		PDF pdf = new PDF(result);		

diff --git a/pom.xml b/pom.xml
@@ -74,6 +74,11 @@
       <version>${slf4j.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.jsoup</groupId>
+      <artifactId>jsoup</artifactId>
+      <version>1.17.2</version>
+    </dependency>
   </dependencies>
 
   <modules>