From 27bf4c5534aea824eb72b036847127345cdb7643 Mon Sep 17 00:00:00 2001 From: robyngit Date: Thu, 19 Sep 2024 21:47:02 -0400 Subject: [PATCH] Allow iFrames from trusted sources in MarkdownView - Add a new Showdown extension to allow iFrames from trusted sources - Add a property in the AppModel to store the list of trusted sources - Allow iFrames through the xss filter with limited attributes so that they can be processed by the new Showdown extension - Add the new Showdown extension to the MarkdownView Issue #1383 --- .../showdown/extensions/showdown-iframes.js | 159 ++++++++++++++++++ .../showdown-xss-filter.js | 1 + src/js/app.js | 2 + src/js/models/AppModel.js | 21 +++ src/js/views/MarkdownView.js | 12 ++ 5 files changed, 195 insertions(+) create mode 100644 src/components/showdown/extensions/showdown-iframes.js diff --git a/src/components/showdown/extensions/showdown-iframes.js b/src/components/showdown/extensions/showdown-iframes.js new file mode 100644 index 000000000..4afadff09 --- /dev/null +++ b/src/components/showdown/extensions/showdown-iframes.js @@ -0,0 +1,159 @@ +/** + * SHOWDOWN IFRAMES + * + * This extension filters out iframes with src attributes that + * are not from a trusted source + */ + +/** List of trusted URL patterns */ +const TRUSTED_SOURCES = MetacatUI.appModel.get("trustedContentSources") || []; + +/** + * The sandbox to add to iframes from trusted sources. This allows the iframe + * some capabilities, such as running scripts and accessing it's own origin. + */ +const SANDBOX = `sandbox="allow-scripts allow-same-origin"`; + +/** + * Regular expression that finds all iframes in the markdown content. The regex + * captures the full iframe tag, the src attribute, the inner content, and the + * closing tag, if it exists. + * @type {RegExp} + */ +const IFRAME_REGEX = + /]*?\bsrc="([^"]*)"[^>]*?>([\s\S]*?)(<\/iframe>)?/g; + +/** + * Function to convert URL patterns with wildcards to regex patterns. + * @param {string} wildcardPattern - The URL pattern with wildcards + * @returns {RegExp} - The regex pattern + */ +function patternToRegex(wildcardPattern) { + // Extract protocol if specified + let protocol = ""; + let pattern = wildcardPattern; + const protocolMatch = pattern.match(/^(https?:\/\/)/); + if (protocolMatch) { + [, protocol] = protocolMatch; + pattern = wildcardPattern.slice(protocol.length); + } + + // Escape special regex characters except for '*' + let escapedPattern = pattern.replace(/[-/\\^$+?.()|[\]{}]/g, "\\$&"); + // Replace '*' with '.*' + escapedPattern = escapedPattern.replace(/\*/g, ".*"); + // Escape the protocol + const escapedProtocol = protocol.replace(/[-/\\^$+?.()|[\]{}]/g, "\\$&"); + // Build the full regex pattern + const regexString = `^${escapedProtocol}${escapedPattern}$`; + + return new RegExp(regexString, "i"); // Case-insensitive matching +} + +/** + * Check if a URL is valid according to the trusted sources. Trusted sources may + * use wildcards (*) to match multiple URLs. For example, the trusted source + * "https://*dataone.org/*" will match any URL that starts with "https://", + * contains "dataone.org", and ends with a path. The trusted source + * "*arcticdata.io*" will match any URL that contains "arcticdata.io". It could + * also include wildcards at any position, such as + * "*arcticdata.io/*\/something". + * @param {string} url - The URL to check + * @returns {boolean} - True if the URL is trusted, false otherwise + */ +function isTrustedUrl(url) { + if (!TRUSTED_SOURCES?.length) return false; + + try { + const urlObj = new URL(url); + if (!urlObj.protocol.startsWith("http")) { + return false; + } + } catch (e) { + return false; + } + + // Check if the URL matches any of the trusted sources + for (let i = 0; i < TRUSTED_SOURCES.length; i += 1) { + const pattern = TRUSTED_SOURCES[i]; + const regex = patternToRegex(pattern); + + if (regex.test(url)) { + return true; + } + } + + return false; +} + +/** + * Replace iFrames that are NOT from trusted sources with a link to the source + * URL. Make the iFrames from trusted sources secure by adding the 'sandbox' + * attribute, which restricts the iframe's capabilities. Remove any inner + * content from the iframe. + * @param {string} iframe - The full iframe tag + * @param {string} src - The src attribute of the iframe + * @param {string} _innerContent - The inner content of the iframe tag + * @param {string} closingTag - The closing iframe tag + * @param {number} _index - The index of the match + * @param {string} _markdown - The full markdown content + * @returns {string} - The secure iframe tag + */ +const secureIFrame = ( + iframe, + src, + _innerContent, + closingTag, + _index, + _markdown, +) => { + // Return as a link instead of an iframe if the source is not trusted + if (!isTrustedUrl(src)) { + return `External Content: ${src}`; + } + + // Find the position of the first '>' that ends the opening iframe tag + const openingTagEndIndex = iframe.indexOf(">"); + + // Add the 'sandbox' attr and strip out any inner content + if (openingTagEndIndex !== -1) { + // Extract the opening tag + let openingTag = iframe.slice(0, openingTagEndIndex); + + // Ensure 'sandbox' attribute exists with the correct value + if (!/\bsandbox=/.test(openingTag)) { + // Add the 'sandbox' attribute + openingTag += ` ${SANDBOX}`; + } else { + // Update the existing 'sandbox' attribute to have the correct value + openingTag = openingTag.replace(/\bsandbox="[^"]*"/, SANDBOX); + } + + // Close the opening tag + openingTag += ">"; + + let newIframe; + if (closingTag) { + // Reconstruct the iframe without inner content and include the closing tag + newIframe = `${openingTag}${closingTag}`; + } else { + // If there is no closing tag, self-close the iframe + newIframe = openingTag.replace(">", " />"); + } + + return newIframe; + } + + // If the iframe tag is malformed and doesn't contain '>', return it as is + return iframe; +}; + +const extension = { + type: "output", + regex: IFRAME_REGEX, + replace: secureIFrame, +}; + +define(["showdown"], (showdown) => { + showdown.extension("showdown-iframes", () => [extension]); +}); diff --git a/src/components/showdown/extensions/showdown-xss-filter/showdown-xss-filter.js b/src/components/showdown/extensions/showdown-xss-filter/showdown-xss-filter.js index 548c2153f..e0bd30aec 100644 --- a/src/components/showdown/extensions/showdown-xss-filter/showdown-xss-filter.js +++ b/src/components/showdown/extensions/showdown-xss-filter/showdown-xss-filter.js @@ -12,6 +12,7 @@ define(['showdown', 'xss'], function (showdown, xss) { var options = { css: false, allowList: { + iframe: ["src", "width", "height", "frameborder", "allowfullscreen"], a: ["target", "href", "title", "class", "target"], abbr: ["title"], address: [], diff --git a/src/js/app.js b/src/js/app.js index e460fda2a..b4f75146c 100644 --- a/src/js/app.js +++ b/src/js/app.js @@ -80,6 +80,8 @@ require.config({ "/components/showdown/extensions/showdown-xss-filter/xss.min", showdownHtags: MetacatUI.root + "/components/showdown/extensions/showdown-htags", + showdownIframes: + MetacatUI.root + "/components/showdown/extensions/showdown-iframes", // woofmark - markdown editor woofmark: MetacatUI.root + "/components/woofmark.min", // drop zone creates drag and drop areas diff --git a/src/js/models/AppModel.js b/src/js/models/AppModel.js index d5af3c6c3..261d4d96c 100644 --- a/src/js/models/AppModel.js +++ b/src/js/models/AppModel.js @@ -1731,6 +1731,27 @@ define(["jquery", "underscore", "backbone"], function ($, _, Backbone) { */ feverUrl: "", + /** + * A list of trusted content sources from which MetacatUI can safely + * embed external content. This property is used to define URLs or URL + * patterns that are considered secure for embedding content in + * iframes, especially when rendering user-generated Markdown content. + * + * Each source in the list can include wildcards (`*`) to match any + * subdomain or path. For example, `"https://*.dataone.org/*"` matches + * any subdomain of `dataone.org` over HTTPS, and `"*arcticdata.io*"` + * matches any URL containing `arcticdata.io`. + * + * Set to an empty array or a falsy value to disable all embedded content. + * + * @type {string[]} + * @since 0.0.0 + */ + trustedContentSources: [ + "https://www.youtube.com/embed/*", + "https://player.vimeo.com/video/*", + ], + /** If true, then archived content is available in the search index. * Set to false if this MetacatUI is using a Metacat version before 2.10.0 * @type {boolean} diff --git a/src/js/views/MarkdownView.js b/src/js/views/MarkdownView.js index 46860bb2b..8e24b5863 100644 --- a/src/js/views/MarkdownView.js +++ b/src/js/views/MarkdownView.js @@ -179,6 +179,7 @@ define([ "footnotes", "showdown-citation", "showdown-images", + "showdown-iframes", ]; var numTestsTodo = SDextensions.length; @@ -219,6 +220,8 @@ define([ regexCitation = /\[@.+\]/; // test for any tags (regexHtags = new RegExp("#\\s")), (regexImages = /!\[.*\]\(\S+\)/); + // test for anything that looks like an iframe. Keep it very general. + const regexIframes = /<\/iframe>/g; // ================================================================ // Test for and load each as required each showdown extension @@ -342,6 +345,15 @@ define([ } else { updateExtensionList("showdown-images", (required = false)); } + + // --- Test for iframes --- // + if (regexIframes.test(markdown)) { + require(["showdownIframes"], function (showdownIframes) { + updateExtensionList("showdown-iframes", (required = true)); + }); + } else { + updateExtensionList("showdown-iframes", (required = false)); + } }, /**