Skip to content

Commit

Permalink
feat(front): use webworker for markdown parsing (#1733)
Browse files Browse the repository at this point in the history
* feat(front): move markdown parsing to web worker

* feat: use webworker for markdown parsing

* feat(markdown-worker): implement message buffering and processing

* fix(markdown): import KaTeX CSS locally instead of via CDN

* fix(markdown): make sure messages are serializable

* feat(markdown): make sure links have target blank

* refactor(markdown): improve HTML escaping function formatting
  • Loading branch information
nsarrazin authored Feb 26, 2025
1 parent a378823 commit 6faa829
Show file tree
Hide file tree
Showing 4 changed files with 313 additions and 177 deletions.
214 changes: 43 additions & 171 deletions src/lib/components/chat/MarkdownRenderer.svelte
Original file line number Diff line number Diff line change
@@ -1,199 +1,71 @@
<script lang="ts">
import type { WebSearchSource } from "$lib/types/WebSearch";
import katex from "katex";
import "katex/dist/contrib/mhchem.mjs";
import DOMPurify from "isomorphic-dompurify";
import { Marked } from "marked";
import type { Tokens, TokenizerExtension, RendererExtension } from "marked";
import { processTokens, processTokensSync, type Token } from "$lib/utils/marked";
import MarkdownWorker from "$lib/workers/markdownWorker?worker";
import CodeBlock from "../CodeBlock.svelte";
import type { IncomingMessage, OutgoingMessage } from "$lib/workers/markdownWorker";
import { browser } from "$app/environment";
import DOMPurify from "isomorphic-dompurify";
interface Props {
content: string;
sources?: WebSearchSource[];
}
let { content, sources = [] }: Props = $props();
interface katexBlockToken extends Tokens.Generic {
type: "katexBlock";
raw: string;
text: string;
displayMode: true;
}
interface katexInlineToken extends Tokens.Generic {
type: "katexInline";
raw: string;
text: string;
displayMode: false;
}
export const katexBlockExtension: TokenizerExtension & RendererExtension = {
name: "katexBlock",
level: "block",
start(src: string): number | undefined {
const match = src.match(/(\${2}|\\\[)/);
return match ? match.index : -1;
},
tokenizer(src: string): katexBlockToken | undefined {
// 1) $$ ... $$
const rule1 = /^\${2}([\s\S]+?)\${2}/;
const match1 = rule1.exec(src);
if (match1) {
const token: katexBlockToken = {
type: "katexBlock",
raw: match1[0],
text: match1[1].trim(),
displayMode: true,
};
return token;
}
// 2) \[ ... \]
const rule2 = /^\\\[([\s\S]+?)\\\]/;
const match2 = rule2.exec(src);
if (match2) {
const token: katexBlockToken = {
type: "katexBlock",
raw: match2[0],
text: match2[1].trim(),
displayMode: true,
};
return token;
}
return undefined;
},
renderer(token) {
if (token.type === "katexBlock") {
return katex.renderToString(token.text, {
throwOnError: false,
displayMode: token.displayMode,
});
}
return undefined;
},
};
const worker = browser && window.Worker ? new MarkdownWorker() : null;
const katexInlineExtension: TokenizerExtension & RendererExtension = {
name: "katexInline",
level: "inline",
start(src: string): number | undefined {
const match = src.match(/(\$|\\\()/);
return match ? match.index : -1;
},
tokenizer(src: string): katexInlineToken | undefined {
// 1) $...$
const rule1 = /^\$([^$]+?)\$/;
const match1 = rule1.exec(src);
if (match1) {
const token: katexInlineToken = {
type: "katexInline",
raw: match1[0],
text: match1[1].trim(),
displayMode: false,
};
return token;
}
// 2) \(...\)
const rule2 = /^\\\(([\s\S]+?)\\\)/;
const match2 = rule2.exec(src);
if (match2) {
const token: katexInlineToken = {
type: "katexInline",
raw: match2[0],
text: match2[1].trim(),
displayMode: false,
};
return token;
}
return undefined;
},
renderer(token) {
if (token.type === "katexInline") {
return katex.renderToString(token.text, {
throwOnError: false,
displayMode: token.displayMode,
});
}
return undefined;
},
};
function escapeHTML(content: string) {
return content.replace(
/[<>&"']/g,
(x) =>
({
"<": "&lt;",
">": "&gt;",
"&": "&amp;",
"'": "&#39;",
'"': "&quot;",
})[x] || x
);
}
let { content, sources = [] }: Props = $props();
function addInlineCitations(md: string, webSearchSources: WebSearchSource[] = []): string {
const linkStyle =
"color: rgb(59, 130, 246); text-decoration: none; hover:text-decoration: underline;";
let tokens: Token[] = $state(processTokensSync(content, sources));
return md.replace(/\[(\d+)\]/g, (match: string) => {
const indices: number[] = (match.match(/\d+/g) || []).map(Number);
const links: string = indices
.map((index: number) => {
if (index === 0) return false;
const source = webSearchSources[index - 1];
if (source) {
return `<a href="${source.link}" target="_blank" rel="noreferrer" style="${linkStyle}">${index}</a>`;
async function processContent(content: string, sources: WebSearchSource[]): Promise<Token[]> {
if (worker) {
return new Promise((resolve) => {
worker.onmessage = (event: MessageEvent<OutgoingMessage>) => {
if (event.data.type !== "processed") {
throw new Error("Invalid message type");
}
return "";
})
.filter(Boolean)
.join(", ");
return links ? ` <sup>${links}</sup>` : match;
});
resolve(event.data.tokens);
};
worker.postMessage(
JSON.parse(JSON.stringify({ content, sources, type: "process" })) as IncomingMessage
);
});
} else {
return processTokens(content, sources);
}
}
const marked = new Marked({
hooks: {
postprocess: (html) => DOMPurify.sanitize(addInlineCitations(html, sources)),
},
extensions: [katexBlockExtension, katexInlineExtension],
renderer: {
link: (href, title, text) =>
`<a href="${href?.replace(/>$/, "")}" target="_blank" rel="noreferrer">${text}</a>`,
html: (html) => escapeHTML(html),
},
gfm: true,
breaks: true,
$effect(() => {
if (!browser) {
tokens = processTokensSync(content, sources);
} else {
(async () => {
if (!browser) {
tokens = processTokensSync(content, sources);
} else {
tokens = await processContent(content, sources);
}
})();
}
});
DOMPurify.addHook("afterSanitizeAttributes", (node) => {
if (node.tagName === "A") {
node.setAttribute("rel", "noreferrer");
node.setAttribute("target", "_blank");
node.setAttribute("rel", "noreferrer");
}
});
</script>

{#each marked.lexer(content) as token}
{#if token.type === "code"}
<CodeBlock lang={token.lang} code={token.text} />
{:else}
{#await marked.parse(token.raw) then parsed}
{#each tokens as token}
{#if token.type === "text"}
{#await token.html then html}
<!-- eslint-disable-next-line svelte/no-at-html-tags -->
{@html parsed}
{@html DOMPurify.sanitize(html)}
{/await}
{:else if token.type === "code"}
<CodeBlock lang={token.lang} code={token.code} />
{/if}
{/each}
Loading

0 comments on commit 6faa829

Please sign in to comment.