Skip to content

Commit

Permalink
[TOOL-3562] Portal: Add llms.txt and llms-full.txt generation script
Browse files Browse the repository at this point in the history
  • Loading branch information
MananTank committed Feb 28, 2025
1 parent 56301bc commit f8417b2
Show file tree
Hide file tree
Showing 14 changed files with 382 additions and 260 deletions.
2 changes: 2 additions & 0 deletions apps/portal/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ next-env.d.ts

# generated files
searchIndex.json
public/llms.txt
public/llms-full.txt

.env
public/sitemap*.xml
Expand Down
3 changes: 3 additions & 0 deletions apps/portal/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,11 @@
"date-fns": "4.1.0",
"flexsearch": "^0.7.43",
"github-slugger": "^2.0.0",
"he": "^1.2.0",
"lucide-react": "0.476.0",
"next": "15.2.0",
"nextjs-toploader": "^1.6.12",
"node-html-markdown": "^1.3.0",
"node-html-parser": "^6.1.13",
"posthog-js": "1.67.1",
"prettier": "3.3.3",
Expand All @@ -55,6 +57,7 @@
"devDependencies": {
"@next/eslint-plugin-next": "15.2.0",
"@types/flexsearch": "^0.7.6",
"@types/he": "^1.2.3",
"@types/mdx": "^2.0.13",
"@types/node": "22.13.5",
"@types/react": "19.0.10",
Expand Down
9 changes: 6 additions & 3 deletions apps/portal/scripts/extractSearchData.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import { writeFileSync } from "node:fs";
import { extractSearchData } from "../src/app/api/search/extraction";
import { extractContent } from "../src/app/api/search/extraction";

async function main() {
const rootDir = process.cwd();
const websiteData = await extractSearchData(rootDir);
writeFileSync("./searchIndex.json", JSON.stringify(websiteData, null, 2));
const { searchData, llmContent, llmFullContent } =
await extractContent(rootDir);
writeFileSync("./searchIndex.json", JSON.stringify(searchData, null, 2));
writeFileSync("./public/llms.txt", llmContent);
writeFileSync("./public/llms-full.txt", llmFullContent);
}

main();
2 changes: 1 addition & 1 deletion apps/portal/src/app/account/layout.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";

export default async function Layout(props: { children: React.ReactNode }) {
return (
<DocLayout sideBar={sidebar} editPageButton={true}>
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
{props.children}
</DocLayout>
);
Expand Down
202 changes: 186 additions & 16 deletions apps/portal/src/app/api/search/extraction/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { readFile } from "node:fs/promises";
import he from "he";
import { NodeHtmlMarkdown } from "node-html-markdown";
import {
CommentNode as X_CommentNode,
HTMLElement as X_HTMLElement,
Expand All @@ -11,11 +13,35 @@ import { getFilesRecursive } from "./getFilesRecursive";
import { ignoreHeadings } from "./settings";
import { trimExtraSpace } from "./trimExtraSpace";

export async function extractSearchData(rootDir: string): Promise<PageData[]> {
type ExtractedContent = {
searchData: PageData[];
llmContent: string;
llmFullContent: string;
};

const llmsContentHeader = `\
# thirdweb
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
## Docs
`;

const llmsFullContentHeader = `\
# thirdweb
> Frontend, Backend, and Onchain tools to build complete web3 apps — on every EVM chain.
`;

export async function extractContent(
rootDir: string,
): Promise<ExtractedContent> {
const nextOutputDir = `${rootDir}/.next/server/app`;
const htmlFiles = getFilesRecursive(nextOutputDir, "html");

const pages: PageData[] = [];
let llmContent = "";
let llmFullContent = "";

const noMainFound: string[] = [];
const noH1Found: string[] = [];
Expand All @@ -25,8 +51,9 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
const htmlContent = await readFile(filePath, "utf-8");
const mainEl = parse(htmlContent, {
comment: false,
// fixNestedATags: true,
blockTextElements: {
pre: false, // parse text inside <pre> elements instead of treating it as text
pre: true,
},
}).querySelector("main");

Expand All @@ -38,24 +65,38 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
}

const noIndex = mainEl.getAttribute("data-noindex");

if (noIndex) {
if (noIndex === "true") {
return;
}

const pageTitle = mainEl.querySelector("h1")?.text;

if (!pageTitle) {
noH1Found.push(
filePath.split(".next/server/app")[1]?.replace(".html", "") || "",
);
}

pages.push({
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
title: pageTitle ? trimExtraSpace(pageTitle) : "",
sections: getPageSections(mainEl),
});
// Important: do the search index collection first - we will modify the main element in the next step
// Extract search data
const pageData = extractPageSearchData(
mainEl,
filePath,
nextOutputDir,
pageTitle,
);
if (pageData) {
pages.push(pageData);
}

// Extract LLM content
const { links, full } = extractPageLLMContent(
mainEl,
pageTitle,
filePath,
nextOutputDir,
);
llmContent += links ? `${links}\n` : "";
llmFullContent += full ? `${full}\n` : "";
}),
);

Expand All @@ -77,13 +118,144 @@ export async function extractSearchData(rootDir: string): Promise<PageData[]> {
console.warn("\n");
}

return pages;
return {
searchData: pages,
llmContent: `${llmsContentHeader}\n${llmContent}`,
llmFullContent: `${llmsFullContentHeader}\n${llmFullContent}`,
};
}

function extractPageSearchData(
main: X_HTMLElement,
filePath: string,
nextOutputDir: string,
pageTitle: string | undefined,
): PageData | null {
if (main.getAttribute("data-noindex") === "true") {
return null;
}

return {
href: filePath.replace(nextOutputDir, "").replace(".html", ""),
title: pageTitle ? trimExtraSpace(pageTitle) : "",
sections: getPageSectionsForSearchIndex(main),
};
}

function extractPageLLMContent(
main: X_HTMLElement,
pageTitle: string | undefined,
filePath: string,
nextOutputDir: string,
): { links: string; full: string } {
if (
main.getAttribute("data-noindex") === "true" ||
main.getAttribute("data-no-llm") === "true"
) {
return { links: "", full: "" };
}

const htmlToMarkdown = new NodeHtmlMarkdown({
keepDataImages: false,
});

let linksContent = "";
let fullContent = "";

const pageUrl = filePath.replace(nextOutputDir, "").replace(".html", "");

// Get first non-empty paragraph for description
const paragraphs = main.querySelectorAll("p");
let description = "";
for (const p of paragraphs) {
if (
p.getAttribute("data-noindex") !== "true" ||
p.getAttribute("data-no-llm") !== "true"
) {
description = trimExtraSpace(htmlToMarkdown.translate(p.toString()));
if (description) break;
}
}

linksContent += `* [${pageTitle}](${pageUrl}): ${description}`;

// Remove noindex and no-llm elements
const contentElements = main.querySelectorAll("*");
for (const element of contentElements) {
if (
element.getAttribute("data-noindex") === "true" ||
element.getAttribute("data-no-llm") === "true"
) {
element.remove();
}
}

// Shift all heading elements to 1 step down (h1 > h2, h2 > h3, etc.)
const headings = main.querySelectorAll("h1, h2, h3, h4, h5, h6");
for (const heading of headings) {
const headingLevel = Number.parseInt(heading.tagName.replace("H", ""));
const newLevel = Math.min(headingLevel + 1, 6);
heading.tagName = `H${newLevel}`;
}

// prefix all the relative links with the `https://portal.thirdweb.com`
const links = main.querySelectorAll("a");
for (const link of links) {
const [path, hash] = link.getAttribute("href")?.split("#") || [];
if (path?.startsWith("/")) {
link.setAttribute(
"href",
`https://portal.thirdweb.com${path}${hash ? `#${hash}` : ""}`,
);
}
}

// for code blocks inside pre tags -> make them direct descendants of the pre tag
// so they are parsed as blocks by node-html-markdown + add language class
const preTags = main.querySelectorAll("pre");
for (const preTag of preTags) {
const codeBlock = parse(preTag.innerHTML.toString(), {
comment: false,
blockTextElements: {
pre: true,
},
}).querySelector("code");

if (codeBlock) {
const code = codeBlock
.querySelectorAll("div > div > div > div")
.map((x) => x.textContent)
.join("\n")
.trim();

const lang = codeBlock.getAttribute("lang");
codeBlock.textContent = code;

const newCodePreBlock = parse(
`<pre><code class=${lang ? `language-${lang}` : ""}>${he.encode(code)}</code></pre>`,
);

preTag.replaceWith(newCodePreBlock);
}
}

// console.log(main.toString());

// Convert the cleaned HTML to markdown
fullContent += `${htmlToMarkdown.translate(main.toString())}`;

return {
links: linksContent,
full: fullContent,
};
}

function getPageSections(main: X_HTMLElement): PageSectionData[] {
function getPageSectionsForSearchIndex(main: X_HTMLElement): PageSectionData[] {
const sectionData: PageSectionData[] = [];

const ignoreTags = new Set(["code", "nav"].map((t) => t.toUpperCase()));
const ignoreTags = new Set(
["code", "nav", "pre"].map((t) => t.toUpperCase()),
);

function collector(node: X_Node) {
if (node instanceof X_CommentNode) {
Expand All @@ -94,9 +266,7 @@ function getPageSections(main: X_HTMLElement): PageSectionData[] {
return;
}

const noIndexAttribute = node.getAttribute("data-noindex");

if (noIndexAttribute === "true") {
if (node.getAttribute("data-noindex") === "true") {
return;
}

Expand Down
2 changes: 1 addition & 1 deletion apps/portal/src/app/cli/layout.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { sidebar } from "./sidebar";

export default async function Layout(props: { children: React.ReactNode }) {
return (
<DocLayout sideBar={sidebar} editPageButton={true}>
<DocLayout sideBar={sidebar} editPageButton={true} noLLM={true}>
{props.children}
</DocLayout>
);
Expand Down
2 changes: 1 addition & 1 deletion apps/portal/src/app/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import DocsHeroLight from "./_images/docs-hero-light.png";

export default function Page() {
return (
<main className="container max-w-[900px] grow pb-20">
<main className="container max-w-[900px] grow pb-20" data-noindex>
<Hero />
<div className="grid grid-cols-1 gap-8">
<FrontendSection />
Expand Down
2 changes: 1 addition & 1 deletion apps/portal/src/app/react-native/v5/layout.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
</div>
}
>
<div data-noindex>{props.children}</div>
<div>{props.children}</div>
</DocLayout>
);
}
Expand Down
2 changes: 1 addition & 1 deletion apps/portal/src/app/react/v5/layout.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
</div>
}
>
<div data-noindex>{props.children}</div>
<div>{props.children}</div>
</DocLayout>
);
}
Expand Down
2 changes: 1 addition & 1 deletion apps/portal/src/app/typescript/v5/layout.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ export default async function Layout(props: { children: React.ReactNode }) {
</div>
}
>
<div data-noindex>{props.children}</div>
<div>{props.children}</div>
</DocLayout>
);
}
Expand Down
2 changes: 2 additions & 0 deletions apps/portal/src/components/Document/Cards/ArticleCard.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export function ArticleCard(props: {
const isExternal = props.href.startsWith("http");
return (
<Link
data-noindex
href={props.href}
className="flex cursor-default bg-card"
target={isExternal ? "_blank" : undefined}
Expand Down Expand Up @@ -38,6 +39,7 @@ export function ArticleIconCard(props: {
const isExternal = props.href.startsWith("http");
return (
<Link
data-noindex
href={props.href}
className={cn(
"flex items-center gap-4 rounded-lg border bg-card p-4 transition-colors hover:border-active-border",
Expand Down
Loading

0 comments on commit f8417b2

Please sign in to comment.