From e91f8cc6348b581c5d0ca95539335d5a5faf4935 Mon Sep 17 00:00:00 2001 From: Jeremy Howard Date: Sun, 1 Sep 2024 17:31:29 +1000 Subject: [PATCH] init commit --- .github/workflows/deploy.yaml | 14 + .github/workflows/test.yaml.off | 7 + .gitignore | 119 ++++---- MANIFEST.in | 5 + README.md | 106 ++++++- llms_txt/__init__.py | 1 + llms_txt/_modidx.py | 17 ++ llms_txt/core.py | 86 ++++++ nbs/00_intro.ipynb | 125 ++++++++ nbs/01_core.ipynb | 509 ++++++++++++++++++++++++++++++++ nbs/_quarto.yml | 23 ++ nbs/index.md | 66 +++++ nbs/llms-sample.txt | 20 ++ nbs/nbdev.yml | 9 + nbs/styles.css | 37 +++ pyproject.toml | 3 + settings.ini | 47 +++ setup.py | 64 ++++ 18 files changed, 1193 insertions(+), 65 deletions(-) create mode 100644 .github/workflows/deploy.yaml create mode 100644 .github/workflows/test.yaml.off create mode 100644 MANIFEST.in create mode 100644 llms_txt/__init__.py create mode 100644 llms_txt/_modidx.py create mode 100644 llms_txt/core.py create mode 100644 nbs/00_intro.ipynb create mode 100644 nbs/01_core.ipynb create mode 100644 nbs/_quarto.yml create mode 100644 nbs/index.md create mode 100644 nbs/llms-sample.txt create mode 100644 nbs/nbdev.yml create mode 100644 nbs/styles.css create mode 100644 pyproject.toml create mode 100644 settings.ini create mode 100644 setup.py diff --git a/.github/workflows/deploy.yaml b/.github/workflows/deploy.yaml new file mode 100644 index 0000000..29bfc57 --- /dev/null +++ b/.github/workflows/deploy.yaml @@ -0,0 +1,14 @@ +name: Deploy to GitHub Pages + +permissions: + contents: write + pages: write + +on: + push: + branches: [ "main", "master" ] + workflow_dispatch: +jobs: + deploy: + runs-on: ubuntu-latest + steps: [uses: fastai/workflows/quarto-ghp@master] diff --git a/.github/workflows/test.yaml.off b/.github/workflows/test.yaml.off new file mode 100644 index 0000000..5608592 --- /dev/null +++ b/.github/workflows/test.yaml.off @@ -0,0 +1,7 @@ +name: CI +on: [workflow_dispatch, pull_request, push] + +jobs: + test: + runs-on: ubuntu-latest + steps: [uses: fastai/workflows/nbdev-ci@master] diff --git a/.gitignore b/.gitignore index 82f9275..4d71a5f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,26 @@ +fasthtml.md +posts/ +.quarto +.sesskey +*.db-* +*.db +.gitattributes +_proc/ +sidebar.yml +Gemfile.lock +token +_docs/ +conda/ +.last_checked +.gitconfig +*.bak +*.log +*~ +~* +_tmp* +tmp* +tags + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -8,6 +31,7 @@ __pycache__/ # Distribution / packaging .Python +env/ build/ develop-eggs/ dist/ @@ -20,11 +44,9 @@ parts/ sdist/ var/ wheels/ -share/python-wheels/ *.egg-info/ .installed.cfg *.egg -MANIFEST # PyInstaller # Usually these files are written by a python script from a template @@ -39,17 +61,13 @@ pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ -.nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover -*.py,cover .hypothesis/ -.pytest_cache/ -cover/ # Translations *.mo @@ -58,8 +76,6 @@ cover/ # Django stuff: *.log local_settings.py -db.sqlite3 -db.sqlite3-journal # Flask stuff: instance/ @@ -72,63 +88,27 @@ instance/ docs/_build/ # PyBuilder -.pybuilder/ target/ # Jupyter Notebook .ipynb_checkpoints -# IPython -profile_default/ -ipython_config.py - # pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/latest/usage/project/#working-with-version-control -.pdm.toml -.pdm-python -.pdm-build/ - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff +.python-version + +# celery beat schedule file celerybeat-schedule -celerybeat.pid # SageMath parsed files *.sage.py -# Environments +# dotenv .env + +# virtualenv .venv -env/ venv/ ENV/ -env.bak/ -venv.bak/ # Spyder project settings .spyderproject @@ -142,21 +122,32 @@ venv.bak/ # mypy .mypy_cache/ -.dmypy.json -dmypy.json -# Pyre type checker -.pyre/ +.vscode +*.swp + +# osx generated files +.DS_Store +.DS_Store? +.Trashes +ehthumbs.db +Thumbs.db +.idea + +# pytest +.pytest_cache + +# tools/trust-doc-nbs +docs_src/.last_checked + +# symlinks to fastai +docs_src/fastai +tools/fastai -# pytype static type analyzer -.pytype/ +# link checker +checklink/cookies.txt -# Cython debug symbols -cython_debug/ +# .gitconfig is now autogenerated +.gitconfig -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +_docs diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5c0e7ce --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include settings.ini +include LICENSE +include CONTRIBUTING.md +include README.md +recursive-exclude * __pycache__ diff --git a/README.md b/README.md index 8b75bb7..4e12dfb 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,106 @@ # llms-txt -The /llms.txt file, helping language models use your website + + + + +This file will become your README and also the index of your +documentation. + +## Developer Guide + +If you are new to using `nbdev` here are some useful pointers to get you +started. + +### Setup + +It can be helpful to have a dedicated environment for development. Here +we are assuming that you have an conda environment file called `env.yml` +named after `llms_txt` i.e.: + +``` yaml +# env.yml +name: llms_txt + +channels: + - fastai + +dependencies: + - fastai::nbdev>=2.3.12 +# - python>=3.11 # specify python version if required +# - dependency 1 +# - dependency 2 +# - pip +# pip: +# - pip dependency 1 +# - pip dependency 2 +``` + +You can then use `conda` or `mamba` (faster at resolving) to create and +update your environment file should your needs change as you work on +`llms_txt` + +``` sh +# create a conda environment for working on llms-txt +$ mamba env create -f env.yml + +# update conda environment +$ mamba env update -n llms_txt --file env.yml +``` + +### Install llms_txt in Development mode + +``` sh +# activate conda environment +$ conda activate llms_txt + +# make sure llms_txt package is installed in development mode +$ pip install -e . + +# make changes under nbs/ directory +# ... + +# compile to have changes apply to llms_txt +$ nbdev_prepare +``` + +## Usage + +### Installation + +Install latest from the GitHub +[repository](https://github.com/AnswerDotAI/llms-txt): + +``` sh +$ pip install git+https://github.com/AnswerDotAI/llms-txt.git +``` + +or from [conda](https://anaconda.org/AnswerDotAI/llms-txt) + +``` sh +$ conda install -c AnswerDotAI llms_txt +``` + +or from [pypi](https://pypi.org/project/llms-txt/) + +``` sh +$ pip install llms_txt +``` + +### Documentation + +Documentation can be found hosted on this GitHub +[repository](https://github.com/AnswerDotAI/llms-txt)’s +[pages](https://AnswerDotAI.github.io/llms-txt/). Additionally you can +find package manager specific guidelines on +[conda](https://anaconda.org/AnswerDotAI/llms-txt) and +[pypi](https://pypi.org/project/llms-txt/) respectively. + +## How to use + +Fill me in please! Don’t forget code examples: + +``` python +1+1 +``` + + 2 diff --git a/llms_txt/__init__.py b/llms_txt/__init__.py new file mode 100644 index 0000000..f102a9c --- /dev/null +++ b/llms_txt/__init__.py @@ -0,0 +1 @@ +__version__ = "0.0.1" diff --git a/llms_txt/_modidx.py b/llms_txt/_modidx.py new file mode 100644 index 0000000..54f2826 --- /dev/null +++ b/llms_txt/_modidx.py @@ -0,0 +1,17 @@ +# Autogenerated by nbdev + +d = { 'settings': { 'branch': 'main', + 'doc_baseurl': '/llms-txt', + 'doc_host': 'https://AnswerDotAI.github.io', + 'git_url': 'https://github.com/AnswerDotAI/llms-txt', + 'lib_path': 'llms_txt'}, + 'syms': { 'llms_txt.core': { 'llms_txt.core.Doc': ('core.html#doc', 'llms_txt/core.py'), + 'llms_txt.core.Section': ('core.html#section', 'llms_txt/core.py'), + 'llms_txt.core._opt_re': ('core.html#_opt_re', 'llms_txt/core.py'), + 'llms_txt.core._parse_llms_txt': ('core.html#_parse_llms_txt', 'llms_txt/core.py'), + 'llms_txt.core._parse_section': ('core.html#_parse_section', 'llms_txt/core.py'), + 'llms_txt.core._split_on_h2': ('core.html#_split_on_h2', 'llms_txt/core.py'), + 'llms_txt.core.get_sizes': ('core.html#get_sizes', 'llms_txt/core.py'), + 'llms_txt.core.llms_txt2ctx': ('core.html#llms_txt2ctx', 'llms_txt/core.py'), + 'llms_txt.core.mk_ctx': ('core.html#mk_ctx', 'llms_txt/core.py'), + 'llms_txt.core.parse_llms_file': ('core.html#parse_llms_file', 'llms_txt/core.py')}}} diff --git a/llms_txt/core.py b/llms_txt/core.py new file mode 100644 index 0000000..ffcf2cb --- /dev/null +++ b/llms_txt/core.py @@ -0,0 +1,86 @@ +"""Helpers to create and use llms.txt files""" + +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/00_core.ipynb. + +# %% auto 0 +__all__ = ['Sections', 'Project', 'parse_llms_file', 'Doc', 'Section', 'mk_ctx', 'get_sizes', 'llms_txt2ctx'] + +# %% ../nbs/00_core.ipynb +import re + +# %% ../nbs/00_core.ipynb +from fastcore.utils import * +from fastcore.xml import * +from fastcore.script import * +import httpx + +# %% ../nbs/00_core.ipynb +def _opt_re(s): return f'(?:{s})?' + +def _parse_llms_txt(txt): + pat = r"^#\s*(?P[^\n]+)\n+" + pat += _opt_re(r"^>\s*(?P<summary>.+?)\n+") + pat += r"(?P<rest>.*)" + match = re.search(pat, txt, flags=(re.DOTALL | re.MULTILINE)) + return match.groupdict() if match else None + +# %% ../nbs/00_core.ipynb +def _split_on_h2(text): + parts = re.split(r'\n?## ', text) + details = parts[0].strip() if parts[0].strip() else None + sections = [f"## {p.strip()}" for p in parts[1:] if p.strip()] + return details, sections + +# %% ../nbs/00_core.ipynb +def _parse_section(section): + title = section.split('\n', 1)[0].strip('# ') + links = re.findall(r'\[(.+?)\]\((.+?)\)(?:: (.+?))?(?=\n|$)', section) + return title, [(t, u, d.strip() if d else None) for t, u, d in links] + +# %% ../nbs/00_core.ipynb +def parse_llms_file(txt): + parsed = _parse_llms_txt(txt) + if not parsed: return None + parsed['details'], sections = _split_on_h2(parsed['rest']) + parsed['sections'] = dict(_parse_section(s) for s in sections) + del parsed['rest'] + return dict2obj(parsed) + +# %% ../nbs/00_core.ipynb +Sections = partial(ft, 'sections') +Project = partial(ft, 'project') + +# %% ../nbs/00_core.ipynb +def Doc(url, **kw): + "Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs." + re_comment = re.compile('^<!--.*-->$', flags=re.MULTILINE) + txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o)] + return ft('doc', '\n'.join(txt), **kw) + +# %% ../nbs/00_core.ipynb +def Section(nm, items): + "Create a `Section` FT object containing a `Doc` object for each child." + return ft(nm, *[Doc(title=title, url=url, detl=detl) for title,url,detl in items]) + +# %% ../nbs/00_core.ipynb +def mk_ctx(d, optional=True): + "Create a `Project` with a `Section` for each H2 part in `d`, optionally skipping the 'optional' section." + skip = '' if optional else 'Optional' + sections = [Section(k, v) for k,v in d.sections.items() if k!=skip] + return Project(title=d.title, summary=d.summary, details=d.details)(*sections) + +# %% ../nbs/00_core.ipynb +def get_sizes(ctx): + "Get the size of each section of the LLM context" + return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children} + +# %% ../nbs/00_core.ipynb +@call_parse +def llms_txt2ctx( + fname:str, # File name to read + optional:bool_arg=True # Skip 'optional' section? +): + "Print a `Project` with a `Section` for each H2 part in file read from `fname`, optionally skipping the 'optional' section." + d = parse_llms_file(Path(fname).read_text()) + ctx = mk_ctx(d, optional=optional) + print(to_xml(ctx, do_escape=False)) diff --git a/nbs/00_intro.ipynb b/nbs/00_intro.ipynb new file mode 100644 index 0000000..5009a6c --- /dev/null +++ b/nbs/00_intro.ipynb @@ -0,0 +1,125 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Python module & CLI\n", + "\n", + "> Read llms.txt files and create XML context documents for LLMs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Given an `llms.txt` file, this provides a CLI and Python API to parse the file and create an XML context file from it. The input file should follow this format:\n", + "\n", + "```\n", + "# FastHTML\n", + "\n", + "> FastHTML is a python library which...\n", + "\n", + "## Docs\n", + "\n", + "- [Surreal](https://host/README.md): Tiny jQuery alternative with Locality of Behavior\n", + "- [FastHTML quick start](https://host/quickstart.html.md): An overview of FastHTML features\n", + "\n", + "## Examples\n", + "\n", + "- [Todo app](https://host/adv_app.py)\n", + "\n", + "## Optional\n", + "\n", + "- [Starlette docs](https://host/starlette-sml.md): A subset of the Starlette docs\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```sh\n", + "pip install llms-txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How to use" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To get help for the CLI:\n", + "\n", + "```sh\n", + "llms_txt2ctx -h\n", + "```\n", + "\n", + "To convert an `llms.txt` file to XML context and save to `llms.md`:\n", + "\n", + "```sh\n", + "llms_txt2ctx llms.txt > llms.md\n", + "```\n", + "\n", + "Pass `--optional False` to skip the 'optional' section of the input file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": { + "height": "411.818px", + "width": "301.818px" + }, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nbs/01_core.ipynb b/nbs/01_core.ipynb new file mode 100644 index 0000000..f6ccc6e --- /dev/null +++ b/nbs/01_core.ipynb @@ -0,0 +1,509 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Python source\n", + "\n", + "> Helpers to create and use llms.txt files" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#| default_exp core" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "from nbdev.showdoc import *\n", + "import nbdev; nbdev.nbdev_export()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "from fastcore.utils import *\n", + "from fastcore.xml import *\n", + "from fastcore.script import *\n", + "import httpx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The llms.txt file spec is for files located in the path `llms.txt` of a website (or, optionally, in a subpath). `llms-sample.txt` is a simple example. A file following the spec contains the following sections as markdown, in the specific order:\n", + "\n", + "- An H1 with the name of the project or site. This is the only required section\n", + "- A blockquote with a short summary of the project, containing key information necessary for understanding the rest of the file\n", + "- Zero or more markdown sections (e.g. paragraphs, lists, etc) of any type, except headings, containing more detailed information about the project and how to interpret the provided files\n", + "- Zero or more markdown sections delimited by H2 headers, containing \"file lists\" of URLs where further detail is available\n", + " - Each \"file list\" is a markdown list, containing a required markdown hyperlink `[name](url)`, then optionally a `:` and notes about the file.\n", + "\n", + "Here's the start of a sample llms.txt file we'll use for testing:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "# FastHTML\n", + "\n", + "> FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore's `FT` \"FastTags\" into a library for creating server-rendered hypermedia applications.\n", + "\n", + "FastHTML is written by Answer.AI, an organization which follows the fast.ai style guide instead of PEP 8, so most examples follow fast.ai style.\n", + "\n", + "## Docs\n", + "\n", + "- [FastHTML quick start](https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html.md): A brief overview of many FastHTML feature\n" + ] + } + ], + "source": [ + "samp = Path('llms-sample.txt').read_text()\n", + "print(samp[:480])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reading" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll implement `parse_llms_file` to pull out the sections of llms.txt into a simple data structure." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "%ai reset" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def _opt_re(s): return f'(?:{s})?'\n", + "\n", + "def _parse_llms_txt(txt):\n", + " pat = r\"^#\\s*(?P<title>[^\\n]+)\\n+\"\n", + " pat += _opt_re(r\"^>\\s*(?P<summary>.+?)\\n+\")\n", + " pat += r\"(?P<rest>.*)\"\n", + " match = re.search(pat, txt, flags=(re.DOTALL | re.MULTILINE))\n", + " return match.groupdict() if match else None" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore\\'s `FT` \"FastTags\" into a library for creating server-rendered hypermedia applications.'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "match = _parse_llms_txt(samp)\n", + "match['summary']" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def _split_on_h2(text):\n", + " parts = re.split(r'\\n?## ', text)\n", + " details = parts[0].strip() if parts[0].strip() else None\n", + " sections = [f\"## {p.strip()}\" for p in parts[1:] if p.strip()]\n", + " return details, sections" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'FastHTML is written by Answer.AI, an organization which follows the fast.ai style guide instead of PEP 8, so most examples follow fast.ai style.'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rest = match['rest']\n", + "details,sections = _split_on_h2(rest)\n", + "details" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def _parse_section(section):\n", + " title = section.split('\\n', 1)[0].strip('# ')\n", + " links = re.findall(r'\\[(.+?)\\]\\((.+?)\\)(?:: (.+?))?(?=\\n|$)', section)\n", + " return title, [(t, u, d.strip() if d else None) for t, u, d in links]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Todo list application',\n", + " 'https://raw.githubusercontent.com/AnswerDotAI/fasthtml/main/examples/adv_app.py',\n", + " 'Detailed walk-thru of a complete CRUD app in FastHTML showing idiomatic use of FastHTML and HTMX patterns.')]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sections_dict = dict(_parse_section(s) for s in sections)\n", + "sections_dict['Examples']" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def parse_llms_file(txt):\n", + " parsed = _parse_llms_txt(txt)\n", + " if not parsed: return None\n", + " parsed['details'], sections = _split_on_h2(parsed['rest'])\n", + " parsed['sections'] = dict(_parse_section(s) for s in sections)\n", + " del parsed['rest']\n", + " return dict2obj(parsed)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore\\'s `FT` \"FastTags\" into a library for creating server-rendered hypermedia applications.'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "llmsd = parse_llms_file(samp)\n", + "llmsd.summary" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(#1) [('Todo list application', 'https://raw.githubusercontent.com/AnswerDotAI/fasthtml/main/examples/adv_app.py', 'Detailed walk-thru of a complete CRUD app in FastHTML showing idiomatic use of FastHTML and HTMX patterns.')]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "llmsd.sections.Examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## XML conversion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For some LLMs such as Claude, XML format is preferred, so we'll provide a function to create that format." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "Sections = partial(ft, 'sections')\n", + "Project = partial(ft, 'project')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def Doc(url, **kw):\n", + " \"Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs.\"\n", + " re_comment = re.compile('^<!--.*-->$', flags=re.MULTILINE)\n", + " txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o)]\n", + " return ft('doc', '\\n'.join(txt), **kw)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def Section(nm, items):\n", + " \"Create a `Section` FT object containing a `Doc` object for each child.\"\n", + " return ft(nm, *[Doc(title=title, url=url, detl=detl) for title,url,detl in items])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def mk_ctx(d, optional=True):\n", + " \"Create a `Project` with a `Section` for each H2 part in `d`, optionally skipping the 'optional' section.\"\n", + " skip = '' if optional else 'Optional'\n", + " sections = [Section(k, v) for k,v in d.sections.items() if k!=skip]\n", + " return Project(title=d.title, summary=d.summary, details=d.details)(*sections)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<project title=\"FastHTML\" summary='FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore's `FT` \"FastTags\" into a library for creating server-rendered hypermedia applications.' details=\"FastHTML is written by Answer.AI, an organization which follows the fast.ai style guide instead of PEP 8, so most examples follow fast.ai style.\">\n", + " <docs>\n", + " <doc title=\"FastHTML quick start\" detl=\"A brief overview of many FastHTML features\"># Web Devs Quickstar\n" + ] + } + ], + "source": [ + "ctx = mk_ctx(llmsd)\n", + "print(to_xml(ctx, do_escape=False)[:490])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "def get_sizes(ctx):\n", + " \"Get the size of each section of the LLM context\"\n", + " return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children}" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'docs': {'FastHTML quick start': 21997,\n", + " 'HTMX reference': 26427,\n", + " 'Starlette quick guide': 7936},\n", + " 'examples': {'Todo list application': 18588},\n", + " 'optional': {'Starlette full documentation': 48331}}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_sizes(ctx)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "124335" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Path('fasthtml.md').write_text(to_xml(ctx, do_escape=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", + "@call_parse\n", + "def llms_txt2ctx(\n", + " fname:str, # File name to read\n", + " optional:bool_arg=True # Skip 'optional' section?\n", + "):\n", + " \"Print a `Project` with a `Section` for each H2 part in file read from `fname`, optionally skipping the 'optional' section.\"\n", + " d = parse_llms_file(Path(fname).read_text())\n", + " ctx = mk_ctx(d, optional=optional)\n", + " print(to_xml(ctx, do_escape=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "!llms_txt2ctx llms-sample.txt > fasthtml.md" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export -" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "#|hide\n", + "#|eval: false\n", + "from nbdev import nbdev_export\n", + "nbdev_export()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nbs/_quarto.yml b/nbs/_quarto.yml new file mode 100644 index 0000000..6afb1ea --- /dev/null +++ b/nbs/_quarto.yml @@ -0,0 +1,23 @@ +project: + type: website + +format: + html: + code-overflow: wrap + theme: cosmo + css: styles.css + toc: true + keep-md: true + commonmark: default + +website: + twitter-card: true + open-graph: true + repo-actions: [issue] + navbar: + background: primary + search: true + sidebar: + style: floating + +metadata-files: [nbdev.yml, sidebar.yml] diff --git a/nbs/index.md b/nbs/index.md new file mode 100644 index 0000000..5156104 --- /dev/null +++ b/nbs/index.md @@ -0,0 +1,66 @@ +# The /llms.txt file + +## Background + +Today websites are not just used to provide information to people, but they are also used to provide information to large language models. This can be used, for instance, in order to provide information necessary for coders to use a library, or as part of research to learn about a person or organization and so forth. Providing information for language models is a little different to providing information for humans, although there is plenty of overlap. Language models generally like to have information in a more concise form which can be more similar to what a human expert would want to read rather than a complete beginner to a topic. Language models can ingest a lot of information quickly. So it can be helpful to have a single place where all of the key information can be collated. + +The llms.txt format can be used in various scenarios. For software libraries, it can provide a structured overview of documentation, making it easier for LLMs to locate specific features or usage examples. In corporate websites, it can outline organizational structure and key information sources. Information about new legislation and necessary background and context could be curated in an llms.txt file to help stakeholders understand it. + +llms.txt can be adapted for various domains. Personal portfolio or CV websites could use it to help answer questions about an individual. In e-commerce, it could outline product categories and policies. Educational institutions might use it to summarize course offerings and resources. + +At the moment the most widely and easily understood format for language models is Markdown. Simply showing where key Markdown files can be found is a great first step. Providing some basic structure helps a language model to find where the information it needs can come from. + +## Format + +The llms.txt file is unusual in that it uses Markdown to structure the information rather than a classic structured format such as XML. The reason for this is that we expect many of these files to be read by language models. Having said that, the information in llms.txt follows a specific format and can be read using standard programmatic-based tools. + +The llms.txt file spec is for files located in the root path `/llms.txt` of a website (or, optionally, in a subpath). A file following the spec contains the following sections as markdown, in the specific order: + +- An H1 with the name of the project or site. This is the only required section +- A blockquote with a short summary of the project, containing key information necessary for understanding the rest of the file +- Zero or more markdown sections (e.g. paragraphs, lists, etc) of any type, except headings, containing more detailed information about the project and how to interpret the provided files +- Zero or more markdown sections delimited by H2 headers, containing "file lists" of URLs where further detail is available + - Each "file list" is a markdown list, containing a required markdown hyperlink `[name](url)`, then optionally a `:` and notes about the file. + +## Existing standards + +llms.txt is designed to coexist with current web standards. While sitemaps list all pages for search engines, llms.txt offers a curated overview for LLMs. It can complement robots.txt by providing context for allowed content. The file can also reference structured data markup used on the site, helping LLMs understand how to interpret this information in context. + +The approach of standardising on a path for the file follows the approach of `/robots.txt` and `/sitemap.xml`. robots.txt and llms.txt have different purposes — llms.txt information would generally be explicitly requested by a human for a particular task, to have a language model help them use the information on a website. On the other hand, robots.txt is generally used to let automated tools what access to a site is considered acceptable. + +sitemap.xml is a list of all the indexable human-readable information available on a site. This isn’t a substitute for llms.txt since it: + +- Often won’t have the LLM-readable versions of pages listed +- Doesn’t include URLs to external sites, even although they might be helpful to understand the information +- Will generally cover documents that in aggregate will be too large to fit in an LLM context window, and will include a lot of information that isn’t necessary to understand the site. + +## Example + +Here’s an example of llms.txt, in this case a cut down version of the file used for the FastHTML project: + +```markdown +# FastHTML + +> FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore's `FT` "FastTags" into a library for creating server-rendered hypermedia applications. + +FastHTML is written by Answer.AI, an organization which follows the fast.ai style guide instead of PEP 8, so most examples follow fast.ai style. + +## Docs + +- [FastHTML quick start](https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html.md): A brief overview of many FastHTML features +- [HTMX reference](https://raw.githubusercontent.com/bigskysoftware/reference.md): Brief description of all HTMX attributes, CSS classes, headers, events, extensions, js lib methods, and config options + +## Examples + +- [Todo list application](https://raw.githubusercontent.com/AnswerDotAI/fasthtml/adv_app.py): Detailed walk-thru of a complete CRUD app in FastHTML showing idiomatic use of FastHTML and HTMX patterns. + +## Optional + +- [Starlette full documentation](https://gist.githubusercontent.com/jph00/starlette-sml.md): A subset of the Starlette documentation useful for FastHTML development. +``` + +To create effective llms.txt files, consider these guidelines: Use concise, clear language. When linking to resources, include brief, informative descriptions. Avoid ambiguous terms or unexplained jargon. Run a tool that expands your llms.txt file into an LLM context file and test a number of language models to see if they can answer questions about your content. + +## Next steps + +The llms.txt specification is open for community input. A GitHub repository hosts this informal overview, allowing for version control and public discussion. A community discord channel is available for sharing implementation experiences and discussing best practices. diff --git a/nbs/llms-sample.txt b/nbs/llms-sample.txt new file mode 100644 index 0000000..349a7dc --- /dev/null +++ b/nbs/llms-sample.txt @@ -0,0 +1,20 @@ +# FastHTML + +> FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore's `FT` "FastTags" into a library for creating server-rendered hypermedia applications. + +FastHTML is written by Answer.AI, an organization which follows the fast.ai style guide instead of PEP 8, so most examples follow fast.ai style. + +## Docs + +- [FastHTML quick start](https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html.md): A brief overview of many FastHTML features +- [HTMX reference](https://raw.githubusercontent.com/bigskysoftware/htmx/master/www/content/reference.md): Brief description of all HTMX attributes, CSS classes, headers, events, extensions, js lib methods, and config options +- [Starlette quick guide](https://gist.githubusercontent.com/jph00/e91192e9bdc1640f5421ce3c904f2efb/raw/61a2774912414029edaf1a55b506f0e283b93c46/starlette-quick.md) + +## Examples + +- [Todo list application](https://raw.githubusercontent.com/AnswerDotAI/fasthtml/main/examples/adv_app.py): Detailed walk-thru of a complete CRUD app in FastHTML showing idiomatic use of FastHTML and HTMX patterns. + +## Optional + +- [Starlette full documentation](https://gist.githubusercontent.com/jph00/809e4a4808d4510be0e3dc9565e9cbd3/raw/9b717589ca44cedc8aaf00b2b8cacef922964c0f/starlette-sml.md): A subset of the Starlette documentation useful for FastHTML development. + diff --git a/nbs/nbdev.yml b/nbs/nbdev.yml new file mode 100644 index 0000000..ea4120d --- /dev/null +++ b/nbs/nbdev.yml @@ -0,0 +1,9 @@ +project: + output-dir: _docs + +website: + title: "llms-txt" + site-url: "https://AnswerDotAI.github.io/llms-txt" + description: "The /llms.txt file, helping language models use your website" + repo-branch: main + repo-url: "https://github.com/AnswerDotAI/llms-txt" diff --git a/nbs/styles.css b/nbs/styles.css new file mode 100644 index 0000000..66ccc49 --- /dev/null +++ b/nbs/styles.css @@ -0,0 +1,37 @@ +.cell { + margin-bottom: 1rem; +} + +.cell > .sourceCode { + margin-bottom: 0; +} + +.cell-output > pre { + margin-bottom: 0; +} + +.cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { + margin-left: 0.8rem; + margin-top: 0; + background: none; + border-left: 2px solid lightsalmon; + border-top-left-radius: 0; + border-top-right-radius: 0; +} + +.cell-output > .sourceCode { + border: none; +} + +.cell-output > .sourceCode { + background: none; + margin-top: 0; +} + +div.description { + padding-left: 2px; + padding-top: 5px; + font-style: italic; + font-size: 135%; + opacity: 70%; +} diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f2c07bf --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=64.0"] +build-backend = "setuptools.build_meta" diff --git a/settings.ini b/settings.ini new file mode 100644 index 0000000..9e46a50 --- /dev/null +++ b/settings.ini @@ -0,0 +1,47 @@ +[DEFAULT] +# All sections below are required unless otherwise specified. +# See https://github.com/fastai/nbdev/blob/master/settings.ini for examples. + +### Python library ### +repo = llms-txt +lib_name = %(repo)s +version = 0.0.1 +min_python = 3.8 +license = apache2 +black_formatting = False + +### nbdev ### +doc_path = _docs +lib_path = llms_txt +nbs_path = nbs +recursive = True +tst_flags = notest +put_version_in_init = True +cell_number = False + +### Docs ### +branch = main +custom_sidebar = False +doc_host = https://%(user)s.github.io +doc_baseurl = /%(repo)s +git_url = https://github.com/%(user)s/%(repo)s +title = %(lib_name)s + +### PyPI ### +audience = Developers +author = Jeremy Howard +author_email = github@jhoward.fastmail.fm +copyright = 2024 onwards, %(author)s +description = The /llms.txt file, helping language models use your website +keywords = nbdev jupyter notebook python LLMs NLP +language = English +status = 3 +user = AnswerDotAI +requirements = fastcore +conda_user = fastai +console_scripts = llms_txt2ctx=llms_txt.core:llms_txt2ctx + +### Optional ### +# dev_requirements = +# package_data = + diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..21b001e --- /dev/null +++ b/setup.py @@ -0,0 +1,64 @@ +from pkg_resources import parse_version +from configparser import ConfigParser +import setuptools, shlex +assert parse_version(setuptools.__version__)>=parse_version('36.2') + +# note: all settings are in settings.ini; edit there, not here +config = ConfigParser(delimiters=['=']) +config.read('settings.ini', encoding='utf-8') +cfg = config['DEFAULT'] + +cfg_keys = 'version description keywords author author_email'.split() +expected = cfg_keys + "lib_name user branch license status min_python audience language".split() +for o in expected: assert o in cfg, "missing expected setting: {}".format(o) +setup_cfg = {o:cfg[o] for o in cfg_keys} + +licenses = { + 'apache2': ('Apache Software License 2.0','OSI Approved :: Apache Software License'), + 'mit': ('MIT License', 'OSI Approved :: MIT License'), + 'gpl2': ('GNU General Public License v2', 'OSI Approved :: GNU General Public License v2 (GPLv2)'), + 'gpl3': ('GNU General Public License v3', 'OSI Approved :: GNU General Public License v3 (GPLv3)'), + 'bsd3': ('BSD License', 'OSI Approved :: BSD License'), +} +statuses = [ '1 - Planning', '2 - Pre-Alpha', '3 - Alpha', + '4 - Beta', '5 - Production/Stable', '6 - Mature', '7 - Inactive' ] +py_versions = '3.6 3.7 3.8 3.9 3.10 3.11 3.12'.split() + +requirements = shlex.split(cfg.get('requirements', '')) +if cfg.get('pip_requirements'): requirements += shlex.split(cfg.get('pip_requirements', '')) +min_python = cfg['min_python'] +lic = licenses.get(cfg['license'].lower(), (cfg['license'], None)) +dev_requirements = (cfg.get('dev_requirements') or '').split() + +package_data = dict() +pkg_data = cfg.get('package_data', None) +if pkg_data: + package_data[cfg['lib_name']] = pkg_data.split() # split as multiple files might be listed +# Add package data to setup_cfg for setuptools.setup(..., **setup_cfg) +setup_cfg['package_data'] = package_data + +setuptools.setup( + name = cfg['lib_name'], + license = lic[0], + classifiers = [ + 'Development Status :: ' + statuses[int(cfg['status'])], + 'Intended Audience :: ' + cfg['audience'].title(), + 'Natural Language :: ' + cfg['language'].title(), + ] + ['Programming Language :: Python :: '+o for o in py_versions[py_versions.index(min_python):]] + (['License :: ' + lic[1] ] if lic[1] else []), + url = cfg['git_url'], + packages = setuptools.find_packages(), + include_package_data = True, + install_requires = requirements, + extras_require={ 'dev': dev_requirements }, + dependency_links = cfg.get('dep_links','').split(), + python_requires = '>=' + cfg['min_python'], + long_description = open('README.md', encoding='utf-8').read(), + long_description_content_type = 'text/markdown', + zip_safe = False, + entry_points = { + 'console_scripts': cfg.get('console_scripts','').split(), + 'nbdev': [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'] + }, + **setup_cfg) + +