loculus-project · fhennig · Jan 29, 2025 · Jan 29, 2025 · Feb 5, 2025 · Feb 5, 2025
diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml
@@ -22,6 +22,10 @@ jobs:
       - name: Navigate to docs directory
         run: cd docs
 
+      - name: Copy Kubernetes schema to docs
+        run: npm run copy-schema
+        working-directory: ./docs
+
       - name: Install Dependencies
         run: npm install
         working-directory: ./docs

diff --git a/.github/workflows/docs-image.yml b/.github/workflows/docs-image.yml
@@ -42,7 +42,7 @@ jobs:
       - name: Generate files hash
         id: files-hash
         run: |
-          DIR_HASH=$(echo -n ${{ hashFiles('docs/**', '.github/workflows/docs-image.yml') }})
+          DIR_HASH=$(echo -n ${{ hashFiles('docs/**', '.github/workflows/docs-image.yml', 'kubernetes/loculus/values.schema.json') }})
           echo "DIR_HASH=$DIR_HASH${{ env.BUILD_ARM == 'true' && '-arm' || '' }}" >> $GITHUB_ENV
       - name: Setup Docker metadata
         id: dockerMetadata
@@ -73,6 +73,9 @@ jobs:
         run: |
           NODE_VERSION=$(cat docs/.nvmrc | tr -cd [:digit:].)
           echo "NODE_VERSION=$NODE_VERSION" >> $GITHUB_ENV
+      - name: Copy Kubernetes schema to docs
+        run: npm run copy-schema
+        working-directory: ./docs
       - name: Build and push image
         if: env.CACHE_HIT == 'false'
         uses: docker/build-push-action@v6

diff --git a/.github/workflows/docs-test.yml b/.github/workflows/docs-test.yml
@@ -22,6 +22,9 @@ jobs:
         uses: actions/setup-node@v4
         with:
           node-version-file: ./docs/.nvmrc
+      - name: Copy Kubernetes schema to docs
+        run: npm run copy-schema
+        working-directory: ./docs
       - name: Install Dependencies
         run: npm install
         working-directory: ./docs

diff --git a/.github/workflows/helm-schema-lint.yaml b/.github/workflows/helm-schema-lint.yaml
@@ -0,0 +1,29 @@
+name: helm-schema-lint
+
+on:
+  pull_request:
+    paths:
+      - "kubernetes/**"
+      - ".github/workflows/helm-schema-lint.yaml"
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Install Helm
+        uses: azure/setup-helm@v4
+        with:
+          version: latest
+
+      - name: Run Helm lint on values files
+        run: |
+          helm lint kubernetes/loculus -f kubernetes/loculus/values.yaml
+          helm lint kubernetes/loculus -f kubernetes/loculus/values_e2e_and_dev.yaml
+          helm lint kubernetes/loculus -f kubernetes/loculus/values_preview_server.yaml
diff --git a/docs/.gitignore b/docs/.gitignore
@@ -12,6 +12,8 @@ yarn-debug.log*
 yarn-error.log*
 pnpm-debug.log*
 
+# this file is only copied over from /kubernetes/loculus
+src/values.schema.json
 
 # environment variables
 .env

diff --git a/docs/package-lock.json b/docs/package-lock.json
diff --git a/docs/package.json b/docs/package.json
@@ -3,8 +3,9 @@
     "type": "module",
     "version": "0.0.1",
     "scripts": {
-        "dev": "astro dev",
-        "start": "astro dev",
+        "copy-schema": "cp ../kubernetes/loculus/values.schema.json src/",
+        "dev": "npm run copy-schema && astro dev",
+        "start": "npm run copy-schema && astro dev",
         "build": "astro check && astro build",
         "preview": "astro preview",
         "astro": "astro",
@@ -22,5 +23,8 @@
         "sharp": "^0.33.5",
         "tailwindcss": "^3.4.16",
         "typescript": "^5.7.2"
+    },
+    "devDependencies": {
+        "@types/json-schema": "^7.0.15"
     }
 }
diff --git a/docs/src/components/MarkdownRenderer.astro b/docs/src/components/MarkdownRenderer.astro
@@ -0,0 +1,18 @@
+---
+/**
+ * Pass in a Markdown formatted string as 'content'.
+ */
+import { unified } from 'unified';
+import remarkParse from 'remark-parse';
+import remarkRehype from 'remark-rehype';
+import rehypeStringify from 'rehype-stringify';
+
+const { content } = Astro.props;
+
+const html = String(await unified()
+    .use(remarkParse)
+    .use(remarkRehype)
+    .use(rehypeStringify)
+    .process(content));
+---
+<div set:html={html}></div>
diff --git a/docs/src/components/SchemaDocs.astro b/docs/src/components/SchemaDocs.astro
@@ -0,0 +1,142 @@
+---
+/**
+ * Render parts of the JSON schema into a table, given a `group` to render.
+ * Properties in the schema can be assigned to groups by adding a new "groups": ["group1", "group2"] key to them.
+ * This way, you can split schema definition into semantic groups.
+ */
+import { type JSONSchema7, type JSONSchema7Definition, type JSONSchema7TypeName, type JSONSchema7Type } from 'json-schema';
+import MarkdownRenderer from './MarkdownRenderer.astro';
+import rawSchema from '../values.schema.json';
+
+// the 'as any' isn't pretty but needed because our schema doesn't actually conform to the type.
+// it's still nice to use the type to get at least some type hinting when developing.
+let schema: JSONSchema7 = rawSchema as any;
+
+const { group, fieldColumnClass } = Astro.props;
+
+/** Example: 'boolean' -> 'Boolean'. */
+function capitalizeFirst(str: string) {
+    return str.charAt(0).toUpperCase() + str.slice(1);
+}
+
+/**
+ * A useful representation of a schema type.
+ * Plain types are uppercased. Lists of types are joined.
+ * If type is 'string' but there are also enum vals, the enum vals are returned instead.
+ */
+function typeToString(
+    type: JSONSchema7TypeName | JSONSchema7TypeName[] | undefined,
+    enumvals: JSONSchema7Type[] | undefined
+) {
+    if (type === undefined) return "";
+    if (Array.isArray(type)) {
+        return type.map(t => capitalizeFirst(String(t))).join(", ")
+    }
+    if (type === "string" && enumvals !== undefined) {
+        return enumvals?.map(enumval => String(enumval)).join(", ")
+    }
+    return capitalizeFirst(String(type));
+}
+
+/** A row in the table. */
+interface Row {
+    key: string,
+    type?: string,
+    default?: string,
+    description?: string
+}
+
+const rows: Row[] = [];
+
+/**
+ * Recursive function to traverse the schema properties and extract rows for the table.
+ * @param prefix Accumulates the tree path during recursion, start of with "".
+ * @param key The key of the property currently observed.
+ * @param definition The definition of the property currently observed.
+ */
+function addSelfAndChildren(prefix: string, key: string, definition: JSONSchema7Definition) {
+    if (
+        typeof definition === 'object' &&
+        definition !== null
+    ) {
+        if ('placeholder' in definition) {
+            key = `<${definition.placeholder}>`;
+        }
+        if ('docsIncludePrefix' in definition && definition.docsIncludePrefix === false) {
+            prefix = "";
+        }
+        if (
+            'groups' in definition &&
+            Array.isArray(definition.groups) &&
+            definition.groups.includes(group)
+        ) {
+            var def = definition.default !== undefined ? String(definition.default) : "";
+            if (definition.type === "string" && def !== "") {
+                def = `"${def}"`
+            }
+            rows.push({
+                key: `${prefix}${key}`,
+                type: typeToString(definition.type, definition.enum),
+                default: def,
+                description: definition.description
+            })
+        }
+        if ('properties' in definition && definition.properties) {
+            Object.entries(definition.properties).forEach(([k, d]) => addSelfAndChildren(`${prefix}${key}.`, k, d));
+        }
+        if ('patternProperties' in definition && definition.patternProperties) {
+            Object.entries(definition.patternProperties).forEach(([k, d]) => addSelfAndChildren(`${prefix}${key}.`, k, d));
+        }
+        if ('items' in definition && definition.items !== undefined && typeof definition.items === 'object') {
+            const items = definition.items;
+            if ('length' in items) { // filter out arrays
+                return;
+            }
+            addSelfAndChildren(`${prefix}${key}.`, "[]", items)
+        }
+    }
+}
+
+if (schema.definitions) {
+    Object.entries(schema.definitions).forEach(([_, definition]) => {
+        if (typeof definition === 'object' && definition.properties) {
+            Object.entries(definition.properties).forEach(([key, definition]) => {
+                addSelfAndChildren("", key, definition);
+            });
+        }
+    });
+}
+
+// start of recursing with the top level properties in the schema.
+if (schema.properties) {
+    Object.entries(schema.properties).forEach(([key, definition]) => {
+        addSelfAndChildren("", key, definition);
+    });
+}
+
+---
+
+<div class='overflow-x-scroll'>
+    <table class='min-w-[700px]'>
+        <thead>
+            <tr>
+                <th class={fieldColumnClass}>Field</th>
+                <th>Type</th>
+                <th>Default</th>
+                <th>Description</th>
+            </tr>
+        </thead>
+        <tbody>
+            {
+                rows.map(row => (
+                    <tr>
+                        <td><code>{row.key}</code></td>
+                        <td>{row.type}</td>
+                        <td>{row.default}</td>
+                        <td><MarkdownRenderer content={row.description} /></td>
+                    </tr>
+                ))
+            }
+        </tbody>
+    </table>
+</div>
diff --git a/docs/src/content/docs/for-administrators/existing-preprocessing-pipelines.md b/docs/src/content/docs/for-administrators/existing-preprocessing-pipelines.md
@@ -28,12 +28,12 @@ Additionally the pipeline performs checks on the metadata fields. The checks are
 
 In the default configuration the pipeline performs:
 
--   **type checks**: Checks that the type of each metadata field corresponds to the expected `type` value seen in the config (default is string).
--   **required value checks**: Checks that if a field is required, e.g. `required` field in config is true, that that field is not None.
--   **INSDC-accepted country checks**: Using the `process_options` preprocessing function checks that the `geoLocCountry` field is set to an [INSDC-accepted country](https://www.ebi.ac.uk/ena/browser/api/xml/ERC000011) option.
+- **type checks**: Checks that the type of each metadata field corresponds to the expected `type` value seen in the config (default is string).
+- **required value checks**: Checks that if a field is required, e.g. `required` field in config is true, that that field is not None.
+- **INSDC-accepted country checks**: Using the `process_options` preprocessing function checks that the `geoLocCountry` field is set to an [INSDC-accepted country](https://www.ebi.ac.uk/ena/browser/api/xml/ERC000011) option.
 
 The pipeline also formats metadata fields:
 
--   **parse timestamp**: Takes an ISO timestamp e.g. `2022-11-01T00:00:00Z` and returns that field in the `%Y-%m-%d` format.
+- **parse timestamp**: Takes an ISO timestamp e.g. `2022-11-01T00:00:00Z` and returns that field in the `%Y-%m-%d` format.
 
 The code is available on [GitHub](https://github.com/loculus-project/loculus/tree/main/preprocessing/nextclade) under the [AGPL-3.0 license](https://github.com/loculus-project/loculus/blob/main/LICENSE).
diff --git a/docs/src/content/docs/for-administrators/getting-started.md b/docs/src/content/docs/for-administrators/getting-started.md
@@ -33,12 +33,12 @@ We do not have a guide to deploy Loculus with Docker Compose at the moment but y
 
 You can compile and run Loculus from source code if you do not want to use Docker. We do not have a dedicated guide for this at the moment and recommend reading the [Docker Compose example](#with-docker-compose) to understand how the sub-services should be connected and the (developer) documentation of the individual services for getting them running:
 
--   [Loculus backend](https://github.com/loculus-project/loculus/tree/main/backend)
--   [Loculus website](https://github.com/loculus-project/loculus/tree/main/website)
--   [PostgreSQL](https://www.postgresql.org/docs/)
--   [Keycloak](https://www.keycloak.org/guides)
--   [SILO](https://github.com/GenSpectrum/LAPIS-SILO)
--   [LAPIS](https://github.com/GenSpectrum/LAPIS)
--   Use the [Nextclade preprocessing pipeline](https://github.com/loculus-project/loculus/tree/main/preprocessing/nextclade) or follow the [preprocessing pipeline specifications](https://github.com/loculus-project/loculus/blob/main/preprocessing/specification.md) to build your own custom pipeline
+- [Loculus backend](https://github.com/loculus-project/loculus/tree/main/backend)
+- [Loculus website](https://github.com/loculus-project/loculus/tree/main/website)
+- [PostgreSQL](https://www.postgresql.org/docs/)
+- [Keycloak](https://www.keycloak.org/guides)
+- [SILO](https://github.com/GenSpectrum/LAPIS-SILO)
+- [LAPIS](https://github.com/GenSpectrum/LAPIS)
+- Use the [Nextclade preprocessing pipeline](https://github.com/loculus-project/loculus/tree/main/preprocessing/nextclade) or follow the [preprocessing pipeline specifications](https://github.com/loculus-project/loculus/blob/main/preprocessing/specification.md) to build your own custom pipeline
 
 Please let us know if you are interested in using Loculus without Docker or Kubernetes! Your feedback will motivate us to create a guide. You are of course also very welcome to contribute to the documentation if you have successfully deployed a Loculus instance and have written down the steps.
diff --git a/docs/src/content/docs/for-administrators/my-first-loculus.md b/docs/src/content/docs/for-administrators/my-first-loculus.md
@@ -163,7 +163,6 @@ createTestAccounts: true
 ```
 <!-- prettier-ignore-end -->
 
-
 Because we have enabled the `createTestAccounts` option, we need to delete the existing keycloak database to ensure that the test users are added.
 
 First we need to run `kubectl get pods` to get the name of the keycloak pod, which will be something like `loculus-keycloak-database-665b964c6b-gm9t5` (but with the random string at the end being different).
@@ -182,7 +181,6 @@ Now we can upgrade the Loculus installation again:
 helm upgrade loculus ./kubernetes/loculus --set environment=local --set branch=latest --set disableIngest=true --set disableEnaSubmission=true -f custom_values.yaml
 ```
 
-
 ### Testing it out with some data
 
 While that's getting ready, let's create some data to submit.

diff --git a/docs/src/content/docs/for-administrators/schema-designs.md b/docs/src/content/docs/for-administrators/schema-designs.md
@@ -5,10 +5,10 @@ description: Different ways to design the schema of a Loculus instance
 
 Loculus is very flexible in its data model and there are different ways to design the [schema](../../introduction/glossary#schema). Technically, a Loculus instance can have one or multiple organisms and each organism has
 
--   a set of metadata fields
--   a set of unaligned nucleotide sequences
--   a set of aligned nucleotide sequences
--   a set of aligned amino acid sequences
+- a set of metadata fields
+- a set of unaligned nucleotide sequences
+- a set of aligned nucleotide sequences
+- a set of aligned amino acid sequences
 
 The different nucleotide sequences are called segments and the different amino acid sequences are called genes but they do not need to be biological segments and genes. If there is only one nucleotide sequence, it may but does not need to have a name. If there are multiple nucleotide sequences, they must be named. The amino acid sequences must always be named.
 
@@ -24,19 +24,19 @@ This is the typical model for Loculus. The Loculus instance contains one or more
 
 This is a good model if:
 
--   Each sample (taken from the host) only has one (possibly multi-segmented) sequence.
--   For each organism, it is clear which reference genome to use.
--   Users are expected to analyze the organisms independently (e.g., users don’t desire a table containing sequences from different organisms).
+- Each sample (taken from the host) only has one (possibly multi-segmented) sequence.
+- For each organism, it is clear which reference genome to use.
+- Users are expected to analyze the organisms independently (e.g., users don’t desire a table containing sequences from different organisms).
 
 ### One organism for everything
 
 On the opposite end of the spectrum, it is possible to only have one “technical organism” in Loculus to store all the data. There are multiple “technical segments” with each segment storing the sequence of a different “actual organism”. Users submit a multi-segment file (e.g., a FASTA containing`>sample1_covid`,`>sample1_rsv-a`, ...).
 
 This is a good model if:
 
--   Samples are sequenced with a multi-pathogen panel and may contain sequences from one or multiple pathogens (i.e., co-infections).
--   Sequences of different organisms share the same (sampling and host) metadata.
--   Users want to see co-infection data (e.g., a sequence details page listing all sequences from the sample).
+- Samples are sequenced with a multi-pathogen panel and may contain sequences from one or multiple pathogens (i.e., co-infections).
+- Sequences of different organisms share the same (sampling and host) metadata.
+- Users want to see co-infection data (e.g., a sequence details page listing all sequences from the sample).
 
 ### Multiple references for an organism