diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..98ec895
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,2 @@
+**/renv
+**/.Rprofile
diff --git a/.env.sample b/.env.sample
new file mode 100644
index 0000000..3caeb91
--- /dev/null
+++ b/.env.sample
@@ -0,0 +1,12 @@
+ENV=test
+DATA_VOLUME_PATH=./data/test_data
+
+# For preprocessing
+PREPROCESS_DB_NAME=         # name of the source database
+PREPROCESS_HOST=            # host address for the source database
+PREPROCESS_PORT=            # port on which to connect to the source database
+PREPROCESS_DB_USERNAME=     # username for the source database
+PREPROCESS_DB_PASSWORD=     # password for the source database
+
+# For testing
+TEST_DB_PATH=./data-raw/test_db/eunomia
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
index 139ccee..dd8a38b 100644
--- a/.github/workflows/R-CMD-check.yaml
+++ b/.github/workflows/R-CMD-check.yaml
@@ -6,7 +6,7 @@ on:
   pull_request:
     branches: [main, master]
 
-name: R-CMD-check.yaml
+name: R-CMD-check
 
 permissions: read-all
 
@@ -18,9 +18,14 @@ concurrency:
 jobs:
   R-CMD-check:
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        package_dir: [app, preprocessing]
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       R_KEEP_PKG_SOURCE: yes
+
     steps:
       - uses: actions/checkout@v4
 
@@ -29,12 +34,16 @@ jobs:
           use-public-rspm: true
 
       - uses: r-lib/actions/setup-renv@v2
+        with:
+          working-directory: ${{ matrix.package_dir }}
 
       - name: Install rcmdcheck
         run: install.packages("rcmdcheck")
         shell: Rscript {0}
+        working-directory: ${{ matrix.package_dir }}
 
       - uses: r-lib/actions/check-r-package@v2
         with:
           upload-snapshots: true
           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
+          working-directory: ${{ matrix.package_dir }}
diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 871de19..932b5f5 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -3,20 +3,28 @@ name: Build Docker Image
 on:
   push:
     branches: ["main"]
-    paths: ["deploy/**"]
+    paths: ["docker-compose.yml", "app/Dockerfile", "preprocessing/Dockerfile"]
   pull_request:
     branches: ["main"]
-    paths: ["deploy/**"]
+    paths: ["docker-compose.yml", "app/Dockerfile", "preprocessing/Dockerfile"]
   workflow_dispatch:
 
+# Only run actions on the most recent push to a branch
+concurrency:
+  group: "${{ github.workflow }}-${{ github.head_ref }}"
+  cancel-in-progress: true
+
 jobs:
   build:
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        image: [omopcat, preprocess]
 
     steps:
       - uses: actions/checkout@v4
-      - name: Add dummy .Renviron file
-        run: touch deploy/.Renviron
       - name: Build the Docker image
-        run: docker compose build
-        working-directory: deploy/
+        run: |
+          cp .env.sample .env
+          docker compose build ${{ matrix.image }}
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index 7d9ce47..6a54b1a 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -6,7 +6,7 @@ on:
   pull_request:
     branches: [main, master]
 
-name: lint.yaml
+name: lint
 
 permissions: read-all
 
@@ -18,6 +18,10 @@ concurrency:
 jobs:
   lint:
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        package_dir: [app, preprocessing]
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
@@ -28,15 +32,19 @@ jobs:
           use-public-rspm: true
 
       - uses: r-lib/actions/setup-renv@v2
+        with:
+          working-directory: ${{ matrix.package_dir }}
 
       - name: Install lintr
         run: install.packages("lintr")
         shell: Rscript {0}
+        working-directory: ${{ matrix.package_dir }}
 
       - name: Lint
         run: |
           pkgload::load_all()
           lintr::lint_package()
         shell: Rscript {0}
+        working-directory: ${{ matrix.package_dir }}
         env:
           LINTR_ERROR_ON_LINT: true
diff --git a/.github/workflows/pr-commands.yaml b/.github/workflows/pr-commands.yaml
deleted file mode 100644
index 1f5b8bb..0000000
--- a/.github/workflows/pr-commands.yaml
+++ /dev/null
@@ -1,91 +0,0 @@
-# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
-# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
-on:
-  issue_comment:
-    types: [created]
-
-name: pr-commands.yaml
-
-permissions: read-all
-
-# Only run actions on the most recent push to a branch
-concurrency:
-  group: "${{ github.workflow }}-${{ github.head_ref }}"
-  cancel-in-progress: true
-
-jobs:
-  document:
-    if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }}
-    name: document
-    runs-on: ubuntu-latest
-    env:
-      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
-    permissions:
-      contents: write
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: r-lib/actions/pr-fetch@v2
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-
-      - uses: r-lib/actions/setup-r@v2
-        with:
-          use-public-rspm: true
-
-      - uses: r-lib/actions/setup-renv@v2
-
-      - name: Install roxygen2
-        run: install.packages("roxygen2")
-        shell: Rscript {0}
-
-      - name: Document
-        run: roxygen2::roxygenise()
-        shell: Rscript {0}
-
-      - name: commit
-        run: |
-          git config --local user.name "$GITHUB_ACTOR"
-          git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
-          git add man/\* NAMESPACE
-          git commit -m 'Document'
-
-      - uses: r-lib/actions/pr-push@v2
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-
-  style:
-    if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }}
-    name: style
-    runs-on: ubuntu-latest
-    env:
-      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
-    permissions:
-      contents: write
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: r-lib/actions/pr-fetch@v2
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-
-      - uses: r-lib/actions/setup-r@v2
-
-      - name: Install dependencies
-        run: install.packages("styler")
-        shell: Rscript {0}
-
-      - name: Style
-        run: styler::style_pkg()
-        shell: Rscript {0}
-
-      - name: commit
-        run: |
-          git config --local user.name "$GITHUB_ACTOR"
-          git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com"
-          git add \*.R
-          git commit -m 'Style'
-
-      - uses: r-lib/actions/pr-push@v2
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
index 7f7abaa..2fda2dd 100644
--- a/.github/workflows/test-coverage.yaml
+++ b/.github/workflows/test-coverage.yaml
@@ -6,13 +6,17 @@ on:
   pull_request:
     branches: [main, master]
 
-name: test-coverage.yaml
+name: test-coverage
 
 permissions: read-all
 
 jobs:
   test-coverage:
     runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        package_dir: [app, preprocessing]
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 
@@ -24,10 +28,13 @@ jobs:
           use-public-rspm: true
 
       - uses: r-lib/actions/setup-renv@v2
+        with:
+          working-directory: ${{ matrix.package_dir }}
 
       - name: Install dependencies
         run: install.packages("covr", "xml2")
         shell: Rscript {0}
+        working-directory: ${{ matrix.package_dir }}
 
       - name: Test coverage
         run: |
@@ -38,6 +45,7 @@ jobs:
           )
           covr::to_cobertura(cov)
         shell: Rscript {0}
+        working-directory: ${{ matrix.package_dir }}
 
       - uses: codecov/codecov-action@v4
         with:
@@ -46,6 +54,7 @@ jobs:
           plugin: noop
           disable_search: true
           token: ${{ secrets.CODECOV_TOKEN }}
+          directory: ${{ matrix.package_dir }}
 
       - name: Show testthat output
         if: always()
@@ -53,6 +62,7 @@ jobs:
           ## --------------------------------------------------------------------
           find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true
         shell: bash
+        working-directory: ${{ matrix.package_dir }}
 
       - name: Upload test results
         if: failure()
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6e03a97..24f2501 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,10 +7,12 @@ repos:
       - id: style-files
         args: [--style_pkg=styler, --style_fun=tidyverse_style]
       - id: roxygenize
+        args: [--root=./app]
         additional_dependencies:
           - bslib
           - config
           - DT
+          - fs
           - ggplot2
           - golem
           - shiny
@@ -35,6 +37,7 @@ repos:
       # codemeta must be above use-tidy-description when both are used
       # -   id: codemeta-description-updated
       - id: use-tidy-description
+        args: [--root=./app]
       # - id: lintr  # disabling lintr hook as it doesn't seem to correctly load the package
       # see also https://github.com/lorenzwalthert/precommit/issues/440
       - id: parsable-R
@@ -42,7 +45,8 @@ repos:
       - id: no-print-statement
       - id: no-debug-statement
       - id: deps-in-desc
-        exclude: ^scripts/
+        args: [--root=./app]
+        exclude: "^scripts/|^app/dev/"
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.6.0
     hooks:
@@ -52,11 +56,6 @@ repos:
         files: '^\.Rbuildignore$'
       - id: end-of-file-fixer
         exclude: '\.Rd'
-  - repo: https://github.com/pre-commit-ci/pre-commit-ci-config
-    rev: v1.6.1
-    hooks:
-      # Only required when https://pre-commit.ci is used for config validation
-      - id: check-pre-commit-ci-config
   - repo: local
     hooks:
       - id: forbid-to-commit
diff --git a/R/utils_preprocessing_db.R b/R/utils_preprocessing_db.R
deleted file mode 100644
index f725d25..0000000
--- a/R/utils_preprocessing_db.R
+++ /dev/null
@@ -1,152 +0,0 @@
-# nocov start
-
-#' Connec to a database
-#'
-#' General helper to connect to a databae through [`DBI::dbConnect()`], while ensuring
-#' that the connection is closed when the connection object goes out of scope.
-#'
-#' @param ... arguments passed on to [`DBI::dbConnect()`]
-#' @param .envir passed on to [`withr::defer()`]
-#'
-#' @return A [`DBI::DBIConnection-class`] object
-#' @export
-connect_to_db <- function(..., .envir = parent.frame()) {
-  con <- DBI::dbConnect(...)
-  withr::defer(DBI::dbDisconnect(con), envir = .envir)
-  con
-}
-
-#' Connect to duckdb database
-#'
-#' @param db_path path to the duckdb database file
-#' @param ... unused
-#' @param .envir passed on to [`withr::defer()`]
-#'
-#' @return A [`DBI::DBIConnection-class`] object
-#' @export
-connect_to_test_duckdb <- function(db_path, ..., .envir = parent.frame()) {
-  if (!file.exists(db_path)) {
-    cli::cli_abort("Database file {.file {db_path}} not found")
-  }
-
-  # Connect to the duckdb test database
-  rlang::check_installed("duckdb", reason = "to set up test database connection")
-  connect_to_db(duckdb::duckdb(dbdir = db_path), .envir = .envir)
-}
-
-
-#' Write data to a table in the database
-#'
-#' @param data data.frame, data to be written to the table
-#' @param con A [`DBI::DBIConnection-class`] object
-#' @param table character, name of the table to write to
-#' @param schema character, name of the schema to be used
-#'
-#' @return `TRUE`, invisibly, if the operation was successful
-#' @export
-write_table <- function(data, con, table, schema) {
-  DBI::dbWriteTable(
-    conn = con,
-    name = DBI::Id(schema = schema, table = table),
-    value = data,
-    overwrite = TRUE
-  )
-}
-
-
-#' Read a parquet table and sort the results
-#'
-#' @param path path to the parquet file to be read
-#' @inheritParams nanoparquet::read_parquet
-#'
-#' @return A `data.frame` with the results sorted by all columns
-#' @export
-#' @importFrom dplyr arrange across everything
-read_parquet_sorted <- function(path, options = nanoparquet::parquet_options()) {
-  if (!file.exists(path)) {
-    cli::cli_abort("File {.file {path}} not found")
-  }
-
-  nanoparquet::read_parquet(path, options) |>
-    arrange(across(everything()))
-}
-
-#' Function to produce the 'omopcat_concepts' table from a list of concept ids
-#'
-#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
-#' @param concepts A vector of concept IDs
-#'
-#' @return A `data.frame` with the concept table
-#' @export
-query_concepts_table <- function(cdm, concepts) {
-  # Extract columns from concept table
-  cdm$concept |>
-    filter(.data$concept_id %in% concepts) |>
-    select(
-      "concept_id",
-      "concept_name",
-      "vocabulary_id",
-      "domain_id",
-      "concept_class_id",
-      "standard_concept",
-      "concept_code"
-    ) |>
-    collect()
-}
-
-#' Generate the 'omopcat_monthly_counts' table
-#'
-#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
-#'
-#' @return A `data.frame` with the monthly counts
-#' @export
-process_monthly_counts <- function(cdm) {
-  # Combine results for all tables
-  out <- bind_rows( # nolint start
-    cdm$condition_occurrence |> calculate_monthly_counts(condition_concept_id, condition_start_date),
-    cdm$drug_exposure |> calculate_monthly_counts(drug_concept_id, drug_exposure_start_date),
-    cdm$procedure_occurrence |> calculate_monthly_counts(procedure_concept_id, procedure_date),
-    cdm$device_exposure |> calculate_monthly_counts(device_concept_id, device_exposure_start_date),
-    cdm$measurement |> calculate_monthly_counts(measurement_concept_id, measurement_date),
-    cdm$observation |> calculate_monthly_counts(observation_concept_id, observation_date),
-    cdm$specimen |> calculate_monthly_counts(specimen_concept_id, specimen_date)
-  ) # nolint end
-
-  # Map concept names to the concept IDs
-  concept_names <- select(cdm$concept, .data$concept_id, .data$concept_name) |>
-    filter(.data$concept_id %in% out$concept_id) |>
-    collect()
-  out |>
-    dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |>
-    select("concept_id", "concept_name", everything())
-}
-
-#' Generate the 'omopcat_summary_stats' table
-
-#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
-#'
-#' @return A `data.frame` with the summary statistics
-#' @export
-process_summary_stats <- function(cdm) {
-  table_names <- c("measurement", "observation")
-  concept_names <- c("measurement_concept_id", "observation_concept_id")
-
-  # Combine results for all tables
-  stats <- purrr::map2(table_names, concept_names, ~ calculate_summary_stats(cdm[[.x]], .y))
-  stats <- bind_rows(stats)
-
-  # Map concept names to the concept_ids
-  concept_names <- select(cdm$concept, "concept_id", "concept_name") |>
-    filter(.data$concept_id %in% c(stats$concept_id, stats$value_as_concept_id)) |>
-    collect()
-  stats |>
-    # Order is important here, first we get the names for the value_as_concept_ids
-    # from the categorical data summaries and record it as `value_as_string`
-    dplyr::left_join(concept_names, by = c("value_as_concept_id" = "concept_id")) |>
-    rename(value_as_string = "concept_name") |>
-    # Then we get the names for the main concept_ids
-    dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |>
-    select("concept_id", "concept_name", !"value_as_concept_id")
-}
-
-# nocov end
diff --git a/R/utils_preprocessing_summarise.R b/R/utils_preprocessing_summarise.R
deleted file mode 100644
index aa1402e..0000000
--- a/R/utils_preprocessing_summarise.R
+++ /dev/null
@@ -1,121 +0,0 @@
-#' Calculate monthly statistics for an OMOP concept
-#'
-#' @param omop_table A table from the OMOP CDM
-#' @param concept The name of the concept column to calculate statistics for
-#' @param date The name of the date column to calculate statistics for
-#'
-#' @return A `data.frame` with the following columns:
-#'   - `concept_id`: The concept ID
-#'   - `concept_name`: The concept name
-#'   - `date_year`: The year of the date
-#'   - `date_month`: The month of the date
-#'   - `person_count`: The number of unique patients per concept for each month
-#'   - `records_per_person`: The average number of records per person per concept for each month
-#' @export
-#' @importFrom dplyr mutate group_by summarise select n n_distinct collect
-calculate_monthly_counts <- function(omop_table, concept, date) {
-  # Extract year and month from date column
-  omop_table <- mutate(omop_table,
-    concept_id = {{ concept }},
-    date_year = as.integer(lubridate::year({{ date }})),
-    date_month = as.integer(lubridate::month({{ date }}))
-  )
-
-  omop_table |>
-    group_by(.data$date_year, .data$date_month, .data$concept_id) |>
-    summarise(
-      record_count = n(),
-      person_count = n_distinct(.data$person_id),
-    ) |>
-    # NOTE: Explicitly cast types to avoid unexpected SQL behaviour,
-    # otherwise the records_per_person might end up as an int
-    # and the *_count vars as int64, which can give problems later
-    mutate(
-      record_count = as.integer(.data$record_count),
-      person_count = as.integer(.data$person_count),
-      records_per_person = as.double(.data$record_count) / as.double(.data$person_count)
-    ) |>
-    select(
-      "concept_id",
-      "date_year",
-      "date_month",
-      "record_count",
-      "person_count",
-      "records_per_person"
-    ) |>
-    ## Collect in case we're dealing with a database-stored table
-    collect()
-}
-
-#' Calculate summary statistics for an OMOP table
-#'
-#' Calculates the mean and standard deviation for numeric concepts and the
-#' frequency for categorical concepts.
-#'
-#' @param omop_table A table from the OMOP CDM
-#' @param concept_name The name of the concept ID column
-#'
-#' @return A `data.frame` with the following columns:
-#'  - `concept_id`: The concept ID
-#'  - `summary_attribute`: The summary attribute (e.g. `"mean"`, `"sd"`, `"frequency"`)
-#'  - `value_as_number`: The value of the summary attribute
-#'  - `value_as_concept_id`: In case of a categorical concept, the concept ID for each category
-#' @export
-#' @importFrom dplyr all_of rename filter collect bind_rows
-calculate_summary_stats <- function(omop_table, concept_name) {
-  stopifnot(is.character(concept_name))
-
-  omop_table <- rename(omop_table, concept_id = all_of(concept_name))
-
-  ## Avoid "no visible binding" notes
-  value_as_number <- value_as_concept_id <- NULL
-
-  numeric_concepts <- filter(omop_table, !is.na(value_as_number))
-  # beware CDM docs: NULL=no categorical result, 0=categorical result but no mapping
-  categorical_concepts <- filter(omop_table, !is.null(value_as_concept_id) & value_as_concept_id != 0)
-
-  numeric_stats <- .summarise_numeric_concepts(numeric_concepts) |> collect()
-  categorical_stats <- .summarise_categorical_concepts(categorical_concepts) |>
-    # Convert value_as_number to double to make it compatible with numeric stats
-    mutate(value_as_number = as.double(.data$value_as_number)) |>
-    collect()
-  bind_rows(numeric_stats, categorical_stats)
-}
-
-#' @importFrom dplyr group_by summarise
-#' @importFrom stats sd
-.summarise_numeric_concepts <- function(omop_table) {
-  value_as_number <- concept_id <- NULL
-
-  # Calculate mean and sd
-  stats <- omop_table |>
-    group_by(concept_id) |>
-    summarise(mean = mean(value_as_number, na.rm = TRUE), sd = sd(value_as_number, na.rm = TRUE))
-
-  # Wrangle output to expected format
-  stats |>
-    tidyr::pivot_longer(
-      cols = c(mean, sd),
-      names_to = "summary_attribute",
-      values_to = "value_as_number"
-    )
-}
-
-#' @importFrom dplyr count mutate select
-.summarise_categorical_concepts <- function(omop_table) {
-  concept_id <- value_as_concept_id <- summary_attribute <- NULL
-
-  # Calculate frequencies
-  frequencies <- omop_table |>
-    count(concept_id, value_as_concept_id)
-
-  # Wrangle output into the expected format
-  frequencies |>
-    mutate(summary_attribute = "frequency") |>
-    select(
-      concept_id,
-      summary_attribute,
-      value_as_number = n,
-      value_as_concept_id
-    )
-}
diff --git a/README.md b/README.md
index 7ae0369..11692ae 100644
--- a/README.md
+++ b/README.md
@@ -19,13 +19,22 @@ and subsequently export a selection of concepts of interest.
 
 ## Installation
 
-You can install the development version of omopcat from within R like so:
+The data catalogue consists of 2 separate R packages:
+
+- [`omopcat`](./app/) defines the [Shiny](https://shiny.posit.co/) app to display the catalogue
+- [`omopcat.preprocessing`](./preprocessing/) contains the functionality to summarise an OMOP data source to use as input for the app
+
+The source for these is contained in the `app/` and `preprocessing/` directories, respectively.
+
+You can install the development version of these packages from within R like so:
 
 ```r
 install.packages("remotes")
 usethis::create_github_token()
 credentials::set_github_pat()
-remotes::install_github("SAFEHR-data/omopcat")
+
+remotes::install_github("SAFEHR-data/omopcat/app") # omopcat
+remotes::install_github("SAFEHR-data/omopcat/preprocessing") # omopcat.preprocessing
 ```
 
 You will need to copy the PAT from the web page that `usethis::create_github_token`
@@ -62,18 +71,31 @@ To run a truly productionised version, we provide a [containerised deployment](#
 We provide a [Docker](https://www.docker.com/) container and [`docker-compose`](https://docs.docker.com/compose/)
 configuration to run the app in a production environment.
 
-A test version can be run with
+### Configure environment variables
+
+Copy the `.env.sample` file to `.env` and fill out the required values
 
 ```sh
-docker compose -f deploy/docker-compose.test.yml up -d
+cp .env.sample .env
 ```
 
-which will use the [test data](./data/test_data).
+### Pre-processing
+
+The pre-processing pipeline can be run with the following command
+
+```sh
+docker compose run preprocess
+```
+
+This will generate the necessary files to run the `omopcat` app with and store them
+at the path defined by the `DATA_VOLUME_PATH` environment variable.
+
+### Running the app
 
-To deploy a production version, using the data from `data/prod_data` (needs to be populated manually), run
+To deploy a production version, using the data from `data/prod_data` (generated by the `preprocess` service), run
 
 ```sh
-docker compose -f deploy/docker-compose.yml up -d
+docker compose up -d omopcat
 ```
 
 By default, the app will be hosted at `http://localhost:3838`.
diff --git a/.Rbuildignore b/app/.Rbuildignore
similarity index 75%
rename from .Rbuildignore
rename to app/.Rbuildignore
index 9f7c10c..2580063 100644
--- a/.Rbuildignore
+++ b/app/.Rbuildignore
@@ -3,16 +3,21 @@ $run_dev.*
 ^.here$
 ^LICENSE\.md$
 ^\.Rproj\.user$
+^\.env.*$
 ^\.github$
 ^\.lintr$
 ^\.pre-commit-config\.yaml$
 ^\.renvignore$
+^app$
 ^codecov\.yml$
 ^data$
 ^data-raw$
 ^deploy$
 ^dev$
+^docker-compose.*\.yml$
+^preprocessing$
 ^renv$
 ^renv\.lock$
 ^scripts$
 dev_history.R
+^Dockerfile$
diff --git a/.Rprofile b/app/.Rprofile
similarity index 100%
rename from .Rprofile
rename to app/.Rprofile
diff --git a/.renvignore b/app/.renvignore
similarity index 100%
rename from .renvignore
rename to app/.renvignore
diff --git a/DESCRIPTION b/app/DESCRIPTION
similarity index 93%
rename from DESCRIPTION
rename to app/DESCRIPTION
index 2dd377e..c54900e 100644
--- a/DESCRIPTION
+++ b/app/DESCRIPTION
@@ -19,10 +19,8 @@ Imports:
     ggplot2,
     golem (>= 0.4.1),
     shiny (>= 1.9.1),
-    DBI,
     glue,
     tidyr,
-    withr,
     forcats,
     readr,
     lubridate,
@@ -35,21 +33,18 @@ Imports:
     purrr,
     htmltools,
     markdown,
-    bsicons,
-    CDMConnector,
-    RPostgres
+    bsicons
 Suggests:
     devtools,
     usethis,
+    fs,
     styler,
     testthat (>= 3.0.0),
     spelling,
     here,
     lintr,
-    dbplyr,
-    RSQLite,
     precommit,
-    duckdb
+    withr
 Remotes: SAFEHR-data/omop-bundles
 Encoding: UTF-8
 LazyData: true
diff --git a/app/Dockerfile b/app/Dockerfile
new file mode 100644
index 0000000..b87de04
--- /dev/null
+++ b/app/Dockerfile
@@ -0,0 +1,19 @@
+FROM rocker/shiny-verse:4.4.1
+
+WORKDIR /app
+ADD app .
+COPY app/renv.lock ./renv.lock
+
+# Install renv and restore environment
+# omopbundles is installed separately as renv is giving problems
+# with GitHub packages
+RUN install2.r --error --skipinstalled renv remotes && \
+    R -e 'renv::restore(exclude = "omopbundles")' && \
+    rm -rf /tmp/downloaded_packages
+RUN R -e 'remotes::install_github("SAFEHR-data/omop-bundles")'
+
+# Install the app
+RUN R -e 'remotes::install_local(".", dependencies = TRUE)'
+
+EXPOSE 3838
+CMD ["R", "-e", "options('shiny.port'=3838, shiny.host='0.0.0.0'); omopcat::run_app()" ]
diff --git a/NAMESPACE b/app/NAMESPACE
similarity index 54%
rename from NAMESPACE
rename to app/NAMESPACE
index f66f92c..2bfcba2 100644
--- a/NAMESPACE
+++ b/app/NAMESPACE
@@ -1,15 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
-export(calculate_monthly_counts)
-export(calculate_summary_stats)
-export(connect_to_db)
-export(connect_to_test_duckdb)
-export(process_monthly_counts)
-export(process_summary_stats)
-export(query_concepts_table)
-export(read_parquet_sorted)
 export(run_app)
-export(write_table)
 import(bslib)
 import(ggplot2)
 import(shiny)
@@ -17,20 +8,8 @@ importFrom(bslib,card)
 importFrom(bslib,card_header)
 importFrom(bslib,layout_columns)
 importFrom(bslib,page_fluid)
-importFrom(dplyr,across)
-importFrom(dplyr,all_of)
-importFrom(dplyr,arrange)
-importFrom(dplyr,bind_rows)
-importFrom(dplyr,collect)
-importFrom(dplyr,count)
-importFrom(dplyr,everything)
-importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
-importFrom(dplyr,mutate)
-importFrom(dplyr,n)
 importFrom(dplyr,n_distinct)
-importFrom(dplyr,rename)
-importFrom(dplyr,select)
 importFrom(dplyr,summarise)
 importFrom(golem,activate_js)
 importFrom(golem,add_resource_path)
@@ -38,6 +17,7 @@ importFrom(golem,bundle_resources)
 importFrom(golem,favicon)
 importFrom(golem,with_golem_options)
 importFrom(htmltools,includeMarkdown)
+importFrom(markdown,mark)
 importFrom(rlang,.data)
 importFrom(shiny,NS)
 importFrom(shiny,shinyApp)
@@ -45,4 +25,3 @@ importFrom(shiny,span)
 importFrom(shiny,tableOutput)
 importFrom(shiny,tagList)
 importFrom(shiny,textOutput)
-importFrom(stats,sd)
diff --git a/NEWS.md b/app/NEWS.md
similarity index 100%
rename from NEWS.md
rename to app/NEWS.md
diff --git a/R/app_config.R b/app/R/app_config.R
similarity index 100%
rename from R/app_config.R
rename to app/R/app_config.R
diff --git a/R/app_server.R b/app/R/app_server.R
similarity index 100%
rename from R/app_server.R
rename to app/R/app_server.R
diff --git a/R/app_ui.R b/app/R/app_ui.R
similarity index 100%
rename from R/app_ui.R
rename to app/R/app_ui.R
diff --git a/R/fct_monthly_count_plot.R b/app/R/fct_monthly_count_plot.R
similarity index 100%
rename from R/fct_monthly_count_plot.R
rename to app/R/fct_monthly_count_plot.R
diff --git a/R/fct_summary_stat_plot.R b/app/R/fct_summary_stat_plot.R
similarity index 100%
rename from R/fct_summary_stat_plot.R
rename to app/R/fct_summary_stat_plot.R
diff --git a/R/mod_bundles_summary.R b/app/R/mod_bundles_summary.R
similarity index 100%
rename from R/mod_bundles_summary.R
rename to app/R/mod_bundles_summary.R
diff --git a/R/mod_datatable.R b/app/R/mod_datatable.R
similarity index 98%
rename from R/mod_datatable.R
rename to app/R/mod_datatable.R
index 93f4ebf..a43f335 100644
--- a/R/mod_datatable.R
+++ b/app/R/mod_datatable.R
@@ -54,7 +54,6 @@ mod_datatable_ui <- function(id) {
 #' @return The selected row as a reactive object
 #'
 #' @noRd
-#' @importFrom dplyr group_by summarise
 mod_datatable_server <- function(id, selected_dates, bundle_concepts) {
   stopifnot(is.reactive(selected_dates))
   stopifnot(is.reactive(bundle_concepts))
@@ -66,7 +65,7 @@ mod_datatable_server <- function(id, selected_dates, bundle_concepts) {
     concepts_with_counts <- reactive({
       join_counts_to_concepts(all_concepts, monthly_counts, selected_dates()) |>
         # Handle low frequencies
-        mutate(
+        dplyr::mutate(
           total_records = replace_low_frequencies(.data$total_records),
           mean_persons = replace_low_frequencies(.data$mean_persons),
           mean_records_per_person = replace_low_frequencies(.data$mean_records_per_person)
diff --git a/R/mod_daterange.R b/app/R/mod_daterange.R
similarity index 100%
rename from R/mod_daterange.R
rename to app/R/mod_daterange.R
diff --git a/R/mod_export.R b/app/R/mod_export.R
similarity index 100%
rename from R/mod_export.R
rename to app/R/mod_export.R
diff --git a/R/mod_export_tab.R b/app/R/mod_export_tab.R
similarity index 100%
rename from R/mod_export_tab.R
rename to app/R/mod_export_tab.R
diff --git a/R/mod_manual.R b/app/R/mod_manual.R
similarity index 94%
rename from R/mod_manual.R
rename to app/R/mod_manual.R
index 62a4740..3c8dffd 100644
--- a/R/mod_manual.R
+++ b/app/R/mod_manual.R
@@ -9,6 +9,7 @@
 #' @importFrom shiny NS tagList
 #' @importFrom bslib card card_header
 #' @importFrom htmltools includeMarkdown
+#' @importFrom markdown mark
 mod_manual_ui <- function(id) {
   tagList(
     fluidRow(
diff --git a/R/mod_plots.R b/app/R/mod_plots.R
similarity index 100%
rename from R/mod_plots.R
rename to app/R/mod_plots.R
diff --git a/R/mod_select_bundle.R b/app/R/mod_select_bundle.R
similarity index 100%
rename from R/mod_select_bundle.R
rename to app/R/mod_select_bundle.R
diff --git a/R/mod_select_for_export.R b/app/R/mod_select_for_export.R
similarity index 100%
rename from R/mod_select_for_export.R
rename to app/R/mod_select_for_export.R
diff --git a/R/omopcat-package.R b/app/R/omopcat-package.R
similarity index 100%
rename from R/omopcat-package.R
rename to app/R/omopcat-package.R
diff --git a/R/run_app.R b/app/R/run_app.R
similarity index 100%
rename from R/run_app.R
rename to app/R/run_app.R
diff --git a/R/utils_bundles.R b/app/R/utils_bundles.R
similarity index 100%
rename from R/utils_bundles.R
rename to app/R/utils_bundles.R
diff --git a/R/utils_get_data.R b/app/R/utils_get_data.R
similarity index 100%
rename from R/utils_get_data.R
rename to app/R/utils_get_data.R
diff --git a/R/utils_replace_low_frequencies.R b/app/R/utils_replace_low_frequencies.R
similarity index 100%
rename from R/utils_replace_low_frequencies.R
rename to app/R/utils_replace_low_frequencies.R
diff --git a/dev/01_start.R b/app/dev/01_start.R
similarity index 100%
rename from dev/01_start.R
rename to app/dev/01_start.R
diff --git a/dev/02_dev.R b/app/dev/02_dev.R
similarity index 100%
rename from dev/02_dev.R
rename to app/dev/02_dev.R
diff --git a/dev/03_deploy.R b/app/dev/03_deploy.R
similarity index 99%
rename from dev/03_deploy.R
rename to app/dev/03_deploy.R
index 0220968..a51cd94 100644
--- a/dev/03_deploy.R
+++ b/app/dev/03_deploy.R
@@ -28,4 +28,3 @@ devtools::build()
 ## Docker ----
 ## If you want to deploy via a generic Dockerfile
 golem::add_dockerfile_with_renv()
-
diff --git a/dev/design/omop-data-catalogue-design.excalidraw b/app/dev/design/omop-data-catalogue-design.excalidraw
similarity index 99%
rename from dev/design/omop-data-catalogue-design.excalidraw
rename to app/dev/design/omop-data-catalogue-design.excalidraw
index 9302b04..4f61808 100644
--- a/dev/design/omop-data-catalogue-design.excalidraw
+++ b/app/dev/design/omop-data-catalogue-design.excalidraw
@@ -2829,4 +2829,4 @@
 	},
 	"prevTextMode": "parsed",
 	"files": {}
-}
\ No newline at end of file
+}
diff --git a/dev/design/omop-data-catalogue-design.png b/app/dev/design/omop-data-catalogue-design.png
similarity index 100%
rename from dev/design/omop-data-catalogue-design.png
rename to app/dev/design/omop-data-catalogue-design.png
diff --git a/dev/run_dev.R b/app/dev/run_dev.R
similarity index 100%
rename from dev/run_dev.R
rename to app/dev/run_dev.R
diff --git a/app/inst/WORDLIST b/app/inst/WORDLIST
new file mode 100644
index 0000000..dac9b27
--- /dev/null
+++ b/app/inst/WORDLIST
@@ -0,0 +1,22 @@
+Briot
+CMD
+Catalogue
+Changelog
+GAE
+Lifecycle
+OMOP
+Pre
+Ribeyre
+UI
+catalogue
+codecov
+containerised
+dev
+dropdown
+funder
+ggplot
+golem
+pre
+productionised
+repo
+visualising
diff --git a/inst/app/help_tab.md b/app/inst/app/help_tab.md
similarity index 100%
rename from inst/app/help_tab.md
rename to app/inst/app/help_tab.md
diff --git a/inst/app/www/favicon.ico b/app/inst/app/www/favicon.ico
similarity index 100%
rename from inst/app/www/favicon.ico
rename to app/inst/app/www/favicon.ico
diff --git a/inst/dev_data/omopcat_concepts.csv b/app/inst/dev_data/omopcat_concepts.csv
similarity index 100%
rename from inst/dev_data/omopcat_concepts.csv
rename to app/inst/dev_data/omopcat_concepts.csv
diff --git a/inst/dev_data/omopcat_monthly_counts.csv b/app/inst/dev_data/omopcat_monthly_counts.csv
similarity index 100%
rename from inst/dev_data/omopcat_monthly_counts.csv
rename to app/inst/dev_data/omopcat_monthly_counts.csv
diff --git a/inst/dev_data/omopcat_summary_stats.csv b/app/inst/dev_data/omopcat_summary_stats.csv
similarity index 100%
rename from inst/dev_data/omopcat_summary_stats.csv
rename to app/inst/dev_data/omopcat_summary_stats.csv
diff --git a/inst/golem-config.yml b/app/inst/golem-config.yml
similarity index 100%
rename from inst/golem-config.yml
rename to app/inst/golem-config.yml
diff --git a/man/all_bundles.Rd b/app/man/all_bundles.Rd
similarity index 100%
rename from man/all_bundles.Rd
rename to app/man/all_bundles.Rd
diff --git a/man/get_bundle_concepts.Rd b/app/man/get_bundle_concepts.Rd
similarity index 100%
rename from man/get_bundle_concepts.Rd
rename to app/man/get_bundle_concepts.Rd
diff --git a/man/omopcat-package.Rd b/app/man/omopcat-package.Rd
similarity index 100%
rename from man/omopcat-package.Rd
rename to app/man/omopcat-package.Rd
diff --git a/man/run_app.Rd b/app/man/run_app.Rd
similarity index 100%
rename from man/run_app.Rd
rename to app/man/run_app.Rd
diff --git a/omopcat.Rproj b/app/omopcat.Rproj
similarity index 100%
rename from omopcat.Rproj
rename to app/omopcat.Rproj
diff --git a/renv.lock b/app/renv.lock
similarity index 81%
rename from renv.lock
rename to app/renv.lock
index 386388f..5e9d181 100644
--- a/renv.lock
+++ b/app/renv.lock
@@ -9,48 +9,6 @@
     ]
   },
   "Packages": {
-    "CDMConnector": {
-      "Package": "CDMConnector",
-      "Version": "1.5.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "DBI",
-        "R",
-        "checkmate",
-        "cli",
-        "dbplyr",
-        "dplyr",
-        "fs",
-        "generics",
-        "glue",
-        "jsonlite",
-        "lifecycle",
-        "methods",
-        "omopgenerics",
-        "purrr",
-        "readr",
-        "rlang",
-        "stringi",
-        "stringr",
-        "tidyr",
-        "tidyselect",
-        "waldo",
-        "withr"
-      ],
-      "Hash": "2bae94aed83da2007c938f3d2fa1a48f"
-    },
-    "DBI": {
-      "Package": "DBI",
-      "Version": "1.2.3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "methods"
-      ],
-      "Hash": "065ae649b05f1ff66bb0c793107508f5"
-    },
     "DT": {
       "Package": "DT",
       "Version": "0.33",
@@ -174,44 +132,6 @@
       ],
       "Hash": "45f0398006e83a5b10b72a90663d8d8c"
     },
-    "RPostgres": {
-      "Package": "RPostgres",
-      "Version": "1.4.7",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "DBI",
-        "R",
-        "bit64",
-        "blob",
-        "cpp11",
-        "hms",
-        "lubridate",
-        "methods",
-        "plogr",
-        "withr"
-      ],
-      "Hash": "beb7e18bf3f9e096f716a52a77ec793c"
-    },
-    "RSQLite": {
-      "Package": "RSQLite",
-      "Version": "2.3.7",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "DBI",
-        "R",
-        "bit64",
-        "blob",
-        "cpp11",
-        "memoise",
-        "methods",
-        "pkgconfig",
-        "plogr",
-        "rlang"
-      ],
-      "Hash": "46b45a4dd7bb0e0f4e3fc22245817240"
-    },
     "Rcpp": {
       "Package": "Rcpp",
       "Version": "1.0.13-1",
@@ -287,18 +207,6 @@
       ],
       "Hash": "e84984bf5f12a18628d9a02322128dfd"
     },
-    "blob": {
-      "Package": "blob",
-      "Version": "1.2.4",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "methods",
-        "rlang",
-        "vctrs"
-      ],
-      "Hash": "40415719b5a479b87949f3aa0aee737c"
-    },
     "brew": {
       "Package": "brew",
       "Version": "1.0-10",
@@ -316,26 +224,6 @@
       ],
       "Hash": "c1ee497a6d999947c2c224ae46799b1a"
     },
-    "broom": {
-      "Package": "broom",
-      "Version": "1.0.7",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "backports",
-        "dplyr",
-        "generics",
-        "glue",
-        "lifecycle",
-        "purrr",
-        "rlang",
-        "stringr",
-        "tibble",
-        "tidyr"
-      ],
-      "Hash": "8fcc818f3b9887aebaf206f141437cc9"
-    },
     "bsicons": {
       "Package": "bsicons",
       "Version": "0.1.2",
@@ -396,30 +284,6 @@
       ],
       "Hash": "d7e13f49c19103ece9e58ad2d83a7354"
     },
-    "cellranger": {
-      "Package": "cellranger",
-      "Version": "1.1.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "rematch",
-        "tibble"
-      ],
-      "Hash": "f61dbaec772ccd2e17705c1e872e9e7c"
-    },
-    "checkmate": {
-      "Package": "checkmate",
-      "Version": "2.3.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "backports",
-        "utils"
-      ],
-      "Hash": "0e14e01ce07e7c88fd25de6d4260d26b"
-    },
     "cli": {
       "Package": "cli",
       "Version": "3.6.3",
@@ -482,19 +346,6 @@
       ],
       "Hash": "8b7222e9d9eb5178eea545c0c4d33fc2"
     },
-    "conflicted": {
-      "Package": "conflicted",
-      "Version": "1.2.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cli",
-        "memoise",
-        "rlang"
-      ],
-      "Hash": "bb097fccb22d156624fd07cd2894ddb6"
-    },
     "cpp11": {
       "Package": "cpp11",
       "Version": "0.5.0",
@@ -579,34 +430,6 @@
       ],
       "Hash": "2e00b378fc3be69c865120d9f313039a"
     },
-    "dbplyr": {
-      "Package": "dbplyr",
-      "Version": "2.5.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "DBI",
-        "R",
-        "R6",
-        "blob",
-        "cli",
-        "dplyr",
-        "glue",
-        "lifecycle",
-        "magrittr",
-        "methods",
-        "pillar",
-        "purrr",
-        "rlang",
-        "tibble",
-        "tidyr",
-        "tidyselect",
-        "utils",
-        "vctrs",
-        "withr"
-      ],
-      "Hash": "39b2e002522bfd258039ee4e889e0fd1"
-    },
     "desc": {
       "Package": "desc",
       "Version": "1.4.3",
@@ -723,38 +546,6 @@
       ],
       "Hash": "fedd9d00c2944ff00a0e2696ccf048ec"
     },
-    "dtplyr": {
-      "Package": "dtplyr",
-      "Version": "1.3.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cli",
-        "data.table",
-        "dplyr",
-        "glue",
-        "lifecycle",
-        "rlang",
-        "tibble",
-        "tidyselect",
-        "vctrs"
-      ],
-      "Hash": "54ed3ea01b11e81a86544faaecfef8e2"
-    },
-    "duckdb": {
-      "Package": "duckdb",
-      "Version": "1.1.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "DBI",
-        "R",
-        "methods",
-        "utils"
-      ],
-      "Hash": "83a09ee9c8380fecfcea1daeaa99e3b2"
-    },
     "ellipsis": {
       "Package": "ellipsis",
       "Version": "0.3.2",
@@ -841,28 +632,6 @@
       ],
       "Hash": "7f48af39fa27711ea5fbd183b399920d"
     },
-    "gargle": {
-      "Package": "gargle",
-      "Version": "1.5.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cli",
-        "fs",
-        "glue",
-        "httr",
-        "jsonlite",
-        "lifecycle",
-        "openssl",
-        "rappdirs",
-        "rlang",
-        "stats",
-        "utils",
-        "withr"
-      ],
-      "Hash": "fc0b272e5847c58cd5da9b20eedbd026"
-    },
     "generics": {
       "Package": "generics",
       "Version": "0.1.3",
@@ -971,59 +740,6 @@
       ],
       "Hash": "11be24963e2f220c403eafd01e2259d5"
     },
-    "googledrive": {
-      "Package": "googledrive",
-      "Version": "2.1.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cli",
-        "gargle",
-        "glue",
-        "httr",
-        "jsonlite",
-        "lifecycle",
-        "magrittr",
-        "pillar",
-        "purrr",
-        "rlang",
-        "tibble",
-        "utils",
-        "uuid",
-        "vctrs",
-        "withr"
-      ],
-      "Hash": "e99641edef03e2a5e87f0a0b1fcc97f4"
-    },
-    "googlesheets4": {
-      "Package": "googlesheets4",
-      "Version": "1.1.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cellranger",
-        "cli",
-        "curl",
-        "gargle",
-        "glue",
-        "googledrive",
-        "httr",
-        "ids",
-        "lifecycle",
-        "magrittr",
-        "methods",
-        "purrr",
-        "rematch2",
-        "rlang",
-        "tibble",
-        "utils",
-        "vctrs",
-        "withr"
-      ],
-      "Hash": "d6db1667059d027da730decdc214b959"
-    },
     "gtable": {
       "Package": "gtable",
       "Version": "0.3.6",
@@ -1040,27 +756,6 @@
       ],
       "Hash": "de949855009e2d4d0e52a844e30617ae"
     },
-    "haven": {
-      "Package": "haven",
-      "Version": "2.5.4",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cli",
-        "cpp11",
-        "forcats",
-        "hms",
-        "lifecycle",
-        "methods",
-        "readr",
-        "rlang",
-        "tibble",
-        "tidyselect",
-        "vctrs"
-      ],
-      "Hash": "9171f898db9d9c4c1b2c745adc2c1ef1"
-    },
     "here": {
       "Package": "here",
       "Version": "1.0.1",
@@ -1190,17 +885,6 @@
       ],
       "Hash": "e58f80d4c5b4f0bab1456956d6ca6aad"
     },
-    "ids": {
-      "Package": "ids",
-      "Version": "1.0.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "openssl",
-        "uuid"
-      ],
-      "Hash": "99df65cfef20e525ed38c3d2577f7190"
-    },
     "ini": {
       "Package": "ini",
       "Version": "0.3.1",
@@ -1422,24 +1106,6 @@
       ],
       "Hash": "fec5f52652d60615fdb3957b3d74324a"
     },
-    "modelr": {
-      "Package": "modelr",
-      "Version": "0.1.11",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "broom",
-        "magrittr",
-        "purrr",
-        "rlang",
-        "tibble",
-        "tidyr",
-        "tidyselect",
-        "vctrs"
-      ],
-      "Hash": "4f50122dc256b1b6996a4703fecea821"
-    },
     "munsell": {
       "Package": "munsell",
       "Version": "0.5.1",
@@ -1495,28 +1161,6 @@
       ],
       "Hash": "caecf4e78df185f39703c28a73658659"
     },
-    "omopgenerics": {
-      "Package": "omopgenerics",
-      "Version": "0.3.1",
-      "Source": "Repository",
-      "Repository": "https://packagemanager.posit.co/cran/2024-09-30",
-      "Requirements": [
-        "R",
-        "cli",
-        "dbplyr",
-        "dplyr",
-        "glue",
-        "lifecycle",
-        "methods",
-        "purrr",
-        "rlang",
-        "snakecase",
-        "stringr",
-        "tidyr",
-        "vctrs"
-      ],
-      "Hash": "c17db6329c631b9010951ad112341ba3"
-    },
     "openssl": {
       "Package": "openssl",
       "Version": "2.2.2",
@@ -1621,13 +1265,6 @@
       ],
       "Hash": "2ec30ffbeec83da57655b850cf2d3e0e"
     },
-    "plogr": {
-      "Package": "plogr",
-      "Version": "0.2.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "09eb987710984fc2905c7129c7d85e65"
-    },
     "plotly": {
       "Package": "plotly",
       "Version": "4.10.4",
@@ -1844,38 +1481,6 @@
       ],
       "Hash": "9de96463d2117f6ac49980577939dfb3"
     },
-    "readxl": {
-      "Package": "readxl",
-      "Version": "1.4.3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cellranger",
-        "cpp11",
-        "progress",
-        "tibble",
-        "utils"
-      ],
-      "Hash": "8cf9c239b96df1bbb133b74aef77ad0a"
-    },
-    "rematch": {
-      "Package": "rematch",
-      "Version": "2.0.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "cbff1b666c6fa6d21202f07e2318d4f1"
-    },
-    "rematch2": {
-      "Package": "rematch2",
-      "Version": "2.1.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "tibble"
-      ],
-      "Hash": "76c9e04c712a05848ae7a23d2f170a40"
-    },
     "remotes": {
       "Package": "remotes",
       "Version": "2.5.0",
@@ -1900,28 +1505,6 @@
       ],
       "Hash": "47623f66b4e80b3b0587bc5d7b309888"
     },
-    "reprex": {
-      "Package": "reprex",
-      "Version": "2.1.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "callr",
-        "cli",
-        "clipr",
-        "fs",
-        "glue",
-        "knitr",
-        "lifecycle",
-        "rlang",
-        "rmarkdown",
-        "rstudioapi",
-        "utils",
-        "withr"
-      ],
-      "Hash": "97b1d5361a24d9fb588db7afe3e5bcbf"
-    },
     "rex": {
       "Package": "rex",
       "Version": "1.2.1",
@@ -2021,25 +1604,6 @@
       ],
       "Hash": "a9881dfed103e83f9de151dc17002cd1"
     },
-    "rvest": {
-      "Package": "rvest",
-      "Version": "1.0.4",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cli",
-        "glue",
-        "httr",
-        "lifecycle",
-        "magrittr",
-        "rlang",
-        "selectr",
-        "tibble",
-        "xml2"
-      ],
-      "Hash": "0bcf0c6f274e90ea314b812a6d19a519"
-    },
     "sass": {
       "Package": "sass",
       "Version": "0.4.9",
@@ -2074,19 +1638,6 @@
       ],
       "Hash": "c19df082ba346b0ffa6f833e92de34d1"
     },
-    "selectr": {
-      "Package": "selectr",
-      "Version": "0.4-2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "R6",
-        "methods",
-        "stringr"
-      ],
-      "Hash": "3838071b66e0c566d55cc26bd6e27bf4"
-    },
     "sessioninfo": {
       "Package": "sessioninfo",
       "Version": "1.2.2",
@@ -2133,18 +1684,6 @@
       ],
       "Hash": "6a293995a66e12c48d13aa1f957d09c7"
     },
-    "snakecase": {
-      "Package": "snakecase",
-      "Version": "0.11.1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "stringi",
-        "stringr"
-      ],
-      "Hash": "58767e44739b76965332e8a4fe3f91f1"
-    },
     "sourcetools": {
       "Package": "sourcetools",
       "Version": "0.1.7-1",
@@ -2336,46 +1875,6 @@
       ],
       "Hash": "829f27b9c4919c16b593794a6344d6c0"
     },
-    "tidyverse": {
-      "Package": "tidyverse",
-      "Version": "2.0.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "broom",
-        "cli",
-        "conflicted",
-        "dbplyr",
-        "dplyr",
-        "dtplyr",
-        "forcats",
-        "ggplot2",
-        "googledrive",
-        "googlesheets4",
-        "haven",
-        "hms",
-        "httr",
-        "jsonlite",
-        "lubridate",
-        "magrittr",
-        "modelr",
-        "pillar",
-        "purrr",
-        "ragg",
-        "readr",
-        "readxl",
-        "reprex",
-        "rlang",
-        "rstudioapi",
-        "rvest",
-        "stringr",
-        "tibble",
-        "tidyr",
-        "xml2"
-      ],
-      "Hash": "c328568cd14ea89a83bd4ca7f54ae07e"
-    },
     "timechange": {
       "Package": "timechange",
       "Version": "0.3.0",
@@ -2463,16 +1962,6 @@
       ],
       "Hash": "62b65c52671e6665f803ff02954446e9"
     },
-    "uuid": {
-      "Package": "uuid",
-      "Version": "1.2-1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R"
-      ],
-      "Hash": "34e965e62a41fcafb1ca60e9b142085b"
-    },
     "vctrs": {
       "Package": "vctrs",
       "Version": "0.6.5",
@@ -2525,7 +2014,7 @@
     },
     "waldo": {
       "Package": "waldo",
-      "Version": "0.6.0",
+      "Version": "0.6.1",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
@@ -2536,7 +2025,7 @@
         "methods",
         "rlang"
       ],
-      "Hash": "53ec6571dc7a321797cde0abe007ff53"
+      "Hash": "52f574062a7b66e56926988c3fbdb3b7"
     },
     "whisker": {
       "Package": "whisker",
diff --git a/renv/.gitignore b/app/renv/.gitignore
similarity index 100%
rename from renv/.gitignore
rename to app/renv/.gitignore
diff --git a/renv/activate.R b/app/renv/activate.R
similarity index 100%
rename from renv/activate.R
rename to app/renv/activate.R
diff --git a/renv/settings.json b/app/renv/settings.json
similarity index 100%
rename from renv/settings.json
rename to app/renv/settings.json
diff --git a/tests/spelling.R b/app/tests/spelling.R
similarity index 100%
rename from tests/spelling.R
rename to app/tests/spelling.R
diff --git a/tests/testthat.R b/app/tests/testthat.R
similarity index 100%
rename from tests/testthat.R
rename to app/tests/testthat.R
diff --git a/tests/testthat/_snaps/utils_get_data/concepts_table.csv b/app/tests/testthat/_snaps/utils_get_data/concepts_table.csv
similarity index 100%
rename from tests/testthat/_snaps/utils_get_data/concepts_table.csv
rename to app/tests/testthat/_snaps/utils_get_data/concepts_table.csv
diff --git a/tests/testthat/_snaps/utils_get_data/monthly_counts.csv b/app/tests/testthat/_snaps/utils_get_data/monthly_counts.csv
similarity index 100%
rename from tests/testthat/_snaps/utils_get_data/monthly_counts.csv
rename to app/tests/testthat/_snaps/utils_get_data/monthly_counts.csv
diff --git a/tests/testthat/_snaps/utils_get_data/summary_stats.csv b/app/tests/testthat/_snaps/utils_get_data/summary_stats.csv
similarity index 100%
rename from tests/testthat/_snaps/utils_get_data/summary_stats.csv
rename to app/tests/testthat/_snaps/utils_get_data/summary_stats.csv
diff --git a/tests/testthat/helper-mock_data.R b/app/tests/testthat/helper-mock_data.R
similarity index 100%
rename from tests/testthat/helper-mock_data.R
rename to app/tests/testthat/helper-mock_data.R
diff --git a/tests/testthat/setup.R b/app/tests/testthat/setup.R
similarity index 100%
rename from tests/testthat/setup.R
rename to app/tests/testthat/setup.R
diff --git a/tests/testthat/test-fct_monthly_count_plot.R b/app/tests/testthat/test-fct_monthly_count_plot.R
similarity index 100%
rename from tests/testthat/test-fct_monthly_count_plot.R
rename to app/tests/testthat/test-fct_monthly_count_plot.R
diff --git a/tests/testthat/test-fct_summary_stat_plot.R b/app/tests/testthat/test-fct_summary_stat_plot.R
similarity index 100%
rename from tests/testthat/test-fct_summary_stat_plot.R
rename to app/tests/testthat/test-fct_summary_stat_plot.R
diff --git a/tests/testthat/test-mod_bundles_summary.R b/app/tests/testthat/test-mod_bundles_summary.R
similarity index 100%
rename from tests/testthat/test-mod_bundles_summary.R
rename to app/tests/testthat/test-mod_bundles_summary.R
diff --git a/tests/testthat/test-mod_datatable.R b/app/tests/testthat/test-mod_datatable.R
similarity index 100%
rename from tests/testthat/test-mod_datatable.R
rename to app/tests/testthat/test-mod_datatable.R
diff --git a/tests/testthat/test-mod_daterange.R b/app/tests/testthat/test-mod_daterange.R
similarity index 100%
rename from tests/testthat/test-mod_daterange.R
rename to app/tests/testthat/test-mod_daterange.R
diff --git a/tests/testthat/test-mod_export_tab.R b/app/tests/testthat/test-mod_export_tab.R
similarity index 100%
rename from tests/testthat/test-mod_export_tab.R
rename to app/tests/testthat/test-mod_export_tab.R
diff --git a/tests/testthat/test-mod_exportsummary.R b/app/tests/testthat/test-mod_exportsummary.R
similarity index 100%
rename from tests/testthat/test-mod_exportsummary.R
rename to app/tests/testthat/test-mod_exportsummary.R
diff --git a/tests/testthat/test-mod_plots.R b/app/tests/testthat/test-mod_plots.R
similarity index 100%
rename from tests/testthat/test-mod_plots.R
rename to app/tests/testthat/test-mod_plots.R
diff --git a/tests/testthat/test-mod_select_bundle.R b/app/tests/testthat/test-mod_select_bundle.R
similarity index 100%
rename from tests/testthat/test-mod_select_bundle.R
rename to app/tests/testthat/test-mod_select_bundle.R
diff --git a/tests/testthat/test-mod_select_for_export.R b/app/tests/testthat/test-mod_select_for_export.R
similarity index 100%
rename from tests/testthat/test-mod_select_for_export.R
rename to app/tests/testthat/test-mod_select_for_export.R
diff --git a/tests/testthat/test-run_app.R b/app/tests/testthat/test-run_app.R
similarity index 100%
rename from tests/testthat/test-run_app.R
rename to app/tests/testthat/test-run_app.R
diff --git a/tests/testthat/test-utils_bundles.R b/app/tests/testthat/test-utils_bundles.R
similarity index 100%
rename from tests/testthat/test-utils_bundles.R
rename to app/tests/testthat/test-utils_bundles.R
diff --git a/tests/testthat/test-utils_get_data.R b/app/tests/testthat/test-utils_get_data.R
similarity index 100%
rename from tests/testthat/test-utils_get_data.R
rename to app/tests/testthat/test-utils_get_data.R
diff --git a/codecov.yml b/codecov.yml
index 9aba9bd..aefd73f 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -10,3 +10,14 @@ coverage:
         target: auto
         threshold: 1%
         informational: true
+
+component_management:
+  individual_components:
+    - component_id: app
+      name: app
+      paths:
+        - app/
+    - component_id: preprocessing
+      name: preprocessing
+      paths:
+        - preprocessing/
diff --git a/data-raw/test_db/create_test_db.R b/data-raw/test_db/create_test_db.R
new file mode 100644
index 0000000..8b500f2
--- /dev/null
+++ b/data-raw/test_db/create_test_db.R
@@ -0,0 +1,113 @@
+# Creates a test database from the synthea-allergies-10k example dataset
+# with added dummy data for observations and measurements
+
+# PRODUCED FOR A SPECIFIC DATASET:
+# synthea-allergies-10k
+# (but could work for others)
+
+
+# Setup ---------------------------------------------------------------------------------------
+
+library(readr)
+
+here::i_am("data-raw/test_db/create_test_db.R")
+
+dir <- here::here("data-raw/test_db/eunomia")
+name <- "synthea-allergies-10k"
+version <- "5.3"
+schema <- "main"
+
+# Download the example data if it doesn't exist yet and create the duckdb database
+withr::with_envvar(
+  new = c(EUNOMIA_DATA_FOLDER = dir),
+  {
+    invisible(CDMConnector::eunomia_dir(dataset_name = name, cdm_version = version))
+  }
+)
+
+con <- DBI::dbConnect(duckdb::duckdb(glue::glue("{dir}/{name}_{version}_1.1.duckdb")))
+withr::defer(DBI::dbDisconnect(con))
+
+
+# Insert dummy tables -------------------------------------------------------------------------
+
+#' Write data to a table in the database
+#'
+#' @param data data.frame, data to be written to the table
+#' @param con A [`DBI::DBIConnection-class`] object
+#' @param table character, name of the table to write to
+#' @param schema character, name of the schema to be used
+#'
+#' @return `TRUE`, invisibly, if the operation was successful
+write_table <- function(data, con, table, schema) {
+  DBI::dbWriteTable(
+    conn = con,
+    name = DBI::Id(schema = schema, table = table),
+    value = data,
+    overwrite = TRUE
+  )
+}
+
+
+## Load dummy data and write tables to database
+## We explicitly set the column types for columns that are needed later down the pipeline
+dummy_measurements <- read_csv(
+  here::here("data-raw/test_db/dummy/measurement.csv"),
+  col_types = cols(
+    measurement_id = col_integer(),
+    person_id = col_integer(),
+    measurement_concept_id = col_integer(),
+    measurement_date = col_date(),
+    value_as_number = col_double(),
+    value_as_concept_id = col_integer(),
+    measurement_time = col_character(),
+    measurement_type_concept_id = col_integer(),
+    operator_concept_id = col_integer(),
+    unit_concept_id = col_integer(),
+    range_low = col_number(),
+    range_high = col_number(),
+    provider_id = col_integer(),
+    visit_occurrence_id = col_integer(),
+    visit_detail_id = col_integer(),
+    measurement_source_value = col_character(),
+    measurement_source_concept_id = col_integer(),
+    unit_source_value = col_character(),
+    value_source_value = col_character()
+  )
+)
+write_table(dummy_measurements, con, "measurement", schema = schema)
+
+dummy_observations <- read_csv(
+  here::here("data-raw/test_db/dummy/observation.csv"),
+  col_types = cols(
+    observation_id = col_integer(),
+    person_id = col_integer(),
+    observation_concept_id = col_integer(),
+    observation_date = col_date(),
+    value_as_number = col_double(),
+    value_as_string = col_character(),
+    value_as_concept_id = col_integer(),
+    observation_type_concept_id = col_integer(),
+    qualifier_concept_id = col_integer(),
+    unit_concept_id = col_integer(),
+    provider_id = col_integer(),
+    visit_occurrence_id = col_integer(),
+    visit_detail_id = col_integer(),
+    observation_source_value = col_character(),
+    observation_source_concept_id = col_integer(),
+    unit_source_value = col_character(),
+    qualifier_source_value = col_character()
+  )
+)
+write_table(dummy_observations, con, "observation", schema = schema)
+
+## Verify integrity, turn warnings into error
+tryCatch(
+  cdm <- CDMConnector::cdm_from_con(con, cdm_schema = schema, write_schema = schema),
+  warning = function(cnd) {
+    msg <- sprintf("CDM integrity check produced warnings: %s", conditionMessage(cnd))
+    rlang::abort(msg, call = NULL)
+  }
+)
+
+cli::cli_alert_success("Test database set up successfully")
diff --git a/data/test_data/omopcat_concepts.parquet b/data/test_data/omopcat_concepts.parquet
index ee69534..fb5f088 100644
Binary files a/data/test_data/omopcat_concepts.parquet and b/data/test_data/omopcat_concepts.parquet differ
diff --git a/data/test_data/omopcat_monthly_counts.parquet b/data/test_data/omopcat_monthly_counts.parquet
index ff7d3b3..6cfaf3d 100644
Binary files a/data/test_data/omopcat_monthly_counts.parquet and b/data/test_data/omopcat_monthly_counts.parquet differ
diff --git a/data/test_data/omopcat_summary_stats.parquet b/data/test_data/omopcat_summary_stats.parquet
index 14929c3..b42faae 100644
Binary files a/data/test_data/omopcat_summary_stats.parquet and b/data/test_data/omopcat_summary_stats.parquet differ
diff --git a/deploy/.Renviron.sample b/deploy/.Renviron.sample
deleted file mode 100644
index 34b4422..0000000
--- a/deploy/.Renviron.sample
+++ /dev/null
@@ -1,7 +0,0 @@
-OMOPCAT_DATA_PATH=
-DB_NAME=
-HOST=
-PORT=
-DB_USERNAME=
-DB_PASSWORD=
-DB_CDM_SCHEMA=
diff --git a/deploy/Dockerfile b/deploy/Dockerfile
deleted file mode 100644
index 6b09a8a..0000000
--- a/deploy/Dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-FROM rocker/shiny-verse:4.4.1
-
-WORKDIR /app
-COPY deploy/renv.lock.prod renv.lock
-
-# Install renv and restore environment
-# omopbundles is installed separately as renv is giving problems
-# with GitHub packages
-RUN R -e 'remotes::install_github("SAFEHR-data/omop-bundles")'
-RUN install2.r --error --skipinstalled renv && \
-    R -e 'renv::restore(exclude = "omopbundles")'
-
-COPY deploy/omopcat_*.tar.gz /app.tar.gz
-RUN R -e 'remotes::install_local("/app.tar.gz", upgrade="never", dependencies = FALSE)' && \
-    rm /app.tar.gz
-
-ARG OMOPCAT_DATA_PATH
-
-ADD scripts ./scripts
-COPY deploy/.Renviron .Renviron
-
-EXPOSE 3838
-CMD ["R", "-e", \
-    "options('shiny.port'=3838,shiny.host='0.0.0.0'); \
-    source('scripts/create_prod_data.R'); \
-    omopcat::run_app()" \
-    ]
diff --git a/deploy/README.md b/deploy/README.md
deleted file mode 100644
index 7d8f4bf..0000000
--- a/deploy/README.md
+++ /dev/null
@@ -1,77 +0,0 @@
-# Deploy
-
-## Build the package
-
-From the package root directory, run from `R`:
-
-```r
-pkgbuild::build(path = ".", dest_path = "deploy")
-```
-
-This will create a `omopcat_*.tar.gz` file in the `deploy/` directory with a built version of the
-package, which  will be used to install the package in the Docker container. The reasoning here
-is that we have a **production** version of the package that is separate from the **development**
-version. The production version would ideally be pinned to a release version, e.g. `0.1.0` and only
-be updated when a new release is made, e.g. `0.1.1` or `0.2.0`. This should allow us to continue
-developing the app without affecting the version that is running in production.
-
-## Create `renv.lock.prod`
-
-From the package root directory, run from `R`:
-
-```r
-renv::snapshot(project = ".", lockfile = "./deploy/renv.lock.prod", type = "explicit")
-```
-
-This `renv.lock.prod` file will be a subset of the `renv.lock` that is in the package root. The
-latter also includes development dependencies, which are not necessary to run the app in production.
-
-## Populate the `data/prod_data` directory
-
-Running the production version of the app requires to populate the
-[`data/prod_data`](../data/prod_data/) directory with
-the necessary `parquet` files (see [`data/test_data`](../data/test_data/) for an example).
-
-We provide the [`scripts/create_prod_data.R`](../scripts/create_prod_data.R)
-script to facilitate this. This script will be run automatically when running the Docker container
-if the mounted data directory is found to be empty.
-
-A few environment variables are required to run this script:
-
-* `DB_NAME`: the name of the database to connect to
-* `HOST`: the host of the database
-* `PORT`: the port on which to connect to the database
-* `DB_USERNAME`: the username to connect to the database
-* `DB_PASSWORD`: the password to connect to the database
-* `DB_CDM_SCHEMA`: the schema of the CDM database, note that this needs to have both read and write
-    permissions for the user to be able to use the
-    [`CDMConnector`](https://darwin-eu.github.io/CDMConnector/index.html) package
-
-These should be defined in a local `.Renviron` file (not git-tracked) in the `deploy/` directory.
-See the `.Renviron.sample` file for a template.
-
-## Build Docker images and run the app
-
-To launch the test version of the app, run:
-
-```shell
-docker compose -f docker-compose.test.yml up -d --build
-```
-
-This will run a **test** version of the production app using the synthetic data in
-[`data/test_data`](../data/test_data/).
-
-To launch the production version of the up, run:
-
-```shell
-docker compose up -d --build
-```
-
-This will build the container and install the necessary dependencies to run the app.
-The `-d` flag runs the `docker compose` command in "detached" mode, meaning the app will be run
-in the background and you can safely quit your terminal session.
-
-By default, the app will be hosted at `https://localhost:3838`.
-
-Running the app on GAE05 will make it available at `http://uclvlddpragae05:3838` within the UCLH
-network.
diff --git a/deploy/docker-compose.test.yml b/deploy/docker-compose.test.yml
deleted file mode 100644
index ee733be..0000000
--- a/deploy/docker-compose.test.yml
+++ /dev/null
@@ -1,7 +0,0 @@
-services:
-  calyspo_test:
-    extends:
-      file: docker-compose.yml
-      service: omopcat
-    volumes:
-      - ../data/test_data:/etc/omopcat/data
diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml
deleted file mode 100644
index 97a1820..0000000
--- a/deploy/docker-compose.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-services:
-  omopcat:
-    build:
-      # Use repo root as context so we can copy scripts directory to container
-      context: ..
-      dockerfile: deploy/Dockerfile
-      args:
-        HTTP_PROXY: ${HTTP_PROXY}
-        HTTPS_PROXY: ${HTTPS_PROXY}
-        OMOPCAT_DATA_PATH: /etc/omopcat/data
-    image: omopcat:latest
-    platform: linux/amd64
-    restart: unless-stopped
-    environment:
-      HTTP_PROXY: ${HTTP_PROXY}
-      HTTPS_PROXY: ${HTTPS_PROXY}
-      GOLEM_CONFIG_ACTIVE: production
-      OMOPCAT_DATA_PATH: /etc/omopcat/data
-      LOW_FREQUENCY_THRESHOLD: 5
-      LOW_FREQUENCY_REPLACEMENT: 2.5
-    volumes:
-      - ../data/prod_data:/etc/omopcat/data
-    ports:
-      - 3838:3838
diff --git a/deploy/omopcat_0.2.1.tar.gz b/deploy/omopcat_0.2.1.tar.gz
deleted file mode 100644
index ca512fa..0000000
Binary files a/deploy/omopcat_0.2.1.tar.gz and /dev/null differ
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..48366be
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,50 @@
+services:
+  preprocess:
+    build:
+      context: .
+      dockerfile: preprocessing/Dockerfile
+      args:
+        # Required for running on GAE
+        HTTP_PROXY: ${HTTP_PROXY}
+        HTTPS_PROXY: ${HTTPS_PROXY}
+    image: omopcat_preprocessing:latest
+    platform: linux/amd64
+    environment:
+      ENV: ${ENV}
+      PREPROCESS_OUT_PATH: /mnt/preprocessing/data
+      DB_NAME: ${PREPROCESS_DB_NAME}
+      HOST: ${PREPROCESS_HOST}
+      PORT: ${PREPROCESS_PORT}
+      DB_USERNAME: ${PREPROCESS_DB_USERNAME}
+      DB_PASSWORD: ${PREPROCESS_DB_PASSWORD}
+      DB_CDM_SCHEMA: ${PREPROCESS_DB_CDM_SCHEMA}
+      EUNOMIA_DATA_FOLDER: /mnt/preprocessing/data-raw/test_db
+    command: ["R", "-e", "omopcat.preprocessing::preprocess()"]
+    volumes:
+      - ${DATA_VOLUME_PATH}:/mnt/preprocessing/data
+      - ${TEST_DB_PATH}:/mnt/preprocessing/data-raw/test_db
+
+  omopcat:
+    build:
+      # Use repo root as context so we can copy scripts directory to container
+      context: .
+      dockerfile: app/Dockerfile
+      args:
+        # Required for running on GAE
+        HTTP_PROXY: ${HTTP_PROXY}
+        HTTPS_PROXY: ${HTTPS_PROXY}
+    image: omopcat:latest
+    platform: linux/amd64
+    restart: unless-stopped
+    environment:
+      ENV: ${ENV}
+      HTTP_PROXY: ${HTTP_PROXY}
+      HTTPS_PROXY: ${HTTPS_PROXY}
+      GOLEM_CONFIG_ACTIVE: production
+      OMOPCAT_DATA_PATH: /etc/omopcat/data
+      LOW_FREQUENCY_THRESHOLD: 5
+      LOW_FREQUENCY_REPLACEMENT: 2.5
+    volumes:
+      - ${DATA_VOLUME_PATH}:/etc/omopcat/data
+    ports:
+      - 3838:3838
diff --git a/inst/WORDLIST b/inst/WORDLIST
deleted file mode 100644
index 9b571ff..0000000
--- a/inst/WORDLIST
+++ /dev/null
@@ -1,12 +0,0 @@
-CDM
-CMD
-Catalogue
-Lifecycle
-Nanoparquet
-OMOP
-catalogue
-codecov
-containerised
-duckdb
-golem
-productionised
diff --git a/man/connect_to_test_duckdb.Rd b/man/connect_to_test_duckdb.Rd
deleted file mode 100644
index 380616b..0000000
--- a/man/connect_to_test_duckdb.Rd
+++ /dev/null
@@ -1,21 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils_preprocessing_db.R
-\name{connect_to_test_duckdb}
-\alias{connect_to_test_duckdb}
-\title{Connect to duckdb database}
-\usage{
-connect_to_test_duckdb(db_path, ..., .envir = parent.frame())
-}
-\arguments{
-\item{db_path}{path to the duckdb database file}
-
-\item{...}{unused}
-
-\item{.envir}{passed on to \code{\link[withr:defer]{withr::defer()}}}
-}
-\value{
-A \code{\link[DBI:DBIConnection-class]{DBI::DBIConnection}} object
-}
-\description{
-Connect to duckdb database
-}
diff --git a/man/query_concepts_table.Rd b/man/query_concepts_table.Rd
deleted file mode 100644
index 1feb5e6..0000000
--- a/man/query_concepts_table.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils_preprocessing_db.R
-\name{query_concepts_table}
-\alias{query_concepts_table}
-\title{Function to produce the 'omopcat_concepts' table from a list of concept ids}
-\usage{
-query_concepts_table(cdm, concepts)
-}
-\arguments{
-\item{cdm}{A \code{\link{CDMConnector}} object, e.g. from \code{\link[CDMConnector:cdm_from_con]{CDMConnector::cdm_from_con()}}}
-
-\item{concepts}{A vector of concept IDs}
-}
-\value{
-A \code{data.frame} with the concept table
-}
-\description{
-Function to produce the 'omopcat_concepts' table from a list of concept ids
-}
diff --git a/man/read_parquet_sorted.Rd b/man/read_parquet_sorted.Rd
deleted file mode 100644
index 25d14ed..0000000
--- a/man/read_parquet_sorted.Rd
+++ /dev/null
@@ -1,19 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils_preprocessing_db.R
-\name{read_parquet_sorted}
-\alias{read_parquet_sorted}
-\title{Read a parquet table and sort the results}
-\usage{
-read_parquet_sorted(path, options = nanoparquet::parquet_options())
-}
-\arguments{
-\item{path}{path to the parquet file to be read}
-
-\item{options}{Nanoparquet options, see \code{\link[nanoparquet:parquet_options]{parquet_options()}}.}
-}
-\value{
-A \code{data.frame} with the results sorted by all columns
-}
-\description{
-Read a parquet table and sort the results
-}
diff --git a/man/write_table.Rd b/man/write_table.Rd
deleted file mode 100644
index 6ed033e..0000000
--- a/man/write_table.Rd
+++ /dev/null
@@ -1,23 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils_preprocessing_db.R
-\name{write_table}
-\alias{write_table}
-\title{Write data to a table in the database}
-\usage{
-write_table(data, con, table, schema)
-}
-\arguments{
-\item{data}{data.frame, data to be written to the table}
-
-\item{con}{A \code{\link[DBI:DBIConnection-class]{DBI::DBIConnection}} object}
-
-\item{table}{character, name of the table to write to}
-
-\item{schema}{character, name of the schema to be used}
-}
-\value{
-\code{TRUE}, invisibly, if the operation was successful
-}
-\description{
-Write data to a table in the database
-}
diff --git a/preprocessing/.Rbuildignore b/preprocessing/.Rbuildignore
new file mode 100644
index 0000000..fd6568a
--- /dev/null
+++ b/preprocessing/.Rbuildignore
@@ -0,0 +1,7 @@
+^renv$
+^renv\.lock$
+^preprocessing\.Rproj$
+^\.Rproj\.user$
+^data-raw$
+^Dockerfile$
+^omopcat\.preprocessing\.Rproj$
diff --git a/preprocessing/.Rprofile b/preprocessing/.Rprofile
new file mode 100644
index 0000000..c81932b
--- /dev/null
+++ b/preprocessing/.Rprofile
@@ -0,0 +1,14 @@
+if (interactive()) {
+  suppressMessages(require("devtools"))
+  suppressMessages(require("golem"))
+
+  # warn about partial matching
+  options(
+    warnPartialMatchDollar = TRUE,
+    warnPartialMatchAttr = TRUE,
+    warnPartialMatchArgs = TRUE
+  )
+  options(styler.cache_root = "styler_perm")
+}
+
+source("renv/activate.R")
diff --git a/preprocessing/.gitignore b/preprocessing/.gitignore
new file mode 100644
index 0000000..cd67eac
--- /dev/null
+++ b/preprocessing/.gitignore
@@ -0,0 +1 @@
+.Rproj.user
diff --git a/preprocessing/DESCRIPTION b/preprocessing/DESCRIPTION
new file mode 100644
index 0000000..1a355a3
--- /dev/null
+++ b/preprocessing/DESCRIPTION
@@ -0,0 +1,34 @@
+Package: omopcat.preprocessing
+Title: Preprocessing for the omopcat App
+Version: 0.0.0.9000
+Authors@R: c(
+    person('Milan', 'Malfait', email = 'm.malfait@ucl.ac.uk', role = c('cre', 'aut')),
+    person('Stefan', 'Piatek', role = 'aut'),
+    person('Baptiste', 'Briot Ribeyre', role = 'aut'),
+    person('Andy', 'South', role = 'aut'),
+    person("SAFEHR", role = c("cph", "fnd"))
+  )
+Description: Preprocess data for the omopcat app.
+License: Apache License (>= 2)
+Encoding: UTF-8
+Roxygen: list(markdown = TRUE)
+RoxygenNote: 7.3.2
+Imports:
+    CDMConnector,
+    cli,
+    DBI,
+    dplyr,
+    duckdb,
+    fs,
+    lubridate,
+    nanoparquet,
+    purrr,
+    rlang,
+    RPostgres,
+    tidyr,
+    withr
+Suggests:
+    dbplyr,
+    RSQLite,
+    testthat (>= 3.0.0)
+Config/testthat/edition: 3
diff --git a/preprocessing/Dockerfile b/preprocessing/Dockerfile
new file mode 100644
index 0000000..e277e02
--- /dev/null
+++ b/preprocessing/Dockerfile
@@ -0,0 +1,13 @@
+FROM rocker/tidyverse:4.4.1
+
+WORKDIR /pkg
+ADD preprocessing .
+COPY preprocessing/renv.lock ./renv.lock
+
+RUN install2.r --error --skipinstalled renv remotes && \
+    R -e 'renv::restore()' && \
+    rm -rf /tmp/downloaded_packages
+
+RUN R -e 'remotes::install_local(path = ".", dependencies = TRUE)'
+
+CMD ["R", "-e", "omopcat.preprocessing::preprocess()"]
diff --git a/preprocessing/NAMESPACE b/preprocessing/NAMESPACE
new file mode 100644
index 0000000..8048ce8
--- /dev/null
+++ b/preprocessing/NAMESPACE
@@ -0,0 +1,5 @@
+# Generated by roxygen2: do not edit by hand
+
+export(preprocess)
+importFrom(rlang,.data)
+importFrom(stats,sd)
diff --git a/preprocessing/R/concepts.R b/preprocessing/R/concepts.R
new file mode 100644
index 0000000..4d5e8d5
--- /dev/null
+++ b/preprocessing/R/concepts.R
@@ -0,0 +1,22 @@
+#' Generate the `omopcat_concepts` table
+#'
+#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
+#' @param concept_ids A vector of concept IDs
+#'
+#' @return A `data.frame` with the monthly counts
+#' @keywords internal
+generate_concepts <- function(cdm, concept_ids) {
+  # Extract columns from concept table
+  cdm$concept |>
+    dplyr::filter(.data$concept_id %in% concept_ids) |>
+    dplyr::select(
+      "concept_id",
+      "concept_name",
+      "vocabulary_id",
+      "domain_id",
+      "concept_class_id",
+      "standard_concept",
+      "concept_code"
+    ) |>
+    dplyr::collect()
+}
diff --git a/preprocessing/R/monthly_counts.R b/preprocessing/R/monthly_counts.R
new file mode 100644
index 0000000..52ff104
--- /dev/null
+++ b/preprocessing/R/monthly_counts.R
@@ -0,0 +1,83 @@
+#' Generate the 'omopcat_monthly_counts' table
+#'
+#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
+#'
+#' @return A `data.frame` with the monthly counts
+#' @keywords internal
+generate_monthly_counts <- function(cdm) {
+  # Combine results for all tables
+  out <- dplyr::bind_rows(
+    cdm$condition_occurrence |> calculate_monthly_counts(
+      .data$condition_concept_id, .data$condition_start_date
+    ),
+    cdm$drug_exposure |>
+      calculate_monthly_counts(.data$drug_concept_id, .data$drug_exposure_start_date),
+    cdm$procedure_occurrence |>
+      calculate_monthly_counts(.data$procedure_concept_id, .data$procedure_date),
+    cdm$device_exposure |>
+      calculate_monthly_counts(.data$device_concept_id, .data$device_exposure_start_date),
+    cdm$measurement |>
+      calculate_monthly_counts(.data$measurement_concept_id, .data$measurement_date),
+    cdm$observation |>
+      calculate_monthly_counts(.data$observation_concept_id, .data$observation_date),
+    cdm$specimen |>
+      calculate_monthly_counts(.data$specimen_concept_id, .data$specimen_date)
+  )
+
+  # Map concept names to the concept IDs
+  concept_names <- dplyr::select(cdm$concept, "concept_id", "concept_name") |>
+    dplyr::filter(.data$concept_id %in% out$concept_id) |>
+    dplyr::collect()
+  out |>
+    dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |>
+    dplyr::select("concept_id", "concept_name", dplyr::everything())
+}
+
+
+#' Calculate monthly statistics for an OMOP concept
+#'
+#' @param omop_table A table from the OMOP CDM
+#' @param concept The name of the concept column to calculate statistics for
+#' @param date The name of the date column to calculate statistics for
+#'
+#' @return A `data.frame` with the following columns:
+#'   - `concept_id`: The concept ID
+#'   - `concept_name`: The concept name
+#'   - `date_year`: The year of the date
+#'   - `date_month`: The month of the date
+#'   - `person_count`: The number of unique patients per concept for each month
+#'   - `records_per_person`: The average number of records per person per concept for each month
+#' @keywords internal
+calculate_monthly_counts <- function(omop_table, concept, date) {
+  # Extract year and month from date column
+  omop_table <- dplyr::mutate(omop_table,
+    concept_id = {{ concept }},
+    date_year = as.integer(lubridate::year({{ date }})),
+    date_month = as.integer(lubridate::month({{ date }}))
+  )
+
+  omop_table |>
+    dplyr::group_by(.data$date_year, .data$date_month, .data$concept_id) |>
+    dplyr::summarise(
+      record_count = dplyr::n(),
+      person_count = dplyr::n_distinct(.data$person_id),
+    ) |>
+    # NOTE: Explicitly cast types to avoid unexpected SQL behaviour,
+    # otherwise the records_per_person might end up as an int
+    # and the *_count vars as int64, which can give problems later
+    dplyr::mutate(
+      record_count = as.integer(.data$record_count),
+      person_count = as.integer(.data$person_count),
+      records_per_person = as.double(.data$record_count) / as.double(.data$person_count)
+    ) |>
+    dplyr::select(
+      "concept_id",
+      "date_year",
+      "date_month",
+      "record_count",
+      "person_count",
+      "records_per_person"
+    ) |>
+    ## Collect in case we're dealing with a database-stored table
+    dplyr::collect()
+}
diff --git a/preprocessing/R/omopcat.preprocessing-package.R b/preprocessing/R/omopcat.preprocessing-package.R
new file mode 100644
index 0000000..52c0c07
--- /dev/null
+++ b/preprocessing/R/omopcat.preprocessing-package.R
@@ -0,0 +1,7 @@
+#' @keywords internal
+"_PACKAGE"
+
+## usethis namespace: start
+#' @importFrom rlang .data
+## usethis namespace: end
+NULL
diff --git a/preprocessing/R/preprocess.R b/preprocessing/R/preprocess.R
new file mode 100644
index 0000000..9f242ed
--- /dev/null
+++ b/preprocessing/R/preprocess.R
@@ -0,0 +1,127 @@
+#' Run the pre-processing pipeline
+#'
+#' The pre-processing pipeline generates the following files:
+#'
+#' * `out_path`/omopcat_concepts.parquet
+#' * `out_path`/omopcat_monthly_counts.parquet
+#' * `out_path`/omopcat_summary_stats.parquet
+#'
+#' If all these files already exist, the pipeline will not be run.
+#'
+#' @param out_path The directory where the pre-processed data will be written to.
+#'   Defaults  to the `PREPROCESS_OUT_PATH` environment variable.
+#'
+#' @return Whether the pre-processing was run: `TRUE` or `FALSE`, invisibly.
+#' @export
+preprocess <- function(out_path = Sys.getenv("PREPROCESS_OUT_PATH")) {
+  if (out_path == "") {
+    cli::cli_abort(c(
+      "x" = "{.var out_path} should not be empty.",
+      "i" = "Have you set the {.envvar PREPROCESS_OUT_PATH} environment variable?"
+    ))
+  }
+  fs::dir_create(out_path)
+
+  out_files <- c(
+    concepts = file.path(out_path, "omopcat_concepts.parquet"),
+    monthly_counts = file.path(out_path, "omopcat_monthly_counts.parquet"),
+    summary_stats = file.path(out_path, "omopcat_summary_stats.parquet")
+  )
+
+  # Only run pre-processing if the expected files don't exist
+  exists <- fs::file_exists(out_files)
+
+  if (all(exists)) {
+    cli::cli_alert_info("All expected files already exist. Skipping pre-processing.")
+    cli::cli_alert_info("To force re-processing, delete the following files:")
+    cli::cli_ul(out_files[exists])
+
+    return(invisible(FALSE))
+  }
+
+  if (.running_in_production()) {
+    cli::cli_alert_info("Running in production mode")
+    .check_prod_env()
+  }
+
+  cdm <- .setup_cdm_object()
+
+  cli::cli_progress_message("Generating monthly_counts table")
+  monthly_counts <- generate_monthly_counts(cdm)
+
+  cli::cli_progress_message("Generating summary_stats table")
+  summary_stats <- generate_summary_stats(cdm)
+
+  cli::cli_progress_message("Generating concepts table")
+  concept_ids_with_data <- unique(c(monthly_counts$concept_id, summary_stats$concept_id))
+  concepts_table <- generate_concepts(cdm, concept_ids = concept_ids_with_data)
+
+  nanoparquet::write_parquet(concepts_table, out_files["concepts"])
+  nanoparquet::write_parquet(monthly_counts, out_files["monthly_counts"])
+  nanoparquet::write_parquet(summary_stats, out_files["summary_stats"])
+  cli::cli_alert_success("Tables written to {.path {out_files}}")
+
+  return(invisible(TRUE))
+}
+
+.running_in_production <- function() {
+  return(Sys.getenv("ENV") == "prod")
+}
+
+.check_prod_env <- function() {
+  required_envvars <- c(
+    "DB_NAME",
+    "HOST",
+    "PORT",
+    "DB_USERNAME",
+    "DB_PASSWORD"
+  )
+
+  missing <- Sys.getenv(required_envvars) == ""
+  if (any(missing)) {
+    cli::cli_abort(
+      "Environment variable{?s} {.envvar {required_envvars[missing]}} not set",
+      call = rlang::caller_env()
+    )
+  }
+}
+
+#' Set up a CDM object from a database connection
+#'
+#' When running in production, sets up a CDM object from the database settings
+#' configured through the relevant environment variables.
+#'
+#' When not in production, creates a CDM from one of the CDMConnector example
+#' datasets (`"GiBleed"` by default), using [`CDMConnector::eunomia_dir()`].
+#' This is intended for use in testing and development.
+#'
+#' @param .envir Passed on to [`connect_to_db()`], controls the scope in which the database
+#'    connection should live. When it goes out of scope, the database connection is closed.
+#' @noRd
+.setup_cdm_object <- function(.envir = parent.frame()) {
+  if (.running_in_production()) {
+    name <- Sys.getenv("DB_NAME")
+    con <- connect_to_db(
+      RPostgres::Postgres(),
+      dbname = Sys.getenv("DB_NAME"),
+      host = Sys.getenv("HOST"),
+      port = Sys.getenv("PORT"),
+      user = Sys.getenv("DB_USERNAME"),
+      password = Sys.getenv("DB_PASSWORD"),
+      .envir = parent.frame()
+    )
+  } else {
+    name <- Sys.getenv("DB_NAME", unset = "GiBleed")
+    duckdb_path <- CDMConnector::eunomia_dir(dataset_name = name)
+    rlang::check_installed("duckdb")
+    con <- connect_to_db(duckdb::duckdb(duckdb_path), .envir = .envir)
+  }
+
+  # Load the data in a CDMConnector object
+  CDMConnector::cdm_from_con(
+    con = con,
+    cdm_schema = Sys.getenv("DB_CDM_SCHEMA", unset = "main"),
+    write_schema = Sys.getenv("DB_CDM_SCHEMA", unset = "main"),
+    cdm_name = name
+  )
+}
diff --git a/preprocessing/R/summary_stats.R b/preprocessing/R/summary_stats.R
new file mode 100644
index 0000000..9dd258f
--- /dev/null
+++ b/preprocessing/R/summary_stats.R
@@ -0,0 +1,97 @@
+#' Generate the `omopcat_summary_stats` table
+#'
+#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
+#'
+#' @return A `data.frame` with the summary statistics
+#' @keywords internal
+generate_summary_stats <- function(cdm) {
+  omop_tables <- cdm[c("measurement", "observation")]
+  concept_cols <- c("measurement_concept_id", "observation_concept_id")
+
+  # Combine results for all tables
+  stats <- purrr::map2(omop_tables, concept_cols, calculate_summary_stats)
+  stats <- dplyr::bind_rows(stats)
+
+  # Map concept names to the concept_ids
+  concept_names <- dplyr::select(cdm$concept, "concept_id", "concept_name") |>
+    dplyr::filter(.data$concept_id %in% c(stats$concept_id, stats$value_as_concept_id)) |>
+    dplyr::collect()
+  stats |>
+    # Order is important here, first we get the names for the value_as_concept_ids
+    # from the categorical data summaries and record it as `value_as_string`
+    dplyr::left_join(concept_names, by = c("value_as_concept_id" = "concept_id")) |>
+    dplyr::rename(value_as_string = "concept_name") |>
+    # Then we get the names for the main concept_ids
+    dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |>
+    dplyr::select("concept_id", "concept_name", !"value_as_concept_id")
+}
+
+#' Calculate summary statistics for an OMOP table
+#'
+#' Calculates the mean and standard deviation for numeric concepts and the
+#' frequency for categorical concepts.
+#'
+#' @param omop_table A table from the OMOP CDM
+#' @param concept_name The name of the concept ID column
+#'
+#' @return A `data.frame` with the following columns:
+#'  - `concept_id`: The concept ID
+#'  - `summary_attribute`: The summary attribute (e.g. `"mean"`, `"sd"`, `"frequency"`)
+#'  - `value_as_number`: The value of the summary attribute
+#'  - `value_as_concept_id`: In case of a categorical concept, the concept ID for each category
+#' @keywords internal
+calculate_summary_stats <- function(omop_table, concept_name) {
+  stopifnot(is.character(concept_name))
+  stopifnot(concept_name %in% colnames(omop_table))
+
+  omop_table <- dplyr::rename(omop_table, concept_id = dplyr::all_of(concept_name))
+
+  numeric_concepts <- dplyr::filter(omop_table, !is.na(.data$value_as_number))
+  # beware CDM docs: NULL=no categorical result, 0=categorical result but no mapping
+  categorical_concepts <- dplyr::filter(
+    omop_table,
+    !is.null(.data$value_as_concept_id) & .data$value_as_concept_id != 0
+  )
+
+  numeric_stats <- .summarise_numeric_concepts(numeric_concepts) |> dplyr::collect()
+  categorical_stats <- .summarise_categorical_concepts(categorical_concepts) |>
+    # Convert value_as_number to double to make it compatible with numeric stats
+    dplyr::mutate(value_as_number = as.double(.data$value_as_number)) |>
+    dplyr::collect()
+  dplyr::bind_rows(numeric_stats, categorical_stats)
+}
+
+#' @importFrom stats sd
+.summarise_numeric_concepts <- function(omop_table) {
+  # Calculate mean and sd
+  stats <- omop_table |>
+    dplyr::group_by(.data$concept_id) |>
+    dplyr::summarise(
+      mean = mean(.data$value_as_number, na.rm = TRUE),
+      sd = sd(.data$value_as_number, na.rm = TRUE)
+    )
+
+  # Wrangle output to expected format
+  stats |>
+    tidyr::pivot_longer(
+      cols = c("mean", "sd"),
+      names_to = "summary_attribute",
+      values_to = "value_as_number"
+    )
+}
+
+.summarise_categorical_concepts <- function(omop_table) {
+  # Calculate frequencies
+  frequencies <- omop_table |>
+    dplyr::count(.data$concept_id, .data$value_as_concept_id)
+
+  # Wrangle output into the expected format
+  frequencies |>
+    dplyr::mutate(summary_attribute = "frequency") |>
+    dplyr::select(
+      "concept_id",
+      "summary_attribute",
+      value_as_number = "n",
+      "value_as_concept_id"
+    )
+}
diff --git a/preprocessing/R/utils-db.R b/preprocessing/R/utils-db.R
new file mode 100644
index 0000000..a634593
--- /dev/null
+++ b/preprocessing/R/utils-db.R
@@ -0,0 +1,14 @@
+#' Connect to a database
+#'
+#' General helper to connect to a database through [`DBI::dbConnect()`], while ensuring
+#' that the connection is closed when the connection object goes out of scope.
+#'
+#' @param ... arguments passed on to [`DBI::dbConnect()`]
+#' @param .envir passed on to [`withr::defer()`]
+#'
+#' @return A [`DBI::DBIConnection-class`] object
+connect_to_db <- function(..., .envir = parent.frame()) {
+  con <- DBI::dbConnect(...)
+  withr::defer(DBI::dbDisconnect(con), envir = .envir)
+  con
+}
diff --git a/preprocessing/data-raw/test_db/.gitignore b/preprocessing/data-raw/test_db/.gitignore
new file mode 100644
index 0000000..9f0a7c3
--- /dev/null
+++ b/preprocessing/data-raw/test_db/.gitignore
@@ -0,0 +1,9 @@
+# Eunomia data
+*.zip
+
+# duckdb databases
+*.duckdb
+
+# duckdb temp files
+# (in case of failure)
+*.duckdb.wal
diff --git a/man/calculate_monthly_counts.Rd b/preprocessing/man/calculate_monthly_counts.Rd
similarity index 93%
rename from man/calculate_monthly_counts.Rd
rename to preprocessing/man/calculate_monthly_counts.Rd
index bf7ac48..4f38b54 100644
--- a/man/calculate_monthly_counts.Rd
+++ b/preprocessing/man/calculate_monthly_counts.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils_preprocessing_summarise.R
+% Please edit documentation in R/monthly_counts.R
 \name{calculate_monthly_counts}
 \alias{calculate_monthly_counts}
 \title{Calculate monthly statistics for an OMOP concept}
@@ -27,3 +27,4 @@ A \code{data.frame} with the following columns:
 \description{
 Calculate monthly statistics for an OMOP concept
 }
+\keyword{internal}
diff --git a/man/calculate_summary_stats.Rd b/preprocessing/man/calculate_summary_stats.Rd
similarity index 92%
rename from man/calculate_summary_stats.Rd
rename to preprocessing/man/calculate_summary_stats.Rd
index ee02e37..a4e2bf3 100644
--- a/man/calculate_summary_stats.Rd
+++ b/preprocessing/man/calculate_summary_stats.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils_preprocessing_summarise.R
+% Please edit documentation in R/summary_stats.R
 \name{calculate_summary_stats}
 \alias{calculate_summary_stats}
 \title{Calculate summary statistics for an OMOP table}
@@ -24,3 +24,4 @@ A \code{data.frame} with the following columns:
 Calculates the mean and standard deviation for numeric concepts and the
 frequency for categorical concepts.
 }
+\keyword{internal}
diff --git a/man/connect_to_db.Rd b/preprocessing/man/connect_to_db.Rd
similarity index 70%
rename from man/connect_to_db.Rd
rename to preprocessing/man/connect_to_db.Rd
index c254ab7..ff09c7c 100644
--- a/man/connect_to_db.Rd
+++ b/preprocessing/man/connect_to_db.Rd
@@ -1,8 +1,8 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils_preprocessing_db.R
+% Please edit documentation in R/utils-db.R
 \name{connect_to_db}
 \alias{connect_to_db}
-\title{Connec to a database}
+\title{Connect to a database}
 \usage{
 connect_to_db(..., .envir = parent.frame())
 }
@@ -15,6 +15,6 @@ connect_to_db(..., .envir = parent.frame())
 A \code{\link[DBI:DBIConnection-class]{DBI::DBIConnection}} object
 }
 \description{
-General helper to connect to a databae through \code{\link[DBI:dbConnect]{DBI::dbConnect()}}, while ensuring
+General helper to connect to a database through \code{\link[DBI:dbConnect]{DBI::dbConnect()}}, while ensuring
 that the connection is closed when the connection object goes out of scope.
 }
diff --git a/preprocessing/man/generate_concepts.Rd b/preprocessing/man/generate_concepts.Rd
new file mode 100644
index 0000000..dce5168
--- /dev/null
+++ b/preprocessing/man/generate_concepts.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/concepts.R
+\name{generate_concepts}
+\alias{generate_concepts}
+\title{Generate the \code{omopcat_concepts} table}
+\usage{
+generate_concepts(cdm, concept_ids)
+}
+\arguments{
+\item{cdm}{A \code{\link{CDMConnector}} object, e.g. from \code{\link[CDMConnector:cdm_from_con]{CDMConnector::cdm_from_con()}}}
+
+\item{concept_ids}{A vector of concept IDs}
+}
+\value{
+A \code{data.frame} with the monthly counts
+}
+\description{
+Generate the \code{omopcat_concepts} table
+}
+\keyword{internal}
diff --git a/man/process_monthly_counts.Rd b/preprocessing/man/generate_monthly_counts.Rd
similarity index 69%
rename from man/process_monthly_counts.Rd
rename to preprocessing/man/generate_monthly_counts.Rd
index 5b085d9..75a454c 100644
--- a/man/process_monthly_counts.Rd
+++ b/preprocessing/man/generate_monthly_counts.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils_preprocessing_db.R
-\name{process_monthly_counts}
-\alias{process_monthly_counts}
+% Please edit documentation in R/monthly_counts.R
+\name{generate_monthly_counts}
+\alias{generate_monthly_counts}
 \title{Generate the 'omopcat_monthly_counts' table}
 \usage{
-process_monthly_counts(cdm)
+generate_monthly_counts(cdm)
 }
 \arguments{
 \item{cdm}{A \code{\link{CDMConnector}} object, e.g. from \code{\link[CDMConnector:cdm_from_con]{CDMConnector::cdm_from_con()}}}
@@ -15,3 +15,4 @@ A \code{data.frame} with the monthly counts
 \description{
 Generate the 'omopcat_monthly_counts' table
 }
+\keyword{internal}
diff --git a/man/process_summary_stats.Rd b/preprocessing/man/generate_summary_stats.Rd
similarity index 51%
rename from man/process_summary_stats.Rd
rename to preprocessing/man/generate_summary_stats.Rd
index 194de3e..197eb0a 100644
--- a/man/process_summary_stats.Rd
+++ b/preprocessing/man/generate_summary_stats.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/utils_preprocessing_db.R
-\name{process_summary_stats}
-\alias{process_summary_stats}
-\title{Generate the 'omopcat_summary_stats' table}
+% Please edit documentation in R/summary_stats.R
+\name{generate_summary_stats}
+\alias{generate_summary_stats}
+\title{Generate the \code{omopcat_summary_stats} table}
 \usage{
-process_summary_stats(cdm)
+generate_summary_stats(cdm)
 }
 \arguments{
 \item{cdm}{A \code{\link{CDMConnector}} object, e.g. from \code{\link[CDMConnector:cdm_from_con]{CDMConnector::cdm_from_con()}}}
@@ -13,5 +13,6 @@ process_summary_stats(cdm)
 A \code{data.frame} with the summary statistics
 }
 \description{
-Generate the 'omopcat_summary_stats' table
+Generate the \code{omopcat_summary_stats} table
 }
+\keyword{internal}
diff --git a/preprocessing/man/omopcat.preprocessing-package.Rd b/preprocessing/man/omopcat.preprocessing-package.Rd
new file mode 100644
index 0000000..c07ce26
--- /dev/null
+++ b/preprocessing/man/omopcat.preprocessing-package.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/omopcat.preprocessing-package.R
+\docType{package}
+\name{omopcat.preprocessing-package}
+\alias{omopcat.preprocessing}
+\alias{omopcat.preprocessing-package}
+\title{omopcat.preprocessing: Preprocessing for the omopcat App}
+\description{
+Preprocess data for the omopcat app.
+}
+\author{
+\strong{Maintainer}: Milan Malfait \email{m.malfait@ucl.ac.uk}
+
+Authors:
+\itemize{
+  \item Stefan Piatek
+  \item Baptiste Briot Ribeyre
+  \item Andy South
+}
+
+Other contributors:
+\itemize{
+  \item SAFEHR [copyright holder, funder]
+}
+
+}
+\keyword{internal}
diff --git a/preprocessing/man/preprocess.Rd b/preprocessing/man/preprocess.Rd
new file mode 100644
index 0000000..9b89138
--- /dev/null
+++ b/preprocessing/man/preprocess.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/preprocess.R
+\name{preprocess}
+\alias{preprocess}
+\title{Run the pre-processing pipeline}
+\usage{
+preprocess(out_path = Sys.getenv("PREPROCESS_OUT_PATH"))
+}
+\arguments{
+\item{out_path}{The directory where the pre-processed data will be written to.
+Defaults  to the \code{PREPROCESS_OUT_PATH} environment variable.}
+}
+\value{
+Whether the pre-processing was run: \code{TRUE} or \code{FALSE}, invisibly.
+}
+\description{
+The pre-processing pipeline generates the following files:
+}
+\details{
+\itemize{
+\item \code{out_path}/omopcat_concepts.parquet
+\item \code{out_path}/omopcat_monthly_counts.parquet
+\item \code{out_path}/omopcat_summary_stats.parquet
+}
+
+If all these files already exist, the pipeline will not be run.
+}
diff --git a/preprocessing/omopcat.preprocessing.Rproj b/preprocessing/omopcat.preprocessing.Rproj
new file mode 100644
index 0000000..7f1b52b
--- /dev/null
+++ b/preprocessing/omopcat.preprocessing.Rproj
@@ -0,0 +1,22 @@
+Version: 1.0
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: XeLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+LineEndingConversion: Posix
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace
diff --git a/deploy/renv.lock.prod b/preprocessing/renv.lock
similarity index 76%
rename from deploy/renv.lock.prod
rename to preprocessing/renv.lock
index 3b0d940..033049a 100644
--- a/deploy/renv.lock.prod
+++ b/preprocessing/renv.lock
@@ -51,55 +51,6 @@
       ],
       "Hash": "065ae649b05f1ff66bb0c793107508f5"
     },
-    "DT": {
-      "Package": "DT",
-      "Version": "0.33",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "crosstalk",
-        "htmltools",
-        "htmlwidgets",
-        "httpuv",
-        "jquerylib",
-        "jsonlite",
-        "magrittr",
-        "promises"
-      ],
-      "Hash": "64ff3427f559ce3f2597a4fe13255cb6"
-    },
-    "MASS": {
-      "Package": "MASS",
-      "Version": "7.3-61",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "grDevices",
-        "graphics",
-        "methods",
-        "stats",
-        "utils"
-      ],
-      "Hash": "0cafd6f0500e5deba33be22c46bf6055"
-    },
-    "Matrix": {
-      "Package": "Matrix",
-      "Version": "1.7-1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "grDevices",
-        "graphics",
-        "grid",
-        "lattice",
-        "methods",
-        "stats",
-        "utils"
-      ],
-      "Hash": "5122bb14d8736372411f955e1b16bc8a"
-    },
     "R6": {
       "Package": "R6",
       "Version": "2.5.1",
@@ -110,16 +61,6 @@
       ],
       "Hash": "470851b6d5d0ac559e9d01bb352b4021"
     },
-    "RColorBrewer": {
-      "Package": "RColorBrewer",
-      "Version": "1.1-3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R"
-      ],
-      "Hash": "45f0398006e83a5b10b72a90663d8d8c"
-    },
     "RPostgres": {
       "Package": "RPostgres",
       "Version": "1.4.7",
@@ -139,6 +80,25 @@
       ],
       "Hash": "beb7e18bf3f9e096f716a52a77ec793c"
     },
+    "RSQLite": {
+      "Package": "RSQLite",
+      "Version": "2.3.7",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "DBI",
+        "R",
+        "bit64",
+        "blob",
+        "cpp11",
+        "memoise",
+        "methods",
+        "pkgconfig",
+        "plogr",
+        "rlang"
+      ],
+      "Hash": "46b45a4dd7bb0e0f4e3fc22245817240"
+    },
     "Rcpp": {
       "Package": "Rcpp",
       "Version": "1.0.13-1",
@@ -226,19 +186,22 @@
       ],
       "Hash": "40415719b5a479b87949f3aa0aee737c"
     },
-    "bsicons": {
-      "Package": "bsicons",
-      "Version": "0.1.2",
+    "brew": {
+      "Package": "brew",
+      "Version": "1.0-10",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "8f4a384e19dccd8c65356dc096847b76"
+    },
+    "brio": {
+      "Package": "brio",
+      "Version": "1.1.5",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
-        "R",
-        "cli",
-        "htmltools",
-        "rlang",
-        "utils"
+        "R"
       ],
-      "Hash": "d8f892fbd94d0b9b1f6d688b05b8633c"
+      "Hash": "c1ee497a6d999947c2c224ae46799b1a"
     },
     "bslib": {
       "Package": "bslib",
@@ -273,6 +236,19 @@
       ],
       "Hash": "cd9a672193789068eb5a2aad65a0dedf"
     },
+    "callr": {
+      "Package": "callr",
+      "Version": "3.7.6",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "R6",
+        "processx",
+        "utils"
+      ],
+      "Hash": "d7e13f49c19103ece9e58ad2d83a7354"
+    },
     "checkmate": {
       "Package": "checkmate",
       "Version": "2.3.2",
@@ -306,20 +282,6 @@
       ],
       "Hash": "3f038e5ac7f41d4ac41ce658c85e3042"
     },
-    "colorspace": {
-      "Package": "colorspace",
-      "Version": "2.1-1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "grDevices",
-        "graphics",
-        "methods",
-        "stats"
-      ],
-      "Hash": "d954cb1c57e8d8b756165d7ba18aa55a"
-    },
     "commonmark": {
       "Package": "commonmark",
       "Version": "1.9.2",
@@ -341,7 +303,7 @@
       "Package": "cpp11",
       "Version": "0.5.0",
       "Source": "Repository",
-      "Repository": "CRAN",
+      "Repository": "RSPM",
       "Requirements": [
         "R"
       ],
@@ -359,39 +321,29 @@
       ],
       "Hash": "859d96e65ef198fd43e82b9628d593ef"
     },
-    "crosstalk": {
-      "Package": "crosstalk",
-      "Version": "1.2.1",
+    "credentials": {
+      "Package": "credentials",
+      "Version": "2.0.2",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
-        "R6",
-        "htmltools",
+        "askpass",
+        "curl",
         "jsonlite",
-        "lazyeval"
+        "openssl",
+        "sys"
       ],
-      "Hash": "ab12c7b080a57475248a30f4db6298c0"
+      "Hash": "09fd631e607a236f8cc7f9604db32cb8"
     },
     "curl": {
       "Package": "curl",
-      "Version": "5.2.3",
+      "Version": "6.0.0",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
         "R"
       ],
-      "Hash": "d91263322a58af798f6cf3b13fd56dde"
-    },
-    "data.table": {
-      "Package": "data.table",
-      "Version": "1.16.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "methods"
-      ],
-      "Hash": "2e00b378fc3be69c865120d9f313039a"
+      "Hash": "ff51697d9205fe715f29e7171e874c6e"
     },
     "dbplyr": {
       "Package": "dbplyr",
@@ -421,6 +373,53 @@
       ],
       "Hash": "39b2e002522bfd258039ee4e889e0fd1"
     },
+    "desc": {
+      "Package": "desc",
+      "Version": "1.4.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "R6",
+        "cli",
+        "utils"
+      ],
+      "Hash": "99b79fcbd6c4d1ce087f5c5c758b384f"
+    },
+    "devtools": {
+      "Package": "devtools",
+      "Version": "2.4.5",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "cli",
+        "desc",
+        "ellipsis",
+        "fs",
+        "lifecycle",
+        "memoise",
+        "miniUI",
+        "pkgbuild",
+        "pkgdown",
+        "pkgload",
+        "profvis",
+        "rcmdcheck",
+        "remotes",
+        "rlang",
+        "roxygen2",
+        "rversions",
+        "sessioninfo",
+        "stats",
+        "testthat",
+        "tools",
+        "urlchecker",
+        "usethis",
+        "utils",
+        "withr"
+      ],
+      "Hash": "ea5bc8b4a6a01e4f12d98b58329930bb"
+    },
     "diffobj": {
       "Package": "diffobj",
       "Version": "0.3.5",
@@ -440,13 +439,33 @@
       "Package": "digest",
       "Version": "0.6.37",
       "Source": "Repository",
-      "Repository": "CRAN",
+      "Repository": "RSPM",
       "Requirements": [
         "R",
         "utils"
       ],
       "Hash": "33698c4b3127fc9f506654607fb73676"
     },
+    "downlit": {
+      "Package": "downlit",
+      "Version": "0.4.4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "brio",
+        "desc",
+        "digest",
+        "evaluate",
+        "fansi",
+        "memoise",
+        "rlang",
+        "vctrs",
+        "withr",
+        "yaml"
+      ],
+      "Hash": "45a6a596bf0108ee1ff16a040a2df897"
+    },
     "dplyr": {
       "Package": "dplyr",
       "Version": "1.1.4",
@@ -470,6 +489,30 @@
       ],
       "Hash": "fedd9d00c2944ff00a0e2696ccf048ec"
     },
+    "duckdb": {
+      "Package": "duckdb",
+      "Version": "1.1.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "DBI",
+        "R",
+        "methods",
+        "utils"
+      ],
+      "Hash": "83a09ee9c8380fecfcea1daeaa99e3b2"
+    },
+    "ellipsis": {
+      "Package": "ellipsis",
+      "Version": "0.3.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "rlang"
+      ],
+      "Hash": "bb0eec2fe32e88d9e2836c2f73ea2077"
+    },
     "evaluate": {
       "Package": "evaluate",
       "Version": "1.0.1",
@@ -492,13 +535,6 @@
       ],
       "Hash": "962174cf2aeb5b9eea581522286a911f"
     },
-    "farver": {
-      "Package": "farver",
-      "Version": "2.1.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "680887028577f3fa2a81e410ed0d6e42"
-    },
     "fastmap": {
       "Package": "fastmap",
       "Version": "1.2.0",
@@ -518,22 +554,6 @@
       ],
       "Hash": "c2efdd5f0bcd1ea861c2d4e2a883a67d"
     },
-    "forcats": {
-      "Package": "forcats",
-      "Version": "1.0.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cli",
-        "glue",
-        "lifecycle",
-        "magrittr",
-        "rlang",
-        "tibble"
-      ],
-      "Hash": "1a0a9a3d5083d0d573c4214576f1e690"
-    },
     "fs": {
       "Package": "fs",
       "Version": "1.6.5",
@@ -556,30 +576,48 @@
       ],
       "Hash": "15e9634c0fcd294799e9b2e929ed1b86"
     },
-    "ggplot2": {
-      "Package": "ggplot2",
-      "Version": "3.5.1",
+    "gert": {
+      "Package": "gert",
+      "Version": "2.1.4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "askpass",
+        "credentials",
+        "openssl",
+        "rstudioapi",
+        "sys",
+        "zip"
+      ],
+      "Hash": "ae855ad6d7be20dd7b05d43d25700398"
+    },
+    "gh": {
+      "Package": "gh",
+      "Version": "1.4.1",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
-        "MASS",
         "R",
         "cli",
+        "gitcreds",
         "glue",
-        "grDevices",
-        "grid",
-        "gtable",
-        "isoband",
+        "httr2",
+        "ini",
+        "jsonlite",
         "lifecycle",
-        "mgcv",
-        "rlang",
-        "scales",
-        "stats",
-        "tibble",
-        "vctrs",
-        "withr"
+        "rlang"
+      ],
+      "Hash": "fbbbc48eba7a6626a08bb365e44b563b"
+    },
+    "gitcreds": {
+      "Package": "gitcreds",
+      "Version": "0.1.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R"
       ],
-      "Hash": "44c6a2f8202d5b7e878ea274b1092426"
+      "Hash": "ab08ac61f3e1be454ae21911eb8bc2fe"
     },
     "glue": {
       "Package": "glue",
@@ -610,22 +648,6 @@
       ],
       "Hash": "11be24963e2f220c403eafd01e2259d5"
     },
-    "gtable": {
-      "Package": "gtable",
-      "Version": "0.3.6",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "cli",
-        "glue",
-        "grid",
-        "lifecycle",
-        "rlang",
-        "stats"
-      ],
-      "Hash": "de949855009e2d4d0e52a844e30617ae"
-    },
     "here": {
       "Package": "here",
       "Version": "1.0.1",
@@ -707,31 +729,33 @@
       ],
       "Hash": "d55aa087c47a63ead0f6fc10f8fa1ee0"
     },
-    "httr": {
-      "Package": "httr",
-      "Version": "1.4.7",
+    "httr2": {
+      "Package": "httr2",
+      "Version": "1.0.6",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
         "R",
         "R6",
+        "cli",
         "curl",
-        "jsonlite",
-        "mime",
-        "openssl"
+        "glue",
+        "lifecycle",
+        "magrittr",
+        "openssl",
+        "rappdirs",
+        "rlang",
+        "vctrs",
+        "withr"
       ],
-      "Hash": "ac107251d9d9fd72f0ca8049988f1d7f"
+      "Hash": "3ef5d07ec78803475a94367d71b40c41"
     },
-    "isoband": {
-      "Package": "isoband",
-      "Version": "0.2.7",
+    "ini": {
+      "Package": "ini",
+      "Version": "0.3.1",
       "Source": "Repository",
       "Repository": "CRAN",
-      "Requirements": [
-        "grid",
-        "utils"
-      ],
-      "Hash": "0080607b4a1a7b28979aecef976d8bc2"
+      "Hash": "6154ec2223172bce8162d4153cda21f7"
     },
     "jquerylib": {
       "Package": "jquerylib",
@@ -769,17 +793,6 @@
       ],
       "Hash": "acf380f300c721da9fde7df115a5f86f"
     },
-    "labeling": {
-      "Package": "labeling",
-      "Version": "0.4.3",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "graphics",
-        "stats"
-      ],
-      "Hash": "b64ec208ac5bc1852b285f665d6368b3"
-    },
     "later": {
       "Package": "later",
       "Version": "1.3.2",
@@ -791,31 +804,6 @@
       ],
       "Hash": "a3e051d405326b8b0012377434c62b37"
     },
-    "lattice": {
-      "Package": "lattice",
-      "Version": "0.22-6",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "grDevices",
-        "graphics",
-        "grid",
-        "stats",
-        "utils"
-      ],
-      "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2"
-    },
-    "lazyeval": {
-      "Package": "lazyeval",
-      "Version": "0.2.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R"
-      ],
-      "Hash": "d908914ae53b04d4c0c0fd72ecc35370"
-    },
     "lifecycle": {
       "Package": "lifecycle",
       "Version": "1.0.4",
@@ -852,19 +840,6 @@
       ],
       "Hash": "7ce2733a9826b3aeb1775d56fd305472"
     },
-    "markdown": {
-      "Package": "markdown",
-      "Version": "1.13",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "commonmark",
-        "utils",
-        "xfun"
-      ],
-      "Hash": "074efab766a9d6360865ad39512f2a7e"
-    },
     "memoise": {
       "Package": "memoise",
       "Version": "2.0.1",
@@ -876,23 +851,6 @@
       ],
       "Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c"
     },
-    "mgcv": {
-      "Package": "mgcv",
-      "Version": "1.9-1",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "Matrix",
-        "R",
-        "graphics",
-        "methods",
-        "nlme",
-        "splines",
-        "stats",
-        "utils"
-      ],
-      "Hash": "110ee9d83b496279960e162ac97764ce"
-    },
     "mime": {
       "Package": "mime",
       "Version": "0.12",
@@ -903,16 +861,17 @@
       ],
       "Hash": "18e9c28c1d3ca1560ce30658b22ce104"
     },
-    "munsell": {
-      "Package": "munsell",
-      "Version": "0.5.1",
+    "miniUI": {
+      "Package": "miniUI",
+      "Version": "0.1.1.1",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
-        "colorspace",
-        "methods"
+        "htmltools",
+        "shiny",
+        "utils"
       ],
-      "Hash": "4fd8900853b746af55b81fda99da7695"
+      "Hash": "fec5f52652d60615fdb3957b3d74324a"
     },
     "nanoparquet": {
       "Package": "nanoparquet",
@@ -924,40 +883,6 @@
       ],
       "Hash": "a2a0a0dc4cd6bd52d501c619c77cc321"
     },
-    "nlme": {
-      "Package": "nlme",
-      "Version": "3.1-166",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R",
-        "graphics",
-        "lattice",
-        "stats",
-        "utils"
-      ],
-      "Hash": "ccbb8846be320b627e6aa2b4616a2ded"
-    },
-    "omopbundles": {
-      "Package": "omopbundles",
-      "Version": "0.1.1",
-      "Source": "GitHub",
-      "RemoteType": "github",
-      "RemoteHost": "api.github.com",
-      "RemoteUsername": "SAFEHR-data",
-      "RemoteRepo": "omop-bundles",
-      "RemoteRef": "main",
-      "RemoteSha": "0d24c38f1bf4d032eab700741a9576f9a8255991",
-      "Requirements": [
-        "dplyr",
-        "glue",
-        "purrr",
-        "readr",
-        "rlang",
-        "vroom"
-      ],
-      "Hash": "caecf4e78df185f39703c28a73658659"
-    },
     "omopgenerics": {
       "Package": "omopgenerics",
       "Version": "0.3.1",
@@ -1007,6 +932,21 @@
       ],
       "Hash": "15da5a8412f317beeee6175fbc76f4bb"
     },
+    "pkgbuild": {
+      "Package": "pkgbuild",
+      "Version": "1.4.5",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "R6",
+        "callr",
+        "cli",
+        "desc",
+        "processx"
+      ],
+      "Hash": "30eaaab94db72652e72e3475c1b55278"
+    },
     "pkgconfig": {
       "Package": "pkgconfig",
       "Version": "2.0.3",
@@ -1017,44 +957,71 @@
       ],
       "Hash": "01f28d4278f15c76cddbea05899c5d6f"
     },
-    "plogr": {
-      "Package": "plogr",
-      "Version": "0.2.0",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Hash": "09eb987710984fc2905c7129c7d85e65"
-    },
-    "plotly": {
-      "Package": "plotly",
-      "Version": "4.10.4",
+    "pkgdown": {
+      "Package": "pkgdown",
+      "Version": "2.1.1",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
         "R",
-        "RColorBrewer",
-        "base64enc",
-        "crosstalk",
-        "data.table",
+        "bslib",
+        "callr",
+        "cli",
+        "desc",
         "digest",
-        "dplyr",
-        "ggplot2",
-        "htmltools",
-        "htmlwidgets",
-        "httr",
+        "downlit",
+        "fontawesome",
+        "fs",
+        "httr2",
         "jsonlite",
-        "lazyeval",
-        "magrittr",
-        "promises",
+        "openssl",
         "purrr",
+        "ragg",
         "rlang",
-        "scales",
+        "rmarkdown",
         "tibble",
-        "tidyr",
-        "tools",
-        "vctrs",
-        "viridisLite"
+        "whisker",
+        "withr",
+        "xml2",
+        "yaml"
       ],
-      "Hash": "a1ac5c03ad5ad12b9d1597e00e23c3dd"
+      "Hash": "df2912d5873422b55a13002510f02c9f"
+    },
+    "pkgload": {
+      "Package": "pkgload",
+      "Version": "1.4.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "cli",
+        "desc",
+        "fs",
+        "glue",
+        "lifecycle",
+        "methods",
+        "pkgbuild",
+        "processx",
+        "rlang",
+        "rprojroot",
+        "utils",
+        "withr"
+      ],
+      "Hash": "2ec30ffbeec83da57655b850cf2d3e0e"
+    },
+    "plogr": {
+      "Package": "plogr",
+      "Version": "0.2.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "09eb987710984fc2905c7129c7d85e65"
+    },
+    "praise": {
+      "Package": "praise",
+      "Version": "1.0.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "a555924add98c99d2f411e37e7d25e9f"
     },
     "prettyunits": {
       "Package": "prettyunits",
@@ -1066,6 +1033,32 @@
       ],
       "Hash": "6b01fc98b1e86c4f705ce9dcfd2f57c7"
     },
+    "processx": {
+      "Package": "processx",
+      "Version": "3.8.4",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "R6",
+        "ps",
+        "utils"
+      ],
+      "Hash": "0c90a7d71988856bad2a2a45dd871bb9"
+    },
+    "profvis": {
+      "Package": "profvis",
+      "Version": "0.4.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "htmlwidgets",
+        "rlang",
+        "vctrs"
+      ],
+      "Hash": "bffa126bf92987e677c12cfb5651fc1d"
+    },
     "progress": {
       "Package": "progress",
       "Version": "1.2.3",
@@ -1096,6 +1089,17 @@
       ],
       "Hash": "434cd5388a3979e74be5c219bcd6e77d"
     },
+    "ps": {
+      "Package": "ps",
+      "Version": "1.8.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "utils"
+      ],
+      "Hash": "b4404b1de13758dea1c0484ad0d48563"
+    },
     "purrr": {
       "Package": "purrr",
       "Version": "1.0.2",
@@ -1111,6 +1115,17 @@
       ],
       "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc"
     },
+    "ragg": {
+      "Package": "ragg",
+      "Version": "1.3.3",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "systemfonts",
+        "textshaping"
+      ],
+      "Hash": "0595fe5e47357111f29ad19101c7d271"
+    },
     "rappdirs": {
       "Package": "rappdirs",
       "Version": "0.3.3",
@@ -1121,6 +1136,28 @@
       ],
       "Hash": "5e3c5dc0b071b21fa128676560dbe94d"
     },
+    "rcmdcheck": {
+      "Package": "rcmdcheck",
+      "Version": "1.4.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R6",
+        "callr",
+        "cli",
+        "curl",
+        "desc",
+        "digest",
+        "pkgbuild",
+        "prettyunits",
+        "rprojroot",
+        "sessioninfo",
+        "utils",
+        "withr",
+        "xopen"
+      ],
+      "Hash": "8f25ebe2ec38b1f2aef3b0d2ef76f6c4"
+    },
     "readr": {
       "Package": "readr",
       "Version": "2.1.5",
@@ -1144,6 +1181,20 @@
       ],
       "Hash": "9de96463d2117f6ac49980577939dfb3"
     },
+    "remotes": {
+      "Package": "remotes",
+      "Version": "2.5.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "methods",
+        "stats",
+        "tools",
+        "utils"
+      ],
+      "Hash": "3ee025083e66f18db6cf27b56e23e141"
+    },
     "renv": {
       "Package": "renv",
       "Version": "1.0.11",
@@ -1188,6 +1239,32 @@
       ],
       "Hash": "df99277f63d01c34e95e3d2f06a79736"
     },
+    "roxygen2": {
+      "Package": "roxygen2",
+      "Version": "7.3.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "R6",
+        "brew",
+        "cli",
+        "commonmark",
+        "cpp11",
+        "desc",
+        "knitr",
+        "methods",
+        "pkgload",
+        "purrr",
+        "rlang",
+        "stringi",
+        "stringr",
+        "utils",
+        "withr",
+        "xml2"
+      ],
+      "Hash": "6ee25f9054a70f44d615300ed531ba8d"
+    },
     "rprojroot": {
       "Package": "rprojroot",
       "Version": "2.0.4",
@@ -1198,6 +1275,25 @@
       ],
       "Hash": "4c8415e0ec1e29f3f4f6fc108bef0144"
     },
+    "rstudioapi": {
+      "Package": "rstudioapi",
+      "Version": "0.17.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "5f90cd73946d706cfe26024294236113"
+    },
+    "rversions": {
+      "Package": "rversions",
+      "Version": "2.1.2",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "curl",
+        "utils",
+        "xml2"
+      ],
+      "Hash": "a9881dfed103e83f9de151dc17002cd1"
+    },
     "sass": {
       "Package": "sass",
       "Version": "0.4.9",
@@ -1212,25 +1308,18 @@
       ],
       "Hash": "d53dbfddf695303ea4ad66f86e99b95d"
     },
-    "scales": {
-      "Package": "scales",
-      "Version": "1.3.0",
+    "sessioninfo": {
+      "Package": "sessioninfo",
+      "Version": "1.2.2",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
         "R",
-        "R6",
-        "RColorBrewer",
         "cli",
-        "farver",
-        "glue",
-        "labeling",
-        "lifecycle",
-        "munsell",
-        "rlang",
-        "viridisLite"
+        "tools",
+        "utils"
       ],
-      "Hash": "c19df082ba346b0ffa6f833e92de34d1"
+      "Hash": "3f9796a8d0a0e8c6eb49a4b029359d1f"
     },
     "shiny": {
       "Package": "shiny",
@@ -1324,6 +1413,60 @@
       "Repository": "CRAN",
       "Hash": "de342ebfebdbf40477d0758d05426646"
     },
+    "systemfonts": {
+      "Package": "systemfonts",
+      "Version": "1.1.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "cpp11",
+        "lifecycle"
+      ],
+      "Hash": "213b6b8ed5afbf934843e6c3b090d418"
+    },
+    "testthat": {
+      "Package": "testthat",
+      "Version": "3.2.1.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "R6",
+        "brio",
+        "callr",
+        "cli",
+        "desc",
+        "digest",
+        "evaluate",
+        "jsonlite",
+        "lifecycle",
+        "magrittr",
+        "methods",
+        "pkgload",
+        "praise",
+        "processx",
+        "ps",
+        "rlang",
+        "utils",
+        "waldo",
+        "withr"
+      ],
+      "Hash": "3f6e7e5e2220856ff865e4834766bf2b"
+    },
+    "textshaping": {
+      "Package": "textshaping",
+      "Version": "0.4.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "cpp11",
+        "lifecycle",
+        "systemfonts"
+      ],
+      "Hash": "5142f8bc78ed3d819d26461b641627ce"
+    },
     "tibble": {
       "Package": "tibble",
       "Version": "3.2.1",
@@ -1414,6 +1557,51 @@
       ],
       "Hash": "f561504ec2897f4d46f0c7657e488ae1"
     },
+    "urlchecker": {
+      "Package": "urlchecker",
+      "Version": "1.0.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "cli",
+        "curl",
+        "tools",
+        "xml2"
+      ],
+      "Hash": "409328b8e1253c8d729a7836fe7f7a16"
+    },
+    "usethis": {
+      "Package": "usethis",
+      "Version": "3.0.0",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "cli",
+        "clipr",
+        "crayon",
+        "curl",
+        "desc",
+        "fs",
+        "gert",
+        "gh",
+        "glue",
+        "jsonlite",
+        "lifecycle",
+        "purrr",
+        "rappdirs",
+        "rlang",
+        "rprojroot",
+        "rstudioapi",
+        "stats",
+        "utils",
+        "whisker",
+        "withr",
+        "yaml"
+      ],
+      "Hash": "b2fbf93c2127bedd2cbe9b799530d5d2"
+    },
     "utf8": {
       "Package": "utf8",
       "Version": "1.2.4",
@@ -1438,16 +1626,6 @@
       ],
       "Hash": "c03fa420630029418f7e6da3667aac4a"
     },
-    "viridisLite": {
-      "Package": "viridisLite",
-      "Version": "0.4.2",
-      "Source": "Repository",
-      "Repository": "CRAN",
-      "Requirements": [
-        "R"
-      ],
-      "Hash": "c826c7c4241b6fc89ff55aaea3fa7491"
-    },
     "vroom": {
       "Package": "vroom",
       "Version": "1.6.5",
@@ -1476,7 +1654,7 @@
     },
     "waldo": {
       "Package": "waldo",
-      "Version": "0.6.0",
+      "Version": "0.6.1",
       "Source": "Repository",
       "Repository": "CRAN",
       "Requirements": [
@@ -1487,7 +1665,14 @@
         "methods",
         "rlang"
       ],
-      "Hash": "53ec6571dc7a321797cde0abe007ff53"
+      "Hash": "52f574062a7b66e56926988c3fbdb3b7"
+    },
+    "whisker": {
+      "Package": "whisker",
+      "Version": "0.4.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "c6abfa47a46d281a7d5159d0a8891e88"
     },
     "withr": {
       "Package": "withr",
@@ -1514,6 +1699,30 @@
       ],
       "Hash": "8687398773806cfff9401a2feca96298"
     },
+    "xml2": {
+      "Package": "xml2",
+      "Version": "1.3.6",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "cli",
+        "methods",
+        "rlang"
+      ],
+      "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61"
+    },
+    "xopen": {
+      "Package": "xopen",
+      "Version": "1.0.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Requirements": [
+        "R",
+        "processx"
+      ],
+      "Hash": "423df1e86d5533fcb73c6b02b4923b49"
+    },
     "xtable": {
       "Package": "xtable",
       "Version": "1.8-4",
@@ -1532,6 +1741,13 @@
       "Source": "Repository",
       "Repository": "CRAN",
       "Hash": "51dab85c6c98e50a18d7551e9d49f76c"
+    },
+    "zip": {
+      "Package": "zip",
+      "Version": "2.3.1",
+      "Source": "Repository",
+      "Repository": "CRAN",
+      "Hash": "fcc4bd8e6da2d2011eb64a5e5cc685ab"
     }
   }
 }
diff --git a/preprocessing/renv/.gitignore b/preprocessing/renv/.gitignore
new file mode 100644
index 0000000..0ec0cbb
--- /dev/null
+++ b/preprocessing/renv/.gitignore
@@ -0,0 +1,7 @@
+library/
+local/
+cellar/
+lock/
+python/
+sandbox/
+staging/
diff --git a/preprocessing/renv/activate.R b/preprocessing/renv/activate.R
new file mode 100644
index 0000000..0eb5108
--- /dev/null
+++ b/preprocessing/renv/activate.R
@@ -0,0 +1,1305 @@
+
+local({
+
+  # the requested version of renv
+  version <- "1.0.11"
+  attr(version, "sha") <- NULL
+
+  # the project directory
+  project <- Sys.getenv("RENV_PROJECT")
+  if (!nzchar(project))
+    project <- getwd()
+
+  # use start-up diagnostics if enabled
+  diagnostics <- Sys.getenv("RENV_STARTUP_DIAGNOSTICS", unset = "FALSE")
+  if (diagnostics) {
+    start <- Sys.time()
+    profile <- tempfile("renv-startup-", fileext = ".Rprof")
+    utils::Rprof(profile)
+    on.exit({
+      utils::Rprof(NULL)
+      elapsed <- signif(difftime(Sys.time(), start, units = "auto"), digits = 2L)
+      writeLines(sprintf("- renv took %s to run the autoloader.", format(elapsed)))
+      writeLines(sprintf("- Profile: %s", profile))
+      print(utils::summaryRprof(profile))
+    }, add = TRUE)
+  }
+
+  # figure out whether the autoloader is enabled
+  enabled <- local({
+
+    # first, check config option
+    override <- getOption("renv.config.autoloader.enabled")
+    if (!is.null(override))
+      return(override)
+
+    # if we're being run in a context where R_LIBS is already set,
+    # don't load -- presumably we're being run as a sub-process and
+    # the parent process has already set up library paths for us
+    rcmd <- Sys.getenv("R_CMD", unset = NA)
+    rlibs <- Sys.getenv("R_LIBS", unset = NA)
+    if (!is.na(rlibs) && !is.na(rcmd))
+      return(FALSE)
+
+    # next, check environment variables
+    # TODO: prefer using the configuration one in the future
+    envvars <- c(
+      "RENV_CONFIG_AUTOLOADER_ENABLED",
+      "RENV_AUTOLOADER_ENABLED",
+      "RENV_ACTIVATE_PROJECT"
+    )
+
+    for (envvar in envvars) {
+      envval <- Sys.getenv(envvar, unset = NA)
+      if (!is.na(envval))
+        return(tolower(envval) %in% c("true", "t", "1"))
+    }
+
+    # enable by default
+    TRUE
+
+  })
+
+  # bail if we're not enabled
+  if (!enabled) {
+
+    # if we're not enabled, we might still need to manually load
+    # the user profile here
+    profile <- Sys.getenv("R_PROFILE_USER", unset = "~/.Rprofile")
+    if (file.exists(profile)) {
+      cfg <- Sys.getenv("RENV_CONFIG_USER_PROFILE", unset = "TRUE")
+      if (tolower(cfg) %in% c("true", "t", "1"))
+        sys.source(profile, envir = globalenv())
+    }
+
+    return(FALSE)
+
+  }
+
+  # avoid recursion
+  if (identical(getOption("renv.autoloader.running"), TRUE)) {
+    warning("ignoring recursive attempt to run renv autoloader")
+    return(invisible(TRUE))
+  }
+
+  # signal that we're loading renv during R startup
+  options(renv.autoloader.running = TRUE)
+  on.exit(options(renv.autoloader.running = NULL), add = TRUE)
+
+  # signal that we've consented to use renv
+  options(renv.consent = TRUE)
+
+  # load the 'utils' package eagerly -- this ensures that renv shims, which
+  # mask 'utils' packages, will come first on the search path
+  library(utils, lib.loc = .Library)
+
+  # unload renv if it's already been loaded
+  if ("renv" %in% loadedNamespaces())
+    unloadNamespace("renv")
+
+  # load bootstrap tools   
+  ansify <- function(text) {
+    if (renv_ansify_enabled())
+      renv_ansify_enhanced(text)
+    else
+      renv_ansify_default(text)
+  }
+  
+  renv_ansify_enabled <- function() {
+  
+    override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA)
+    if (!is.na(override))
+      return(as.logical(override))
+  
+    pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA)
+    if (identical(pane, "build"))
+      return(FALSE)
+  
+    testthat <- Sys.getenv("TESTTHAT", unset = "false")
+    if (tolower(testthat) %in% "true")
+      return(FALSE)
+  
+    iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false")
+    if (tolower(iderun) %in% "false")
+      return(FALSE)
+  
+    TRUE
+  
+  }
+  
+  renv_ansify_default <- function(text) {
+    text
+  }
+  
+  renv_ansify_enhanced <- function(text) {
+  
+    # R help links
+    pattern <- "`\\?(renv::(?:[^`])+)`"
+    replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`"
+    text <- gsub(pattern, replacement, text, perl = TRUE)
+  
+    # runnable code
+    pattern <- "`(renv::(?:[^`])+)`"
+    replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`"
+    text <- gsub(pattern, replacement, text, perl = TRUE)
+  
+    # return ansified text
+    text
+  
+  }
+  
+  renv_ansify_init <- function() {
+  
+    envir <- renv_envir_self()
+    if (renv_ansify_enabled())
+      assign("ansify", renv_ansify_enhanced, envir = envir)
+    else
+      assign("ansify", renv_ansify_default, envir = envir)
+  
+  }
+  
+  `%||%` <- function(x, y) {
+    if (is.null(x)) y else x
+  }
+  
+  catf <- function(fmt, ..., appendLF = TRUE) {
+  
+    quiet <- getOption("renv.bootstrap.quiet", default = FALSE)
+    if (quiet)
+      return(invisible())
+  
+    msg <- sprintf(fmt, ...)
+    cat(msg, file = stdout(), sep = if (appendLF) "\n" else "")
+  
+    invisible(msg)
+  
+  }
+  
+  header <- function(label,
+                     ...,
+                     prefix = "#",
+                     suffix = "-",
+                     n = min(getOption("width"), 78))
+  {
+    label <- sprintf(label, ...)
+    n <- max(n - nchar(label) - nchar(prefix) - 2L, 8L)
+    if (n <= 0)
+      return(paste(prefix, label))
+  
+    tail <- paste(rep.int(suffix, n), collapse = "")
+    paste0(prefix, " ", label, " ", tail)
+  
+  }
+  
+  heredoc <- function(text, leave = 0) {
+  
+    # remove leading, trailing whitespace
+    trimmed <- gsub("^\\s*\\n|\\n\\s*$", "", text)
+  
+    # split into lines
+    lines <- strsplit(trimmed, "\n", fixed = TRUE)[[1L]]
+  
+    # compute common indent
+    indent <- regexpr("[^[:space:]]", lines)
+    common <- min(setdiff(indent, -1L)) - leave
+    text <- paste(substring(lines, common), collapse = "\n")
+  
+    # substitute in ANSI links for executable renv code
+    ansify(text)
+  
+  }
+  
+  startswith <- function(string, prefix) {
+    substring(string, 1, nchar(prefix)) == prefix
+  }
+  
+  bootstrap <- function(version, library) {
+  
+    friendly <- renv_bootstrap_version_friendly(version)
+    section <- header(sprintf("Bootstrapping renv %s", friendly))
+    catf(section)
+  
+    # attempt to download renv
+    catf("- Downloading renv ... ", appendLF = FALSE)
+    withCallingHandlers(
+      tarball <- renv_bootstrap_download(version),
+      error = function(err) {
+        catf("FAILED")
+        stop("failed to download:\n", conditionMessage(err))
+      }
+    )
+    catf("OK")
+    on.exit(unlink(tarball), add = TRUE)
+  
+    # now attempt to install
+    catf("- Installing renv  ... ", appendLF = FALSE)
+    withCallingHandlers(
+      status <- renv_bootstrap_install(version, tarball, library),
+      error = function(err) {
+        catf("FAILED")
+        stop("failed to install:\n", conditionMessage(err))
+      }
+    )
+    catf("OK")
+  
+    # add empty line to break up bootstrapping from normal output
+    catf("")
+  
+    return(invisible())
+  }
+  
+  renv_bootstrap_tests_running <- function() {
+    getOption("renv.tests.running", default = FALSE)
+  }
+  
+  renv_bootstrap_repos <- function() {
+  
+    # get CRAN repository
+    cran <- getOption("renv.repos.cran", "https://cloud.r-project.org")
+  
+    # check for repos override
+    repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA)
+    if (!is.na(repos)) {
+  
+      # check for RSPM; if set, use a fallback repository for renv
+      rspm <- Sys.getenv("RSPM", unset = NA)
+      if (identical(rspm, repos))
+        repos <- c(RSPM = rspm, CRAN = cran)
+  
+      return(repos)
+  
+    }
+  
+    # check for lockfile repositories
+    repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity)
+    if (!inherits(repos, "error") && length(repos))
+      return(repos)
+  
+    # retrieve current repos
+    repos <- getOption("repos")
+  
+    # ensure @CRAN@ entries are resolved
+    repos[repos == "@CRAN@"] <- cran
+  
+    # add in renv.bootstrap.repos if set
+    default <- c(FALLBACK = "https://cloud.r-project.org")
+    extra <- getOption("renv.bootstrap.repos", default = default)
+    repos <- c(repos, extra)
+  
+    # remove duplicates that might've snuck in
+    dupes <- duplicated(repos) | duplicated(names(repos))
+    repos[!dupes]
+  
+  }
+  
+  renv_bootstrap_repos_lockfile <- function() {
+  
+    lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock")
+    if (!file.exists(lockpath))
+      return(NULL)
+  
+    lockfile <- tryCatch(renv_json_read(lockpath), error = identity)
+    if (inherits(lockfile, "error")) {
+      warning(lockfile)
+      return(NULL)
+    }
+  
+    repos <- lockfile$R$Repositories
+    if (length(repos) == 0)
+      return(NULL)
+  
+    keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1))
+    vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1))
+    names(vals) <- keys
+  
+    return(vals)
+  
+  }
+  
+  renv_bootstrap_download <- function(version) {
+  
+    sha <- attr(version, "sha", exact = TRUE)
+  
+    methods <- if (!is.null(sha)) {
+  
+      # attempting to bootstrap a development version of renv
+      c(
+        function() renv_bootstrap_download_tarball(sha),
+        function() renv_bootstrap_download_github(sha)
+      )
+  
+    } else {
+  
+      # attempting to bootstrap a release version of renv
+      c(
+        function() renv_bootstrap_download_tarball(version),
+        function() renv_bootstrap_download_cran_latest(version),
+        function() renv_bootstrap_download_cran_archive(version)
+      )
+  
+    }
+  
+    for (method in methods) {
+      path <- tryCatch(method(), error = identity)
+      if (is.character(path) && file.exists(path))
+        return(path)
+    }
+  
+    stop("All download methods failed")
+  
+  }
+  
+  renv_bootstrap_download_impl <- function(url, destfile) {
+  
+    mode <- "wb"
+  
+    # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715
+    fixup <-
+      Sys.info()[["sysname"]] == "Windows" &&
+      substring(url, 1L, 5L) == "file:"
+  
+    if (fixup)
+      mode <- "w+b"
+  
+    args <- list(
+      url      = url,
+      destfile = destfile,
+      mode     = mode,
+      quiet    = TRUE
+    )
+  
+    if ("headers" %in% names(formals(utils::download.file))) {
+      headers <- renv_bootstrap_download_custom_headers(url)
+      if (length(headers) && is.character(headers))
+        args$headers <- headers
+    }
+  
+    do.call(utils::download.file, args)
+  
+  }
+  
+  renv_bootstrap_download_custom_headers <- function(url) {
+  
+    headers <- getOption("renv.download.headers")
+    if (is.null(headers))
+      return(character())
+  
+    if (!is.function(headers))
+      stopf("'renv.download.headers' is not a function")
+  
+    headers <- headers(url)
+    if (length(headers) == 0L)
+      return(character())
+  
+    if (is.list(headers))
+      headers <- unlist(headers, recursive = FALSE, use.names = TRUE)
+  
+    ok <-
+      is.character(headers) &&
+      is.character(names(headers)) &&
+      all(nzchar(names(headers)))
+  
+    if (!ok)
+      stop("invocation of 'renv.download.headers' did not return a named character vector")
+  
+    headers
+  
+  }
+  
+  renv_bootstrap_download_cran_latest <- function(version) {
+  
+    spec <- renv_bootstrap_download_cran_latest_find(version)
+    type  <- spec$type
+    repos <- spec$repos
+  
+    baseurl <- utils::contrib.url(repos = repos, type = type)
+    ext <- if (identical(type, "source"))
+      ".tar.gz"
+    else if (Sys.info()[["sysname"]] == "Windows")
+      ".zip"
+    else
+      ".tgz"
+    name <- sprintf("renv_%s%s", version, ext)
+    url <- paste(baseurl, name, sep = "/")
+  
+    destfile <- file.path(tempdir(), name)
+    status <- tryCatch(
+      renv_bootstrap_download_impl(url, destfile),
+      condition = identity
+    )
+  
+    if (inherits(status, "condition"))
+      return(FALSE)
+  
+    # report success and return
+    destfile
+  
+  }
+  
+  renv_bootstrap_download_cran_latest_find <- function(version) {
+  
+    # check whether binaries are supported on this system
+    binary <-
+      getOption("renv.bootstrap.binary", default = TRUE) &&
+      !identical(.Platform$pkgType, "source") &&
+      !identical(getOption("pkgType"), "source") &&
+      Sys.info()[["sysname"]] %in% c("Darwin", "Windows")
+  
+    types <- c(if (binary) "binary", "source")
+  
+    # iterate over types + repositories
+    for (type in types) {
+      for (repos in renv_bootstrap_repos()) {
+  
+        # build arguments for utils::available.packages() call
+        args <- list(type = type, repos = repos)
+  
+        # add custom headers if available -- note that
+        # utils::available.packages() will pass this to download.file()
+        if ("headers" %in% names(formals(utils::download.file))) {
+          headers <- renv_bootstrap_download_custom_headers(repos)
+          if (length(headers) && is.character(headers))
+            args$headers <- headers
+        }
+  
+        # retrieve package database
+        db <- tryCatch(
+          as.data.frame(
+            do.call(utils::available.packages, args),
+            stringsAsFactors = FALSE
+          ),
+          error = identity
+        )
+  
+        if (inherits(db, "error"))
+          next
+  
+        # check for compatible entry
+        entry <- db[db$Package %in% "renv" & db$Version %in% version, ]
+        if (nrow(entry) == 0)
+          next
+  
+        # found it; return spec to caller
+        spec <- list(entry = entry, type = type, repos = repos)
+        return(spec)
+  
+      }
+    }
+  
+    # if we got here, we failed to find renv
+    fmt <- "renv %s is not available from your declared package repositories"
+    stop(sprintf(fmt, version))
+  
+  }
+  
+  renv_bootstrap_download_cran_archive <- function(version) {
+  
+    name <- sprintf("renv_%s.tar.gz", version)
+    repos <- renv_bootstrap_repos()
+    urls <- file.path(repos, "src/contrib/Archive/renv", name)
+    destfile <- file.path(tempdir(), name)
+  
+    for (url in urls) {
+  
+      status <- tryCatch(
+        renv_bootstrap_download_impl(url, destfile),
+        condition = identity
+      )
+  
+      if (identical(status, 0L))
+        return(destfile)
+  
+    }
+  
+    return(FALSE)
+  
+  }
+  
+  renv_bootstrap_download_tarball <- function(version) {
+  
+    # if the user has provided the path to a tarball via
+    # an environment variable, then use it
+    tarball <- Sys.getenv("RENV_BOOTSTRAP_TARBALL", unset = NA)
+    if (is.na(tarball))
+      return()
+  
+    # allow directories
+    if (dir.exists(tarball)) {
+      name <- sprintf("renv_%s.tar.gz", version)
+      tarball <- file.path(tarball, name)
+    }
+  
+    # bail if it doesn't exist
+    if (!file.exists(tarball)) {
+  
+      # let the user know we weren't able to honour their request
+      fmt <- "- RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist."
+      msg <- sprintf(fmt, tarball)
+      warning(msg)
+  
+      # bail
+      return()
+  
+    }
+  
+    catf("- Using local tarball '%s'.", tarball)
+    tarball
+  
+  }
+  
+  renv_bootstrap_github_token <- function() {
+    for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) {
+      envval <- Sys.getenv(envvar, unset = NA)
+      if (!is.na(envval))
+        return(envval)
+    }
+  }
+  
+  renv_bootstrap_download_github <- function(version) {
+  
+    enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE")
+    if (!identical(enabled, "TRUE"))
+      return(FALSE)
+  
+    # prepare download options
+    token <- renv_bootstrap_github_token()
+    if (nzchar(Sys.which("curl")) && nzchar(token)) {
+      fmt <- "--location --fail --header \"Authorization: token %s\""
+      extra <- sprintf(fmt, token)
+      saved <- options("download.file.method", "download.file.extra")
+      options(download.file.method = "curl", download.file.extra = extra)
+      on.exit(do.call(base::options, saved), add = TRUE)
+    } else if (nzchar(Sys.which("wget")) && nzchar(token)) {
+      fmt <- "--header=\"Authorization: token %s\""
+      extra <- sprintf(fmt, token)
+      saved <- options("download.file.method", "download.file.extra")
+      options(download.file.method = "wget", download.file.extra = extra)
+      on.exit(do.call(base::options, saved), add = TRUE)
+    }
+  
+    url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version)
+    name <- sprintf("renv_%s.tar.gz", version)
+    destfile <- file.path(tempdir(), name)
+  
+    status <- tryCatch(
+      renv_bootstrap_download_impl(url, destfile),
+      condition = identity
+    )
+  
+    if (!identical(status, 0L))
+      return(FALSE)
+  
+    renv_bootstrap_download_augment(destfile)
+  
+    return(destfile)
+  
+  }
+  
+  # Add Sha to DESCRIPTION. This is stop gap until #890, after which we
+  # can use renv::install() to fully capture metadata.
+  renv_bootstrap_download_augment <- function(destfile) {
+    sha <- renv_bootstrap_git_extract_sha1_tar(destfile)
+    if (is.null(sha)) {
+      return()
+    }
+  
+    # Untar
+    tempdir <- tempfile("renv-github-")
+    on.exit(unlink(tempdir, recursive = TRUE), add = TRUE)
+    untar(destfile, exdir = tempdir)
+    pkgdir <- dir(tempdir, full.names = TRUE)[[1]]
+  
+    # Modify description
+    desc_path <- file.path(pkgdir, "DESCRIPTION")
+    desc_lines <- readLines(desc_path)
+    remotes_fields <- c(
+      "RemoteType: github",
+      "RemoteHost: api.github.com",
+      "RemoteRepo: renv",
+      "RemoteUsername: rstudio",
+      "RemotePkgRef: rstudio/renv",
+      paste("RemoteRef: ", sha),
+      paste("RemoteSha: ", sha)
+    )
+    writeLines(c(desc_lines[desc_lines != ""], remotes_fields), con = desc_path)
+  
+    # Re-tar
+    local({
+      old <- setwd(tempdir)
+      on.exit(setwd(old), add = TRUE)
+  
+      tar(destfile, compression = "gzip")
+    })
+    invisible()
+  }
+  
+  # Extract the commit hash from a git archive. Git archives include the SHA1
+  # hash as the comment field of the tarball pax extended header
+  # (see https://www.kernel.org/pub/software/scm/git/docs/git-archive.html)
+  # For GitHub archives this should be the first header after the default one
+  # (512 byte) header.
+  renv_bootstrap_git_extract_sha1_tar <- function(bundle) {
+  
+    # open the bundle for reading
+    # We use gzcon for everything because (from ?gzcon)
+    # > Reading from a connection which does not supply a 'gzip' magic
+    # > header is equivalent to reading from the original connection
+    conn <- gzcon(file(bundle, open = "rb", raw = TRUE))
+    on.exit(close(conn))
+  
+    # The default pax header is 512 bytes long and the first pax extended header
+    # with the comment should be 51 bytes long
+    # `52 comment=` (11 chars) + 40 byte SHA1 hash
+    len <- 0x200 + 0x33
+    res <- rawToChar(readBin(conn, "raw", n = len)[0x201:len])
+  
+    if (grepl("^52 comment=", res)) {
+      sub("52 comment=", "", res)
+    } else {
+      NULL
+    }
+  }
+  
+  renv_bootstrap_install <- function(version, tarball, library) {
+  
+    # attempt to install it into project library
+    dir.create(library, showWarnings = FALSE, recursive = TRUE)
+    output <- renv_bootstrap_install_impl(library, tarball)
+  
+    # check for successful install
+    status <- attr(output, "status")
+    if (is.null(status) || identical(status, 0L))
+      return(status)
+  
+    # an error occurred; report it
+    header <- "installation of renv failed"
+    lines <- paste(rep.int("=", nchar(header)), collapse = "")
+    text <- paste(c(header, lines, output), collapse = "\n")
+    stop(text)
+  
+  }
+  
+  renv_bootstrap_install_impl <- function(library, tarball) {
+  
+    # invoke using system2 so we can capture and report output
+    bin <- R.home("bin")
+    exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R"
+    R <- file.path(bin, exe)
+  
+    args <- c(
+      "--vanilla", "CMD", "INSTALL", "--no-multiarch",
+      "-l", shQuote(path.expand(library)),
+      shQuote(path.expand(tarball))
+    )
+  
+    system2(R, args, stdout = TRUE, stderr = TRUE)
+  
+  }
+  
+  renv_bootstrap_platform_prefix <- function() {
+  
+    # construct version prefix
+    version <- paste(R.version$major, R.version$minor, sep = ".")
+    prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-")
+  
+    # include SVN revision for development versions of R
+    # (to avoid sharing platform-specific artefacts with released versions of R)
+    devel <-
+      identical(R.version[["status"]],   "Under development (unstable)") ||
+      identical(R.version[["nickname"]], "Unsuffered Consequences")
+  
+    if (devel)
+      prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r")
+  
+    # build list of path components
+    components <- c(prefix, R.version$platform)
+  
+    # include prefix if provided by user
+    prefix <- renv_bootstrap_platform_prefix_impl()
+    if (!is.na(prefix) && nzchar(prefix))
+      components <- c(prefix, components)
+  
+    # build prefix
+    paste(components, collapse = "/")
+  
+  }
+  
+  renv_bootstrap_platform_prefix_impl <- function() {
+  
+    # if an explicit prefix has been supplied, use it
+    prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA)
+    if (!is.na(prefix))
+      return(prefix)
+  
+    # if the user has requested an automatic prefix, generate it
+    auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA)
+    if (is.na(auto) && getRversion() >= "4.4.0")
+      auto <- "TRUE"
+  
+    if (auto %in% c("TRUE", "True", "true", "1"))
+      return(renv_bootstrap_platform_prefix_auto())
+  
+    # empty string on failure
+    ""
+  
+  }
+  
+  renv_bootstrap_platform_prefix_auto <- function() {
+  
+    prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity)
+    if (inherits(prefix, "error") || prefix %in% "unknown") {
+  
+      msg <- paste(
+        "failed to infer current operating system",
+        "please file a bug report at https://github.com/rstudio/renv/issues",
+        sep = "; "
+      )
+  
+      warning(msg)
+  
+    }
+  
+    prefix
+  
+  }
+  
+  renv_bootstrap_platform_os <- function() {
+  
+    sysinfo <- Sys.info()
+    sysname <- sysinfo[["sysname"]]
+  
+    # handle Windows + macOS up front
+    if (sysname == "Windows")
+      return("windows")
+    else if (sysname == "Darwin")
+      return("macos")
+  
+    # check for os-release files
+    for (file in c("/etc/os-release", "/usr/lib/os-release"))
+      if (file.exists(file))
+        return(renv_bootstrap_platform_os_via_os_release(file, sysinfo))
+  
+    # check for redhat-release files
+    if (file.exists("/etc/redhat-release"))
+      return(renv_bootstrap_platform_os_via_redhat_release())
+  
+    "unknown"
+  
+  }
+  
+  renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) {
+  
+    # read /etc/os-release
+    release <- utils::read.table(
+      file             = file,
+      sep              = "=",
+      quote            = c("\"", "'"),
+      col.names        = c("Key", "Value"),
+      comment.char     = "#",
+      stringsAsFactors = FALSE
+    )
+  
+    vars <- as.list(release$Value)
+    names(vars) <- release$Key
+  
+    # get os name
+    os <- tolower(sysinfo[["sysname"]])
+  
+    # read id
+    id <- "unknown"
+    for (field in c("ID", "ID_LIKE")) {
+      if (field %in% names(vars) && nzchar(vars[[field]])) {
+        id <- vars[[field]]
+        break
+      }
+    }
+  
+    # read version
+    version <- "unknown"
+    for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) {
+      if (field %in% names(vars) && nzchar(vars[[field]])) {
+        version <- vars[[field]]
+        break
+      }
+    }
+  
+    # join together
+    paste(c(os, id, version), collapse = "-")
+  
+  }
+  
+  renv_bootstrap_platform_os_via_redhat_release <- function() {
+  
+    # read /etc/redhat-release
+    contents <- readLines("/etc/redhat-release", warn = FALSE)
+  
+    # infer id
+    id <- if (grepl("centos", contents, ignore.case = TRUE))
+      "centos"
+    else if (grepl("redhat", contents, ignore.case = TRUE))
+      "redhat"
+    else
+      "unknown"
+  
+    # try to find a version component (very hacky)
+    version <- "unknown"
+  
+    parts <- strsplit(contents, "[[:space:]]")[[1L]]
+    for (part in parts) {
+  
+      nv <- tryCatch(numeric_version(part), error = identity)
+      if (inherits(nv, "error"))
+        next
+  
+      version <- nv[1, 1]
+      break
+  
+    }
+  
+    paste(c("linux", id, version), collapse = "-")
+  
+  }
+  
+  renv_bootstrap_library_root_name <- function(project) {
+  
+    # use project name as-is if requested
+    asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE")
+    if (asis)
+      return(basename(project))
+  
+    # otherwise, disambiguate based on project's path
+    id <- substring(renv_bootstrap_hash_text(project), 1L, 8L)
+    paste(basename(project), id, sep = "-")
+  
+  }
+  
+  renv_bootstrap_library_root <- function(project) {
+  
+    prefix <- renv_bootstrap_profile_prefix()
+  
+    path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA)
+    if (!is.na(path))
+      return(paste(c(path, prefix), collapse = "/"))
+  
+    path <- renv_bootstrap_library_root_impl(project)
+    if (!is.null(path)) {
+      name <- renv_bootstrap_library_root_name(project)
+      return(paste(c(path, prefix, name), collapse = "/"))
+    }
+  
+    renv_bootstrap_paths_renv("library", project = project)
+  
+  }
+  
+  renv_bootstrap_library_root_impl <- function(project) {
+  
+    root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA)
+    if (!is.na(root))
+      return(root)
+  
+    type <- renv_bootstrap_project_type(project)
+    if (identical(type, "package")) {
+      userdir <- renv_bootstrap_user_dir()
+      return(file.path(userdir, "library"))
+    }
+  
+  }
+  
+  renv_bootstrap_validate_version <- function(version, description = NULL) {
+  
+    # resolve description file
+    #
+    # avoid passing lib.loc to `packageDescription()` below, since R will
+    # use the loaded version of the package by default anyhow. note that
+    # this function should only be called after 'renv' is loaded
+    # https://github.com/rstudio/renv/issues/1625
+    description <- description %||% packageDescription("renv")
+  
+    # check whether requested version 'version' matches loaded version of renv
+    sha <- attr(version, "sha", exact = TRUE)
+    valid <- if (!is.null(sha))
+      renv_bootstrap_validate_version_dev(sha, description)
+    else
+      renv_bootstrap_validate_version_release(version, description)
+  
+    if (valid)
+      return(TRUE)
+  
+    # the loaded version of renv doesn't match the requested version;
+    # give the user instructions on how to proceed
+    dev <- identical(description[["RemoteType"]], "github")
+    remote <- if (dev)
+      paste("rstudio/renv", description[["RemoteSha"]], sep = "@")
+    else
+      paste("renv", description[["Version"]], sep = "@")
+  
+    # display both loaded version + sha if available
+    friendly <- renv_bootstrap_version_friendly(
+      version = description[["Version"]],
+      sha     = if (dev) description[["RemoteSha"]]
+    )
+  
+    fmt <- heredoc("
+      renv %1$s was loaded from project library, but this project is configured to use renv %2$s.
+      - Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile.
+      - Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library.
+    ")
+    catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote)
+  
+    FALSE
+  
+  }
+  
+  renv_bootstrap_validate_version_dev <- function(version, description) {
+    expected <- description[["RemoteSha"]]
+    is.character(expected) && startswith(expected, version)
+  }
+  
+  renv_bootstrap_validate_version_release <- function(version, description) {
+    expected <- description[["Version"]]
+    is.character(expected) && identical(expected, version)
+  }
+  
+  renv_bootstrap_hash_text <- function(text) {
+  
+    hashfile <- tempfile("renv-hash-")
+    on.exit(unlink(hashfile), add = TRUE)
+  
+    writeLines(text, con = hashfile)
+    tools::md5sum(hashfile)
+  
+  }
+  
+  renv_bootstrap_load <- function(project, libpath, version) {
+  
+    # try to load renv from the project library
+    if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE))
+      return(FALSE)
+  
+    # warn if the version of renv loaded does not match
+    renv_bootstrap_validate_version(version)
+  
+    # execute renv load hooks, if any
+    hooks <- getHook("renv::autoload")
+    for (hook in hooks)
+      if (is.function(hook))
+        tryCatch(hook(), error = warnify)
+  
+    # load the project
+    renv::load(project)
+  
+    TRUE
+  
+  }
+  
+  renv_bootstrap_profile_load <- function(project) {
+  
+    # if RENV_PROFILE is already set, just use that
+    profile <- Sys.getenv("RENV_PROFILE", unset = NA)
+    if (!is.na(profile) && nzchar(profile))
+      return(profile)
+  
+    # check for a profile file (nothing to do if it doesn't exist)
+    path <- renv_bootstrap_paths_renv("profile", profile = FALSE, project = project)
+    if (!file.exists(path))
+      return(NULL)
+  
+    # read the profile, and set it if it exists
+    contents <- readLines(path, warn = FALSE)
+    if (length(contents) == 0L)
+      return(NULL)
+  
+    # set RENV_PROFILE
+    profile <- contents[[1L]]
+    if (!profile %in% c("", "default"))
+      Sys.setenv(RENV_PROFILE = profile)
+  
+    profile
+  
+  }
+  
+  renv_bootstrap_profile_prefix <- function() {
+    profile <- renv_bootstrap_profile_get()
+    if (!is.null(profile))
+      return(file.path("profiles", profile, "renv"))
+  }
+  
+  renv_bootstrap_profile_get <- function() {
+    profile <- Sys.getenv("RENV_PROFILE", unset = "")
+    renv_bootstrap_profile_normalize(profile)
+  }
+  
+  renv_bootstrap_profile_set <- function(profile) {
+    profile <- renv_bootstrap_profile_normalize(profile)
+    if (is.null(profile))
+      Sys.unsetenv("RENV_PROFILE")
+    else
+      Sys.setenv(RENV_PROFILE = profile)
+  }
+  
+  renv_bootstrap_profile_normalize <- function(profile) {
+  
+    if (is.null(profile) || profile %in% c("", "default"))
+      return(NULL)
+  
+    profile
+  
+  }
+  
+  renv_bootstrap_path_absolute <- function(path) {
+  
+    substr(path, 1L, 1L) %in% c("~", "/", "\\") || (
+      substr(path, 1L, 1L) %in% c(letters, LETTERS) &&
+      substr(path, 2L, 3L) %in% c(":/", ":\\")
+    )
+  
+  }
+  
+  renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) {
+    renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv")
+    root <- if (renv_bootstrap_path_absolute(renv)) NULL else project
+    prefix <- if (profile) renv_bootstrap_profile_prefix()
+    components <- c(root, renv, prefix, ...)
+    paste(components, collapse = "/")
+  }
+  
+  renv_bootstrap_project_type <- function(path) {
+  
+    descpath <- file.path(path, "DESCRIPTION")
+    if (!file.exists(descpath))
+      return("unknown")
+  
+    desc <- tryCatch(
+      read.dcf(descpath, all = TRUE),
+      error = identity
+    )
+  
+    if (inherits(desc, "error"))
+      return("unknown")
+  
+    type <- desc$Type
+    if (!is.null(type))
+      return(tolower(type))
+  
+    package <- desc$Package
+    if (!is.null(package))
+      return("package")
+  
+    "unknown"
+  
+  }
+  
+  renv_bootstrap_user_dir <- function() {
+    dir <- renv_bootstrap_user_dir_impl()
+    path.expand(chartr("\\", "/", dir))
+  }
+  
+  renv_bootstrap_user_dir_impl <- function() {
+  
+    # use local override if set
+    override <- getOption("renv.userdir.override")
+    if (!is.null(override))
+      return(override)
+  
+    # use R_user_dir if available
+    tools <- asNamespace("tools")
+    if (is.function(tools$R_user_dir))
+      return(tools$R_user_dir("renv", "cache"))
+  
+    # try using our own backfill for older versions of R
+    envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME")
+    for (envvar in envvars) {
+      root <- Sys.getenv(envvar, unset = NA)
+      if (!is.na(root))
+        return(file.path(root, "R/renv"))
+    }
+  
+    # use platform-specific default fallbacks
+    if (Sys.info()[["sysname"]] == "Windows")
+      file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv")
+    else if (Sys.info()[["sysname"]] == "Darwin")
+      "~/Library/Caches/org.R-project.R/R/renv"
+    else
+      "~/.cache/R/renv"
+  
+  }
+  
+  renv_bootstrap_version_friendly <- function(version, shafmt = NULL, sha = NULL) {
+    sha <- sha %||% attr(version, "sha", exact = TRUE)
+    parts <- c(version, sprintf(shafmt %||% " [sha: %s]", substring(sha, 1L, 7L)))
+    paste(parts, collapse = "")
+  }
+  
+  renv_bootstrap_exec <- function(project, libpath, version) {
+    if (!renv_bootstrap_load(project, libpath, version))
+      renv_bootstrap_run(version, libpath)
+  }
+  
+  renv_bootstrap_run <- function(version, libpath) {
+  
+    # perform bootstrap
+    bootstrap(version, libpath)
+  
+    # exit early if we're just testing bootstrap
+    if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA)))
+      return(TRUE)
+  
+    # try again to load
+    if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) {
+      return(renv::load(project = getwd()))
+    }
+  
+    # failed to download or load renv; warn the user
+    msg <- c(
+      "Failed to find an renv installation: the project will not be loaded.",
+      "Use `renv::activate()` to re-initialize the project."
+    )
+  
+    warning(paste(msg, collapse = "\n"), call. = FALSE)
+  
+  }
+  
+  renv_json_read <- function(file = NULL, text = NULL) {
+  
+    jlerr <- NULL
+  
+    # if jsonlite is loaded, use that instead
+    if ("jsonlite" %in% loadedNamespaces()) {
+  
+      json <- tryCatch(renv_json_read_jsonlite(file, text), error = identity)
+      if (!inherits(json, "error"))
+        return(json)
+  
+      jlerr <- json
+  
+    }
+  
+    # otherwise, fall back to the default JSON reader
+    json <- tryCatch(renv_json_read_default(file, text), error = identity)
+    if (!inherits(json, "error"))
+      return(json)
+  
+    # report an error
+    if (!is.null(jlerr))
+      stop(jlerr)
+    else
+      stop(json)
+  
+  }
+  
+  renv_json_read_jsonlite <- function(file = NULL, text = NULL) {
+    text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n")
+    jsonlite::fromJSON(txt = text, simplifyVector = FALSE)
+  }
+  
+  renv_json_read_default <- function(file = NULL, text = NULL) {
+  
+    # find strings in the JSON
+    text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n")
+    pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]'
+    locs <- gregexpr(pattern, text, perl = TRUE)[[1]]
+  
+    # if any are found, replace them with placeholders
+    replaced <- text
+    strings <- character()
+    replacements <- character()
+  
+    if (!identical(c(locs), -1L)) {
+  
+      # get the string values
+      starts <- locs
+      ends <- locs + attr(locs, "match.length") - 1L
+      strings <- substring(text, starts, ends)
+  
+      # only keep those requiring escaping
+      strings <- grep("[[\\]{}:]", strings, perl = TRUE, value = TRUE)
+  
+      # compute replacements
+      replacements <- sprintf('"\032%i\032"', seq_along(strings))
+  
+      # replace the strings
+      mapply(function(string, replacement) {
+        replaced <<- sub(string, replacement, replaced, fixed = TRUE)
+      }, strings, replacements)
+  
+    }
+  
+    # transform the JSON into something the R parser understands
+    transformed <- replaced
+    transformed <- gsub("{}", "`names<-`(list(), character())", transformed, fixed = TRUE)
+    transformed <- gsub("[[{]", "list(", transformed, perl = TRUE)
+    transformed <- gsub("[]}]", ")", transformed, perl = TRUE)
+    transformed <- gsub(":", "=", transformed, fixed = TRUE)
+    text <- paste(transformed, collapse = "\n")
+  
+    # parse it
+    json <- parse(text = text, keep.source = FALSE, srcfile = NULL)[[1L]]
+  
+    # construct map between source strings, replaced strings
+    map <- as.character(parse(text = strings))
+    names(map) <- as.character(parse(text = replacements))
+  
+    # convert to list
+    map <- as.list(map)
+  
+    # remap strings in object
+    remapped <- renv_json_read_remap(json, map)
+  
+    # evaluate
+    eval(remapped, envir = baseenv())
+  
+  }
+  
+  renv_json_read_remap <- function(json, map) {
+  
+    # fix names
+    if (!is.null(names(json))) {
+      lhs <- match(names(json), names(map), nomatch = 0L)
+      rhs <- match(names(map), names(json), nomatch = 0L)
+      names(json)[rhs] <- map[lhs]
+    }
+  
+    # fix values
+    if (is.character(json))
+      return(map[[json]] %||% json)
+  
+    # handle true, false, null
+    if (is.name(json)) {
+      text <- as.character(json)
+      if (text == "true")
+        return(TRUE)
+      else if (text == "false")
+        return(FALSE)
+      else if (text == "null")
+        return(NULL)
+    }
+  
+    # recurse
+    if (is.recursive(json)) {
+      for (i in seq_along(json)) {
+        json[i] <- list(renv_json_read_remap(json[[i]], map))
+      }
+    }
+  
+    json
+  
+  }
+
+  # load the renv profile, if any
+  renv_bootstrap_profile_load(project)
+
+  # construct path to library root
+  root <- renv_bootstrap_library_root(project)
+
+  # construct library prefix for platform
+  prefix <- renv_bootstrap_platform_prefix()
+
+  # construct full libpath
+  libpath <- file.path(root, prefix)
+
+  # run bootstrap code
+  renv_bootstrap_exec(project, libpath, version)
+
+  invisible()
+
+})
diff --git a/preprocessing/renv/settings.json b/preprocessing/renv/settings.json
new file mode 100644
index 0000000..74369b9
--- /dev/null
+++ b/preprocessing/renv/settings.json
@@ -0,0 +1,21 @@
+{
+  "bioconductor.version": null,
+  "external.libraries": [],
+  "ignored.packages": [
+    "omopcat"
+  ],
+  "package.dependency.fields": [
+    "Imports",
+    "Depends",
+    "LinkingTo"
+  ],
+  "ppm.enabled": null,
+  "ppm.ignored.urls": [],
+  "r.version": null,
+  "snapshot.type": "implicit",
+  "use.cache": true,
+  "vcs.ignore.cellar": true,
+  "vcs.ignore.library": true,
+  "vcs.ignore.local": true,
+  "vcs.manage.ignores": true
+}
diff --git a/preprocessing/tests/testthat.R b/preprocessing/tests/testthat.R
new file mode 100644
index 0000000..b7d71c5
--- /dev/null
+++ b/preprocessing/tests/testthat.R
@@ -0,0 +1,12 @@
+# This file is part of the standard setup for testthat.
+# It is recommended that you do not modify it.
+#
+# Where should you do additional test configuration?
+# Learn more about the roles of various files in:
+# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview
+# * https://testthat.r-lib.org/articles/special-files.html
+
+library(testthat)
+library(omopcat.preprocessing)
+
+test_check("omopcat.preprocessing")
diff --git a/preprocessing/tests/testthat/setup.R b/preprocessing/tests/testthat/setup.R
new file mode 100644
index 0000000..b519a2b
--- /dev/null
+++ b/preprocessing/tests/testthat/setup.R
@@ -0,0 +1,8 @@
+withr::local_envvar(
+  EUNOMIA_DATA_FOLDER = testthat::test_path("../../data-raw/test_db"),
+  TEST_DB_NAME = "GiBleed",
+  DB_CDM_SCHEMA = "main",
+  .local_envir = testthat::teardown_env()
+)
+
+mock_cdm <- .setup_cdm_object(.envir = testthat::teardown_env())
diff --git a/preprocessing/tests/testthat/test-concepts.R b/preprocessing/tests/testthat/test-concepts.R
new file mode 100644
index 0000000..740d30e
--- /dev/null
+++ b/preprocessing/tests/testthat/test-concepts.R
@@ -0,0 +1,10 @@
+test_that("generate_concepts works on a CDM object", {
+  concepts <- generate_concepts(mock_cdm, concept_ids = c(1118088, 1569708, 3020630))
+  expect_s3_class(concepts, "data.frame")
+  expect_equal(nrow(concepts), 3)
+  expect_true(
+    all(c("concept_id", "concept_name", "vocabulary_id", "domain_id", "concept_class_id") %in% names(concepts))
+  )
+  expect_type(concepts$concept_id, "integer")
+  expect_type(concepts$concept_name, "character")
+})
diff --git a/preprocessing/tests/testthat/test-monthly_counts.R b/preprocessing/tests/testthat/test-monthly_counts.R
new file mode 100644
index 0000000..223bc49
--- /dev/null
+++ b/preprocessing/tests/testthat/test-monthly_counts.R
@@ -0,0 +1,44 @@
+test_that("generate_monthly_counts works on a CDM object", {
+  monthly_counts <- generate_monthly_counts(mock_cdm)
+  expect_s3_class(monthly_counts, "data.frame")
+  expect_true(nrow(monthly_counts) > 0)
+  expect_named(monthly_counts, c(
+    "concept_id", "concept_name", "date_year", "date_month", "record_count",
+    "person_count", "records_per_person"
+  ))
+})
+
+## Set up a mock measurement OMOP table
+## Measurements for 3 different patients on the same day, with 1 patient having 2 measurements
+measurement <- data.frame(
+  measurement_id = 1:4,
+  person_id = c(1, 1, 2, 3),
+  measurement_type_concept_id = 12345,
+  measurement_concept_id = 1,
+  measurement_date = "2020-01-01",
+  value_as_number = c(2, 1, 2, 1),
+  value_as_concept_id = 0
+)
+
+test_that("calculate_monthly_counts produces the expected results", {
+  res <- calculate_monthly_counts(measurement, measurement_concept_id, measurement_date)
+  expect_s3_class(res, "data.frame")
+  expect_named(res, c("concept_id", "date_year", "date_month", "record_count", "person_count", "records_per_person"))
+  expect_equal(nrow(res), 1)
+  expect_equal(res$person_count, 3)
+  expect_equal(res$records_per_person, 4 / 3)
+})
+
+db <- dbplyr::src_memdb()
+db_measurement <- dplyr::copy_to(db, measurement, name = "measurement", overwrite = TRUE)
+test_that("calculate_monthly_counts works on Database-stored tables", {
+  ref <- calculate_monthly_counts(measurement, measurement_concept_id, measurement_date)
+  db_res <- calculate_monthly_counts(db_measurement, measurement_concept_id, measurement_date)
+
+  expect_s3_class(db_res, "data.frame")
+  expect_named(db_res, c("concept_id", "date_year", "date_month", "record_count", "person_count", "records_per_person"))
+  expect_type(db_res$record_count, "integer")
+  expect_type(db_res$person_count, "integer")
+  expect_type(db_res$records_per_person, "double")
+  expect_identical(db_res, ref)
+})
diff --git a/preprocessing/tests/testthat/test-preprocessing.R b/preprocessing/tests/testthat/test-preprocessing.R
new file mode 100644
index 0000000..378d92b
--- /dev/null
+++ b/preprocessing/tests/testthat/test-preprocessing.R
@@ -0,0 +1,56 @@
+# Disable cli messages
+withr::local_options(usethis.quiet = TRUE, cli.default_handler = function(...) {})
+
+test_that("preprocessing produces the expected files", {
+  testthat::skip_on_ci() # avoid re-downloading example database on GHA runners
+  out_path <- tempdir()
+  expected_files <- c(
+    file.path(out_path, "omopcat_concepts.parquet"),
+    file.path(out_path, "omopcat_monthly_counts.parquet"),
+    file.path(out_path, "omopcat_summary_stats.parquet")
+  )
+  withr::defer(fs::file_delete(expected_files))
+
+  expect_no_error({
+    success <- preprocess(out_path = out_path)
+  })
+  expect_true(success)
+  expect_true(all(file.exists(expected_files)))
+})
+
+test_that("preprocessing fails without valid out_path", {
+  expect_error(preprocess(out_path = ""), "should not be empty")
+})
+
+test_that("preprocessing is skipped if files already exist", {
+  out_path <- tempdir()
+  out_files <- c(
+    file.path(out_path, "omopcat_concepts.parquet"),
+    file.path(out_path, "omopcat_monthly_counts.parquet"),
+    file.path(out_path, "omopcat_summary_stats.parquet")
+  )
+  fs::file_create(out_files)
+  withr::defer(fs::file_delete(out_files))
+
+  expect_false(preprocess(out_path = out_path))
+})
+
+test_that("preprocessing fails if envvars are missing", {
+  withr::local_envvar(
+    ENV = "prod",
+    DB_NAME = NULL,
+    HOST = NULL,
+    PORT = NULL,
+    DB_USERNAME = NULL,
+    DB_PASSWORD = NULL,
+    DB_CDM_SCHEMA = NULL
+  )
+  expect_error(preprocess(out_path = tempfile()), "not set")
+})
+
+test_that("Setting up CDM object works for non-prod data", {
+  skip_if_not_installed("duckdb")
+  withr::local_envvar(ENV = "test")
+  cdm <- .setup_cdm_object()
+  expect_equal(CDMConnector::cdmVersion(cdm), "5.3")
+})
diff --git a/preprocessing/tests/testthat/test-summary_stats.R b/preprocessing/tests/testthat/test-summary_stats.R
new file mode 100644
index 0000000..7757858
--- /dev/null
+++ b/preprocessing/tests/testthat/test-summary_stats.R
@@ -0,0 +1,46 @@
+test_that("generate_summary_stats works on a CDM object", {
+  summary_stats <- generate_summary_stats(mock_cdm)
+  expect_s3_class(summary_stats, "data.frame")
+  expect_named(summary_stats, c(
+    "concept_id", "concept_name", "summary_attribute", "value_as_number", "value_as_string"
+  ))
+})
+
+## Mock measurments for 2 concepts
+mock_measurements <- data.frame(
+  measurement_id = 1:8,
+  person_id = 1,
+  measurement_type_concept_id = rep(c(12345, 23456), each = 4),
+  measurement_concept_id = rep(c(1, 2), each = 4),
+  measurement_date = "2020-01-01",
+  value_as_number = c(2, 1, 2, 1, rep(NA, 4)),
+  value_as_concept_id = c(rep(0, 4), c(1, 1, 2, 3))
+)
+
+test_that("calculate_summary_stats produces the expected results", {
+  res <- calculate_summary_stats(mock_measurements, "measurement_concept_id")
+  expect_s3_class(res, "data.frame")
+  expect_named(res, c("concept_id", "summary_attribute", "value_as_number", "value_as_concept_id"))
+  expect_equal(nrow(res), 5)
+  mean <- res[res$summary_attribute == "mean", ][["value_as_number"]]
+  sd <- res[res$summary_attribute == "sd", ][["value_as_number"]]
+  expect_equal(mean, 1.5)
+  expect_equal(sd^2, 1 / 3)
+
+  frequencies <- res[res$summary_attribute == "frequency", ]
+  expect_equal(nrow(frequencies), 3)
+  expect_equal(frequencies$value_as_number, c(2, 1, 1))
+  expect_equal(frequencies$value_as_concept_id, c(1, 2, 3))
+})
+
+db <- dbplyr::src_memdb()
+db_measurement <- dplyr::copy_to(db, mock_measurements, name = "measurement", overwrite = TRUE)
+test_that("calculate_summary_stats works with a database-stored table", {
+  ref <- calculate_summary_stats(mock_measurements, "measurement_concept_id")
+  db_res <- calculate_summary_stats(db_measurement, "measurement_concept_id")
+
+  expect_s3_class(db_res, "data.frame")
+  expect_named(db_res, c("concept_id", "summary_attribute", "value_as_number", "value_as_concept_id"))
+  expect_identical(db_res, ref)
+  expect_type(db_res$value_as_number, "double")
+})
diff --git a/scripts/01_setup_test_db.R b/scripts/01_setup_test_db.R
deleted file mode 100644
index be5bb2f..0000000
--- a/scripts/01_setup_test_db.R
+++ /dev/null
@@ -1,22 +0,0 @@
-cli::cli_h1("Setting up test database")
-
-library(omopcat)
-
-# Create an duckdb database from Eunomia datasets
-db_path <- CDMConnector::eunomia_dir(
-  dataset_name = Sys.getenv("TEST_DB_NAME"),
-  cdm_version = Sys.getenv("TEST_DB_OMOP_VERSION"),
-  database_file = tempfile(fileext = ".duckdb")
-)
-
-con <- connect_to_test_duckdb(db_path)
-
-# Use 'cdm_from_con' to load the dataset and verify integrity
-CDMConnector::cdm_from_con(
-  con = con,
-  cdm_schema = Sys.getenv("TEST_DB_CDM_SCHEMA"),
-  write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
-  cdm_name = Sys.getenv("TEST_DB_NAME")
-)
-
-cli::cli_alert_success("Test database setup successfully")
diff --git a/scripts/02_insert_dummy_tables.R b/scripts/02_insert_dummy_tables.R
deleted file mode 100644
index e150d3c..0000000
--- a/scripts/02_insert_dummy_tables.R
+++ /dev/null
@@ -1,70 +0,0 @@
-# PRODUCED FOR A SPECIFIC DATASET:
-# synthea-allergies-10k
-# (but could work for others)
-
-cli::cli_h1("Inserting dummy tables")
-
-
-# Setup ---------------------------------------------------------------------------------------
-
-library(readr)
-library(omopcat)
-
-dir <- Sys.getenv("EUNOMIA_DATA_FOLDER")
-name <- Sys.getenv("TEST_DB_NAME")
-version <- Sys.getenv("TEST_DB_OMOP_VERSION")
-
-db_path <- glue::glue("{dir}/{name}_{version}_1.0.duckdb")
-
-con <- connect_to_test_duckdb(db_path)
-
-
-# Insert dummy tables -------------------------------------------------------------------------
-
-## Load dummy data and write tables to database
-## We explicitly set the column types for columns that are needed later down the pipeline
-dummy_measurements <- read_csv(
-  here::here("data-raw/test_db/dummy/measurement.csv"),
-  col_types = cols(
-    measurement_id = col_integer(),
-    person_id = col_integer(),
-    measurement_concept_id = col_integer(),
-    measurement_date = col_date(),
-    value_as_number = col_double(),
-    value_as_concept_id = col_integer(),
-  )
-)
-write_table(dummy_measurements, con, "measurement", schema = Sys.getenv("TEST_DB_CDM_SCHEMA"))
-
-dummy_observations <- read_csv(
-  here::here(
-    "data-raw/test_db/dummy/observation.csv"
-  ),
-  col_types = cols(
-    observation_id = col_integer(),
-    person_id = col_integer(),
-    observation_concept_id = col_integer(),
-    observation_date = col_date(),
-    value_as_number = col_double(),
-    value_as_string = col_logical(),
-    value_as_concept_id = col_integer(),
-  )
-)
-write_table(dummy_observations, con, "observation", schema = Sys.getenv("TEST_DB_CDM_SCHEMA"))
-
-# Sanity check: read the data back and make sure its consistent
-db_measurements <- DBI::dbReadTable(con, "measurement")
-stopifnot(all.equal(db_measurements, as.data.frame(dummy_measurements)))
-
-db_observations <- DBI::dbReadTable(con, "observation")
-stopifnot(all.equal(db_observations, as.data.frame(dummy_observations)))
-
-# Load the CMD object to verify integrity of the schema after insertions
-cdm <- CDMConnector::cdm_from_con(
-  con = con,
-  cdm_schema = Sys.getenv("TEST_DB_CDM_SCHEMA"),
-  write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
-  cdm_name = name
-)
-
-cli::cli_alert_success("Dummy tables inserted successfully")
diff --git a/scripts/03_analyse_omop_cdm.R b/scripts/03_analyse_omop_cdm.R
deleted file mode 100644
index 999a883..0000000
--- a/scripts/03_analyse_omop_cdm.R
+++ /dev/null
@@ -1,63 +0,0 @@
-cli::cli_h1("Generating summarys statistics")
-
-
-# Setup ---------------------------------------------------------------------------------------
-
-library(omopcat)
-
-if (Sys.getenv("ENV") == "prod") {
-  fs::dir_create({
-    out_path <- file.path(Sys.getenv("OMOPCAT_DATA_PATH"))
-  })
-  cli::cli_alert_info("out_path set to {out_path}")
-
-  name <- Sys.getenv("DB_NAME")
-  con <- connect_to_db(
-    RPostgres::Postgres(),
-    dbname = Sys.getenv("DB_NAME"),
-    host = Sys.getenv("HOST"),
-    port = Sys.getenv("PORT"),
-    user = Sys.getenv("DB_USERNAME"),
-    password = Sys.getenv("DB_PASSWORD")
-  )
-} else {
-  dir <- Sys.getenv("EUNOMIA_DATA_FOLDER")
-  name <- Sys.getenv("TEST_DB_NAME")
-  version <- Sys.getenv("TEST_DB_OMOP_VERSION")
-
-  db_path <- glue::glue("{dir}/{name}_{version}_1.0.duckdb")
-  con <- connect_to_test_duckdb(db_path)
-
-  fs::dir_create(
-    out_path <- here::here("data/test_data")
-  )
-}
-
-# Calculate summary stats ---------------------------------------------------------------------
-
-# Load the data in a CDMConnector object
-cdm <- CDMConnector::cdm_from_con(
-  con = con,
-  cdm_schema = Sys.getenv("DB_CDM_SCHEMA"),
-  write_schema = Sys.getenv("DB_CDM_SCHEMA"),
-  cdm_name = name
-)
-
-cli::cli_progress_step("Calculating monthly counts...")
-monthly_counts <- process_monthly_counts(cdm)
-cli::cli_progress_step("Calculating summary statistics...")
-summary_stats <- process_summary_stats(cdm)
-ids <- unique(c(monthly_counts$concept_id, summary_stats$concept_id))
-concepts_table <- query_concepts_table(cdm, ids)
-
-all_tables <- list(
-  concepts = concepts_table,
-  monthly_counts = monthly_counts,
-  summary_stats = summary_stats
-)
-paths <- purrr::map_chr(names(all_tables), ~ file.path(out_path, glue::glue("omopcat_{.x}.parquet")))
-
-# Write the tables to disk as parquet
-purrr::walk2(all_tables, paths, ~ nanoparquet::write_parquet(.x, .y))
-
-cli::cli_alert_success("Summary statistics generated successfully and written to {.file {paths}}")
diff --git a/scripts/04_produce_dev_data.R b/scripts/04_produce_dev_data.R
deleted file mode 100644
index 459e119..0000000
--- a/scripts/04_produce_dev_data.R
+++ /dev/null
@@ -1,43 +0,0 @@
-cli::cli_h1("Producing test data")
-
-
-# Setup ---------------------------------------------------------------------------------------
-
-suppressPackageStartupMessages({
-  library(dplyr)
-  library(omopcat)
-})
-
-data_path <- here::here("data/test_data")
-stopifnot(dir.exists(data_path))
-out_path <- here::here("inst/dev_data")
-stopifnot(dir.exists(out_path))
-
-# Produce test data ---------------------------------------------------------------------------
-
-# Get the relevant tables and filter
-table_names <- c("concepts", "monthly_counts", "summary_stats")
-paths <- glue::glue("{data_path}/omopcat_{table_names}.parquet")
-tables <- purrr::map(paths, read_parquet_sorted)
-names(tables) <- table_names
-
-# Keep only concepts for which we have summary statistics
-keep_concepts <- tables$summary_stats$concept_id
-tables <- purrr::map(tables, ~ .x[.x$concept_id %in% keep_concepts, ])
-
-# Keep only data from 2019 onwards
-monthly_counts <- tables$monthly_counts
-filtered_monthly <- monthly_counts[monthly_counts$date_year >= 2019, ]
-tables$monthly_counts <- filtered_monthly
-
-# Filter the other tables to match the concepts left over after year filtering
-tables <- purrr::map(tables, ~ .x[.x$concept_id %in% filtered_monthly$concept_id, ])
-
-# Write all results to the test data folder
-purrr::iwalk(tables, function(tbl, name) {
-  path <- glue::glue("{out_path}/omopcat_{name}.csv")
-  cli::cli_alert_info("Writing {name} to {path}")
-  readr::write_csv(tbl, file = path)
-})
-
-cli::cli_alert_success("Test data produced")
diff --git a/scripts/create_dev_data.R b/scripts/create_dev_data.R
index ee6cd8b..96d6306 100644
--- a/scripts/create_dev_data.R
+++ b/scripts/create_dev_data.R
@@ -1,20 +1,55 @@
-# Master script to set up the test data
-# Generates the dummy data in `inst/test_data` for running the app in dev mode by calling
-# the relevant scripts in the correct order.
-
-here::i_am("scripts/create_dev_data.R")
-
-Sys.setenv("ENV" = "dev")
-# Path to download Eunomia datasets
-Sys.setenv(EUNOMIA_DATA_FOLDER = file.path("data-raw/test_db/eunomia"))
-# Name of the synthetic dataset to use
-Sys.setenv(TEST_DB_NAME = "synthea-allergies-10k")
-# OMOP CDM version
-Sys.setenv(TEST_DB_OMOP_VERSION = "5.3")
-# Schema name for data and results
-Sys.setenv(DB_CDM_SCHEMA = "main")
-
-source(here::here("scripts/01_setup_test_db.R"))
-source(here::here("scripts/02_insert_dummy_tables.R"))
-source(here::here("scripts/03_analyse_omop_cdm.R"))
-source(here::here("scripts/04_produce_dev_data.R"))
+# Setup ---------------------------------------------------------------------------------------
+
+suppressPackageStartupMessages({
+  library(dplyr)
+})
+
+data_path <- here::here("data/test_data")
+stopifnot(dir.exists(data_path))
+out_path <- here::here("inst/dev_data")
+stopifnot(dir.exists(out_path))
+
+# Produce test data ---------------------------------------------------------------------------
+
+#' Read a parquet table and sort the results
+#'
+#' @param path path to the parquet file to be read
+#' @inheritParams nanoparquet::read_parquet
+#'
+#' @return A `data.frame` with the results sorted by all columns
+#' @importFrom dplyr arrange across everything
+read_parquet_sorted <- function(path, options = nanoparquet::parquet_options()) {
+  if (!file.exists(path)) {
+    cli::cli_abort("File {.file {path}} not found")
+  }
+
+  nanoparquet::read_parquet(path, options) |>
+    arrange(across(everything()))
+}
+
+# Get the relevant tables and filter
+table_names <- c("concepts", "monthly_counts", "summary_stats")
+paths <- glue::glue("{data_path}/omopcat_{table_names}.parquet")
+tables <- purrr::map(paths, read_parquet_sorted)
+names(tables) <- table_names
+
+# Keep only concepts for which we have summary statistics
+keep_concepts <- tables$summary_stats$concept_id
+tables <- purrr::map(tables, ~ .x[.x$concept_id %in% keep_concepts, ])
+
+# Keep only data from 2019 onwards
+monthly_counts <- tables$monthly_counts
+filtered_monthly <- monthly_counts[monthly_counts$date_year >= 2019, ]
+tables$monthly_counts <- filtered_monthly
+
+# Filter the other tables to match the concepts left over after year filtering
+tables <- purrr::map(tables, ~ .x[.x$concept_id %in% filtered_monthly$concept_id, ])
+
+# Write all results to the test data folder
+purrr::iwalk(tables, function(tbl, name) {
+  path <- glue::glue("{out_path}/omopcat_{name}.csv")
+  cli::cli_alert_info("Writing {name} to {path}")
+  readr::write_csv(tbl, file = path)
+})
+
+cli::cli_alert_success("Test data produced")
diff --git a/scripts/create_prod_data.R b/scripts/create_prod_data.R
deleted file mode 100644
index b3c68a8..0000000
--- a/scripts/create_prod_data.R
+++ /dev/null
@@ -1,50 +0,0 @@
-# Master script to create the prod data
-# Generates the `data/prod_data/*.parquet` files
-
-here::i_am("scripts/create_prod_data.R")
-
-Sys.setenv("ENV" = "prod")
-
-required_envvars <- c(
-  "DB_NAME",
-  "HOST",
-  "PORT",
-  "DB_USERNAME",
-  "DB_PASSWORD",
-  "OMOPCAT_DATA_PATH"
-)
-
-out_path <- Sys.getenv("OMOPCAT_DATA_PATH")
-
-check_envvars <- function(x) {
-  missing <- Sys.getenv(x) == ""
-  if (any(missing)) {
-    cli::cli_abort(c(
-      "x" = "Environment variable{?s} {.envvar {x[missing]}} not set",
-      "i" = "Make sure to define the environment variables in a local {.file .Renviron} file"
-    ), call = rlang::caller_env())
-  }
-}
-check_envvars(required_envvars)
-
-expected_files <- c(
-  file.path(out_path, "omopcat_concepts.parquet"),
-  file.path(out_path, "omopcat_monthly_counts.parquet"),
-  file.path(out_path, "omopcat_summary_stats.parquet")
-)
-
-# Only run pre-processing if the expected files don't exist
-exists <- file.exists(expected_files)
-if (!all(exists)) {
-  source(here::here("scripts/03_analyse_omop_cdm.R"))
-  # Sanity check: make sure the expected files were created
-  purrr::walk(expected_files, function(path) {
-    if (!file.exists(path)) {
-      cli::cli_abort("Expected file not found: {.file {path}}")
-    }
-  })
-} else {
-  cli::cli_alert_info("All expected files already exist. Skipping pre-processing.")
-  cli::cli_alert_info("To force re-processing, delete the following files:")
-  cli::cli_ul(expected_files[exists])
-}
diff --git a/scripts/create_test_data.R b/scripts/create_test_data.R
new file mode 100644
index 0000000..72374a6
--- /dev/null
+++ b/scripts/create_test_data.R
@@ -0,0 +1,13 @@
+# Generates the parquet files in data/test_data/ by running the preprocessing pipeilne
+# on the test database located at data-raw/test_db/eunomia
+withr::local_envvar(
+  ENV = "test",
+  EUNOMIA_DATA_FOLDER = here::here("data-raw/test_db/eunomia"),
+  DB_NAME = "synthea-allergies-10k",
+  DB_CDM_SCHEMA = "main"
+)
+
+out_path <- here::here("data/test_data")
+omopcat.preprocessing::preprocess(out_path)
+
+cli::cli_alert_success("Test data written to {out_path}")
diff --git a/tests/testthat/test-utils_preprocessing_summarise.R b/tests/testthat/test-utils_preprocessing_summarise.R
deleted file mode 100644
index 2e5713c..0000000
--- a/tests/testthat/test-utils_preprocessing_summarise.R
+++ /dev/null
@@ -1,72 +0,0 @@
-## Set up a mock measurement OMOP table
-## Measurements for 3 different patients on the same day, with 1 patient having 2 measurements
-measurement <- data.frame(
-  measurement_id = 1:4,
-  person_id = c(1, 1, 2, 3),
-  measurement_type_concept_id = 12345,
-  measurement_concept_id = 1,
-  measurement_date = "2020-01-01",
-  value_as_number = c(2, 1, 2, 1),
-  value_as_concept_id = 0
-)
-
-test_that("calculate_monthly_counts produces the expected results", {
-  res <- calculate_monthly_counts(measurement, measurement_concept_id, measurement_date)
-  expect_s3_class(res, "data.frame")
-  expect_named(res, c("concept_id", "date_year", "date_month", "record_count", "person_count", "records_per_person"))
-  expect_equal(nrow(res), 1)
-  expect_equal(res$person_count, 3)
-  expect_equal(res$records_per_person, 4 / 3)
-})
-
-db <- dbplyr::src_memdb()
-db_measurement <- dplyr::copy_to(db, measurement, name = "measurement", overwrite = TRUE)
-test_that("calculate_monthly_counts works on Database-stored tables", {
-  ref <- calculate_monthly_counts(measurement, measurement_concept_id, measurement_date)
-  db_res <- calculate_monthly_counts(db_measurement, measurement_concept_id, measurement_date)
-
-  expect_s3_class(db_res, "data.frame")
-  expect_named(db_res, c("concept_id", "date_year", "date_month", "record_count", "person_count", "records_per_person"))
-  expect_type(db_res$record_count, "integer")
-  expect_type(db_res$person_count, "integer")
-  expect_type(db_res$records_per_person, "double")
-  expect_identical(db_res, ref)
-})
-
-test_that("calculate_summary_stats produces the expected results", {
-  res <- calculate_summary_stats(measurement, "measurement_concept_id")
-  expect_s3_class(res, "data.frame")
-  expect_named(res, c("concept_id", "summary_attribute", "value_as_number", "value_as_concept_id"))
-  expect_equal(nrow(res), 2)
-  mean <- res[res$summary_attribute == "mean", ][["value_as_number"]]
-  sd <- res[res$summary_attribute == "sd", ][["value_as_number"]]
-  expect_equal(mean, 1.5)
-  expect_equal(sd^2, 1 / 3)
-})
-
-## Add a categorical concept
-categorical_measurement <- data.frame(
-  measurement_id = 1:4,
-  person_id = c(1, 1, 2, 3),
-  measurement_type_concept_id = 12345,
-  measurement_concept_id = 2,
-  measurement_date = as.Date("2020-01-01"),
-  value_as_number = NA,
-  value_as_concept_id = c(1, 1, 2, 3)
-)
-measurement <- rbind(measurement, categorical_measurement)
-test_that("calculate_summary_stats can handle categorical concepts", {
-  res <- calculate_summary_stats(measurement, "measurement_concept_id")
-  frequencies <- res[res$summary_attribute == "frequency", ]
-  expect_equal(nrow(frequencies), 3)
-  expect_equal(frequencies$value_as_number, c(2, 1, 1))
-  expect_equal(frequencies$value_as_concept_id, c(1, 2, 3))
-})
-
-db <- dbplyr::src_memdb()
-db_measurement <- dplyr::copy_to(db, measurement, name = "measurement", overwrite = TRUE)
-test_that("calculate_summary_stats works with a database-stored table", {
-  res <- calculate_summary_stats(db_measurement, "measurement_concept_id")
-  expect_s3_class(res, "data.frame")
-  expect_named(res, c("concept_id", "summary_attribute", "value_as_number", "value_as_concept_id"))
-})