SAFEHR-data · milanmlft · Sep 4, 2024 · Aug 30, 2024 · Aug 30, 2024 · Aug 30, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -2,13 +2,15 @@
 ^renv\.lock$
 ^.*\.Rproj$
 ^\.Rproj\.user$
-^data-raw$
 dev_history.R
-^dev$
 $run_dev.*
 ^.here$
 ^LICENSE\.md$
 ^\.github$
 ^\.lintr$
 ^\.renvignore$
+^data$
+^data-raw$
 ^deploy$
+^dev$
+^scripts$
diff --git a/.Rprofile b/.Rprofile
@@ -12,14 +12,3 @@ if (interactive()) {
 }
 
 source("renv/activate.R")
-
-# Path to download Eunomia datasets
-Sys.setenv(EUNOMIA_DATA_FOLDER = file.path("dev/test_db/eunomia"))
-# Name of the synthetic dataset to use
-Sys.setenv(TEST_DB_NAME = "synthea-allergies-10k")
-# OMOP CDM version
-Sys.setenv(TEST_DB_OMOP_VERSION = "5.3")
-# Schema name for data
-Sys.setenv(TEST_DB_CDM_SCHEMA = "main")
-# Schema name for results
-Sys.setenv(TEST_DB_RESULTS_SCHEMA = "main")
diff --git a/.lintr b/.lintr
@@ -1,5 +1,6 @@
 linters: linters_with_defaults(
     line_length_linter(120),
-    object_name_linter(styles = c("snake_case", "symbols", "camelCase"))
+    object_name_linter(styles = c("snake_case", "symbols", "camelCase")),
+    object_length_linter(NULL)
   )
 encoding: "UTF-8"
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -20,7 +20,8 @@ Imports:
     readr,
     lubridate,
     dplyr,
-    cli
+    cli,
+    nanoparquet
 Suggests:
     devtools,
     usethis,
@@ -29,9 +30,12 @@ Suggests:
     spelling,
     here,
     CDMConnector,
-    lintr
+    lintr,
+    dbplyr,
+    RSQLite
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.2
 Config/testthat/edition: 3
 Language: en-US
+Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,8 +1,28 @@
 # Generated by roxygen2: do not edit by hand
 
+export(calculate_monthly_counts)
+export(calculate_summary_stats)
+export(connect_to_db)
+export(read_parquet_sorted)
 export(run_app)
+export(write_table)
 import(bslib)
 import(shiny)
+importFrom(dplyr,across)
+importFrom(dplyr,all_of)
+importFrom(dplyr,arrange)
+importFrom(dplyr,bind_rows)
+importFrom(dplyr,collect)
+importFrom(dplyr,count)
+importFrom(dplyr,everything)
+importFrom(dplyr,filter)
+importFrom(dplyr,group_by)
+importFrom(dplyr,mutate)
+importFrom(dplyr,n)
+importFrom(dplyr,n_distinct)
+importFrom(dplyr,rename)
+importFrom(dplyr,select)
+importFrom(dplyr,summarise)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,geom_bar)
 importFrom(ggplot2,geom_boxplot)
@@ -20,3 +40,4 @@ importFrom(golem,with_golem_options)
 importFrom(shiny,NS)
 importFrom(shiny,shinyApp)
 importFrom(shiny,tagList)
+importFrom(stats,sd)
diff --git a/R/run_app.R b/R/run_app.R
@@ -33,7 +33,7 @@ run_app <- function(
 }
 
 .check_env <- function() {
-  required <- c("CALYPSO_DATA_PATH", "CALYPSO_DB_NAME", "CALYPSO_DB_OMOP_VERSION")
+  required <- "CALYPSO_DATA_PATH"
   missing <- required[!required %in% names(Sys.getenv())]
   if (length(missing) > 0) {
     cli::cli_abort("The following environment variables are missing: {.envvar {missing}}")

diff --git a/R/utils-preprocessing-db.R b/R/utils-preprocessing-db.R
@@ -0,0 +1,57 @@
+#' Connect to duckdb database
+#'
+#' @param db_path path to the duckdb database file
+#' @param ... unused
+#' @param .envir passed on to [`withr::defer()`]
+#'
+#' @return A [`DBI::DBIConnection-class`] object
+#' @export
+connect_to_db <- function(db_path, ..., .envir = parent.frame()) {
+  if (!file.exists(db_path)) {
+    cli::cli_abort("Database file {.file {db_path}} not found")
+  }
+
+  # Connect to the duckdb test database
+  con <- DBI::dbConnect(
+    duckdb::duckdb(dbdir = db_path)
+  )
+  withr::defer(DBI::dbDisconnect(con), envir = .envir)
+  con
+}
+
+
+#' Write data to a table in the database
+#'
+#' @param data data.frame, data to be written to the table
+#' @param con A [`DBI::DBIConnection-class`] object
+#' @param table character, name of the table to write to
+#' @param schema character, name of the schema to be used
+#'
+#' @return `TRUE`, invisibly, if the operation was successful
+#' @export
+write_table <- function(data, con, table, schema) {
+  DBI::dbWriteTable(
+    conn = con,
+    name = DBI::Id(schema = schema, table = table),
+    value = data,
+    overwrite = TRUE
+  )
+}
+
+
+#' Read a parquet table and sort the results
+#'
+#' @param path path to the parquet file to be read
+#' @inheritParams nanoparquet::read_parquet
+#'
+#' @return A `data.frame` with the results sorted by all columns
+#' @export
+#' @importFrom dplyr arrange across everything
+read_parquet_sorted <- function(path, options = nanoparquet::parquet_options()) {
+  if (!file.exists(path)) {
+    cli::cli_abort("File {.file {path}} not found")
+  }
+
+  nanoparquet::read_parquet(path, options) |>
+    arrange(across(everything()))
+}
diff --git a/R/utils-preprocessing-summarise.R b/R/utils-preprocessing-summarise.R
@@ -0,0 +1,110 @@
+#' Calculate monthly statistics for an OMOP concept
+#'
+#' @param omop_table A table from the OMOP CDM
+#' @param concept The name of the concept column to calculate statistics for
+#' @param date The name of the date column to calculate statistics for
+#'
+#' @return A `data.frame` with the following columns:
+#'   - `concept_id`: The concept ID
+#'   - `concept_name`: The concept name
+#'   - `date_year`: The year of the date
+#'   - `date_month`: The month of the date
+#'   - `person_count`: The number of unique patients per concept for each month
+#'   - `records_per_person`: The average number of records per person per concept for each month
+#' @export
+#' @importFrom dplyr mutate group_by summarise select n n_distinct collect
+calculate_monthly_counts <- function(omop_table, concept, date) {
+  # Extract year and month from date column
+  omop_table <- mutate(omop_table,
+    concept_id = {{ concept }},
+    date_year = lubridate::year({{ date }}),
+    date_month = lubridate::month({{ date }})
+  )
+
+  date_year <- date_month <- concept_id <- person_id <- person_count <- records_per_person <- NULL
+  omop_table |>
+    group_by(date_year, date_month, concept_id) |>
+    summarise(
+      person_count = n_distinct(person_id),
+      records_per_person = n() / n_distinct(person_id)
+    ) |>
+    select(
+      concept_id,
+      date_year,
+      date_month,
+      person_count,
+      records_per_person
+    ) |>
+    ## Collect in case we're dealing with a database-stored table
+    collect()
+}
+
+#' Calculate summary statistics for an OMOP table
+#'
+#' Calculates the mean snd standard deviation for numeric concepts and the
+#' frequency for categorical concepts.
+#'
+#' @param omop_table A table from the OMOP CDM
+#' @param concept_name The name of the concept ID column
+#'
+#' @return A `data.frame` with the following columns:
+#'  - `concept_id`: The concept ID
+#'  - `summary_attribute`: The summary attribute (e.g. "mean", "sd", "frequency")
+#'  - `value_as_number`: The value of the summary attribute
+#'  - `value_as_concept_id`: In case of a categorical concept, the concept ID for each category
+#' @export
+#' @importFrom dplyr all_of rename filter collect bind_rows
+calculate_summary_stats <- function(omop_table, concept_name) {
+  stopifnot(is.character(concept_name))
+
+  omop_table <- rename(omop_table, concept_id = all_of(concept_name))
+
+  ## Avoid "no visible binding" notes
+  value_as_number <- value_as_concept_id <- NULL
+
+  numeric_concepts <- filter(omop_table, !is.na(value_as_number))
+  # beware CDM docs: NULL=no categorical result, 0=categorical result but no mapping
+  categorical_concepts <- filter(omop_table, !is.null(value_as_concept_id) & value_as_concept_id != 0)
+
+  numeric_stats <- .summarise_numeric_concepts(numeric_concepts) |> collect()
+  categorical_stats <- .summarise_categorical_concepts(categorical_concepts) |> collect()
+  bind_rows(numeric_stats, categorical_stats)
+}
+
+#' @importFrom dplyr group_by summarise
+#' @importFrom stats sd
+.summarise_numeric_concepts <- function(omop_table) {
+  value_as_number <- concept_id <- NULL
+
+  # Calculate mean and sd
+  stats <- omop_table |>
+    group_by(concept_id) |>
+    summarise(mean = mean(value_as_number, na.rm = TRUE), sd = sd(value_as_number, na.rm = TRUE))
+
+  # Wrangle output to expected format
+  stats |>
+    tidyr::pivot_longer(
+      cols = c(mean, sd),
+      names_to = "summary_attribute",
+      values_to = "value_as_number"
+    )
+}
+
+#' @importFrom dplyr count mutate select
+.summarise_categorical_concepts <- function(omop_table) {
+  concept_id <- value_as_concept_id <- summary_attribute <- NULL
+
+  # Calculate frequencies
+  frequencies <- omop_table |>
+    count(concept_id, value_as_concept_id)
+
+  # Wrangle output into the expected format
+  frequencies |>
+    mutate(summary_attribute = "frequency") |>
+    select(
+      concept_id,
+      summary_attribute,
+      value_as_number = n,
+      value_as_concept_id
+    )
+}
diff --git a/R/utils_get_data.R b/R/utils_get_data.R
@@ -6,46 +6,39 @@
 get_concepts_table <- function() {
   if (golem::app_dev()) {
     return(
-      readr::read_csv(app_sys("test_data", "calypso_concepts.csv"), show_col_types = FALSE)
+      readr::read_csv(app_sys("dev_data", "calypso_concepts.csv"), show_col_types = FALSE)
     )
   }
-  .read_db_table("calypso_concepts")
+  .read_parquet_table("calypso_concepts")
 }
 
 get_monthly_counts <- function() {
   if (golem::app_dev()) {
     return(
-      readr::read_csv(app_sys("test_data", "calypso_monthly_counts.csv"), show_col_types = FALSE)
+      readr::read_csv(app_sys("dev_data", "calypso_monthly_counts.csv"), show_col_types = FALSE)
     )
   }
-  .read_db_table("calypso_monthly_counts")
+  .read_parquet_table("calypso_monthly_counts")
 }
 
 get_summary_stats <- function() {
   if (golem::app_dev()) {
     return(
-      readr::read_csv(app_sys("test_data", "calypso_summary_stats.csv"), show_col_types = FALSE)
+      readr::read_csv(app_sys("dev_data", "calypso_summary_stats.csv"), show_col_types = FALSE)
     )
   }
-  .read_db_table("calypso_summary_stats")
+  .read_parquet_table("calypso_summary_stats")
 }
 
-.connect_to_db <- function() {
-  dir <- Sys.getenv("CALYPSO_DATA_PATH")
-  name <- Sys.getenv("CALYPSO_DB_NAME")
-  version <- Sys.getenv("CALYPSO_DB_OMOP_VERSION")
 
-  db_file <- glue::glue("{dir}/{name}_{version}_1.0.duckdb")
-  if (!file.exists(db_file)) {
-    cli::cli_abort("Database file {.file {db_file}} does not exist.")
+.read_parquet_table <- function(table_name) {
+  data_dir <- Sys.getenv("CALYPSO_DATA_PATH")
+  if (data_dir == "") {
+    cli::cli_abort("Environment variable {.envvar CALYPSO_DATA_PATH} not set")
+  }
+  if (!dir.exists(data_dir)) {
+    cli::cli_abort("Data directory {.file {data_dir}} not found")
   }
 
-  # Connect to the duckdb database
-  DBI::dbConnect(duckdb::duckdb(dbdir = db_file))
-}
-
-.read_db_table <- function(table_name) {
-  con <- .connect_to_db()
-  withr::defer(DBI::dbDisconnect(con))
-  DBI::dbReadTable(con, table_name)
+  nanoparquet::read_parquet(glue::glue("{data_dir}/{table_name}.parquet"))
 }
diff --git a/README.md b/README.md
@@ -45,11 +45,10 @@ as it has good support for R package development and Shiny.
     install.packages("renv")
     renv::restore()
     ```
-3. Create the [duckdb](https://github.com/duckdb/duckdb) test database and run the analyses by running from an R console in the project directory (test dataset properties can be updated in the [`.Rprofile`](https://github.com/SAFEHR-data/omop-data-catalogue/blob/main/.Rprofile) file):
+3. Create the [duckdb](https://github.com/duckdb/duckdb) test database and run the analyses by running from an R console in the project directory:
 
     ```r
-    source(here::here("dev/test_db/setup_test_db.R"))
-    source(here::here("dev/omop_analyses/analyse_omop_cdm.R"))
+    source(here::here("scripts/create_dev_data.R"))
     ```
 
 4. To preview the app locally, run the following from an R console within the project directory:
@@ -60,16 +59,29 @@ as it has good support for R package development and Shiny.
 
 The `dev/02_dev.R` script contains a few helper functions to get you started.
 
-Calypso test data can be found in [`inst/test_data`](https://github.com/SAFEHR-data/omop-data-catalogue/tree/main/inst/data). These data have been generated by using the synthetic dataset '[synthea-allergies-10k](https://darwin-eu.github.io/CDMConnector/reference/eunomiaDir.html)', and adding some [dummy data](https://github.com/SAFEHR-data/omop-data-catalogue/tree/main/dev/test_db/dummy) for the MEASUREMENT and OBSERVATION tables (to have some records in the 'calypso-summary-stats' table).
+The test data can be found in [`inst/dev_data`](https://github.com/SAFEHR-data/omop-data-catalogue/tree/main/inst/data). These data have been generated by using the synthetic dataset '[synthea-allergies-10k](https://darwin-eu.github.io/CDMConnector/reference/eunomiaDir.html)', and adding some [dummy data](https://github.com/SAFEHR-data/omop-data-catalogue/tree/main/dev/test_db/dummy) for the MEASUREMENT and OBSERVATION tables (to have some records in the 'calypso-summary-stats' table).
 
-If you want to recreate a test dataset, you can run the following R scripts:
 
-```r
-source(here::here("dev/test_db/setup_test_db.R"))
-source(here::here("dev/test_db/insert_dummy_tables.R"))
-source(here::here("dev/omop_analyses/analyse_omop_cdm.R"))
-source(here::here("dev/test_db/produce_test_data.R"))
-```
+### File structure
+
+This repo is organised as an R package with a few additional directories used for deployment of the
+Shiny app:
+
+- `R/`: contains the R source code for the package
+- `inst/`: configuration files and dummy data for the app
+    - `dev_data/`: dummy data for the app to use during development
+    - `app/wwww`: static files (e.g. CSS, JavaScript) for the app
+- `man/`: documentation files for the package, generated by `{roxygen2}`
+- `tests/`: unit tests for the package, written with `{testthat}`
+
+The directories _not_ included in the package (i.e. listed in `.Rbuildignore`) but used for deployment and data pre-processing:
+
+- `data-raw/test_db`: the source data for generating the test data
+- `data/test_data`: test data parquet files mimicking what real data would look like to run the app in production
+- `dev/`: contains scripts and helper functions for development
+- `deploy/`: contains Docker files and scripts for deployment
+- `renv/`: contains the `renv` library, managed by `{renv}`
+- `scripts/`: contains scripts for data pre-processing and generating the test and dev data
 
 ### Updating the `renv` lockfile
 

diff --git a/dev/test_db/dummy/measurement.csv → data-raw/test_db/dummy/measurement.csv b/dev/test_db/dummy/measurement.csv → data-raw/test_db/dummy/measurement.csv
diff --git a/dev/test_db/dummy/observation.csv → data-raw/test_db/dummy/observation.csv b/dev/test_db/dummy/observation.csv → data-raw/test_db/dummy/observation.csv
diff --git a/dev/test_db/eunomia/.gitignore → data-raw/test_db/eunomia/.gitignore b/dev/test_db/eunomia/.gitignore → data-raw/test_db/eunomia/.gitignore
diff --git a/data/prod_data/.gitignore b/data/prod_data/.gitignore
@@ -0,0 +1 @@
+*.parquet
diff --git a/data/test_data/calypso_concepts.parquet b/data/test_data/calypso_concepts.parquet
diff --git a/data/test_data/calypso_monthly_counts.parquet b/data/test_data/calypso_monthly_counts.parquet
diff --git a/data/test_data/calypso_summary_stats.parquet b/data/test_data/calypso_summary_stats.parquet
diff --git a/deploy/.env.sample b/deploy/.env.sample
@@ -1,4 +1 @@
-GOLEM_CONFIG_ACTIVE=production  # production or dev
-CALYPSO_DATA_PATH=dev/test_db/eunomia
-CALYPSO_DB_NAME=synthea-allergies-10k
-CALYPSO_DB_OMOP_VERSION=5.3
+CALYPSO_DATA_PATH=data/test_data
diff --git a/deploy/calypso_0.0.0.9000.tar.gz b/deploy/calypso_0.0.0.9000.tar.gz