SAFEHR-data · BaptisteBR · Aug 15, 2024 · Aug 14, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/.Rprofile b/.Rprofile
@@ -12,3 +12,14 @@ if (interactive()) {
 }
 
 source("renv/activate.R")
+
+# Path to download Eunomia datasets
+Sys.setenv(EUNOMIA_DATA_FOLDER = here::here("dev/test_db/eunomia"))
+# Name of the synthetic dataset to use
+Sys.setenv(TEST_DB_NAME = "GiBleed")
+# OMOP CDM version
+Sys.setenv(TEST_DB_OMOP_VERSION = "5.3")
+# Schema name for data
+Sys.setenv(TEST_DB_CDM_SCHEMA = "main")
+# Schema name for results
+Sys.setenv(TEST_DB_RESULTS_SCHEMA = "main")
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,13 +10,19 @@ Imports:
     DT,
     ggplot2,
     golem (>= 0.4.1),
-    shiny (>= 1.9.1)
+    shiny (>= 1.9.1),
+    DBI,
+    duckdb,
+    glue
 Suggests:
     devtools,
     usethis,
     styler,
     testthat (>= 3.0.0),
-    spelling
+    spelling,
+    here,
+    CDMConnector,
+    lubridate
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.2

diff --git a/R/fct_connect_to_test_db.R b/R/fct_connect_to_test_db.R
@@ -0,0 +1,19 @@
+#' connect_to_test_db
+#'
+#' @description Connect to the test database
+#' @description (using environment variables from .Rprofile)
+#'
+#' @return A DBI connection to the test database
+#'
+#' @importFrom DBI dbConnect
+#' @importFrom duckdb duckdb
+#' @importFrom glue glue
+#'
+#' @noRd
+connect_to_test_db <- function() {
+  dir <- Sys.getenv("EUNOMIA_DATA_FOLDER")
+  name <- Sys.getenv("TEST_DB_NAME")
+  version <- Sys.getenv("TEST_DB_OMOP_VERSION")
+  # Connect to the duckdb test database
+  dbConnect(duckdb(dbdir = glue("{dir}/{name}_{version}_1.0.duckdb")))
+}
diff --git a/README.md b/README.md
@@ -35,8 +35,14 @@ as it has good support for R package development and Shiny.
     install.packages("renv")
     renv::restore()
     ```
+3. Create the [duckdb](https://github.com/duckdb/duckdb) test database and run the analyses by running from an R console in the project directory (test dataset properties can be updated in the [`.Rprofile`](https://github.com/UCLH-Foundry/omop-data-catalogue/blob/main/.Rprofile) file):
 
-3. To preview the app locally, run the following from an R console within the project directory:
+    ```r
+    source(here::here("dev/test_db/setup_test_db.R"))
+    source(here::here("dev/omop_analyses/analyse_omop_cdm.R"))
+    ```
+
+4. To preview the app locally, run the following from an R console within the project directory:
 
     ```r
     golem::run_dev()

diff --git a/dev/omop_analyses/analyse_omop_cdm.R b/dev/omop_analyses/analyse_omop_cdm.R
@@ -0,0 +1,198 @@
+
+library(tidyverse)
+
+dir <- Sys.getenv("EUNOMIA_DATA_FOLDER")
+name <- Sys.getenv("TEST_DB_NAME")
+version <- Sys.getenv("TEST_DB_OMOP_VERSION")
+
+# Connect to the duckdb test database
+con <- DBI::dbConnect(duckdb::duckdb(
+  dbdir = glue::glue("{dir}/{name}_{version}_1.0.duckdb")))
+
+# Function to execute one or more SQL queries and clear results
+create_results_tables <- function(con, sql) {
+  # Execute sql query
+  result <- DBI::dbSendStatement(con, sql)
+  # Clear results
+  DBI::dbClearResult(result)
+}
+
+# Function to produce the 'calypso_concepts' table
+# from a list of concept ids
+analyse_concepts <- function(cdm, concepts) {
+  # Extract columns from concept table
+  cdm$concept |>
+    filter(concept_id %in% concepts) |>
+    select(
+      concept_id,
+      concept_name,
+      vocabulary_id,
+      domain_id,
+      concept_class_id,
+      standard_concept,
+      concept_code
+    ) |>
+    collect()
+}
+
+# Function to produce the 'calypso_monthly_counts' table
+analyse_monthly_counts <- function(cdm) {
+  # Function to analyse a column from a specific table
+  # for each month
+  analyse_table <- function(table, concept, date) {
+    # Extract year and month from date column
+    table <- table |>
+      mutate(
+        concept_id = {{ concept }},
+        date_year = lubridate::year({{ date }}),
+        date_month = lubridate::month({{ date }})
+      )
+    # Get total count for each month
+    total_count <- table |>
+      select(concept_id, date_year, date_month) |>
+      collect() |>
+      group_by(date_year, date_month, concept_id) |>
+      count(name = "total_count")
+    # Get number of unique patients per concept for each month
+    person_count <- table |>
+      select(concept_id, date_year, date_month, person_id) |>
+      collect() |>
+      group_by(date_year, date_month, concept_id) |>
+      reframe(person_count = n_distinct(person_id))
+    # Get number of records per person for each month
+    table |>
+      select(concept_id, date_year, date_month) |>
+      distinct() |>
+      collect() |>
+      inner_join(total_count, join_by(date_year, date_month, concept_id)) |>
+      inner_join(person_count, join_by(date_year, date_month, concept_id)) |>
+      mutate(records_per_person = (total_count / person_count)) |>
+      select(
+        concept_id,
+        date_year,
+        date_month,
+        person_count,
+        records_per_person
+      )
+  }
+  # Combine results for all tables
+  bind_rows(
+    cdm$condition_occurrence |> analyse_table(condition_concept_id, condition_start_date),
+    cdm$drug_exposure |> analyse_table(drug_concept_id, drug_exposure_start_date),
+    cdm$procedure_occurrence |> analyse_table(procedure_concept_id, procedure_date),
+    cdm$device_exposure |> analyse_table(device_concept_id, device_exposure_start_date),
+    cdm$measurement |> analyse_table(measurement_concept_id, measurement_date),
+    cdm$observation |> analyse_table(observation_concept_id, observation_date),
+    cdm$specimen |> analyse_table(specimen_concept_id, specimen_date)
+  )
+}
+
+# Function to produce the 'calypso_summary_stats' table
+analyse_summary_stats <- function(cdm) {
+  # Function to analyse a numeric column
+  # by calculation the mean and the standard deviation
+  analyse_numeric_column <- function(table, concept, value) {
+    # Rename columns and remove empty values
+    table <- table |>
+      select(concept_id = {{ concept }}, value = {{ value }}) |>
+      filter(!is.na(value)) |>
+      collect()
+    # Calculate mean
+    df_mean <- table |>
+      group_by(concept_id) |>
+      reframe(
+        summary_attribute = "mean",
+        value_as_number = mean(value)
+      )
+    # Calculate standard deviation
+    df_sd <- table |>
+      group_by(concept_id) |>
+      reframe(
+        summary_attribute = "sd",
+        value_as_number = sd(value)
+      )
+    # Combine mean and standard deviation
+    bind_rows(df_mean, df_sd)
+  }
+  # Combine results for all columns
+  bind_rows(
+    cdm$measurement |> analyse_numeric_column(measurement_concept_id, value_as_number),
+    cdm$observation |> analyse_numeric_column(observation_concept_id, value_as_number)
+  )
+}
+
+# Function to write result to the results schema
+write_results <- function(data, con, table) {
+  # Insert data into the specified table
+  # (in the results schema)
+  DBI::dbWriteTable(
+    conn = con,
+    name = DBI::Id(
+      schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
+      table = table
+    ),
+    value = data,
+    append = TRUE,
+    overwrite = FALSE
+  )
+}
+
+# Retrieve SQL query to create Calypso tables
+# (using the results schema)
+sql <- gsub(
+  "@resultsDatabaseSchema",
+  Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
+  readr::read_file(here::here("dev/omop_analyses/calypso_tables.sql"))
+)
+
+# Create the Calypso tables to the results schema
+create_results_tables(con, sql)
+
+# Load the data in a CDMConnector object
+cdm <- CDMConnector::cdm_from_con(
+  con = con,
+  cdm_schema = Sys.getenv("TEST_DB_CDM_SCHEMA"),
+  write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
+  cdm_name = name
+)
+
+# Generate monthly counts and write it to the DB
+monthly_counts <- analyse_monthly_counts(cdm)
+monthly_counts |>
+  write_results(con, "calypso_monthly_counts")
+
+# Generate summary stats and write it to the DB
+summary_stats <- analyse_summary_stats(cdm)
+summary_stats |>
+  write_results(con, "calypso_summary_stats")
+
+# Get list of concept ids
+ids <- bind_rows(
+  { monthly_counts |> select(concept_id) },
+  { summary_stats |> select(concept_id) }
+) |> distinct()
+ids <- ids$concept_id
+
+# Retrieve concept properties from the list of ids
+analyse_concepts(cdm, ids) |>
+  write_results(con, "calypso_concepts")
+
+# Clean up
+DBI::dbDisconnect(con)
+rm(create_results_tables)
+rm(analyse_concepts)
+rm(analyse_monthly_counts)
+rm(analyse_summary_stats)
+rm(write_results)
+rm(monthly_counts)
+rm(summary_stats)
+rm(ids)
+rm(cdm)
+rm(con)
+rm(sql)
+rm(dir)
+rm(name)
+rm(version)
+gc()
+
+detach("package:tidyverse", unload = TRUE)
diff --git a/dev/omop_analyses/calypso_tables.sql b/dev/omop_analyses/calypso_tables.sql
@@ -0,0 +1,28 @@
+DROP TABLE IF EXISTS @resultsDatabaseSchema.calypso_concepts;
+DROP TABLE IF EXISTS @resultsDatabaseSchema.calypso_monthly_counts;
+DROP TABLE IF EXISTS @resultsDatabaseSchema.calypso_summary_stats;
+
+CREATE TABLE @resultsDatabaseSchema.calypso_concepts (
+    concept_id BIGINT,
+    concept_name VARCHAR,
+    domain_id VARCHAR,
+    vocabulary_id VARCHAR,
+    concept_class_id VARCHAR,
+    standard_concept VARCHAR,
+    concept_code VARCHAR
+);
+
+CREATE TABLE @resultsDatabaseSchema.calypso_monthly_counts (
+    concept_id BIGINT,
+    date_year INTEGER,
+    date_month INTEGER,
+    person_count BIGINT,
+    records_per_person DOUBLE
+);
+
+CREATE TABLE @resultsDatabaseSchema.calypso_summary_stats (
+    concept_id BIGINT,
+    summary_attribute VARCHAR,
+    value_as_string VARCHAR,
+    value_as_number DOUBLE
+);
diff --git a/dev/test_db/eunomia/.gitignore b/dev/test_db/eunomia/.gitignore
@@ -0,0 +1,5 @@
+# Eunomia data
+*.zip
+
+# duckdb databases
+*.duckdb
diff --git a/dev/test_db/setup_test_db.R b/dev/test_db/setup_test_db.R
@@ -0,0 +1,24 @@
+
+# Create an duckdb database from Eunomia datasets
+con <- DBI::dbConnect(
+  duckdb::duckdb(
+    dbdir = CDMConnector::eunomia_dir(
+      dataset_name = Sys.getenv("TEST_DB_NAME"),
+      cdm_version = Sys.getenv("TEST_DB_OMOP_VERSION"),
+      database_file = tempfile(fileext = ".duckdb")
+    )
+  )
+)
+
+# Use 'cdm_from_con' to load the dataset and verify integrity
+CDMConnector::cdm_from_con(
+  con = con,
+  cdm_schema = Sys.getenv("TEST_DB_CDM_SCHEMA"),
+  write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
+  cdm_name = Sys.getenv("TEST_DB_NAME")
+)
+
+# Clean up
+DBI::dbDisconnect(con)
+rm(con)
+gc()