Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Summary statistics #22

Merged
merged 11 commits into from
Aug 15, 2024
11 changes: 11 additions & 0 deletions .Rprofile
BaptisteBR marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,14 @@ if (interactive()) {
}

source("renv/activate.R")

# Path to download Eunomia datasets
Sys.setenv(EUNOMIA_DATA_FOLDER = here::here("dev/test_db/eunomia"))
BaptisteBR marked this conversation as resolved.
Show resolved Hide resolved
# Name of the synthetic dataset to use
Sys.setenv(TEST_DB_NAME = "GiBleed")
# OMOP CDM version
Sys.setenv(TEST_DB_OMOP_VERSION = "5.3")
# Schema name for data
Sys.setenv(TEST_DB_CDM_SCHEMA = "main")
# Schema name for results
Sys.setenv(TEST_DB_RESULTS_SCHEMA = "main")
10 changes: 8 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,19 @@ Imports:
DT,
ggplot2,
golem (>= 0.4.1),
shiny (>= 1.9.1)
shiny (>= 1.9.1),
DBI,
duckdb,
glue
Suggests:
devtools,
usethis,
styler,
testthat (>= 3.0.0),
spelling
spelling,
here,
CDMConnector,
lubridate
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.3.2
Expand Down
19 changes: 19 additions & 0 deletions R/fct_connect_to_test_db.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#' connect_to_test_db
#'
#' @description Connect to the test database
#' @description (using environment variables from .Rprofile)
BaptisteBR marked this conversation as resolved.
Show resolved Hide resolved
#'
#' @return A DBI connection to the test database
#'
#' @importFrom DBI dbConnect
#' @importFrom duckdb duckdb
#' @importFrom glue glue
#'
#' @noRd
connect_to_test_db <- function() {
dir <- Sys.getenv("EUNOMIA_DATA_FOLDER")
name <- Sys.getenv("TEST_DB_NAME")
version <- Sys.getenv("TEST_DB_OMOP_VERSION")
# Connect to the duckdb test database
dbConnect(duckdb(dbdir = glue("{dir}/{name}_{version}_1.0.duckdb")))
}
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,14 @@ as it has good support for R package development and Shiny.
install.packages("renv")
renv::restore()
```
3. Create the [duckdb](https://github.com/duckdb/duckdb) test database and run the analyses by running from an R console in the project directory (test dataset properties can be updated in the [`.Rprofile`](https://github.com/UCLH-Foundry/omop-data-catalogue/blob/main/.Rprofile) file):

3. To preview the app locally, run the following from an R console within the project directory:
```r
source(here::here("dev/test_db/setup_test_db.R"))
source(here::here("dev/omop_analyses/analyse_omop_cdm.R"))
```

4. To preview the app locally, run the following from an R console within the project directory:

```r
golem::run_dev()
Expand Down
198 changes: 198 additions & 0 deletions dev/omop_analyses/analyse_omop_cdm.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@

library(tidyverse)

dir <- Sys.getenv("EUNOMIA_DATA_FOLDER")
name <- Sys.getenv("TEST_DB_NAME")
version <- Sys.getenv("TEST_DB_OMOP_VERSION")

# Connect to the duckdb test database
con <- DBI::dbConnect(duckdb::duckdb(
dbdir = glue::glue("{dir}/{name}_{version}_1.0.duckdb")))

# Function to execute one or more SQL queries and clear results
create_results_tables <- function(con, sql) {
# Execute sql query
result <- DBI::dbSendStatement(con, sql)
# Clear results
DBI::dbClearResult(result)
}

# Function to produce the 'calypso_concepts' table
# from a list of concept ids
analyse_concepts <- function(cdm, concepts) {
# Extract columns from concept table
cdm$concept |>
filter(concept_id %in% concepts) |>
select(
concept_id,
concept_name,
vocabulary_id,
domain_id,
concept_class_id,
standard_concept,
concept_code
) |>
collect()
}

# Function to produce the 'calypso_monthly_counts' table
analyse_monthly_counts <- function(cdm) {
# Function to analyse a column from a specific table
# for each month
analyse_table <- function(table, concept, date) {
# Extract year and month from date column
table <- table |>
mutate(
concept_id = {{ concept }},
date_year = lubridate::year({{ date }}),
date_month = lubridate::month({{ date }})
)
# Get total count for each month
total_count <- table |>
select(concept_id, date_year, date_month) |>
collect() |>
group_by(date_year, date_month, concept_id) |>
count(name = "total_count")
# Get number of unique patients per concept for each month
person_count <- table |>
select(concept_id, date_year, date_month, person_id) |>
collect() |>
group_by(date_year, date_month, concept_id) |>
reframe(person_count = n_distinct(person_id))
# Get number of records per person for each month
table |>
select(concept_id, date_year, date_month) |>
distinct() |>
collect() |>
inner_join(total_count, join_by(date_year, date_month, concept_id)) |>
inner_join(person_count, join_by(date_year, date_month, concept_id)) |>
mutate(records_per_person = (total_count / person_count)) |>
select(
concept_id,
date_year,
date_month,
person_count,
records_per_person
)
}
# Combine results for all tables
bind_rows(
cdm$condition_occurrence |> analyse_table(condition_concept_id, condition_start_date),
cdm$drug_exposure |> analyse_table(drug_concept_id, drug_exposure_start_date),
cdm$procedure_occurrence |> analyse_table(procedure_concept_id, procedure_date),
cdm$device_exposure |> analyse_table(device_concept_id, device_exposure_start_date),
cdm$measurement |> analyse_table(measurement_concept_id, measurement_date),
cdm$observation |> analyse_table(observation_concept_id, observation_date),
cdm$specimen |> analyse_table(specimen_concept_id, specimen_date)
)
}

# Function to produce the 'calypso_summary_stats' table
analyse_summary_stats <- function(cdm) {
# Function to analyse a numeric column
# by calculation the mean and the standard deviation
analyse_numeric_column <- function(table, concept, value) {
# Rename columns and remove empty values
table <- table |>
select(concept_id = {{ concept }}, value = {{ value }}) |>
filter(!is.na(value)) |>
collect()
# Calculate mean
df_mean <- table |>
group_by(concept_id) |>
reframe(
summary_attribute = "mean",
value_as_number = mean(value)
)
# Calculate standard deviation
df_sd <- table |>
group_by(concept_id) |>
reframe(
summary_attribute = "sd",
value_as_number = sd(value)
)
# Combine mean and standard deviation
bind_rows(df_mean, df_sd)
}
# Combine results for all columns
bind_rows(
cdm$measurement |> analyse_numeric_column(measurement_concept_id, value_as_number),
cdm$observation |> analyse_numeric_column(observation_concept_id, value_as_number)
)
}

# Function to write result to the results schema
write_results <- function(data, con, table) {
# Insert data into the specified table
# (in the results schema)
DBI::dbWriteTable(
conn = con,
name = DBI::Id(
schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
table = table
),
value = data,
append = TRUE,
overwrite = FALSE
)
}

# Retrieve SQL query to create Calypso tables
# (using the results schema)
sql <- gsub(
"@resultsDatabaseSchema",
Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
readr::read_file(here::here("dev/omop_analyses/calypso_tables.sql"))
)

# Create the Calypso tables to the results schema
create_results_tables(con, sql)

# Load the data in a CDMConnector object
cdm <- CDMConnector::cdm_from_con(
con = con,
cdm_schema = Sys.getenv("TEST_DB_CDM_SCHEMA"),
write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
cdm_name = name
)

# Generate monthly counts and write it to the DB
monthly_counts <- analyse_monthly_counts(cdm)
monthly_counts |>
write_results(con, "calypso_monthly_counts")

# Generate summary stats and write it to the DB
summary_stats <- analyse_summary_stats(cdm)
summary_stats |>
write_results(con, "calypso_summary_stats")

# Get list of concept ids
ids <- bind_rows(
{ monthly_counts |> select(concept_id) },
{ summary_stats |> select(concept_id) }
) |> distinct()
ids <- ids$concept_id

# Retrieve concept properties from the list of ids
analyse_concepts(cdm, ids) |>
write_results(con, "calypso_concepts")

# Clean up
DBI::dbDisconnect(con)
rm(create_results_tables)
rm(analyse_concepts)
rm(analyse_monthly_counts)
rm(analyse_summary_stats)
rm(write_results)
rm(monthly_counts)
rm(summary_stats)
rm(ids)
rm(cdm)
rm(con)
rm(sql)
rm(dir)
rm(name)
rm(version)
gc()
BaptisteBR marked this conversation as resolved.
Show resolved Hide resolved

detach("package:tidyverse", unload = TRUE)
28 changes: 28 additions & 0 deletions dev/omop_analyses/calypso_tables.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
DROP TABLE IF EXISTS @resultsDatabaseSchema.calypso_concepts;
DROP TABLE IF EXISTS @resultsDatabaseSchema.calypso_monthly_counts;
DROP TABLE IF EXISTS @resultsDatabaseSchema.calypso_summary_stats;

CREATE TABLE @resultsDatabaseSchema.calypso_concepts (
concept_id BIGINT,
concept_name VARCHAR,
domain_id VARCHAR,
vocabulary_id VARCHAR,
concept_class_id VARCHAR,
standard_concept VARCHAR,
concept_code VARCHAR
);

CREATE TABLE @resultsDatabaseSchema.calypso_monthly_counts (
concept_id BIGINT,
date_year INTEGER,
date_month INTEGER,
person_count BIGINT,
records_per_person DOUBLE
);

CREATE TABLE @resultsDatabaseSchema.calypso_summary_stats (
concept_id BIGINT,
summary_attribute VARCHAR,
value_as_string VARCHAR,
value_as_number DOUBLE
);
5 changes: 5 additions & 0 deletions dev/test_db/eunomia/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Eunomia data
*.zip

# duckdb databases
*.duckdb
24 changes: 24 additions & 0 deletions dev/test_db/setup_test_db.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@

# Create an duckdb database from Eunomia datasets
con <- DBI::dbConnect(
duckdb::duckdb(
dbdir = CDMConnector::eunomia_dir(
dataset_name = Sys.getenv("TEST_DB_NAME"),
cdm_version = Sys.getenv("TEST_DB_OMOP_VERSION"),
database_file = tempfile(fileext = ".duckdb")
)
)
)

# Use 'cdm_from_con' to load the dataset and verify integrity
CDMConnector::cdm_from_con(
con = con,
cdm_schema = Sys.getenv("TEST_DB_CDM_SCHEMA"),
write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"),
cdm_name = Sys.getenv("TEST_DB_NAME")
)

# Clean up
DBI::dbDisconnect(con)
rm(con)
gc()
BaptisteBR marked this conversation as resolved.
Show resolved Hide resolved