Skip to content

Commit

Permalink
Updates to generate production data on GAE (#87)
Browse files Browse the repository at this point in the history
  • Loading branch information
milanmlft authored Oct 18, 2024
1 parent 77aa674 commit ed05e2c
Show file tree
Hide file tree
Showing 27 changed files with 575 additions and 162 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/docker-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,16 @@ on:
pull_request:
branches: ["main"]
paths: ["deploy/**"]
workflow_dispatch:

jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Add dummy .Renviron file
run: touch deploy/.Renviron
- name: Build the Docker image
run: docker compose build
working-directory: deploy/
3 changes: 3 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ repos:
- rlang
- SAFEHR-data/omop-bundles
- plotly
- markdown
- CDMConnector
# codemeta must be above use-tidy-description when both are used
# - id: codemeta-description-updated
- id: use-tidy-description
Expand All @@ -38,6 +40,7 @@ repos:
- id: no-print-statement
- id: no-debug-statement
- id: deps-in-desc
exclude: ^scripts/
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
Expand Down
9 changes: 5 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ Imports:
golem (>= 0.4.1),
shiny (>= 1.9.1),
DBI,
duckdb,
glue,
tidyr,
withr,
Expand All @@ -35,19 +34,21 @@ Imports:
plotly (>= 4.10.4),
purrr,
htmltools,
markdown
markdown,
CDMConnector,
RPostgres
Suggests:
devtools,
usethis,
styler,
testthat (>= 3.0.0),
spelling,
here,
CDMConnector,
lintr,
dbplyr,
RSQLite,
precommit
precommit,
duckdb
Remotes: SAFEHR-data/omop-bundles
Encoding: UTF-8
LazyData: true
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
export(calculate_monthly_counts)
export(calculate_summary_stats)
export(connect_to_db)
export(connect_to_test_duckdb)
export(process_monthly_counts)
export(process_summary_stats)
export(query_concepts_table)
export(read_parquet_sorted)
export(run_app)
export(write_table)
Expand Down
14 changes: 8 additions & 6 deletions R/run_app.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ run_app <- function(
# Synchronise environment variable settings and golem options for running in prod
if (get_golem_config("app_prod")) {
options("golem.app.prod" = TRUE)
.check_env()
.check_envvars("OMOPCAT_DATA_PATH")
}

with_golem_options(
Expand All @@ -32,10 +32,12 @@ run_app <- function(
)
}

.check_env <- function() {
required <- "OMOPCAT_DATA_PATH"
missing <- required[!required %in% names(Sys.getenv())]
if (length(missing) > 0) {
cli::cli_abort("The following environment variables are missing: {.envvar {missing}}")
.check_envvars <- function(required) {
missing <- Sys.getenv(required) == ""
if (any(missing)) {
cli::cli_abort(c(
"x" = "Environment variable{?s} {.envvar {required[missing]}} not set",
"i" = "Make sure to define the environment variables (e.g. in a local {.file .Renviron} file)"
), call = rlang::caller_env())
}
}
107 changes: 101 additions & 6 deletions R/utils_preprocessing_db.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
# nocov start

#' Connec to a database
#'
#' General helper to connect to a databae through [`DBI::dbConnect()`], while ensuring
#' that the connection is closed when the connection object goes out of scope.
#'
#' @param ... arguments passed on to [`DBI::dbConnect()`]
#' @param .envir passed on to [`withr::defer()`]
#'
#' @return A [`DBI::DBIConnection-class`] object
#' @export
connect_to_db <- function(..., .envir = parent.frame()) {
con <- DBI::dbConnect(...)
withr::defer(DBI::dbDisconnect(con), envir = .envir)
con
}

#' Connect to duckdb database
#'
#' @param db_path path to the duckdb database file
Expand All @@ -6,17 +24,14 @@
#'
#' @return A [`DBI::DBIConnection-class`] object
#' @export
connect_to_db <- function(db_path, ..., .envir = parent.frame()) {
connect_to_test_duckdb <- function(db_path, ..., .envir = parent.frame()) {
if (!file.exists(db_path)) {
cli::cli_abort("Database file {.file {db_path}} not found")
}

# Connect to the duckdb test database
con <- DBI::dbConnect(
duckdb::duckdb(dbdir = db_path)
)
withr::defer(DBI::dbDisconnect(con), envir = .envir)
con
rlang::check_installed("duckdb", reason = "to set up test database connection")
connect_to_db(duckdb::duckdb(dbdir = db_path), .envir = .envir)
}


Expand Down Expand Up @@ -55,3 +70,83 @@ read_parquet_sorted <- function(path, options = nanoparquet::parquet_options())
nanoparquet::read_parquet(path, options) |>
arrange(across(everything()))
}

#' Function to produce the 'omopcat_concepts' table from a list of concept ids
#'
#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
#' @param concepts A vector of concept IDs
#'
#' @return A `data.frame` with the concept table
#' @export
query_concepts_table <- function(cdm, concepts) {
# Extract columns from concept table
cdm$concept |>
filter(.data$concept_id %in% concepts) |>
select(
"concept_id",
"concept_name",
"vocabulary_id",
"domain_id",
"concept_class_id",
"standard_concept",
"concept_code"
) |>
collect()
}

#' Generate the 'omopcat_monthly_counts' table
#'
#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
#'
#' @return A `data.frame` with the monthly counts
#' @export
process_monthly_counts <- function(cdm) {
# Combine results for all tables
out <- bind_rows( # nolint start
cdm$condition_occurrence |> calculate_monthly_counts(condition_concept_id, condition_start_date),
cdm$drug_exposure |> calculate_monthly_counts(drug_concept_id, drug_exposure_start_date),
cdm$procedure_occurrence |> calculate_monthly_counts(procedure_concept_id, procedure_date),
cdm$device_exposure |> calculate_monthly_counts(device_concept_id, device_exposure_start_date),
cdm$measurement |> calculate_monthly_counts(measurement_concept_id, measurement_date),
cdm$observation |> calculate_monthly_counts(observation_concept_id, observation_date),
cdm$specimen |> calculate_monthly_counts(specimen_concept_id, specimen_date)
) # nolint end

# Map concept names to the concept IDs
concept_names <- select(cdm$concept, .data$concept_id, .data$concept_name) |>
filter(.data$concept_id %in% out$concept_id) |>
collect()
out |>
dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |>
select("concept_id", "concept_name", everything())
}

#' Generate the 'omopcat_summary_stats' table

#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`]
#'
#' @return A `data.frame` with the summary statistics
#' @export
process_summary_stats <- function(cdm) {
table_names <- c("measurement", "observation")
concept_names <- c("measurement_concept_id", "observation_concept_id")

# Combine results for all tables
stats <- purrr::map2(table_names, concept_names, ~ calculate_summary_stats(cdm[[.x]], .y))
stats <- bind_rows(stats)

# Map concept names to the concept_ids
concept_names <- select(cdm$concept, "concept_id", "concept_name") |>
filter(.data$concept_id %in% c(stats$concept_id, stats$value_as_concept_id)) |>
collect()
stats |>
# Order is important here, first we get the names for the value_as_concept_ids
# from the categorical data summaries and record it as `value_as_string`
dplyr::left_join(concept_names, by = c("value_as_concept_id" = "concept_id")) |>
rename(value_as_string = "concept_name") |>
# Then we get the names for the main concept_ids
dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |>
select("concept_id", "concept_name", !"value_as_concept_id")
}

# nocov end
5 changes: 4 additions & 1 deletion R/utils_preprocessing_summarise.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,10 @@ calculate_summary_stats <- function(omop_table, concept_name) {
categorical_concepts <- filter(omop_table, !is.null(value_as_concept_id) & value_as_concept_id != 0)

numeric_stats <- .summarise_numeric_concepts(numeric_concepts) |> collect()
categorical_stats <- .summarise_categorical_concepts(categorical_concepts) |> collect()
categorical_stats <- .summarise_categorical_concepts(categorical_concepts) |>
# Convert value_as_number to double to make it compatible with numeric stats
mutate(value_as_number = as.double(.data$value_as_number)) |>
collect()
bind_rows(numeric_stats, categorical_stats)
}

Expand Down
7 changes: 7 additions & 0 deletions deploy/.Renviron.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
OMOPCAT_DATA_PATH=
DB_NAME=
HOST=
PORT=
DB_USERNAME=
DB_PASSWORD=
DB_CDM_SCHEMA=
1 change: 0 additions & 1 deletion deploy/.env.sample

This file was deleted.

25 changes: 17 additions & 8 deletions deploy/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
FROM rocker/shiny-verse:4.4.1

WORKDIR /app
COPY renv.lock.prod renv.lock
COPY deploy/renv.lock.prod renv.lock

# Install renv and restore environment
# omopbundles is installed separately as renv is giving problems
# with GitHub packages
RUN R -e 'remotes::install_github("SAFEHR-data/omop-bundles")'
RUN install2.r --error --skipinstalled renv && \
R -e 'renv::restore()'
R -e 'renv::restore(exclude = "omopbundles")'

COPY omopcat_*.tar.gz /app.tar.gz
RUN R -e 'remotes::install_local("/app.tar.gz", upgrade="never")' && \
COPY deploy/omopcat_*.tar.gz /app.tar.gz
RUN R -e 'remotes::install_local("/app.tar.gz", upgrade="never", dependencies = FALSE)' && \
rm /app.tar.gz

EXPOSE 3838
CMD ["R", "-e", "options('shiny.port'=3838,shiny.host='0.0.0.0');library(omopcat);omopcat::run_app()"]
ARG OMOPCAT_DATA_PATH

ADD scripts ./scripts
COPY deploy/.Renviron .Renviron

# to build: docker build -t omopcat .
# to run: docker run -p 3838:3838 omopcat
EXPOSE 3838
CMD ["R", "-e", \
"options('shiny.port'=3838,shiny.host='0.0.0.0'); \
source('scripts/create_prod_data.R'); \
omopcat::run_app()" \
]
27 changes: 24 additions & 3 deletions deploy/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,30 @@ renv::snapshot(project = ".", lockfile = "./deploy/renv.lock.prod", type = "expl
This `renv.lock.prod` file will be a subset of the `renv.lock` that is in the package root. The
latter also includes development dependencies, which are not necessary to run the app in production.

## Populate the `data/prod_data` directory

Running the production version of the app requires to populate the
[`data/prod_data`](../data/prod_data/) directory with
the necessary `parquet` files (see [`data/test_data`](../data/test_data/) for an example).

We provide the [`scripts/create_prod_data.R`](../scripts/create_prod_data.R)
script to facilitate this. This script will be run automatically when running the Docker container
if the mounted data directory is found to be empty.

A few environment variables are required to run this script:

* `DB_NAME`: the name of the database to connect to
* `HOST`: the host of the database
* `PORT`: the port on which to connect to the database
* `DB_USERNAME`: the username to connect to the database
* `DB_PASSWORD`: the password to connect to the database
* `DB_CDM_SCHEMA`: the schema of the CDM database, note that this needs to have both read and write
permissions for the user to be able to use the
[`CDMConnector`](https://darwin-eu.github.io/CDMConnector/index.html) package

These should be defined in a local `.Renviron` file (not git-tracked) in the `deploy/` directory.
See the `.Renviron.sample` file for a template.

## Build Docker images and run the app

To launch the test version of the app, run:
Expand All @@ -43,9 +67,6 @@ To launch the production version of the up, run:
docker compose up -d --build
```

Note that this will require to populate the [`data/prod_data`](../data/prod_data/) directory with
the necessary `parquet` files (see [`data/test_data`](../data/test_data/ for an example).

This will build the container and install the necessary dependencies to run the app.
The `-d` flag runs the `docker compose` command in "detached" mode, meaning the app will be run
in the background and you can safely quit your terminal session.
Expand Down
5 changes: 4 additions & 1 deletion deploy/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
services:
omopcat:
build:
context: .
# Use repo root as context so we can copy scripts directory to container
context: ..
dockerfile: deploy/Dockerfile
args:
HTTP_PROXY: ${HTTP_PROXY}
HTTPS_PROXY: ${HTTPS_PROXY}
OMOPCAT_DATA_PATH: /etc/omopcat/data
image: omopcat:latest
platform: linux/amd64
restart: unless-stopped
Expand Down
Binary file added deploy/omopcat_0.2.0.9000.tar.gz
Binary file not shown.
Binary file removed deploy/omopcat_0.2.0.tar.gz
Binary file not shown.
Loading

0 comments on commit ed05e2c

Please sign in to comment.