diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..98ec895 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +**/renv +**/.Rprofile diff --git a/.env.sample b/.env.sample new file mode 100644 index 0000000..3caeb91 --- /dev/null +++ b/.env.sample @@ -0,0 +1,12 @@ +ENV=test +DATA_VOLUME_PATH=./data/test_data + +# For preprocessing +PREPROCESS_DB_NAME= # name of the source database +PREPROCESS_HOST= # host address for the source database +PREPROCESS_PORT= # port on which to connect to the source database +PREPROCESS_DB_USERNAME= # username for the source database +PREPROCESS_DB_PASSWORD= # password for the source database + +# For testing +TEST_DB_PATH=./data-raw/test_db/eunomia diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 139ccee..dd8a38b 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -6,7 +6,7 @@ on: pull_request: branches: [main, master] -name: R-CMD-check.yaml +name: R-CMD-check permissions: read-all @@ -18,9 +18,14 @@ concurrency: jobs: R-CMD-check: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + package_dir: [app, preprocessing] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes + steps: - uses: actions/checkout@v4 @@ -29,12 +34,16 @@ jobs: use-public-rspm: true - uses: r-lib/actions/setup-renv@v2 + with: + working-directory: ${{ matrix.package_dir }} - name: Install rcmdcheck run: install.packages("rcmdcheck") shell: Rscript {0} + working-directory: ${{ matrix.package_dir }} - uses: r-lib/actions/check-r-package@v2 with: upload-snapshots: true build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' + working-directory: ${{ matrix.package_dir }} diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 871de19..932b5f5 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -3,20 +3,28 @@ name: Build Docker Image on: push: branches: ["main"] - paths: ["deploy/**"] + paths: ["docker-compose.yml", "app/Dockerfile", "preprocessing/Dockerfile"] pull_request: branches: ["main"] - paths: ["deploy/**"] + paths: ["docker-compose.yml", "app/Dockerfile", "preprocessing/Dockerfile"] workflow_dispatch: +# Only run actions on the most recent push to a branch +concurrency: + group: "${{ github.workflow }}-${{ github.head_ref }}" + cancel-in-progress: true + jobs: build: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + image: [omopcat, preprocess] steps: - uses: actions/checkout@v4 - - name: Add dummy .Renviron file - run: touch deploy/.Renviron - name: Build the Docker image - run: docker compose build - working-directory: deploy/ + run: | + cp .env.sample .env + docker compose build ${{ matrix.image }} diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 7d9ce47..6a54b1a 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -6,7 +6,7 @@ on: pull_request: branches: [main, master] -name: lint.yaml +name: lint permissions: read-all @@ -18,6 +18,10 @@ concurrency: jobs: lint: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + package_dir: [app, preprocessing] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: @@ -28,15 +32,19 @@ jobs: use-public-rspm: true - uses: r-lib/actions/setup-renv@v2 + with: + working-directory: ${{ matrix.package_dir }} - name: Install lintr run: install.packages("lintr") shell: Rscript {0} + working-directory: ${{ matrix.package_dir }} - name: Lint run: | pkgload::load_all() lintr::lint_package() shell: Rscript {0} + working-directory: ${{ matrix.package_dir }} env: LINTR_ERROR_ON_LINT: true diff --git a/.github/workflows/pr-commands.yaml b/.github/workflows/pr-commands.yaml deleted file mode 100644 index 1f5b8bb..0000000 --- a/.github/workflows/pr-commands.yaml +++ /dev/null @@ -1,91 +0,0 @@ -# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples -# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help -on: - issue_comment: - types: [created] - -name: pr-commands.yaml - -permissions: read-all - -# Only run actions on the most recent push to a branch -concurrency: - group: "${{ github.workflow }}-${{ github.head_ref }}" - cancel-in-progress: true - -jobs: - document: - if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/document') }} - name: document - runs-on: ubuntu-latest - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - permissions: - contents: write - steps: - - uses: actions/checkout@v4 - - - uses: r-lib/actions/pr-fetch@v2 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - - uses: r-lib/actions/setup-r@v2 - with: - use-public-rspm: true - - - uses: r-lib/actions/setup-renv@v2 - - - name: Install roxygen2 - run: install.packages("roxygen2") - shell: Rscript {0} - - - name: Document - run: roxygen2::roxygenise() - shell: Rscript {0} - - - name: commit - run: | - git config --local user.name "$GITHUB_ACTOR" - git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" - git add man/\* NAMESPACE - git commit -m 'Document' - - - uses: r-lib/actions/pr-push@v2 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - style: - if: ${{ github.event.issue.pull_request && (github.event.comment.author_association == 'MEMBER' || github.event.comment.author_association == 'OWNER') && startsWith(github.event.comment.body, '/style') }} - name: style - runs-on: ubuntu-latest - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - permissions: - contents: write - steps: - - uses: actions/checkout@v4 - - - uses: r-lib/actions/pr-fetch@v2 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} - - - uses: r-lib/actions/setup-r@v2 - - - name: Install dependencies - run: install.packages("styler") - shell: Rscript {0} - - - name: Style - run: styler::style_pkg() - shell: Rscript {0} - - - name: commit - run: | - git config --local user.name "$GITHUB_ACTOR" - git config --local user.email "$GITHUB_ACTOR@users.noreply.github.com" - git add \*.R - git commit -m 'Style' - - - uses: r-lib/actions/pr-push@v2 - with: - repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 7f7abaa..2fda2dd 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -6,13 +6,17 @@ on: pull_request: branches: [main, master] -name: test-coverage.yaml +name: test-coverage permissions: read-all jobs: test-coverage: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + package_dir: [app, preprocessing] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} @@ -24,10 +28,13 @@ jobs: use-public-rspm: true - uses: r-lib/actions/setup-renv@v2 + with: + working-directory: ${{ matrix.package_dir }} - name: Install dependencies run: install.packages("covr", "xml2") shell: Rscript {0} + working-directory: ${{ matrix.package_dir }} - name: Test coverage run: | @@ -38,6 +45,7 @@ jobs: ) covr::to_cobertura(cov) shell: Rscript {0} + working-directory: ${{ matrix.package_dir }} - uses: codecov/codecov-action@v4 with: @@ -46,6 +54,7 @@ jobs: plugin: noop disable_search: true token: ${{ secrets.CODECOV_TOKEN }} + directory: ${{ matrix.package_dir }} - name: Show testthat output if: always() @@ -53,6 +62,7 @@ jobs: ## -------------------------------------------------------------------- find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true shell: bash + working-directory: ${{ matrix.package_dir }} - name: Upload test results if: failure() diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6e03a97..24f2501 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,10 +7,12 @@ repos: - id: style-files args: [--style_pkg=styler, --style_fun=tidyverse_style] - id: roxygenize + args: [--root=./app] additional_dependencies: - bslib - config - DT + - fs - ggplot2 - golem - shiny @@ -35,6 +37,7 @@ repos: # codemeta must be above use-tidy-description when both are used # - id: codemeta-description-updated - id: use-tidy-description + args: [--root=./app] # - id: lintr # disabling lintr hook as it doesn't seem to correctly load the package # see also https://github.com/lorenzwalthert/precommit/issues/440 - id: parsable-R @@ -42,7 +45,8 @@ repos: - id: no-print-statement - id: no-debug-statement - id: deps-in-desc - exclude: ^scripts/ + args: [--root=./app] + exclude: "^scripts/|^app/dev/" - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.6.0 hooks: @@ -52,11 +56,6 @@ repos: files: '^\.Rbuildignore$' - id: end-of-file-fixer exclude: '\.Rd' - - repo: https://github.com/pre-commit-ci/pre-commit-ci-config - rev: v1.6.1 - hooks: - # Only required when https://pre-commit.ci is used for config validation - - id: check-pre-commit-ci-config - repo: local hooks: - id: forbid-to-commit diff --git a/R/utils_preprocessing_db.R b/R/utils_preprocessing_db.R deleted file mode 100644 index f725d25..0000000 --- a/R/utils_preprocessing_db.R +++ /dev/null @@ -1,152 +0,0 @@ -# nocov start - -#' Connec to a database -#' -#' General helper to connect to a databae through [`DBI::dbConnect()`], while ensuring -#' that the connection is closed when the connection object goes out of scope. -#' -#' @param ... arguments passed on to [`DBI::dbConnect()`] -#' @param .envir passed on to [`withr::defer()`] -#' -#' @return A [`DBI::DBIConnection-class`] object -#' @export -connect_to_db <- function(..., .envir = parent.frame()) { - con <- DBI::dbConnect(...) - withr::defer(DBI::dbDisconnect(con), envir = .envir) - con -} - -#' Connect to duckdb database -#' -#' @param db_path path to the duckdb database file -#' @param ... unused -#' @param .envir passed on to [`withr::defer()`] -#' -#' @return A [`DBI::DBIConnection-class`] object -#' @export -connect_to_test_duckdb <- function(db_path, ..., .envir = parent.frame()) { - if (!file.exists(db_path)) { - cli::cli_abort("Database file {.file {db_path}} not found") - } - - # Connect to the duckdb test database - rlang::check_installed("duckdb", reason = "to set up test database connection") - connect_to_db(duckdb::duckdb(dbdir = db_path), .envir = .envir) -} - - -#' Write data to a table in the database -#' -#' @param data data.frame, data to be written to the table -#' @param con A [`DBI::DBIConnection-class`] object -#' @param table character, name of the table to write to -#' @param schema character, name of the schema to be used -#' -#' @return `TRUE`, invisibly, if the operation was successful -#' @export -write_table <- function(data, con, table, schema) { - DBI::dbWriteTable( - conn = con, - name = DBI::Id(schema = schema, table = table), - value = data, - overwrite = TRUE - ) -} - - -#' Read a parquet table and sort the results -#' -#' @param path path to the parquet file to be read -#' @inheritParams nanoparquet::read_parquet -#' -#' @return A `data.frame` with the results sorted by all columns -#' @export -#' @importFrom dplyr arrange across everything -read_parquet_sorted <- function(path, options = nanoparquet::parquet_options()) { - if (!file.exists(path)) { - cli::cli_abort("File {.file {path}} not found") - } - - nanoparquet::read_parquet(path, options) |> - arrange(across(everything())) -} - -#' Function to produce the 'omopcat_concepts' table from a list of concept ids -#' -#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`] -#' @param concepts A vector of concept IDs -#' -#' @return A `data.frame` with the concept table -#' @export -query_concepts_table <- function(cdm, concepts) { - # Extract columns from concept table - cdm$concept |> - filter(.data$concept_id %in% concepts) |> - select( - "concept_id", - "concept_name", - "vocabulary_id", - "domain_id", - "concept_class_id", - "standard_concept", - "concept_code" - ) |> - collect() -} - -#' Generate the 'omopcat_monthly_counts' table -#' -#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`] -#' -#' @return A `data.frame` with the monthly counts -#' @export -process_monthly_counts <- function(cdm) { - # Combine results for all tables - out <- bind_rows( # nolint start - cdm$condition_occurrence |> calculate_monthly_counts(condition_concept_id, condition_start_date), - cdm$drug_exposure |> calculate_monthly_counts(drug_concept_id, drug_exposure_start_date), - cdm$procedure_occurrence |> calculate_monthly_counts(procedure_concept_id, procedure_date), - cdm$device_exposure |> calculate_monthly_counts(device_concept_id, device_exposure_start_date), - cdm$measurement |> calculate_monthly_counts(measurement_concept_id, measurement_date), - cdm$observation |> calculate_monthly_counts(observation_concept_id, observation_date), - cdm$specimen |> calculate_monthly_counts(specimen_concept_id, specimen_date) - ) # nolint end - - # Map concept names to the concept IDs - concept_names <- select(cdm$concept, .data$concept_id, .data$concept_name) |> - filter(.data$concept_id %in% out$concept_id) |> - collect() - out |> - dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |> - select("concept_id", "concept_name", everything()) -} - -#' Generate the 'omopcat_summary_stats' table - -#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`] -#' -#' @return A `data.frame` with the summary statistics -#' @export -process_summary_stats <- function(cdm) { - table_names <- c("measurement", "observation") - concept_names <- c("measurement_concept_id", "observation_concept_id") - - # Combine results for all tables - stats <- purrr::map2(table_names, concept_names, ~ calculate_summary_stats(cdm[[.x]], .y)) - stats <- bind_rows(stats) - - # Map concept names to the concept_ids - concept_names <- select(cdm$concept, "concept_id", "concept_name") |> - filter(.data$concept_id %in% c(stats$concept_id, stats$value_as_concept_id)) |> - collect() - stats |> - # Order is important here, first we get the names for the value_as_concept_ids - # from the categorical data summaries and record it as `value_as_string` - dplyr::left_join(concept_names, by = c("value_as_concept_id" = "concept_id")) |> - rename(value_as_string = "concept_name") |> - # Then we get the names for the main concept_ids - dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |> - select("concept_id", "concept_name", !"value_as_concept_id") -} - -# nocov end diff --git a/R/utils_preprocessing_summarise.R b/R/utils_preprocessing_summarise.R deleted file mode 100644 index aa1402e..0000000 --- a/R/utils_preprocessing_summarise.R +++ /dev/null @@ -1,121 +0,0 @@ -#' Calculate monthly statistics for an OMOP concept -#' -#' @param omop_table A table from the OMOP CDM -#' @param concept The name of the concept column to calculate statistics for -#' @param date The name of the date column to calculate statistics for -#' -#' @return A `data.frame` with the following columns: -#' - `concept_id`: The concept ID -#' - `concept_name`: The concept name -#' - `date_year`: The year of the date -#' - `date_month`: The month of the date -#' - `person_count`: The number of unique patients per concept for each month -#' - `records_per_person`: The average number of records per person per concept for each month -#' @export -#' @importFrom dplyr mutate group_by summarise select n n_distinct collect -calculate_monthly_counts <- function(omop_table, concept, date) { - # Extract year and month from date column - omop_table <- mutate(omop_table, - concept_id = {{ concept }}, - date_year = as.integer(lubridate::year({{ date }})), - date_month = as.integer(lubridate::month({{ date }})) - ) - - omop_table |> - group_by(.data$date_year, .data$date_month, .data$concept_id) |> - summarise( - record_count = n(), - person_count = n_distinct(.data$person_id), - ) |> - # NOTE: Explicitly cast types to avoid unexpected SQL behaviour, - # otherwise the records_per_person might end up as an int - # and the *_count vars as int64, which can give problems later - mutate( - record_count = as.integer(.data$record_count), - person_count = as.integer(.data$person_count), - records_per_person = as.double(.data$record_count) / as.double(.data$person_count) - ) |> - select( - "concept_id", - "date_year", - "date_month", - "record_count", - "person_count", - "records_per_person" - ) |> - ## Collect in case we're dealing with a database-stored table - collect() -} - -#' Calculate summary statistics for an OMOP table -#' -#' Calculates the mean and standard deviation for numeric concepts and the -#' frequency for categorical concepts. -#' -#' @param omop_table A table from the OMOP CDM -#' @param concept_name The name of the concept ID column -#' -#' @return A `data.frame` with the following columns: -#' - `concept_id`: The concept ID -#' - `summary_attribute`: The summary attribute (e.g. `"mean"`, `"sd"`, `"frequency"`) -#' - `value_as_number`: The value of the summary attribute -#' - `value_as_concept_id`: In case of a categorical concept, the concept ID for each category -#' @export -#' @importFrom dplyr all_of rename filter collect bind_rows -calculate_summary_stats <- function(omop_table, concept_name) { - stopifnot(is.character(concept_name)) - - omop_table <- rename(omop_table, concept_id = all_of(concept_name)) - - ## Avoid "no visible binding" notes - value_as_number <- value_as_concept_id <- NULL - - numeric_concepts <- filter(omop_table, !is.na(value_as_number)) - # beware CDM docs: NULL=no categorical result, 0=categorical result but no mapping - categorical_concepts <- filter(omop_table, !is.null(value_as_concept_id) & value_as_concept_id != 0) - - numeric_stats <- .summarise_numeric_concepts(numeric_concepts) |> collect() - categorical_stats <- .summarise_categorical_concepts(categorical_concepts) |> - # Convert value_as_number to double to make it compatible with numeric stats - mutate(value_as_number = as.double(.data$value_as_number)) |> - collect() - bind_rows(numeric_stats, categorical_stats) -} - -#' @importFrom dplyr group_by summarise -#' @importFrom stats sd -.summarise_numeric_concepts <- function(omop_table) { - value_as_number <- concept_id <- NULL - - # Calculate mean and sd - stats <- omop_table |> - group_by(concept_id) |> - summarise(mean = mean(value_as_number, na.rm = TRUE), sd = sd(value_as_number, na.rm = TRUE)) - - # Wrangle output to expected format - stats |> - tidyr::pivot_longer( - cols = c(mean, sd), - names_to = "summary_attribute", - values_to = "value_as_number" - ) -} - -#' @importFrom dplyr count mutate select -.summarise_categorical_concepts <- function(omop_table) { - concept_id <- value_as_concept_id <- summary_attribute <- NULL - - # Calculate frequencies - frequencies <- omop_table |> - count(concept_id, value_as_concept_id) - - # Wrangle output into the expected format - frequencies |> - mutate(summary_attribute = "frequency") |> - select( - concept_id, - summary_attribute, - value_as_number = n, - value_as_concept_id - ) -} diff --git a/README.md b/README.md index 7ae0369..11692ae 100644 --- a/README.md +++ b/README.md @@ -19,13 +19,22 @@ and subsequently export a selection of concepts of interest. ## Installation -You can install the development version of omopcat from within R like so: +The data catalogue consists of 2 separate R packages: + +- [`omopcat`](./app/) defines the [Shiny](https://shiny.posit.co/) app to display the catalogue +- [`omopcat.preprocessing`](./preprocessing/) contains the functionality to summarise an OMOP data source to use as input for the app + +The source for these is contained in the `app/` and `preprocessing/` directories, respectively. + +You can install the development version of these packages from within R like so: ```r install.packages("remotes") usethis::create_github_token() credentials::set_github_pat() -remotes::install_github("SAFEHR-data/omopcat") + +remotes::install_github("SAFEHR-data/omopcat/app") # omopcat +remotes::install_github("SAFEHR-data/omopcat/preprocessing") # omopcat.preprocessing ``` You will need to copy the PAT from the web page that `usethis::create_github_token` @@ -62,18 +71,31 @@ To run a truly productionised version, we provide a [containerised deployment](# We provide a [Docker](https://www.docker.com/) container and [`docker-compose`](https://docs.docker.com/compose/) configuration to run the app in a production environment. -A test version can be run with +### Configure environment variables + +Copy the `.env.sample` file to `.env` and fill out the required values ```sh -docker compose -f deploy/docker-compose.test.yml up -d +cp .env.sample .env ``` -which will use the [test data](./data/test_data). +### Pre-processing + +The pre-processing pipeline can be run with the following command + +```sh +docker compose run preprocess +``` + +This will generate the necessary files to run the `omopcat` app with and store them +at the path defined by the `DATA_VOLUME_PATH` environment variable. + +### Running the app -To deploy a production version, using the data from `data/prod_data` (needs to be populated manually), run +To deploy a production version, using the data from `data/prod_data` (generated by the `preprocess` service), run ```sh -docker compose -f deploy/docker-compose.yml up -d +docker compose up -d omopcat ``` By default, the app will be hosted at `http://localhost:3838`. diff --git a/.Rbuildignore b/app/.Rbuildignore similarity index 75% rename from .Rbuildignore rename to app/.Rbuildignore index 9f7c10c..2580063 100644 --- a/.Rbuildignore +++ b/app/.Rbuildignore @@ -3,16 +3,21 @@ $run_dev.* ^.here$ ^LICENSE\.md$ ^\.Rproj\.user$ +^\.env.*$ ^\.github$ ^\.lintr$ ^\.pre-commit-config\.yaml$ ^\.renvignore$ +^app$ ^codecov\.yml$ ^data$ ^data-raw$ ^deploy$ ^dev$ +^docker-compose.*\.yml$ +^preprocessing$ ^renv$ ^renv\.lock$ ^scripts$ dev_history.R +^Dockerfile$ diff --git a/.Rprofile b/app/.Rprofile similarity index 100% rename from .Rprofile rename to app/.Rprofile diff --git a/.renvignore b/app/.renvignore similarity index 100% rename from .renvignore rename to app/.renvignore diff --git a/DESCRIPTION b/app/DESCRIPTION similarity index 93% rename from DESCRIPTION rename to app/DESCRIPTION index 2dd377e..c54900e 100644 --- a/DESCRIPTION +++ b/app/DESCRIPTION @@ -19,10 +19,8 @@ Imports: ggplot2, golem (>= 0.4.1), shiny (>= 1.9.1), - DBI, glue, tidyr, - withr, forcats, readr, lubridate, @@ -35,21 +33,18 @@ Imports: purrr, htmltools, markdown, - bsicons, - CDMConnector, - RPostgres + bsicons Suggests: devtools, usethis, + fs, styler, testthat (>= 3.0.0), spelling, here, lintr, - dbplyr, - RSQLite, precommit, - duckdb + withr Remotes: SAFEHR-data/omop-bundles Encoding: UTF-8 LazyData: true diff --git a/app/Dockerfile b/app/Dockerfile new file mode 100644 index 0000000..b87de04 --- /dev/null +++ b/app/Dockerfile @@ -0,0 +1,19 @@ +FROM rocker/shiny-verse:4.4.1 + +WORKDIR /app +ADD app . +COPY app/renv.lock ./renv.lock + +# Install renv and restore environment +# omopbundles is installed separately as renv is giving problems +# with GitHub packages +RUN install2.r --error --skipinstalled renv remotes && \ + R -e 'renv::restore(exclude = "omopbundles")' && \ + rm -rf /tmp/downloaded_packages +RUN R -e 'remotes::install_github("SAFEHR-data/omop-bundles")' + +# Install the app +RUN R -e 'remotes::install_local(".", dependencies = TRUE)' + +EXPOSE 3838 +CMD ["R", "-e", "options('shiny.port'=3838, shiny.host='0.0.0.0'); omopcat::run_app()" ] diff --git a/NAMESPACE b/app/NAMESPACE similarity index 54% rename from NAMESPACE rename to app/NAMESPACE index f66f92c..2bfcba2 100644 --- a/NAMESPACE +++ b/app/NAMESPACE @@ -1,15 +1,6 @@ # Generated by roxygen2: do not edit by hand -export(calculate_monthly_counts) -export(calculate_summary_stats) -export(connect_to_db) -export(connect_to_test_duckdb) -export(process_monthly_counts) -export(process_summary_stats) -export(query_concepts_table) -export(read_parquet_sorted) export(run_app) -export(write_table) import(bslib) import(ggplot2) import(shiny) @@ -17,20 +8,8 @@ importFrom(bslib,card) importFrom(bslib,card_header) importFrom(bslib,layout_columns) importFrom(bslib,page_fluid) -importFrom(dplyr,across) -importFrom(dplyr,all_of) -importFrom(dplyr,arrange) -importFrom(dplyr,bind_rows) -importFrom(dplyr,collect) -importFrom(dplyr,count) -importFrom(dplyr,everything) -importFrom(dplyr,filter) importFrom(dplyr,group_by) -importFrom(dplyr,mutate) -importFrom(dplyr,n) importFrom(dplyr,n_distinct) -importFrom(dplyr,rename) -importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(golem,activate_js) importFrom(golem,add_resource_path) @@ -38,6 +17,7 @@ importFrom(golem,bundle_resources) importFrom(golem,favicon) importFrom(golem,with_golem_options) importFrom(htmltools,includeMarkdown) +importFrom(markdown,mark) importFrom(rlang,.data) importFrom(shiny,NS) importFrom(shiny,shinyApp) @@ -45,4 +25,3 @@ importFrom(shiny,span) importFrom(shiny,tableOutput) importFrom(shiny,tagList) importFrom(shiny,textOutput) -importFrom(stats,sd) diff --git a/NEWS.md b/app/NEWS.md similarity index 100% rename from NEWS.md rename to app/NEWS.md diff --git a/R/app_config.R b/app/R/app_config.R similarity index 100% rename from R/app_config.R rename to app/R/app_config.R diff --git a/R/app_server.R b/app/R/app_server.R similarity index 100% rename from R/app_server.R rename to app/R/app_server.R diff --git a/R/app_ui.R b/app/R/app_ui.R similarity index 100% rename from R/app_ui.R rename to app/R/app_ui.R diff --git a/R/fct_monthly_count_plot.R b/app/R/fct_monthly_count_plot.R similarity index 100% rename from R/fct_monthly_count_plot.R rename to app/R/fct_monthly_count_plot.R diff --git a/R/fct_summary_stat_plot.R b/app/R/fct_summary_stat_plot.R similarity index 100% rename from R/fct_summary_stat_plot.R rename to app/R/fct_summary_stat_plot.R diff --git a/R/mod_bundles_summary.R b/app/R/mod_bundles_summary.R similarity index 100% rename from R/mod_bundles_summary.R rename to app/R/mod_bundles_summary.R diff --git a/R/mod_datatable.R b/app/R/mod_datatable.R similarity index 98% rename from R/mod_datatable.R rename to app/R/mod_datatable.R index 93f4ebf..a43f335 100644 --- a/R/mod_datatable.R +++ b/app/R/mod_datatable.R @@ -54,7 +54,6 @@ mod_datatable_ui <- function(id) { #' @return The selected row as a reactive object #' #' @noRd -#' @importFrom dplyr group_by summarise mod_datatable_server <- function(id, selected_dates, bundle_concepts) { stopifnot(is.reactive(selected_dates)) stopifnot(is.reactive(bundle_concepts)) @@ -66,7 +65,7 @@ mod_datatable_server <- function(id, selected_dates, bundle_concepts) { concepts_with_counts <- reactive({ join_counts_to_concepts(all_concepts, monthly_counts, selected_dates()) |> # Handle low frequencies - mutate( + dplyr::mutate( total_records = replace_low_frequencies(.data$total_records), mean_persons = replace_low_frequencies(.data$mean_persons), mean_records_per_person = replace_low_frequencies(.data$mean_records_per_person) diff --git a/R/mod_daterange.R b/app/R/mod_daterange.R similarity index 100% rename from R/mod_daterange.R rename to app/R/mod_daterange.R diff --git a/R/mod_export.R b/app/R/mod_export.R similarity index 100% rename from R/mod_export.R rename to app/R/mod_export.R diff --git a/R/mod_export_tab.R b/app/R/mod_export_tab.R similarity index 100% rename from R/mod_export_tab.R rename to app/R/mod_export_tab.R diff --git a/R/mod_manual.R b/app/R/mod_manual.R similarity index 94% rename from R/mod_manual.R rename to app/R/mod_manual.R index 62a4740..3c8dffd 100644 --- a/R/mod_manual.R +++ b/app/R/mod_manual.R @@ -9,6 +9,7 @@ #' @importFrom shiny NS tagList #' @importFrom bslib card card_header #' @importFrom htmltools includeMarkdown +#' @importFrom markdown mark mod_manual_ui <- function(id) { tagList( fluidRow( diff --git a/R/mod_plots.R b/app/R/mod_plots.R similarity index 100% rename from R/mod_plots.R rename to app/R/mod_plots.R diff --git a/R/mod_select_bundle.R b/app/R/mod_select_bundle.R similarity index 100% rename from R/mod_select_bundle.R rename to app/R/mod_select_bundle.R diff --git a/R/mod_select_for_export.R b/app/R/mod_select_for_export.R similarity index 100% rename from R/mod_select_for_export.R rename to app/R/mod_select_for_export.R diff --git a/R/omopcat-package.R b/app/R/omopcat-package.R similarity index 100% rename from R/omopcat-package.R rename to app/R/omopcat-package.R diff --git a/R/run_app.R b/app/R/run_app.R similarity index 100% rename from R/run_app.R rename to app/R/run_app.R diff --git a/R/utils_bundles.R b/app/R/utils_bundles.R similarity index 100% rename from R/utils_bundles.R rename to app/R/utils_bundles.R diff --git a/R/utils_get_data.R b/app/R/utils_get_data.R similarity index 100% rename from R/utils_get_data.R rename to app/R/utils_get_data.R diff --git a/R/utils_replace_low_frequencies.R b/app/R/utils_replace_low_frequencies.R similarity index 100% rename from R/utils_replace_low_frequencies.R rename to app/R/utils_replace_low_frequencies.R diff --git a/dev/01_start.R b/app/dev/01_start.R similarity index 100% rename from dev/01_start.R rename to app/dev/01_start.R diff --git a/dev/02_dev.R b/app/dev/02_dev.R similarity index 100% rename from dev/02_dev.R rename to app/dev/02_dev.R diff --git a/dev/03_deploy.R b/app/dev/03_deploy.R similarity index 99% rename from dev/03_deploy.R rename to app/dev/03_deploy.R index 0220968..a51cd94 100644 --- a/dev/03_deploy.R +++ b/app/dev/03_deploy.R @@ -28,4 +28,3 @@ devtools::build() ## Docker ---- ## If you want to deploy via a generic Dockerfile golem::add_dockerfile_with_renv() - diff --git a/dev/design/omop-data-catalogue-design.excalidraw b/app/dev/design/omop-data-catalogue-design.excalidraw similarity index 99% rename from dev/design/omop-data-catalogue-design.excalidraw rename to app/dev/design/omop-data-catalogue-design.excalidraw index 9302b04..4f61808 100644 --- a/dev/design/omop-data-catalogue-design.excalidraw +++ b/app/dev/design/omop-data-catalogue-design.excalidraw @@ -2829,4 +2829,4 @@ }, "prevTextMode": "parsed", "files": {} -} \ No newline at end of file +} diff --git a/dev/design/omop-data-catalogue-design.png b/app/dev/design/omop-data-catalogue-design.png similarity index 100% rename from dev/design/omop-data-catalogue-design.png rename to app/dev/design/omop-data-catalogue-design.png diff --git a/dev/run_dev.R b/app/dev/run_dev.R similarity index 100% rename from dev/run_dev.R rename to app/dev/run_dev.R diff --git a/app/inst/WORDLIST b/app/inst/WORDLIST new file mode 100644 index 0000000..dac9b27 --- /dev/null +++ b/app/inst/WORDLIST @@ -0,0 +1,22 @@ +Briot +CMD +Catalogue +Changelog +GAE +Lifecycle +OMOP +Pre +Ribeyre +UI +catalogue +codecov +containerised +dev +dropdown +funder +ggplot +golem +pre +productionised +repo +visualising diff --git a/inst/app/help_tab.md b/app/inst/app/help_tab.md similarity index 100% rename from inst/app/help_tab.md rename to app/inst/app/help_tab.md diff --git a/inst/app/www/favicon.ico b/app/inst/app/www/favicon.ico similarity index 100% rename from inst/app/www/favicon.ico rename to app/inst/app/www/favicon.ico diff --git a/inst/dev_data/omopcat_concepts.csv b/app/inst/dev_data/omopcat_concepts.csv similarity index 100% rename from inst/dev_data/omopcat_concepts.csv rename to app/inst/dev_data/omopcat_concepts.csv diff --git a/inst/dev_data/omopcat_monthly_counts.csv b/app/inst/dev_data/omopcat_monthly_counts.csv similarity index 100% rename from inst/dev_data/omopcat_monthly_counts.csv rename to app/inst/dev_data/omopcat_monthly_counts.csv diff --git a/inst/dev_data/omopcat_summary_stats.csv b/app/inst/dev_data/omopcat_summary_stats.csv similarity index 100% rename from inst/dev_data/omopcat_summary_stats.csv rename to app/inst/dev_data/omopcat_summary_stats.csv diff --git a/inst/golem-config.yml b/app/inst/golem-config.yml similarity index 100% rename from inst/golem-config.yml rename to app/inst/golem-config.yml diff --git a/man/all_bundles.Rd b/app/man/all_bundles.Rd similarity index 100% rename from man/all_bundles.Rd rename to app/man/all_bundles.Rd diff --git a/man/get_bundle_concepts.Rd b/app/man/get_bundle_concepts.Rd similarity index 100% rename from man/get_bundle_concepts.Rd rename to app/man/get_bundle_concepts.Rd diff --git a/man/omopcat-package.Rd b/app/man/omopcat-package.Rd similarity index 100% rename from man/omopcat-package.Rd rename to app/man/omopcat-package.Rd diff --git a/man/run_app.Rd b/app/man/run_app.Rd similarity index 100% rename from man/run_app.Rd rename to app/man/run_app.Rd diff --git a/omopcat.Rproj b/app/omopcat.Rproj similarity index 100% rename from omopcat.Rproj rename to app/omopcat.Rproj diff --git a/renv.lock b/app/renv.lock similarity index 81% rename from renv.lock rename to app/renv.lock index 386388f..5e9d181 100644 --- a/renv.lock +++ b/app/renv.lock @@ -9,48 +9,6 @@ ] }, "Packages": { - "CDMConnector": { - "Package": "CDMConnector", - "Version": "1.5.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "DBI", - "R", - "checkmate", - "cli", - "dbplyr", - "dplyr", - "fs", - "generics", - "glue", - "jsonlite", - "lifecycle", - "methods", - "omopgenerics", - "purrr", - "readr", - "rlang", - "stringi", - "stringr", - "tidyr", - "tidyselect", - "waldo", - "withr" - ], - "Hash": "2bae94aed83da2007c938f3d2fa1a48f" - }, - "DBI": { - "Package": "DBI", - "Version": "1.2.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "methods" - ], - "Hash": "065ae649b05f1ff66bb0c793107508f5" - }, "DT": { "Package": "DT", "Version": "0.33", @@ -174,44 +132,6 @@ ], "Hash": "45f0398006e83a5b10b72a90663d8d8c" }, - "RPostgres": { - "Package": "RPostgres", - "Version": "1.4.7", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "DBI", - "R", - "bit64", - "blob", - "cpp11", - "hms", - "lubridate", - "methods", - "plogr", - "withr" - ], - "Hash": "beb7e18bf3f9e096f716a52a77ec793c" - }, - "RSQLite": { - "Package": "RSQLite", - "Version": "2.3.7", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "DBI", - "R", - "bit64", - "blob", - "cpp11", - "memoise", - "methods", - "pkgconfig", - "plogr", - "rlang" - ], - "Hash": "46b45a4dd7bb0e0f4e3fc22245817240" - }, "Rcpp": { "Package": "Rcpp", "Version": "1.0.13-1", @@ -287,18 +207,6 @@ ], "Hash": "e84984bf5f12a18628d9a02322128dfd" }, - "blob": { - "Package": "blob", - "Version": "1.2.4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "methods", - "rlang", - "vctrs" - ], - "Hash": "40415719b5a479b87949f3aa0aee737c" - }, "brew": { "Package": "brew", "Version": "1.0-10", @@ -316,26 +224,6 @@ ], "Hash": "c1ee497a6d999947c2c224ae46799b1a" }, - "broom": { - "Package": "broom", - "Version": "1.0.7", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "backports", - "dplyr", - "generics", - "glue", - "lifecycle", - "purrr", - "rlang", - "stringr", - "tibble", - "tidyr" - ], - "Hash": "8fcc818f3b9887aebaf206f141437cc9" - }, "bsicons": { "Package": "bsicons", "Version": "0.1.2", @@ -396,30 +284,6 @@ ], "Hash": "d7e13f49c19103ece9e58ad2d83a7354" }, - "cellranger": { - "Package": "cellranger", - "Version": "1.1.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "rematch", - "tibble" - ], - "Hash": "f61dbaec772ccd2e17705c1e872e9e7c" - }, - "checkmate": { - "Package": "checkmate", - "Version": "2.3.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "backports", - "utils" - ], - "Hash": "0e14e01ce07e7c88fd25de6d4260d26b" - }, "cli": { "Package": "cli", "Version": "3.6.3", @@ -482,19 +346,6 @@ ], "Hash": "8b7222e9d9eb5178eea545c0c4d33fc2" }, - "conflicted": { - "Package": "conflicted", - "Version": "1.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "memoise", - "rlang" - ], - "Hash": "bb097fccb22d156624fd07cd2894ddb6" - }, "cpp11": { "Package": "cpp11", "Version": "0.5.0", @@ -579,34 +430,6 @@ ], "Hash": "2e00b378fc3be69c865120d9f313039a" }, - "dbplyr": { - "Package": "dbplyr", - "Version": "2.5.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "DBI", - "R", - "R6", - "blob", - "cli", - "dplyr", - "glue", - "lifecycle", - "magrittr", - "methods", - "pillar", - "purrr", - "rlang", - "tibble", - "tidyr", - "tidyselect", - "utils", - "vctrs", - "withr" - ], - "Hash": "39b2e002522bfd258039ee4e889e0fd1" - }, "desc": { "Package": "desc", "Version": "1.4.3", @@ -723,38 +546,6 @@ ], "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" }, - "dtplyr": { - "Package": "dtplyr", - "Version": "1.3.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "data.table", - "dplyr", - "glue", - "lifecycle", - "rlang", - "tibble", - "tidyselect", - "vctrs" - ], - "Hash": "54ed3ea01b11e81a86544faaecfef8e2" - }, - "duckdb": { - "Package": "duckdb", - "Version": "1.1.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "DBI", - "R", - "methods", - "utils" - ], - "Hash": "83a09ee9c8380fecfcea1daeaa99e3b2" - }, "ellipsis": { "Package": "ellipsis", "Version": "0.3.2", @@ -841,28 +632,6 @@ ], "Hash": "7f48af39fa27711ea5fbd183b399920d" }, - "gargle": { - "Package": "gargle", - "Version": "1.5.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "fs", - "glue", - "httr", - "jsonlite", - "lifecycle", - "openssl", - "rappdirs", - "rlang", - "stats", - "utils", - "withr" - ], - "Hash": "fc0b272e5847c58cd5da9b20eedbd026" - }, "generics": { "Package": "generics", "Version": "0.1.3", @@ -971,59 +740,6 @@ ], "Hash": "11be24963e2f220c403eafd01e2259d5" }, - "googledrive": { - "Package": "googledrive", - "Version": "2.1.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "gargle", - "glue", - "httr", - "jsonlite", - "lifecycle", - "magrittr", - "pillar", - "purrr", - "rlang", - "tibble", - "utils", - "uuid", - "vctrs", - "withr" - ], - "Hash": "e99641edef03e2a5e87f0a0b1fcc97f4" - }, - "googlesheets4": { - "Package": "googlesheets4", - "Version": "1.1.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cellranger", - "cli", - "curl", - "gargle", - "glue", - "googledrive", - "httr", - "ids", - "lifecycle", - "magrittr", - "methods", - "purrr", - "rematch2", - "rlang", - "tibble", - "utils", - "vctrs", - "withr" - ], - "Hash": "d6db1667059d027da730decdc214b959" - }, "gtable": { "Package": "gtable", "Version": "0.3.6", @@ -1040,27 +756,6 @@ ], "Hash": "de949855009e2d4d0e52a844e30617ae" }, - "haven": { - "Package": "haven", - "Version": "2.5.4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "cpp11", - "forcats", - "hms", - "lifecycle", - "methods", - "readr", - "rlang", - "tibble", - "tidyselect", - "vctrs" - ], - "Hash": "9171f898db9d9c4c1b2c745adc2c1ef1" - }, "here": { "Package": "here", "Version": "1.0.1", @@ -1190,17 +885,6 @@ ], "Hash": "e58f80d4c5b4f0bab1456956d6ca6aad" }, - "ids": { - "Package": "ids", - "Version": "1.0.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "openssl", - "uuid" - ], - "Hash": "99df65cfef20e525ed38c3d2577f7190" - }, "ini": { "Package": "ini", "Version": "0.3.1", @@ -1422,24 +1106,6 @@ ], "Hash": "fec5f52652d60615fdb3957b3d74324a" }, - "modelr": { - "Package": "modelr", - "Version": "0.1.11", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "broom", - "magrittr", - "purrr", - "rlang", - "tibble", - "tidyr", - "tidyselect", - "vctrs" - ], - "Hash": "4f50122dc256b1b6996a4703fecea821" - }, "munsell": { "Package": "munsell", "Version": "0.5.1", @@ -1495,28 +1161,6 @@ ], "Hash": "caecf4e78df185f39703c28a73658659" }, - "omopgenerics": { - "Package": "omopgenerics", - "Version": "0.3.1", - "Source": "Repository", - "Repository": "https://packagemanager.posit.co/cran/2024-09-30", - "Requirements": [ - "R", - "cli", - "dbplyr", - "dplyr", - "glue", - "lifecycle", - "methods", - "purrr", - "rlang", - "snakecase", - "stringr", - "tidyr", - "vctrs" - ], - "Hash": "c17db6329c631b9010951ad112341ba3" - }, "openssl": { "Package": "openssl", "Version": "2.2.2", @@ -1621,13 +1265,6 @@ ], "Hash": "2ec30ffbeec83da57655b850cf2d3e0e" }, - "plogr": { - "Package": "plogr", - "Version": "0.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "09eb987710984fc2905c7129c7d85e65" - }, "plotly": { "Package": "plotly", "Version": "4.10.4", @@ -1844,38 +1481,6 @@ ], "Hash": "9de96463d2117f6ac49980577939dfb3" }, - "readxl": { - "Package": "readxl", - "Version": "1.4.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cellranger", - "cpp11", - "progress", - "tibble", - "utils" - ], - "Hash": "8cf9c239b96df1bbb133b74aef77ad0a" - }, - "rematch": { - "Package": "rematch", - "Version": "2.0.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "cbff1b666c6fa6d21202f07e2318d4f1" - }, - "rematch2": { - "Package": "rematch2", - "Version": "2.1.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "tibble" - ], - "Hash": "76c9e04c712a05848ae7a23d2f170a40" - }, "remotes": { "Package": "remotes", "Version": "2.5.0", @@ -1900,28 +1505,6 @@ ], "Hash": "47623f66b4e80b3b0587bc5d7b309888" }, - "reprex": { - "Package": "reprex", - "Version": "2.1.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "callr", - "cli", - "clipr", - "fs", - "glue", - "knitr", - "lifecycle", - "rlang", - "rmarkdown", - "rstudioapi", - "utils", - "withr" - ], - "Hash": "97b1d5361a24d9fb588db7afe3e5bcbf" - }, "rex": { "Package": "rex", "Version": "1.2.1", @@ -2021,25 +1604,6 @@ ], "Hash": "a9881dfed103e83f9de151dc17002cd1" }, - "rvest": { - "Package": "rvest", - "Version": "1.0.4", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "glue", - "httr", - "lifecycle", - "magrittr", - "rlang", - "selectr", - "tibble", - "xml2" - ], - "Hash": "0bcf0c6f274e90ea314b812a6d19a519" - }, "sass": { "Package": "sass", "Version": "0.4.9", @@ -2074,19 +1638,6 @@ ], "Hash": "c19df082ba346b0ffa6f833e92de34d1" }, - "selectr": { - "Package": "selectr", - "Version": "0.4-2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "R6", - "methods", - "stringr" - ], - "Hash": "3838071b66e0c566d55cc26bd6e27bf4" - }, "sessioninfo": { "Package": "sessioninfo", "Version": "1.2.2", @@ -2133,18 +1684,6 @@ ], "Hash": "6a293995a66e12c48d13aa1f957d09c7" }, - "snakecase": { - "Package": "snakecase", - "Version": "0.11.1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "stringi", - "stringr" - ], - "Hash": "58767e44739b76965332e8a4fe3f91f1" - }, "sourcetools": { "Package": "sourcetools", "Version": "0.1.7-1", @@ -2336,46 +1875,6 @@ ], "Hash": "829f27b9c4919c16b593794a6344d6c0" }, - "tidyverse": { - "Package": "tidyverse", - "Version": "2.0.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "broom", - "cli", - "conflicted", - "dbplyr", - "dplyr", - "dtplyr", - "forcats", - "ggplot2", - "googledrive", - "googlesheets4", - "haven", - "hms", - "httr", - "jsonlite", - "lubridate", - "magrittr", - "modelr", - "pillar", - "purrr", - "ragg", - "readr", - "readxl", - "reprex", - "rlang", - "rstudioapi", - "rvest", - "stringr", - "tibble", - "tidyr", - "xml2" - ], - "Hash": "c328568cd14ea89a83bd4ca7f54ae07e" - }, "timechange": { "Package": "timechange", "Version": "0.3.0", @@ -2463,16 +1962,6 @@ ], "Hash": "62b65c52671e6665f803ff02954446e9" }, - "uuid": { - "Package": "uuid", - "Version": "1.2-1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "34e965e62a41fcafb1ca60e9b142085b" - }, "vctrs": { "Package": "vctrs", "Version": "0.6.5", @@ -2525,7 +2014,7 @@ }, "waldo": { "Package": "waldo", - "Version": "0.6.0", + "Version": "0.6.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -2536,7 +2025,7 @@ "methods", "rlang" ], - "Hash": "53ec6571dc7a321797cde0abe007ff53" + "Hash": "52f574062a7b66e56926988c3fbdb3b7" }, "whisker": { "Package": "whisker", diff --git a/renv/.gitignore b/app/renv/.gitignore similarity index 100% rename from renv/.gitignore rename to app/renv/.gitignore diff --git a/renv/activate.R b/app/renv/activate.R similarity index 100% rename from renv/activate.R rename to app/renv/activate.R diff --git a/renv/settings.json b/app/renv/settings.json similarity index 100% rename from renv/settings.json rename to app/renv/settings.json diff --git a/tests/spelling.R b/app/tests/spelling.R similarity index 100% rename from tests/spelling.R rename to app/tests/spelling.R diff --git a/tests/testthat.R b/app/tests/testthat.R similarity index 100% rename from tests/testthat.R rename to app/tests/testthat.R diff --git a/tests/testthat/_snaps/utils_get_data/concepts_table.csv b/app/tests/testthat/_snaps/utils_get_data/concepts_table.csv similarity index 100% rename from tests/testthat/_snaps/utils_get_data/concepts_table.csv rename to app/tests/testthat/_snaps/utils_get_data/concepts_table.csv diff --git a/tests/testthat/_snaps/utils_get_data/monthly_counts.csv b/app/tests/testthat/_snaps/utils_get_data/monthly_counts.csv similarity index 100% rename from tests/testthat/_snaps/utils_get_data/monthly_counts.csv rename to app/tests/testthat/_snaps/utils_get_data/monthly_counts.csv diff --git a/tests/testthat/_snaps/utils_get_data/summary_stats.csv b/app/tests/testthat/_snaps/utils_get_data/summary_stats.csv similarity index 100% rename from tests/testthat/_snaps/utils_get_data/summary_stats.csv rename to app/tests/testthat/_snaps/utils_get_data/summary_stats.csv diff --git a/tests/testthat/helper-mock_data.R b/app/tests/testthat/helper-mock_data.R similarity index 100% rename from tests/testthat/helper-mock_data.R rename to app/tests/testthat/helper-mock_data.R diff --git a/tests/testthat/setup.R b/app/tests/testthat/setup.R similarity index 100% rename from tests/testthat/setup.R rename to app/tests/testthat/setup.R diff --git a/tests/testthat/test-fct_monthly_count_plot.R b/app/tests/testthat/test-fct_monthly_count_plot.R similarity index 100% rename from tests/testthat/test-fct_monthly_count_plot.R rename to app/tests/testthat/test-fct_monthly_count_plot.R diff --git a/tests/testthat/test-fct_summary_stat_plot.R b/app/tests/testthat/test-fct_summary_stat_plot.R similarity index 100% rename from tests/testthat/test-fct_summary_stat_plot.R rename to app/tests/testthat/test-fct_summary_stat_plot.R diff --git a/tests/testthat/test-mod_bundles_summary.R b/app/tests/testthat/test-mod_bundles_summary.R similarity index 100% rename from tests/testthat/test-mod_bundles_summary.R rename to app/tests/testthat/test-mod_bundles_summary.R diff --git a/tests/testthat/test-mod_datatable.R b/app/tests/testthat/test-mod_datatable.R similarity index 100% rename from tests/testthat/test-mod_datatable.R rename to app/tests/testthat/test-mod_datatable.R diff --git a/tests/testthat/test-mod_daterange.R b/app/tests/testthat/test-mod_daterange.R similarity index 100% rename from tests/testthat/test-mod_daterange.R rename to app/tests/testthat/test-mod_daterange.R diff --git a/tests/testthat/test-mod_export_tab.R b/app/tests/testthat/test-mod_export_tab.R similarity index 100% rename from tests/testthat/test-mod_export_tab.R rename to app/tests/testthat/test-mod_export_tab.R diff --git a/tests/testthat/test-mod_exportsummary.R b/app/tests/testthat/test-mod_exportsummary.R similarity index 100% rename from tests/testthat/test-mod_exportsummary.R rename to app/tests/testthat/test-mod_exportsummary.R diff --git a/tests/testthat/test-mod_plots.R b/app/tests/testthat/test-mod_plots.R similarity index 100% rename from tests/testthat/test-mod_plots.R rename to app/tests/testthat/test-mod_plots.R diff --git a/tests/testthat/test-mod_select_bundle.R b/app/tests/testthat/test-mod_select_bundle.R similarity index 100% rename from tests/testthat/test-mod_select_bundle.R rename to app/tests/testthat/test-mod_select_bundle.R diff --git a/tests/testthat/test-mod_select_for_export.R b/app/tests/testthat/test-mod_select_for_export.R similarity index 100% rename from tests/testthat/test-mod_select_for_export.R rename to app/tests/testthat/test-mod_select_for_export.R diff --git a/tests/testthat/test-run_app.R b/app/tests/testthat/test-run_app.R similarity index 100% rename from tests/testthat/test-run_app.R rename to app/tests/testthat/test-run_app.R diff --git a/tests/testthat/test-utils_bundles.R b/app/tests/testthat/test-utils_bundles.R similarity index 100% rename from tests/testthat/test-utils_bundles.R rename to app/tests/testthat/test-utils_bundles.R diff --git a/tests/testthat/test-utils_get_data.R b/app/tests/testthat/test-utils_get_data.R similarity index 100% rename from tests/testthat/test-utils_get_data.R rename to app/tests/testthat/test-utils_get_data.R diff --git a/codecov.yml b/codecov.yml index 9aba9bd..aefd73f 100644 --- a/codecov.yml +++ b/codecov.yml @@ -10,3 +10,14 @@ coverage: target: auto threshold: 1% informational: true + +component_management: + individual_components: + - component_id: app + name: app + paths: + - app/ + - component_id: preprocessing + name: preprocessing + paths: + - preprocessing/ diff --git a/data-raw/test_db/create_test_db.R b/data-raw/test_db/create_test_db.R new file mode 100644 index 0000000..8b500f2 --- /dev/null +++ b/data-raw/test_db/create_test_db.R @@ -0,0 +1,113 @@ +# Creates a test database from the synthea-allergies-10k example dataset +# with added dummy data for observations and measurements + +# PRODUCED FOR A SPECIFIC DATASET: +# synthea-allergies-10k +# (but could work for others) + + +# Setup --------------------------------------------------------------------------------------- + +library(readr) + +here::i_am("data-raw/test_db/create_test_db.R") + +dir <- here::here("data-raw/test_db/eunomia") +name <- "synthea-allergies-10k" +version <- "5.3" +schema <- "main" + +# Download the example data if it doesn't exist yet and create the duckdb database +withr::with_envvar( + new = c(EUNOMIA_DATA_FOLDER = dir), + { + invisible(CDMConnector::eunomia_dir(dataset_name = name, cdm_version = version)) + } +) + +con <- DBI::dbConnect(duckdb::duckdb(glue::glue("{dir}/{name}_{version}_1.1.duckdb"))) +withr::defer(DBI::dbDisconnect(con)) + + +# Insert dummy tables ------------------------------------------------------------------------- + +#' Write data to a table in the database +#' +#' @param data data.frame, data to be written to the table +#' @param con A [`DBI::DBIConnection-class`] object +#' @param table character, name of the table to write to +#' @param schema character, name of the schema to be used +#' +#' @return `TRUE`, invisibly, if the operation was successful +write_table <- function(data, con, table, schema) { + DBI::dbWriteTable( + conn = con, + name = DBI::Id(schema = schema, table = table), + value = data, + overwrite = TRUE + ) +} + + +## Load dummy data and write tables to database +## We explicitly set the column types for columns that are needed later down the pipeline +dummy_measurements <- read_csv( + here::here("data-raw/test_db/dummy/measurement.csv"), + col_types = cols( + measurement_id = col_integer(), + person_id = col_integer(), + measurement_concept_id = col_integer(), + measurement_date = col_date(), + value_as_number = col_double(), + value_as_concept_id = col_integer(), + measurement_time = col_character(), + measurement_type_concept_id = col_integer(), + operator_concept_id = col_integer(), + unit_concept_id = col_integer(), + range_low = col_number(), + range_high = col_number(), + provider_id = col_integer(), + visit_occurrence_id = col_integer(), + visit_detail_id = col_integer(), + measurement_source_value = col_character(), + measurement_source_concept_id = col_integer(), + unit_source_value = col_character(), + value_source_value = col_character() + ) +) +write_table(dummy_measurements, con, "measurement", schema = schema) + +dummy_observations <- read_csv( + here::here("data-raw/test_db/dummy/observation.csv"), + col_types = cols( + observation_id = col_integer(), + person_id = col_integer(), + observation_concept_id = col_integer(), + observation_date = col_date(), + value_as_number = col_double(), + value_as_string = col_character(), + value_as_concept_id = col_integer(), + observation_type_concept_id = col_integer(), + qualifier_concept_id = col_integer(), + unit_concept_id = col_integer(), + provider_id = col_integer(), + visit_occurrence_id = col_integer(), + visit_detail_id = col_integer(), + observation_source_value = col_character(), + observation_source_concept_id = col_integer(), + unit_source_value = col_character(), + qualifier_source_value = col_character() + ) +) +write_table(dummy_observations, con, "observation", schema = schema) + +## Verify integrity, turn warnings into error +tryCatch( + cdm <- CDMConnector::cdm_from_con(con, cdm_schema = schema, write_schema = schema), + warning = function(cnd) { + msg <- sprintf("CDM integrity check produced warnings: %s", conditionMessage(cnd)) + rlang::abort(msg, call = NULL) + } +) + +cli::cli_alert_success("Test database set up successfully") diff --git a/data/test_data/omopcat_concepts.parquet b/data/test_data/omopcat_concepts.parquet index ee69534..fb5f088 100644 Binary files a/data/test_data/omopcat_concepts.parquet and b/data/test_data/omopcat_concepts.parquet differ diff --git a/data/test_data/omopcat_monthly_counts.parquet b/data/test_data/omopcat_monthly_counts.parquet index ff7d3b3..6cfaf3d 100644 Binary files a/data/test_data/omopcat_monthly_counts.parquet and b/data/test_data/omopcat_monthly_counts.parquet differ diff --git a/data/test_data/omopcat_summary_stats.parquet b/data/test_data/omopcat_summary_stats.parquet index 14929c3..b42faae 100644 Binary files a/data/test_data/omopcat_summary_stats.parquet and b/data/test_data/omopcat_summary_stats.parquet differ diff --git a/deploy/.Renviron.sample b/deploy/.Renviron.sample deleted file mode 100644 index 34b4422..0000000 --- a/deploy/.Renviron.sample +++ /dev/null @@ -1,7 +0,0 @@ -OMOPCAT_DATA_PATH= -DB_NAME= -HOST= -PORT= -DB_USERNAME= -DB_PASSWORD= -DB_CDM_SCHEMA= diff --git a/deploy/Dockerfile b/deploy/Dockerfile deleted file mode 100644 index 6b09a8a..0000000 --- a/deploy/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM rocker/shiny-verse:4.4.1 - -WORKDIR /app -COPY deploy/renv.lock.prod renv.lock - -# Install renv and restore environment -# omopbundles is installed separately as renv is giving problems -# with GitHub packages -RUN R -e 'remotes::install_github("SAFEHR-data/omop-bundles")' -RUN install2.r --error --skipinstalled renv && \ - R -e 'renv::restore(exclude = "omopbundles")' - -COPY deploy/omopcat_*.tar.gz /app.tar.gz -RUN R -e 'remotes::install_local("/app.tar.gz", upgrade="never", dependencies = FALSE)' && \ - rm /app.tar.gz - -ARG OMOPCAT_DATA_PATH - -ADD scripts ./scripts -COPY deploy/.Renviron .Renviron - -EXPOSE 3838 -CMD ["R", "-e", \ - "options('shiny.port'=3838,shiny.host='0.0.0.0'); \ - source('scripts/create_prod_data.R'); \ - omopcat::run_app()" \ - ] diff --git a/deploy/README.md b/deploy/README.md deleted file mode 100644 index 7d8f4bf..0000000 --- a/deploy/README.md +++ /dev/null @@ -1,77 +0,0 @@ -# Deploy - -## Build the package - -From the package root directory, run from `R`: - -```r -pkgbuild::build(path = ".", dest_path = "deploy") -``` - -This will create a `omopcat_*.tar.gz` file in the `deploy/` directory with a built version of the -package, which will be used to install the package in the Docker container. The reasoning here -is that we have a **production** version of the package that is separate from the **development** -version. The production version would ideally be pinned to a release version, e.g. `0.1.0` and only -be updated when a new release is made, e.g. `0.1.1` or `0.2.0`. This should allow us to continue -developing the app without affecting the version that is running in production. - -## Create `renv.lock.prod` - -From the package root directory, run from `R`: - -```r -renv::snapshot(project = ".", lockfile = "./deploy/renv.lock.prod", type = "explicit") -``` - -This `renv.lock.prod` file will be a subset of the `renv.lock` that is in the package root. The -latter also includes development dependencies, which are not necessary to run the app in production. - -## Populate the `data/prod_data` directory - -Running the production version of the app requires to populate the -[`data/prod_data`](../data/prod_data/) directory with -the necessary `parquet` files (see [`data/test_data`](../data/test_data/) for an example). - -We provide the [`scripts/create_prod_data.R`](../scripts/create_prod_data.R) -script to facilitate this. This script will be run automatically when running the Docker container -if the mounted data directory is found to be empty. - -A few environment variables are required to run this script: - -* `DB_NAME`: the name of the database to connect to -* `HOST`: the host of the database -* `PORT`: the port on which to connect to the database -* `DB_USERNAME`: the username to connect to the database -* `DB_PASSWORD`: the password to connect to the database -* `DB_CDM_SCHEMA`: the schema of the CDM database, note that this needs to have both read and write - permissions for the user to be able to use the - [`CDMConnector`](https://darwin-eu.github.io/CDMConnector/index.html) package - -These should be defined in a local `.Renviron` file (not git-tracked) in the `deploy/` directory. -See the `.Renviron.sample` file for a template. - -## Build Docker images and run the app - -To launch the test version of the app, run: - -```shell -docker compose -f docker-compose.test.yml up -d --build -``` - -This will run a **test** version of the production app using the synthetic data in -[`data/test_data`](../data/test_data/). - -To launch the production version of the up, run: - -```shell -docker compose up -d --build -``` - -This will build the container and install the necessary dependencies to run the app. -The `-d` flag runs the `docker compose` command in "detached" mode, meaning the app will be run -in the background and you can safely quit your terminal session. - -By default, the app will be hosted at `https://localhost:3838`. - -Running the app on GAE05 will make it available at `http://uclvlddpragae05:3838` within the UCLH -network. diff --git a/deploy/docker-compose.test.yml b/deploy/docker-compose.test.yml deleted file mode 100644 index ee733be..0000000 --- a/deploy/docker-compose.test.yml +++ /dev/null @@ -1,7 +0,0 @@ -services: - calyspo_test: - extends: - file: docker-compose.yml - service: omopcat - volumes: - - ../data/test_data:/etc/omopcat/data diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml deleted file mode 100644 index 97a1820..0000000 --- a/deploy/docker-compose.yml +++ /dev/null @@ -1,24 +0,0 @@ -services: - omopcat: - build: - # Use repo root as context so we can copy scripts directory to container - context: .. - dockerfile: deploy/Dockerfile - args: - HTTP_PROXY: ${HTTP_PROXY} - HTTPS_PROXY: ${HTTPS_PROXY} - OMOPCAT_DATA_PATH: /etc/omopcat/data - image: omopcat:latest - platform: linux/amd64 - restart: unless-stopped - environment: - HTTP_PROXY: ${HTTP_PROXY} - HTTPS_PROXY: ${HTTPS_PROXY} - GOLEM_CONFIG_ACTIVE: production - OMOPCAT_DATA_PATH: /etc/omopcat/data - LOW_FREQUENCY_THRESHOLD: 5 - LOW_FREQUENCY_REPLACEMENT: 2.5 - volumes: - - ../data/prod_data:/etc/omopcat/data - ports: - - 3838:3838 diff --git a/deploy/omopcat_0.2.1.tar.gz b/deploy/omopcat_0.2.1.tar.gz deleted file mode 100644 index ca512fa..0000000 Binary files a/deploy/omopcat_0.2.1.tar.gz and /dev/null differ diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..48366be --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,50 @@ +services: + preprocess: + build: + context: . + dockerfile: preprocessing/Dockerfile + args: + # Required for running on GAE + HTTP_PROXY: ${HTTP_PROXY} + HTTPS_PROXY: ${HTTPS_PROXY} + image: omopcat_preprocessing:latest + platform: linux/amd64 + environment: + ENV: ${ENV} + PREPROCESS_OUT_PATH: /mnt/preprocessing/data + DB_NAME: ${PREPROCESS_DB_NAME} + HOST: ${PREPROCESS_HOST} + PORT: ${PREPROCESS_PORT} + DB_USERNAME: ${PREPROCESS_DB_USERNAME} + DB_PASSWORD: ${PREPROCESS_DB_PASSWORD} + DB_CDM_SCHEMA: ${PREPROCESS_DB_CDM_SCHEMA} + EUNOMIA_DATA_FOLDER: /mnt/preprocessing/data-raw/test_db + command: ["R", "-e", "omopcat.preprocessing::preprocess()"] + volumes: + - ${DATA_VOLUME_PATH}:/mnt/preprocessing/data + - ${TEST_DB_PATH}:/mnt/preprocessing/data-raw/test_db + + omopcat: + build: + # Use repo root as context so we can copy scripts directory to container + context: . + dockerfile: app/Dockerfile + args: + # Required for running on GAE + HTTP_PROXY: ${HTTP_PROXY} + HTTPS_PROXY: ${HTTPS_PROXY} + image: omopcat:latest + platform: linux/amd64 + restart: unless-stopped + environment: + ENV: ${ENV} + HTTP_PROXY: ${HTTP_PROXY} + HTTPS_PROXY: ${HTTPS_PROXY} + GOLEM_CONFIG_ACTIVE: production + OMOPCAT_DATA_PATH: /etc/omopcat/data + LOW_FREQUENCY_THRESHOLD: 5 + LOW_FREQUENCY_REPLACEMENT: 2.5 + volumes: + - ${DATA_VOLUME_PATH}:/etc/omopcat/data + ports: + - 3838:3838 diff --git a/inst/WORDLIST b/inst/WORDLIST deleted file mode 100644 index 9b571ff..0000000 --- a/inst/WORDLIST +++ /dev/null @@ -1,12 +0,0 @@ -CDM -CMD -Catalogue -Lifecycle -Nanoparquet -OMOP -catalogue -codecov -containerised -duckdb -golem -productionised diff --git a/man/connect_to_test_duckdb.Rd b/man/connect_to_test_duckdb.Rd deleted file mode 100644 index 380616b..0000000 --- a/man/connect_to_test_duckdb.Rd +++ /dev/null @@ -1,21 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_preprocessing_db.R -\name{connect_to_test_duckdb} -\alias{connect_to_test_duckdb} -\title{Connect to duckdb database} -\usage{ -connect_to_test_duckdb(db_path, ..., .envir = parent.frame()) -} -\arguments{ -\item{db_path}{path to the duckdb database file} - -\item{...}{unused} - -\item{.envir}{passed on to \code{\link[withr:defer]{withr::defer()}}} -} -\value{ -A \code{\link[DBI:DBIConnection-class]{DBI::DBIConnection}} object -} -\description{ -Connect to duckdb database -} diff --git a/man/query_concepts_table.Rd b/man/query_concepts_table.Rd deleted file mode 100644 index 1feb5e6..0000000 --- a/man/query_concepts_table.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_preprocessing_db.R -\name{query_concepts_table} -\alias{query_concepts_table} -\title{Function to produce the 'omopcat_concepts' table from a list of concept ids} -\usage{ -query_concepts_table(cdm, concepts) -} -\arguments{ -\item{cdm}{A \code{\link{CDMConnector}} object, e.g. from \code{\link[CDMConnector:cdm_from_con]{CDMConnector::cdm_from_con()}}} - -\item{concepts}{A vector of concept IDs} -} -\value{ -A \code{data.frame} with the concept table -} -\description{ -Function to produce the 'omopcat_concepts' table from a list of concept ids -} diff --git a/man/read_parquet_sorted.Rd b/man/read_parquet_sorted.Rd deleted file mode 100644 index 25d14ed..0000000 --- a/man/read_parquet_sorted.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_preprocessing_db.R -\name{read_parquet_sorted} -\alias{read_parquet_sorted} -\title{Read a parquet table and sort the results} -\usage{ -read_parquet_sorted(path, options = nanoparquet::parquet_options()) -} -\arguments{ -\item{path}{path to the parquet file to be read} - -\item{options}{Nanoparquet options, see \code{\link[nanoparquet:parquet_options]{parquet_options()}}.} -} -\value{ -A \code{data.frame} with the results sorted by all columns -} -\description{ -Read a parquet table and sort the results -} diff --git a/man/write_table.Rd b/man/write_table.Rd deleted file mode 100644 index 6ed033e..0000000 --- a/man/write_table.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_preprocessing_db.R -\name{write_table} -\alias{write_table} -\title{Write data to a table in the database} -\usage{ -write_table(data, con, table, schema) -} -\arguments{ -\item{data}{data.frame, data to be written to the table} - -\item{con}{A \code{\link[DBI:DBIConnection-class]{DBI::DBIConnection}} object} - -\item{table}{character, name of the table to write to} - -\item{schema}{character, name of the schema to be used} -} -\value{ -\code{TRUE}, invisibly, if the operation was successful -} -\description{ -Write data to a table in the database -} diff --git a/preprocessing/.Rbuildignore b/preprocessing/.Rbuildignore new file mode 100644 index 0000000..fd6568a --- /dev/null +++ b/preprocessing/.Rbuildignore @@ -0,0 +1,7 @@ +^renv$ +^renv\.lock$ +^preprocessing\.Rproj$ +^\.Rproj\.user$ +^data-raw$ +^Dockerfile$ +^omopcat\.preprocessing\.Rproj$ diff --git a/preprocessing/.Rprofile b/preprocessing/.Rprofile new file mode 100644 index 0000000..c81932b --- /dev/null +++ b/preprocessing/.Rprofile @@ -0,0 +1,14 @@ +if (interactive()) { + suppressMessages(require("devtools")) + suppressMessages(require("golem")) + + # warn about partial matching + options( + warnPartialMatchDollar = TRUE, + warnPartialMatchAttr = TRUE, + warnPartialMatchArgs = TRUE + ) + options(styler.cache_root = "styler_perm") +} + +source("renv/activate.R") diff --git a/preprocessing/.gitignore b/preprocessing/.gitignore new file mode 100644 index 0000000..cd67eac --- /dev/null +++ b/preprocessing/.gitignore @@ -0,0 +1 @@ +.Rproj.user diff --git a/preprocessing/DESCRIPTION b/preprocessing/DESCRIPTION new file mode 100644 index 0000000..1a355a3 --- /dev/null +++ b/preprocessing/DESCRIPTION @@ -0,0 +1,34 @@ +Package: omopcat.preprocessing +Title: Preprocessing for the omopcat App +Version: 0.0.0.9000 +Authors@R: c( + person('Milan', 'Malfait', email = 'm.malfait@ucl.ac.uk', role = c('cre', 'aut')), + person('Stefan', 'Piatek', role = 'aut'), + person('Baptiste', 'Briot Ribeyre', role = 'aut'), + person('Andy', 'South', role = 'aut'), + person("SAFEHR", role = c("cph", "fnd")) + ) +Description: Preprocess data for the omopcat app. +License: Apache License (>= 2) +Encoding: UTF-8 +Roxygen: list(markdown = TRUE) +RoxygenNote: 7.3.2 +Imports: + CDMConnector, + cli, + DBI, + dplyr, + duckdb, + fs, + lubridate, + nanoparquet, + purrr, + rlang, + RPostgres, + tidyr, + withr +Suggests: + dbplyr, + RSQLite, + testthat (>= 3.0.0) +Config/testthat/edition: 3 diff --git a/preprocessing/Dockerfile b/preprocessing/Dockerfile new file mode 100644 index 0000000..e277e02 --- /dev/null +++ b/preprocessing/Dockerfile @@ -0,0 +1,13 @@ +FROM rocker/tidyverse:4.4.1 + +WORKDIR /pkg +ADD preprocessing . +COPY preprocessing/renv.lock ./renv.lock + +RUN install2.r --error --skipinstalled renv remotes && \ + R -e 'renv::restore()' && \ + rm -rf /tmp/downloaded_packages + +RUN R -e 'remotes::install_local(path = ".", dependencies = TRUE)' + +CMD ["R", "-e", "omopcat.preprocessing::preprocess()"] diff --git a/preprocessing/NAMESPACE b/preprocessing/NAMESPACE new file mode 100644 index 0000000..8048ce8 --- /dev/null +++ b/preprocessing/NAMESPACE @@ -0,0 +1,5 @@ +# Generated by roxygen2: do not edit by hand + +export(preprocess) +importFrom(rlang,.data) +importFrom(stats,sd) diff --git a/preprocessing/R/concepts.R b/preprocessing/R/concepts.R new file mode 100644 index 0000000..4d5e8d5 --- /dev/null +++ b/preprocessing/R/concepts.R @@ -0,0 +1,22 @@ +#' Generate the `omopcat_concepts` table +#' +#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`] +#' @param concept_ids A vector of concept IDs +#' +#' @return A `data.frame` with the monthly counts +#' @keywords internal +generate_concepts <- function(cdm, concept_ids) { + # Extract columns from concept table + cdm$concept |> + dplyr::filter(.data$concept_id %in% concept_ids) |> + dplyr::select( + "concept_id", + "concept_name", + "vocabulary_id", + "domain_id", + "concept_class_id", + "standard_concept", + "concept_code" + ) |> + dplyr::collect() +} diff --git a/preprocessing/R/monthly_counts.R b/preprocessing/R/monthly_counts.R new file mode 100644 index 0000000..52ff104 --- /dev/null +++ b/preprocessing/R/monthly_counts.R @@ -0,0 +1,83 @@ +#' Generate the 'omopcat_monthly_counts' table +#' +#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`] +#' +#' @return A `data.frame` with the monthly counts +#' @keywords internal +generate_monthly_counts <- function(cdm) { + # Combine results for all tables + out <- dplyr::bind_rows( + cdm$condition_occurrence |> calculate_monthly_counts( + .data$condition_concept_id, .data$condition_start_date + ), + cdm$drug_exposure |> + calculate_monthly_counts(.data$drug_concept_id, .data$drug_exposure_start_date), + cdm$procedure_occurrence |> + calculate_monthly_counts(.data$procedure_concept_id, .data$procedure_date), + cdm$device_exposure |> + calculate_monthly_counts(.data$device_concept_id, .data$device_exposure_start_date), + cdm$measurement |> + calculate_monthly_counts(.data$measurement_concept_id, .data$measurement_date), + cdm$observation |> + calculate_monthly_counts(.data$observation_concept_id, .data$observation_date), + cdm$specimen |> + calculate_monthly_counts(.data$specimen_concept_id, .data$specimen_date) + ) + + # Map concept names to the concept IDs + concept_names <- dplyr::select(cdm$concept, "concept_id", "concept_name") |> + dplyr::filter(.data$concept_id %in% out$concept_id) |> + dplyr::collect() + out |> + dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |> + dplyr::select("concept_id", "concept_name", dplyr::everything()) +} + + +#' Calculate monthly statistics for an OMOP concept +#' +#' @param omop_table A table from the OMOP CDM +#' @param concept The name of the concept column to calculate statistics for +#' @param date The name of the date column to calculate statistics for +#' +#' @return A `data.frame` with the following columns: +#' - `concept_id`: The concept ID +#' - `concept_name`: The concept name +#' - `date_year`: The year of the date +#' - `date_month`: The month of the date +#' - `person_count`: The number of unique patients per concept for each month +#' - `records_per_person`: The average number of records per person per concept for each month +#' @keywords internal +calculate_monthly_counts <- function(omop_table, concept, date) { + # Extract year and month from date column + omop_table <- dplyr::mutate(omop_table, + concept_id = {{ concept }}, + date_year = as.integer(lubridate::year({{ date }})), + date_month = as.integer(lubridate::month({{ date }})) + ) + + omop_table |> + dplyr::group_by(.data$date_year, .data$date_month, .data$concept_id) |> + dplyr::summarise( + record_count = dplyr::n(), + person_count = dplyr::n_distinct(.data$person_id), + ) |> + # NOTE: Explicitly cast types to avoid unexpected SQL behaviour, + # otherwise the records_per_person might end up as an int + # and the *_count vars as int64, which can give problems later + dplyr::mutate( + record_count = as.integer(.data$record_count), + person_count = as.integer(.data$person_count), + records_per_person = as.double(.data$record_count) / as.double(.data$person_count) + ) |> + dplyr::select( + "concept_id", + "date_year", + "date_month", + "record_count", + "person_count", + "records_per_person" + ) |> + ## Collect in case we're dealing with a database-stored table + dplyr::collect() +} diff --git a/preprocessing/R/omopcat.preprocessing-package.R b/preprocessing/R/omopcat.preprocessing-package.R new file mode 100644 index 0000000..52c0c07 --- /dev/null +++ b/preprocessing/R/omopcat.preprocessing-package.R @@ -0,0 +1,7 @@ +#' @keywords internal +"_PACKAGE" + +## usethis namespace: start +#' @importFrom rlang .data +## usethis namespace: end +NULL diff --git a/preprocessing/R/preprocess.R b/preprocessing/R/preprocess.R new file mode 100644 index 0000000..9f242ed --- /dev/null +++ b/preprocessing/R/preprocess.R @@ -0,0 +1,127 @@ +#' Run the pre-processing pipeline +#' +#' The pre-processing pipeline generates the following files: +#' +#' * `out_path`/omopcat_concepts.parquet +#' * `out_path`/omopcat_monthly_counts.parquet +#' * `out_path`/omopcat_summary_stats.parquet +#' +#' If all these files already exist, the pipeline will not be run. +#' +#' @param out_path The directory where the pre-processed data will be written to. +#' Defaults to the `PREPROCESS_OUT_PATH` environment variable. +#' +#' @return Whether the pre-processing was run: `TRUE` or `FALSE`, invisibly. +#' @export +preprocess <- function(out_path = Sys.getenv("PREPROCESS_OUT_PATH")) { + if (out_path == "") { + cli::cli_abort(c( + "x" = "{.var out_path} should not be empty.", + "i" = "Have you set the {.envvar PREPROCESS_OUT_PATH} environment variable?" + )) + } + fs::dir_create(out_path) + + out_files <- c( + concepts = file.path(out_path, "omopcat_concepts.parquet"), + monthly_counts = file.path(out_path, "omopcat_monthly_counts.parquet"), + summary_stats = file.path(out_path, "omopcat_summary_stats.parquet") + ) + + # Only run pre-processing if the expected files don't exist + exists <- fs::file_exists(out_files) + + if (all(exists)) { + cli::cli_alert_info("All expected files already exist. Skipping pre-processing.") + cli::cli_alert_info("To force re-processing, delete the following files:") + cli::cli_ul(out_files[exists]) + + return(invisible(FALSE)) + } + + if (.running_in_production()) { + cli::cli_alert_info("Running in production mode") + .check_prod_env() + } + + cdm <- .setup_cdm_object() + + cli::cli_progress_message("Generating monthly_counts table") + monthly_counts <- generate_monthly_counts(cdm) + + cli::cli_progress_message("Generating summary_stats table") + summary_stats <- generate_summary_stats(cdm) + + cli::cli_progress_message("Generating concepts table") + concept_ids_with_data <- unique(c(monthly_counts$concept_id, summary_stats$concept_id)) + concepts_table <- generate_concepts(cdm, concept_ids = concept_ids_with_data) + + nanoparquet::write_parquet(concepts_table, out_files["concepts"]) + nanoparquet::write_parquet(monthly_counts, out_files["monthly_counts"]) + nanoparquet::write_parquet(summary_stats, out_files["summary_stats"]) + cli::cli_alert_success("Tables written to {.path {out_files}}") + + return(invisible(TRUE)) +} + +.running_in_production <- function() { + return(Sys.getenv("ENV") == "prod") +} + +.check_prod_env <- function() { + required_envvars <- c( + "DB_NAME", + "HOST", + "PORT", + "DB_USERNAME", + "DB_PASSWORD" + ) + + missing <- Sys.getenv(required_envvars) == "" + if (any(missing)) { + cli::cli_abort( + "Environment variable{?s} {.envvar {required_envvars[missing]}} not set", + call = rlang::caller_env() + ) + } +} + +#' Set up a CDM object from a database connection +#' +#' When running in production, sets up a CDM object from the database settings +#' configured through the relevant environment variables. +#' +#' When not in production, creates a CDM from one of the CDMConnector example +#' datasets (`"GiBleed"` by default), using [`CDMConnector::eunomia_dir()`]. +#' This is intended for use in testing and development. +#' +#' @param .envir Passed on to [`connect_to_db()`], controls the scope in which the database +#' connection should live. When it goes out of scope, the database connection is closed. +#' @noRd +.setup_cdm_object <- function(.envir = parent.frame()) { + if (.running_in_production()) { + name <- Sys.getenv("DB_NAME") + con <- connect_to_db( + RPostgres::Postgres(), + dbname = Sys.getenv("DB_NAME"), + host = Sys.getenv("HOST"), + port = Sys.getenv("PORT"), + user = Sys.getenv("DB_USERNAME"), + password = Sys.getenv("DB_PASSWORD"), + .envir = parent.frame() + ) + } else { + name <- Sys.getenv("DB_NAME", unset = "GiBleed") + duckdb_path <- CDMConnector::eunomia_dir(dataset_name = name) + rlang::check_installed("duckdb") + con <- connect_to_db(duckdb::duckdb(duckdb_path), .envir = .envir) + } + + # Load the data in a CDMConnector object + CDMConnector::cdm_from_con( + con = con, + cdm_schema = Sys.getenv("DB_CDM_SCHEMA", unset = "main"), + write_schema = Sys.getenv("DB_CDM_SCHEMA", unset = "main"), + cdm_name = name + ) +} diff --git a/preprocessing/R/summary_stats.R b/preprocessing/R/summary_stats.R new file mode 100644 index 0000000..9dd258f --- /dev/null +++ b/preprocessing/R/summary_stats.R @@ -0,0 +1,97 @@ +#' Generate the `omopcat_summary_stats` table +#' +#' @param cdm A [`CDMConnector`] object, e.g. from [`CDMConnector::cdm_from_con()`] +#' +#' @return A `data.frame` with the summary statistics +#' @keywords internal +generate_summary_stats <- function(cdm) { + omop_tables <- cdm[c("measurement", "observation")] + concept_cols <- c("measurement_concept_id", "observation_concept_id") + + # Combine results for all tables + stats <- purrr::map2(omop_tables, concept_cols, calculate_summary_stats) + stats <- dplyr::bind_rows(stats) + + # Map concept names to the concept_ids + concept_names <- dplyr::select(cdm$concept, "concept_id", "concept_name") |> + dplyr::filter(.data$concept_id %in% c(stats$concept_id, stats$value_as_concept_id)) |> + dplyr::collect() + stats |> + # Order is important here, first we get the names for the value_as_concept_ids + # from the categorical data summaries and record it as `value_as_string` + dplyr::left_join(concept_names, by = c("value_as_concept_id" = "concept_id")) |> + dplyr::rename(value_as_string = "concept_name") |> + # Then we get the names for the main concept_ids + dplyr::left_join(concept_names, by = c("concept_id" = "concept_id")) |> + dplyr::select("concept_id", "concept_name", !"value_as_concept_id") +} + +#' Calculate summary statistics for an OMOP table +#' +#' Calculates the mean and standard deviation for numeric concepts and the +#' frequency for categorical concepts. +#' +#' @param omop_table A table from the OMOP CDM +#' @param concept_name The name of the concept ID column +#' +#' @return A `data.frame` with the following columns: +#' - `concept_id`: The concept ID +#' - `summary_attribute`: The summary attribute (e.g. `"mean"`, `"sd"`, `"frequency"`) +#' - `value_as_number`: The value of the summary attribute +#' - `value_as_concept_id`: In case of a categorical concept, the concept ID for each category +#' @keywords internal +calculate_summary_stats <- function(omop_table, concept_name) { + stopifnot(is.character(concept_name)) + stopifnot(concept_name %in% colnames(omop_table)) + + omop_table <- dplyr::rename(omop_table, concept_id = dplyr::all_of(concept_name)) + + numeric_concepts <- dplyr::filter(omop_table, !is.na(.data$value_as_number)) + # beware CDM docs: NULL=no categorical result, 0=categorical result but no mapping + categorical_concepts <- dplyr::filter( + omop_table, + !is.null(.data$value_as_concept_id) & .data$value_as_concept_id != 0 + ) + + numeric_stats <- .summarise_numeric_concepts(numeric_concepts) |> dplyr::collect() + categorical_stats <- .summarise_categorical_concepts(categorical_concepts) |> + # Convert value_as_number to double to make it compatible with numeric stats + dplyr::mutate(value_as_number = as.double(.data$value_as_number)) |> + dplyr::collect() + dplyr::bind_rows(numeric_stats, categorical_stats) +} + +#' @importFrom stats sd +.summarise_numeric_concepts <- function(omop_table) { + # Calculate mean and sd + stats <- omop_table |> + dplyr::group_by(.data$concept_id) |> + dplyr::summarise( + mean = mean(.data$value_as_number, na.rm = TRUE), + sd = sd(.data$value_as_number, na.rm = TRUE) + ) + + # Wrangle output to expected format + stats |> + tidyr::pivot_longer( + cols = c("mean", "sd"), + names_to = "summary_attribute", + values_to = "value_as_number" + ) +} + +.summarise_categorical_concepts <- function(omop_table) { + # Calculate frequencies + frequencies <- omop_table |> + dplyr::count(.data$concept_id, .data$value_as_concept_id) + + # Wrangle output into the expected format + frequencies |> + dplyr::mutate(summary_attribute = "frequency") |> + dplyr::select( + "concept_id", + "summary_attribute", + value_as_number = "n", + "value_as_concept_id" + ) +} diff --git a/preprocessing/R/utils-db.R b/preprocessing/R/utils-db.R new file mode 100644 index 0000000..a634593 --- /dev/null +++ b/preprocessing/R/utils-db.R @@ -0,0 +1,14 @@ +#' Connect to a database +#' +#' General helper to connect to a database through [`DBI::dbConnect()`], while ensuring +#' that the connection is closed when the connection object goes out of scope. +#' +#' @param ... arguments passed on to [`DBI::dbConnect()`] +#' @param .envir passed on to [`withr::defer()`] +#' +#' @return A [`DBI::DBIConnection-class`] object +connect_to_db <- function(..., .envir = parent.frame()) { + con <- DBI::dbConnect(...) + withr::defer(DBI::dbDisconnect(con), envir = .envir) + con +} diff --git a/preprocessing/data-raw/test_db/.gitignore b/preprocessing/data-raw/test_db/.gitignore new file mode 100644 index 0000000..9f0a7c3 --- /dev/null +++ b/preprocessing/data-raw/test_db/.gitignore @@ -0,0 +1,9 @@ +# Eunomia data +*.zip + +# duckdb databases +*.duckdb + +# duckdb temp files +# (in case of failure) +*.duckdb.wal diff --git a/man/calculate_monthly_counts.Rd b/preprocessing/man/calculate_monthly_counts.Rd similarity index 93% rename from man/calculate_monthly_counts.Rd rename to preprocessing/man/calculate_monthly_counts.Rd index bf7ac48..4f38b54 100644 --- a/man/calculate_monthly_counts.Rd +++ b/preprocessing/man/calculate_monthly_counts.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_preprocessing_summarise.R +% Please edit documentation in R/monthly_counts.R \name{calculate_monthly_counts} \alias{calculate_monthly_counts} \title{Calculate monthly statistics for an OMOP concept} @@ -27,3 +27,4 @@ A \code{data.frame} with the following columns: \description{ Calculate monthly statistics for an OMOP concept } +\keyword{internal} diff --git a/man/calculate_summary_stats.Rd b/preprocessing/man/calculate_summary_stats.Rd similarity index 92% rename from man/calculate_summary_stats.Rd rename to preprocessing/man/calculate_summary_stats.Rd index ee02e37..a4e2bf3 100644 --- a/man/calculate_summary_stats.Rd +++ b/preprocessing/man/calculate_summary_stats.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_preprocessing_summarise.R +% Please edit documentation in R/summary_stats.R \name{calculate_summary_stats} \alias{calculate_summary_stats} \title{Calculate summary statistics for an OMOP table} @@ -24,3 +24,4 @@ A \code{data.frame} with the following columns: Calculates the mean and standard deviation for numeric concepts and the frequency for categorical concepts. } +\keyword{internal} diff --git a/man/connect_to_db.Rd b/preprocessing/man/connect_to_db.Rd similarity index 70% rename from man/connect_to_db.Rd rename to preprocessing/man/connect_to_db.Rd index c254ab7..ff09c7c 100644 --- a/man/connect_to_db.Rd +++ b/preprocessing/man/connect_to_db.Rd @@ -1,8 +1,8 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_preprocessing_db.R +% Please edit documentation in R/utils-db.R \name{connect_to_db} \alias{connect_to_db} -\title{Connec to a database} +\title{Connect to a database} \usage{ connect_to_db(..., .envir = parent.frame()) } @@ -15,6 +15,6 @@ connect_to_db(..., .envir = parent.frame()) A \code{\link[DBI:DBIConnection-class]{DBI::DBIConnection}} object } \description{ -General helper to connect to a databae through \code{\link[DBI:dbConnect]{DBI::dbConnect()}}, while ensuring +General helper to connect to a database through \code{\link[DBI:dbConnect]{DBI::dbConnect()}}, while ensuring that the connection is closed when the connection object goes out of scope. } diff --git a/preprocessing/man/generate_concepts.Rd b/preprocessing/man/generate_concepts.Rd new file mode 100644 index 0000000..dce5168 --- /dev/null +++ b/preprocessing/man/generate_concepts.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/concepts.R +\name{generate_concepts} +\alias{generate_concepts} +\title{Generate the \code{omopcat_concepts} table} +\usage{ +generate_concepts(cdm, concept_ids) +} +\arguments{ +\item{cdm}{A \code{\link{CDMConnector}} object, e.g. from \code{\link[CDMConnector:cdm_from_con]{CDMConnector::cdm_from_con()}}} + +\item{concept_ids}{A vector of concept IDs} +} +\value{ +A \code{data.frame} with the monthly counts +} +\description{ +Generate the \code{omopcat_concepts} table +} +\keyword{internal} diff --git a/man/process_monthly_counts.Rd b/preprocessing/man/generate_monthly_counts.Rd similarity index 69% rename from man/process_monthly_counts.Rd rename to preprocessing/man/generate_monthly_counts.Rd index 5b085d9..75a454c 100644 --- a/man/process_monthly_counts.Rd +++ b/preprocessing/man/generate_monthly_counts.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_preprocessing_db.R -\name{process_monthly_counts} -\alias{process_monthly_counts} +% Please edit documentation in R/monthly_counts.R +\name{generate_monthly_counts} +\alias{generate_monthly_counts} \title{Generate the 'omopcat_monthly_counts' table} \usage{ -process_monthly_counts(cdm) +generate_monthly_counts(cdm) } \arguments{ \item{cdm}{A \code{\link{CDMConnector}} object, e.g. from \code{\link[CDMConnector:cdm_from_con]{CDMConnector::cdm_from_con()}}} @@ -15,3 +15,4 @@ A \code{data.frame} with the monthly counts \description{ Generate the 'omopcat_monthly_counts' table } +\keyword{internal} diff --git a/man/process_summary_stats.Rd b/preprocessing/man/generate_summary_stats.Rd similarity index 51% rename from man/process_summary_stats.Rd rename to preprocessing/man/generate_summary_stats.Rd index 194de3e..197eb0a 100644 --- a/man/process_summary_stats.Rd +++ b/preprocessing/man/generate_summary_stats.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils_preprocessing_db.R -\name{process_summary_stats} -\alias{process_summary_stats} -\title{Generate the 'omopcat_summary_stats' table} +% Please edit documentation in R/summary_stats.R +\name{generate_summary_stats} +\alias{generate_summary_stats} +\title{Generate the \code{omopcat_summary_stats} table} \usage{ -process_summary_stats(cdm) +generate_summary_stats(cdm) } \arguments{ \item{cdm}{A \code{\link{CDMConnector}} object, e.g. from \code{\link[CDMConnector:cdm_from_con]{CDMConnector::cdm_from_con()}}} @@ -13,5 +13,6 @@ process_summary_stats(cdm) A \code{data.frame} with the summary statistics } \description{ -Generate the 'omopcat_summary_stats' table +Generate the \code{omopcat_summary_stats} table } +\keyword{internal} diff --git a/preprocessing/man/omopcat.preprocessing-package.Rd b/preprocessing/man/omopcat.preprocessing-package.Rd new file mode 100644 index 0000000..c07ce26 --- /dev/null +++ b/preprocessing/man/omopcat.preprocessing-package.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/omopcat.preprocessing-package.R +\docType{package} +\name{omopcat.preprocessing-package} +\alias{omopcat.preprocessing} +\alias{omopcat.preprocessing-package} +\title{omopcat.preprocessing: Preprocessing for the omopcat App} +\description{ +Preprocess data for the omopcat app. +} +\author{ +\strong{Maintainer}: Milan Malfait \email{m.malfait@ucl.ac.uk} + +Authors: +\itemize{ + \item Stefan Piatek + \item Baptiste Briot Ribeyre + \item Andy South +} + +Other contributors: +\itemize{ + \item SAFEHR [copyright holder, funder] +} + +} +\keyword{internal} diff --git a/preprocessing/man/preprocess.Rd b/preprocessing/man/preprocess.Rd new file mode 100644 index 0000000..9b89138 --- /dev/null +++ b/preprocessing/man/preprocess.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/preprocess.R +\name{preprocess} +\alias{preprocess} +\title{Run the pre-processing pipeline} +\usage{ +preprocess(out_path = Sys.getenv("PREPROCESS_OUT_PATH")) +} +\arguments{ +\item{out_path}{The directory where the pre-processed data will be written to. +Defaults to the \code{PREPROCESS_OUT_PATH} environment variable.} +} +\value{ +Whether the pre-processing was run: \code{TRUE} or \code{FALSE}, invisibly. +} +\description{ +The pre-processing pipeline generates the following files: +} +\details{ +\itemize{ +\item \code{out_path}/omopcat_concepts.parquet +\item \code{out_path}/omopcat_monthly_counts.parquet +\item \code{out_path}/omopcat_summary_stats.parquet +} + +If all these files already exist, the pipeline will not be run. +} diff --git a/preprocessing/omopcat.preprocessing.Rproj b/preprocessing/omopcat.preprocessing.Rproj new file mode 100644 index 0000000..7f1b52b --- /dev/null +++ b/preprocessing/omopcat.preprocessing.Rproj @@ -0,0 +1,22 @@ +Version: 1.0 + +RestoreWorkspace: No +SaveWorkspace: No +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: XeLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes +LineEndingConversion: Posix + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/deploy/renv.lock.prod b/preprocessing/renv.lock similarity index 76% rename from deploy/renv.lock.prod rename to preprocessing/renv.lock index 3b0d940..033049a 100644 --- a/deploy/renv.lock.prod +++ b/preprocessing/renv.lock @@ -51,55 +51,6 @@ ], "Hash": "065ae649b05f1ff66bb0c793107508f5" }, - "DT": { - "Package": "DT", - "Version": "0.33", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "crosstalk", - "htmltools", - "htmlwidgets", - "httpuv", - "jquerylib", - "jsonlite", - "magrittr", - "promises" - ], - "Hash": "64ff3427f559ce3f2597a4fe13255cb6" - }, - "MASS": { - "Package": "MASS", - "Version": "7.3-61", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "grDevices", - "graphics", - "methods", - "stats", - "utils" - ], - "Hash": "0cafd6f0500e5deba33be22c46bf6055" - }, - "Matrix": { - "Package": "Matrix", - "Version": "1.7-1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "grDevices", - "graphics", - "grid", - "lattice", - "methods", - "stats", - "utils" - ], - "Hash": "5122bb14d8736372411f955e1b16bc8a" - }, "R6": { "Package": "R6", "Version": "2.5.1", @@ -110,16 +61,6 @@ ], "Hash": "470851b6d5d0ac559e9d01bb352b4021" }, - "RColorBrewer": { - "Package": "RColorBrewer", - "Version": "1.1-3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "45f0398006e83a5b10b72a90663d8d8c" - }, "RPostgres": { "Package": "RPostgres", "Version": "1.4.7", @@ -139,6 +80,25 @@ ], "Hash": "beb7e18bf3f9e096f716a52a77ec793c" }, + "RSQLite": { + "Package": "RSQLite", + "Version": "2.3.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "DBI", + "R", + "bit64", + "blob", + "cpp11", + "memoise", + "methods", + "pkgconfig", + "plogr", + "rlang" + ], + "Hash": "46b45a4dd7bb0e0f4e3fc22245817240" + }, "Rcpp": { "Package": "Rcpp", "Version": "1.0.13-1", @@ -226,19 +186,22 @@ ], "Hash": "40415719b5a479b87949f3aa0aee737c" }, - "bsicons": { - "Package": "bsicons", - "Version": "0.1.2", + "brew": { + "Package": "brew", + "Version": "1.0-10", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "8f4a384e19dccd8c65356dc096847b76" + }, + "brio": { + "Package": "brio", + "Version": "1.1.5", "Source": "Repository", "Repository": "CRAN", "Requirements": [ - "R", - "cli", - "htmltools", - "rlang", - "utils" + "R" ], - "Hash": "d8f892fbd94d0b9b1f6d688b05b8633c" + "Hash": "c1ee497a6d999947c2c224ae46799b1a" }, "bslib": { "Package": "bslib", @@ -273,6 +236,19 @@ ], "Hash": "cd9a672193789068eb5a2aad65a0dedf" }, + "callr": { + "Package": "callr", + "Version": "3.7.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "processx", + "utils" + ], + "Hash": "d7e13f49c19103ece9e58ad2d83a7354" + }, "checkmate": { "Package": "checkmate", "Version": "2.3.2", @@ -306,20 +282,6 @@ ], "Hash": "3f038e5ac7f41d4ac41ce658c85e3042" }, - "colorspace": { - "Package": "colorspace", - "Version": "2.1-1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "grDevices", - "graphics", - "methods", - "stats" - ], - "Hash": "d954cb1c57e8d8b756165d7ba18aa55a" - }, "commonmark": { "Package": "commonmark", "Version": "1.9.2", @@ -341,7 +303,7 @@ "Package": "cpp11", "Version": "0.5.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R" ], @@ -359,39 +321,29 @@ ], "Hash": "859d96e65ef198fd43e82b9628d593ef" }, - "crosstalk": { - "Package": "crosstalk", - "Version": "1.2.1", + "credentials": { + "Package": "credentials", + "Version": "2.0.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ - "R6", - "htmltools", + "askpass", + "curl", "jsonlite", - "lazyeval" + "openssl", + "sys" ], - "Hash": "ab12c7b080a57475248a30f4db6298c0" + "Hash": "09fd631e607a236f8cc7f9604db32cb8" }, "curl": { "Package": "curl", - "Version": "5.2.3", + "Version": "6.0.0", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R" ], - "Hash": "d91263322a58af798f6cf3b13fd56dde" - }, - "data.table": { - "Package": "data.table", - "Version": "1.16.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "methods" - ], - "Hash": "2e00b378fc3be69c865120d9f313039a" + "Hash": "ff51697d9205fe715f29e7171e874c6e" }, "dbplyr": { "Package": "dbplyr", @@ -421,6 +373,53 @@ ], "Hash": "39b2e002522bfd258039ee4e889e0fd1" }, + "desc": { + "Package": "desc", + "Version": "1.4.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "cli", + "utils" + ], + "Hash": "99b79fcbd6c4d1ce087f5c5c758b384f" + }, + "devtools": { + "Package": "devtools", + "Version": "2.4.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "desc", + "ellipsis", + "fs", + "lifecycle", + "memoise", + "miniUI", + "pkgbuild", + "pkgdown", + "pkgload", + "profvis", + "rcmdcheck", + "remotes", + "rlang", + "roxygen2", + "rversions", + "sessioninfo", + "stats", + "testthat", + "tools", + "urlchecker", + "usethis", + "utils", + "withr" + ], + "Hash": "ea5bc8b4a6a01e4f12d98b58329930bb" + }, "diffobj": { "Package": "diffobj", "Version": "0.3.5", @@ -440,13 +439,33 @@ "Package": "digest", "Version": "0.6.37", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Requirements": [ "R", "utils" ], "Hash": "33698c4b3127fc9f506654607fb73676" }, + "downlit": { + "Package": "downlit", + "Version": "0.4.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "brio", + "desc", + "digest", + "evaluate", + "fansi", + "memoise", + "rlang", + "vctrs", + "withr", + "yaml" + ], + "Hash": "45a6a596bf0108ee1ff16a040a2df897" + }, "dplyr": { "Package": "dplyr", "Version": "1.1.4", @@ -470,6 +489,30 @@ ], "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" }, + "duckdb": { + "Package": "duckdb", + "Version": "1.1.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "DBI", + "R", + "methods", + "utils" + ], + "Hash": "83a09ee9c8380fecfcea1daeaa99e3b2" + }, + "ellipsis": { + "Package": "ellipsis", + "Version": "0.3.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "rlang" + ], + "Hash": "bb0eec2fe32e88d9e2836c2f73ea2077" + }, "evaluate": { "Package": "evaluate", "Version": "1.0.1", @@ -492,13 +535,6 @@ ], "Hash": "962174cf2aeb5b9eea581522286a911f" }, - "farver": { - "Package": "farver", - "Version": "2.1.2", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "680887028577f3fa2a81e410ed0d6e42" - }, "fastmap": { "Package": "fastmap", "Version": "1.2.0", @@ -518,22 +554,6 @@ ], "Hash": "c2efdd5f0bcd1ea861c2d4e2a883a67d" }, - "forcats": { - "Package": "forcats", - "Version": "1.0.0", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "glue", - "lifecycle", - "magrittr", - "rlang", - "tibble" - ], - "Hash": "1a0a9a3d5083d0d573c4214576f1e690" - }, "fs": { "Package": "fs", "Version": "1.6.5", @@ -556,30 +576,48 @@ ], "Hash": "15e9634c0fcd294799e9b2e929ed1b86" }, - "ggplot2": { - "Package": "ggplot2", - "Version": "3.5.1", + "gert": { + "Package": "gert", + "Version": "2.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "askpass", + "credentials", + "openssl", + "rstudioapi", + "sys", + "zip" + ], + "Hash": "ae855ad6d7be20dd7b05d43d25700398" + }, + "gh": { + "Package": "gh", + "Version": "1.4.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ - "MASS", "R", "cli", + "gitcreds", "glue", - "grDevices", - "grid", - "gtable", - "isoband", + "httr2", + "ini", + "jsonlite", "lifecycle", - "mgcv", - "rlang", - "scales", - "stats", - "tibble", - "vctrs", - "withr" + "rlang" + ], + "Hash": "fbbbc48eba7a6626a08bb365e44b563b" + }, + "gitcreds": { + "Package": "gitcreds", + "Version": "0.1.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" ], - "Hash": "44c6a2f8202d5b7e878ea274b1092426" + "Hash": "ab08ac61f3e1be454ae21911eb8bc2fe" }, "glue": { "Package": "glue", @@ -610,22 +648,6 @@ ], "Hash": "11be24963e2f220c403eafd01e2259d5" }, - "gtable": { - "Package": "gtable", - "Version": "0.3.6", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "cli", - "glue", - "grid", - "lifecycle", - "rlang", - "stats" - ], - "Hash": "de949855009e2d4d0e52a844e30617ae" - }, "here": { "Package": "here", "Version": "1.0.1", @@ -707,31 +729,33 @@ ], "Hash": "d55aa087c47a63ead0f6fc10f8fa1ee0" }, - "httr": { - "Package": "httr", - "Version": "1.4.7", + "httr2": { + "Package": "httr2", + "Version": "1.0.6", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", "R6", + "cli", "curl", - "jsonlite", - "mime", - "openssl" + "glue", + "lifecycle", + "magrittr", + "openssl", + "rappdirs", + "rlang", + "vctrs", + "withr" ], - "Hash": "ac107251d9d9fd72f0ca8049988f1d7f" + "Hash": "3ef5d07ec78803475a94367d71b40c41" }, - "isoband": { - "Package": "isoband", - "Version": "0.2.7", + "ini": { + "Package": "ini", + "Version": "0.3.1", "Source": "Repository", "Repository": "CRAN", - "Requirements": [ - "grid", - "utils" - ], - "Hash": "0080607b4a1a7b28979aecef976d8bc2" + "Hash": "6154ec2223172bce8162d4153cda21f7" }, "jquerylib": { "Package": "jquerylib", @@ -769,17 +793,6 @@ ], "Hash": "acf380f300c721da9fde7df115a5f86f" }, - "labeling": { - "Package": "labeling", - "Version": "0.4.3", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "graphics", - "stats" - ], - "Hash": "b64ec208ac5bc1852b285f665d6368b3" - }, "later": { "Package": "later", "Version": "1.3.2", @@ -791,31 +804,6 @@ ], "Hash": "a3e051d405326b8b0012377434c62b37" }, - "lattice": { - "Package": "lattice", - "Version": "0.22-6", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "grDevices", - "graphics", - "grid", - "stats", - "utils" - ], - "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2" - }, - "lazyeval": { - "Package": "lazyeval", - "Version": "0.2.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "d908914ae53b04d4c0c0fd72ecc35370" - }, "lifecycle": { "Package": "lifecycle", "Version": "1.0.4", @@ -852,19 +840,6 @@ ], "Hash": "7ce2733a9826b3aeb1775d56fd305472" }, - "markdown": { - "Package": "markdown", - "Version": "1.13", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "commonmark", - "utils", - "xfun" - ], - "Hash": "074efab766a9d6360865ad39512f2a7e" - }, "memoise": { "Package": "memoise", "Version": "2.0.1", @@ -876,23 +851,6 @@ ], "Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c" }, - "mgcv": { - "Package": "mgcv", - "Version": "1.9-1", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "Matrix", - "R", - "graphics", - "methods", - "nlme", - "splines", - "stats", - "utils" - ], - "Hash": "110ee9d83b496279960e162ac97764ce" - }, "mime": { "Package": "mime", "Version": "0.12", @@ -903,16 +861,17 @@ ], "Hash": "18e9c28c1d3ca1560ce30658b22ce104" }, - "munsell": { - "Package": "munsell", - "Version": "0.5.1", + "miniUI": { + "Package": "miniUI", + "Version": "0.1.1.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ - "colorspace", - "methods" + "htmltools", + "shiny", + "utils" ], - "Hash": "4fd8900853b746af55b81fda99da7695" + "Hash": "fec5f52652d60615fdb3957b3d74324a" }, "nanoparquet": { "Package": "nanoparquet", @@ -924,40 +883,6 @@ ], "Hash": "a2a0a0dc4cd6bd52d501c619c77cc321" }, - "nlme": { - "Package": "nlme", - "Version": "3.1-166", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R", - "graphics", - "lattice", - "stats", - "utils" - ], - "Hash": "ccbb8846be320b627e6aa2b4616a2ded" - }, - "omopbundles": { - "Package": "omopbundles", - "Version": "0.1.1", - "Source": "GitHub", - "RemoteType": "github", - "RemoteHost": "api.github.com", - "RemoteUsername": "SAFEHR-data", - "RemoteRepo": "omop-bundles", - "RemoteRef": "main", - "RemoteSha": "0d24c38f1bf4d032eab700741a9576f9a8255991", - "Requirements": [ - "dplyr", - "glue", - "purrr", - "readr", - "rlang", - "vroom" - ], - "Hash": "caecf4e78df185f39703c28a73658659" - }, "omopgenerics": { "Package": "omopgenerics", "Version": "0.3.1", @@ -1007,6 +932,21 @@ ], "Hash": "15da5a8412f317beeee6175fbc76f4bb" }, + "pkgbuild": { + "Package": "pkgbuild", + "Version": "1.4.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "callr", + "cli", + "desc", + "processx" + ], + "Hash": "30eaaab94db72652e72e3475c1b55278" + }, "pkgconfig": { "Package": "pkgconfig", "Version": "2.0.3", @@ -1017,44 +957,71 @@ ], "Hash": "01f28d4278f15c76cddbea05899c5d6f" }, - "plogr": { - "Package": "plogr", - "Version": "0.2.0", - "Source": "Repository", - "Repository": "CRAN", - "Hash": "09eb987710984fc2905c7129c7d85e65" - }, - "plotly": { - "Package": "plotly", - "Version": "4.10.4", + "pkgdown": { + "Package": "pkgdown", + "Version": "2.1.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", - "RColorBrewer", - "base64enc", - "crosstalk", - "data.table", + "bslib", + "callr", + "cli", + "desc", "digest", - "dplyr", - "ggplot2", - "htmltools", - "htmlwidgets", - "httr", + "downlit", + "fontawesome", + "fs", + "httr2", "jsonlite", - "lazyeval", - "magrittr", - "promises", + "openssl", "purrr", + "ragg", "rlang", - "scales", + "rmarkdown", "tibble", - "tidyr", - "tools", - "vctrs", - "viridisLite" + "whisker", + "withr", + "xml2", + "yaml" ], - "Hash": "a1ac5c03ad5ad12b9d1597e00e23c3dd" + "Hash": "df2912d5873422b55a13002510f02c9f" + }, + "pkgload": { + "Package": "pkgload", + "Version": "1.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "desc", + "fs", + "glue", + "lifecycle", + "methods", + "pkgbuild", + "processx", + "rlang", + "rprojroot", + "utils", + "withr" + ], + "Hash": "2ec30ffbeec83da57655b850cf2d3e0e" + }, + "plogr": { + "Package": "plogr", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "09eb987710984fc2905c7129c7d85e65" + }, + "praise": { + "Package": "praise", + "Version": "1.0.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "a555924add98c99d2f411e37e7d25e9f" }, "prettyunits": { "Package": "prettyunits", @@ -1066,6 +1033,32 @@ ], "Hash": "6b01fc98b1e86c4f705ce9dcfd2f57c7" }, + "processx": { + "Package": "processx", + "Version": "3.8.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "ps", + "utils" + ], + "Hash": "0c90a7d71988856bad2a2a45dd871bb9" + }, + "profvis": { + "Package": "profvis", + "Version": "0.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "htmlwidgets", + "rlang", + "vctrs" + ], + "Hash": "bffa126bf92987e677c12cfb5651fc1d" + }, "progress": { "Package": "progress", "Version": "1.2.3", @@ -1096,6 +1089,17 @@ ], "Hash": "434cd5388a3979e74be5c219bcd6e77d" }, + "ps": { + "Package": "ps", + "Version": "1.8.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "b4404b1de13758dea1c0484ad0d48563" + }, "purrr": { "Package": "purrr", "Version": "1.0.2", @@ -1111,6 +1115,17 @@ ], "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc" }, + "ragg": { + "Package": "ragg", + "Version": "1.3.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "systemfonts", + "textshaping" + ], + "Hash": "0595fe5e47357111f29ad19101c7d271" + }, "rappdirs": { "Package": "rappdirs", "Version": "0.3.3", @@ -1121,6 +1136,28 @@ ], "Hash": "5e3c5dc0b071b21fa128676560dbe94d" }, + "rcmdcheck": { + "Package": "rcmdcheck", + "Version": "1.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R6", + "callr", + "cli", + "curl", + "desc", + "digest", + "pkgbuild", + "prettyunits", + "rprojroot", + "sessioninfo", + "utils", + "withr", + "xopen" + ], + "Hash": "8f25ebe2ec38b1f2aef3b0d2ef76f6c4" + }, "readr": { "Package": "readr", "Version": "2.1.5", @@ -1144,6 +1181,20 @@ ], "Hash": "9de96463d2117f6ac49980577939dfb3" }, + "remotes": { + "Package": "remotes", + "Version": "2.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods", + "stats", + "tools", + "utils" + ], + "Hash": "3ee025083e66f18db6cf27b56e23e141" + }, "renv": { "Package": "renv", "Version": "1.0.11", @@ -1188,6 +1239,32 @@ ], "Hash": "df99277f63d01c34e95e3d2f06a79736" }, + "roxygen2": { + "Package": "roxygen2", + "Version": "7.3.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "brew", + "cli", + "commonmark", + "cpp11", + "desc", + "knitr", + "methods", + "pkgload", + "purrr", + "rlang", + "stringi", + "stringr", + "utils", + "withr", + "xml2" + ], + "Hash": "6ee25f9054a70f44d615300ed531ba8d" + }, "rprojroot": { "Package": "rprojroot", "Version": "2.0.4", @@ -1198,6 +1275,25 @@ ], "Hash": "4c8415e0ec1e29f3f4f6fc108bef0144" }, + "rstudioapi": { + "Package": "rstudioapi", + "Version": "0.17.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "5f90cd73946d706cfe26024294236113" + }, + "rversions": { + "Package": "rversions", + "Version": "2.1.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "curl", + "utils", + "xml2" + ], + "Hash": "a9881dfed103e83f9de151dc17002cd1" + }, "sass": { "Package": "sass", "Version": "0.4.9", @@ -1212,25 +1308,18 @@ ], "Hash": "d53dbfddf695303ea4ad66f86e99b95d" }, - "scales": { - "Package": "scales", - "Version": "1.3.0", + "sessioninfo": { + "Package": "sessioninfo", + "Version": "1.2.2", "Source": "Repository", "Repository": "CRAN", "Requirements": [ "R", - "R6", - "RColorBrewer", "cli", - "farver", - "glue", - "labeling", - "lifecycle", - "munsell", - "rlang", - "viridisLite" + "tools", + "utils" ], - "Hash": "c19df082ba346b0ffa6f833e92de34d1" + "Hash": "3f9796a8d0a0e8c6eb49a4b029359d1f" }, "shiny": { "Package": "shiny", @@ -1324,6 +1413,60 @@ "Repository": "CRAN", "Hash": "de342ebfebdbf40477d0758d05426646" }, + "systemfonts": { + "Package": "systemfonts", + "Version": "1.1.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cpp11", + "lifecycle" + ], + "Hash": "213b6b8ed5afbf934843e6c3b090d418" + }, + "testthat": { + "Package": "testthat", + "Version": "3.2.1.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "brio", + "callr", + "cli", + "desc", + "digest", + "evaluate", + "jsonlite", + "lifecycle", + "magrittr", + "methods", + "pkgload", + "praise", + "processx", + "ps", + "rlang", + "utils", + "waldo", + "withr" + ], + "Hash": "3f6e7e5e2220856ff865e4834766bf2b" + }, + "textshaping": { + "Package": "textshaping", + "Version": "0.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cpp11", + "lifecycle", + "systemfonts" + ], + "Hash": "5142f8bc78ed3d819d26461b641627ce" + }, "tibble": { "Package": "tibble", "Version": "3.2.1", @@ -1414,6 +1557,51 @@ ], "Hash": "f561504ec2897f4d46f0c7657e488ae1" }, + "urlchecker": { + "Package": "urlchecker", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "curl", + "tools", + "xml2" + ], + "Hash": "409328b8e1253c8d729a7836fe7f7a16" + }, + "usethis": { + "Package": "usethis", + "Version": "3.0.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "clipr", + "crayon", + "curl", + "desc", + "fs", + "gert", + "gh", + "glue", + "jsonlite", + "lifecycle", + "purrr", + "rappdirs", + "rlang", + "rprojroot", + "rstudioapi", + "stats", + "utils", + "whisker", + "withr", + "yaml" + ], + "Hash": "b2fbf93c2127bedd2cbe9b799530d5d2" + }, "utf8": { "Package": "utf8", "Version": "1.2.4", @@ -1438,16 +1626,6 @@ ], "Hash": "c03fa420630029418f7e6da3667aac4a" }, - "viridisLite": { - "Package": "viridisLite", - "Version": "0.4.2", - "Source": "Repository", - "Repository": "CRAN", - "Requirements": [ - "R" - ], - "Hash": "c826c7c4241b6fc89ff55aaea3fa7491" - }, "vroom": { "Package": "vroom", "Version": "1.6.5", @@ -1476,7 +1654,7 @@ }, "waldo": { "Package": "waldo", - "Version": "0.6.0", + "Version": "0.6.1", "Source": "Repository", "Repository": "CRAN", "Requirements": [ @@ -1487,7 +1665,14 @@ "methods", "rlang" ], - "Hash": "53ec6571dc7a321797cde0abe007ff53" + "Hash": "52f574062a7b66e56926988c3fbdb3b7" + }, + "whisker": { + "Package": "whisker", + "Version": "0.4.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "c6abfa47a46d281a7d5159d0a8891e88" }, "withr": { "Package": "withr", @@ -1514,6 +1699,30 @@ ], "Hash": "8687398773806cfff9401a2feca96298" }, + "xml2": { + "Package": "xml2", + "Version": "1.3.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "methods", + "rlang" + ], + "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61" + }, + "xopen": { + "Package": "xopen", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "processx" + ], + "Hash": "423df1e86d5533fcb73c6b02b4923b49" + }, "xtable": { "Package": "xtable", "Version": "1.8-4", @@ -1532,6 +1741,13 @@ "Source": "Repository", "Repository": "CRAN", "Hash": "51dab85c6c98e50a18d7551e9d49f76c" + }, + "zip": { + "Package": "zip", + "Version": "2.3.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "fcc4bd8e6da2d2011eb64a5e5cc685ab" } } } diff --git a/preprocessing/renv/.gitignore b/preprocessing/renv/.gitignore new file mode 100644 index 0000000..0ec0cbb --- /dev/null +++ b/preprocessing/renv/.gitignore @@ -0,0 +1,7 @@ +library/ +local/ +cellar/ +lock/ +python/ +sandbox/ +staging/ diff --git a/preprocessing/renv/activate.R b/preprocessing/renv/activate.R new file mode 100644 index 0000000..0eb5108 --- /dev/null +++ b/preprocessing/renv/activate.R @@ -0,0 +1,1305 @@ + +local({ + + # the requested version of renv + version <- "1.0.11" + attr(version, "sha") <- NULL + + # the project directory + project <- Sys.getenv("RENV_PROJECT") + if (!nzchar(project)) + project <- getwd() + + # use start-up diagnostics if enabled + diagnostics <- Sys.getenv("RENV_STARTUP_DIAGNOSTICS", unset = "FALSE") + if (diagnostics) { + start <- Sys.time() + profile <- tempfile("renv-startup-", fileext = ".Rprof") + utils::Rprof(profile) + on.exit({ + utils::Rprof(NULL) + elapsed <- signif(difftime(Sys.time(), start, units = "auto"), digits = 2L) + writeLines(sprintf("- renv took %s to run the autoloader.", format(elapsed))) + writeLines(sprintf("- Profile: %s", profile)) + print(utils::summaryRprof(profile)) + }, add = TRUE) + } + + # figure out whether the autoloader is enabled + enabled <- local({ + + # first, check config option + override <- getOption("renv.config.autoloader.enabled") + if (!is.null(override)) + return(override) + + # if we're being run in a context where R_LIBS is already set, + # don't load -- presumably we're being run as a sub-process and + # the parent process has already set up library paths for us + rcmd <- Sys.getenv("R_CMD", unset = NA) + rlibs <- Sys.getenv("R_LIBS", unset = NA) + if (!is.na(rlibs) && !is.na(rcmd)) + return(FALSE) + + # next, check environment variables + # TODO: prefer using the configuration one in the future + envvars <- c( + "RENV_CONFIG_AUTOLOADER_ENABLED", + "RENV_AUTOLOADER_ENABLED", + "RENV_ACTIVATE_PROJECT" + ) + + for (envvar in envvars) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(tolower(envval) %in% c("true", "t", "1")) + } + + # enable by default + TRUE + + }) + + # bail if we're not enabled + if (!enabled) { + + # if we're not enabled, we might still need to manually load + # the user profile here + profile <- Sys.getenv("R_PROFILE_USER", unset = "~/.Rprofile") + if (file.exists(profile)) { + cfg <- Sys.getenv("RENV_CONFIG_USER_PROFILE", unset = "TRUE") + if (tolower(cfg) %in% c("true", "t", "1")) + sys.source(profile, envir = globalenv()) + } + + return(FALSE) + + } + + # avoid recursion + if (identical(getOption("renv.autoloader.running"), TRUE)) { + warning("ignoring recursive attempt to run renv autoloader") + return(invisible(TRUE)) + } + + # signal that we're loading renv during R startup + options(renv.autoloader.running = TRUE) + on.exit(options(renv.autoloader.running = NULL), add = TRUE) + + # signal that we've consented to use renv + options(renv.consent = TRUE) + + # load the 'utils' package eagerly -- this ensures that renv shims, which + # mask 'utils' packages, will come first on the search path + library(utils, lib.loc = .Library) + + # unload renv if it's already been loaded + if ("renv" %in% loadedNamespaces()) + unloadNamespace("renv") + + # load bootstrap tools + ansify <- function(text) { + if (renv_ansify_enabled()) + renv_ansify_enhanced(text) + else + renv_ansify_default(text) + } + + renv_ansify_enabled <- function() { + + override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA) + if (!is.na(override)) + return(as.logical(override)) + + pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA) + if (identical(pane, "build")) + return(FALSE) + + testthat <- Sys.getenv("TESTTHAT", unset = "false") + if (tolower(testthat) %in% "true") + return(FALSE) + + iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false") + if (tolower(iderun) %in% "false") + return(FALSE) + + TRUE + + } + + renv_ansify_default <- function(text) { + text + } + + renv_ansify_enhanced <- function(text) { + + # R help links + pattern <- "`\\?(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # runnable code + pattern <- "`(renv::(?:[^`])+)`" + replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`" + text <- gsub(pattern, replacement, text, perl = TRUE) + + # return ansified text + text + + } + + renv_ansify_init <- function() { + + envir <- renv_envir_self() + if (renv_ansify_enabled()) + assign("ansify", renv_ansify_enhanced, envir = envir) + else + assign("ansify", renv_ansify_default, envir = envir) + + } + + `%||%` <- function(x, y) { + if (is.null(x)) y else x + } + + catf <- function(fmt, ..., appendLF = TRUE) { + + quiet <- getOption("renv.bootstrap.quiet", default = FALSE) + if (quiet) + return(invisible()) + + msg <- sprintf(fmt, ...) + cat(msg, file = stdout(), sep = if (appendLF) "\n" else "") + + invisible(msg) + + } + + header <- function(label, + ..., + prefix = "#", + suffix = "-", + n = min(getOption("width"), 78)) + { + label <- sprintf(label, ...) + n <- max(n - nchar(label) - nchar(prefix) - 2L, 8L) + if (n <= 0) + return(paste(prefix, label)) + + tail <- paste(rep.int(suffix, n), collapse = "") + paste0(prefix, " ", label, " ", tail) + + } + + heredoc <- function(text, leave = 0) { + + # remove leading, trailing whitespace + trimmed <- gsub("^\\s*\\n|\\n\\s*$", "", text) + + # split into lines + lines <- strsplit(trimmed, "\n", fixed = TRUE)[[1L]] + + # compute common indent + indent <- regexpr("[^[:space:]]", lines) + common <- min(setdiff(indent, -1L)) - leave + text <- paste(substring(lines, common), collapse = "\n") + + # substitute in ANSI links for executable renv code + ansify(text) + + } + + startswith <- function(string, prefix) { + substring(string, 1, nchar(prefix)) == prefix + } + + bootstrap <- function(version, library) { + + friendly <- renv_bootstrap_version_friendly(version) + section <- header(sprintf("Bootstrapping renv %s", friendly)) + catf(section) + + # attempt to download renv + catf("- Downloading renv ... ", appendLF = FALSE) + withCallingHandlers( + tarball <- renv_bootstrap_download(version), + error = function(err) { + catf("FAILED") + stop("failed to download:\n", conditionMessage(err)) + } + ) + catf("OK") + on.exit(unlink(tarball), add = TRUE) + + # now attempt to install + catf("- Installing renv ... ", appendLF = FALSE) + withCallingHandlers( + status <- renv_bootstrap_install(version, tarball, library), + error = function(err) { + catf("FAILED") + stop("failed to install:\n", conditionMessage(err)) + } + ) + catf("OK") + + # add empty line to break up bootstrapping from normal output + catf("") + + return(invisible()) + } + + renv_bootstrap_tests_running <- function() { + getOption("renv.tests.running", default = FALSE) + } + + renv_bootstrap_repos <- function() { + + # get CRAN repository + cran <- getOption("renv.repos.cran", "https://cloud.r-project.org") + + # check for repos override + repos <- Sys.getenv("RENV_CONFIG_REPOS_OVERRIDE", unset = NA) + if (!is.na(repos)) { + + # check for RSPM; if set, use a fallback repository for renv + rspm <- Sys.getenv("RSPM", unset = NA) + if (identical(rspm, repos)) + repos <- c(RSPM = rspm, CRAN = cran) + + return(repos) + + } + + # check for lockfile repositories + repos <- tryCatch(renv_bootstrap_repos_lockfile(), error = identity) + if (!inherits(repos, "error") && length(repos)) + return(repos) + + # retrieve current repos + repos <- getOption("repos") + + # ensure @CRAN@ entries are resolved + repos[repos == "@CRAN@"] <- cran + + # add in renv.bootstrap.repos if set + default <- c(FALLBACK = "https://cloud.r-project.org") + extra <- getOption("renv.bootstrap.repos", default = default) + repos <- c(repos, extra) + + # remove duplicates that might've snuck in + dupes <- duplicated(repos) | duplicated(names(repos)) + repos[!dupes] + + } + + renv_bootstrap_repos_lockfile <- function() { + + lockpath <- Sys.getenv("RENV_PATHS_LOCKFILE", unset = "renv.lock") + if (!file.exists(lockpath)) + return(NULL) + + lockfile <- tryCatch(renv_json_read(lockpath), error = identity) + if (inherits(lockfile, "error")) { + warning(lockfile) + return(NULL) + } + + repos <- lockfile$R$Repositories + if (length(repos) == 0) + return(NULL) + + keys <- vapply(repos, `[[`, "Name", FUN.VALUE = character(1)) + vals <- vapply(repos, `[[`, "URL", FUN.VALUE = character(1)) + names(vals) <- keys + + return(vals) + + } + + renv_bootstrap_download <- function(version) { + + sha <- attr(version, "sha", exact = TRUE) + + methods <- if (!is.null(sha)) { + + # attempting to bootstrap a development version of renv + c( + function() renv_bootstrap_download_tarball(sha), + function() renv_bootstrap_download_github(sha) + ) + + } else { + + # attempting to bootstrap a release version of renv + c( + function() renv_bootstrap_download_tarball(version), + function() renv_bootstrap_download_cran_latest(version), + function() renv_bootstrap_download_cran_archive(version) + ) + + } + + for (method in methods) { + path <- tryCatch(method(), error = identity) + if (is.character(path) && file.exists(path)) + return(path) + } + + stop("All download methods failed") + + } + + renv_bootstrap_download_impl <- function(url, destfile) { + + mode <- "wb" + + # https://bugs.r-project.org/bugzilla/show_bug.cgi?id=17715 + fixup <- + Sys.info()[["sysname"]] == "Windows" && + substring(url, 1L, 5L) == "file:" + + if (fixup) + mode <- "w+b" + + args <- list( + url = url, + destfile = destfile, + mode = mode, + quiet = TRUE + ) + + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(url) + if (length(headers) && is.character(headers)) + args$headers <- headers + } + + do.call(utils::download.file, args) + + } + + renv_bootstrap_download_custom_headers <- function(url) { + + headers <- getOption("renv.download.headers") + if (is.null(headers)) + return(character()) + + if (!is.function(headers)) + stopf("'renv.download.headers' is not a function") + + headers <- headers(url) + if (length(headers) == 0L) + return(character()) + + if (is.list(headers)) + headers <- unlist(headers, recursive = FALSE, use.names = TRUE) + + ok <- + is.character(headers) && + is.character(names(headers)) && + all(nzchar(names(headers))) + + if (!ok) + stop("invocation of 'renv.download.headers' did not return a named character vector") + + headers + + } + + renv_bootstrap_download_cran_latest <- function(version) { + + spec <- renv_bootstrap_download_cran_latest_find(version) + type <- spec$type + repos <- spec$repos + + baseurl <- utils::contrib.url(repos = repos, type = type) + ext <- if (identical(type, "source")) + ".tar.gz" + else if (Sys.info()[["sysname"]] == "Windows") + ".zip" + else + ".tgz" + name <- sprintf("renv_%s%s", version, ext) + url <- paste(baseurl, name, sep = "/") + + destfile <- file.path(tempdir(), name) + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (inherits(status, "condition")) + return(FALSE) + + # report success and return + destfile + + } + + renv_bootstrap_download_cran_latest_find <- function(version) { + + # check whether binaries are supported on this system + binary <- + getOption("renv.bootstrap.binary", default = TRUE) && + !identical(.Platform$pkgType, "source") && + !identical(getOption("pkgType"), "source") && + Sys.info()[["sysname"]] %in% c("Darwin", "Windows") + + types <- c(if (binary) "binary", "source") + + # iterate over types + repositories + for (type in types) { + for (repos in renv_bootstrap_repos()) { + + # build arguments for utils::available.packages() call + args <- list(type = type, repos = repos) + + # add custom headers if available -- note that + # utils::available.packages() will pass this to download.file() + if ("headers" %in% names(formals(utils::download.file))) { + headers <- renv_bootstrap_download_custom_headers(repos) + if (length(headers) && is.character(headers)) + args$headers <- headers + } + + # retrieve package database + db <- tryCatch( + as.data.frame( + do.call(utils::available.packages, args), + stringsAsFactors = FALSE + ), + error = identity + ) + + if (inherits(db, "error")) + next + + # check for compatible entry + entry <- db[db$Package %in% "renv" & db$Version %in% version, ] + if (nrow(entry) == 0) + next + + # found it; return spec to caller + spec <- list(entry = entry, type = type, repos = repos) + return(spec) + + } + } + + # if we got here, we failed to find renv + fmt <- "renv %s is not available from your declared package repositories" + stop(sprintf(fmt, version)) + + } + + renv_bootstrap_download_cran_archive <- function(version) { + + name <- sprintf("renv_%s.tar.gz", version) + repos <- renv_bootstrap_repos() + urls <- file.path(repos, "src/contrib/Archive/renv", name) + destfile <- file.path(tempdir(), name) + + for (url in urls) { + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (identical(status, 0L)) + return(destfile) + + } + + return(FALSE) + + } + + renv_bootstrap_download_tarball <- function(version) { + + # if the user has provided the path to a tarball via + # an environment variable, then use it + tarball <- Sys.getenv("RENV_BOOTSTRAP_TARBALL", unset = NA) + if (is.na(tarball)) + return() + + # allow directories + if (dir.exists(tarball)) { + name <- sprintf("renv_%s.tar.gz", version) + tarball <- file.path(tarball, name) + } + + # bail if it doesn't exist + if (!file.exists(tarball)) { + + # let the user know we weren't able to honour their request + fmt <- "- RENV_BOOTSTRAP_TARBALL is set (%s) but does not exist." + msg <- sprintf(fmt, tarball) + warning(msg) + + # bail + return() + + } + + catf("- Using local tarball '%s'.", tarball) + tarball + + } + + renv_bootstrap_github_token <- function() { + for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) { + envval <- Sys.getenv(envvar, unset = NA) + if (!is.na(envval)) + return(envval) + } + } + + renv_bootstrap_download_github <- function(version) { + + enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") + if (!identical(enabled, "TRUE")) + return(FALSE) + + # prepare download options + token <- renv_bootstrap_github_token() + if (nzchar(Sys.which("curl")) && nzchar(token)) { + fmt <- "--location --fail --header \"Authorization: token %s\"" + extra <- sprintf(fmt, token) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "curl", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } else if (nzchar(Sys.which("wget")) && nzchar(token)) { + fmt <- "--header=\"Authorization: token %s\"" + extra <- sprintf(fmt, token) + saved <- options("download.file.method", "download.file.extra") + options(download.file.method = "wget", download.file.extra = extra) + on.exit(do.call(base::options, saved), add = TRUE) + } + + url <- file.path("https://api.github.com/repos/rstudio/renv/tarball", version) + name <- sprintf("renv_%s.tar.gz", version) + destfile <- file.path(tempdir(), name) + + status <- tryCatch( + renv_bootstrap_download_impl(url, destfile), + condition = identity + ) + + if (!identical(status, 0L)) + return(FALSE) + + renv_bootstrap_download_augment(destfile) + + return(destfile) + + } + + # Add Sha to DESCRIPTION. This is stop gap until #890, after which we + # can use renv::install() to fully capture metadata. + renv_bootstrap_download_augment <- function(destfile) { + sha <- renv_bootstrap_git_extract_sha1_tar(destfile) + if (is.null(sha)) { + return() + } + + # Untar + tempdir <- tempfile("renv-github-") + on.exit(unlink(tempdir, recursive = TRUE), add = TRUE) + untar(destfile, exdir = tempdir) + pkgdir <- dir(tempdir, full.names = TRUE)[[1]] + + # Modify description + desc_path <- file.path(pkgdir, "DESCRIPTION") + desc_lines <- readLines(desc_path) + remotes_fields <- c( + "RemoteType: github", + "RemoteHost: api.github.com", + "RemoteRepo: renv", + "RemoteUsername: rstudio", + "RemotePkgRef: rstudio/renv", + paste("RemoteRef: ", sha), + paste("RemoteSha: ", sha) + ) + writeLines(c(desc_lines[desc_lines != ""], remotes_fields), con = desc_path) + + # Re-tar + local({ + old <- setwd(tempdir) + on.exit(setwd(old), add = TRUE) + + tar(destfile, compression = "gzip") + }) + invisible() + } + + # Extract the commit hash from a git archive. Git archives include the SHA1 + # hash as the comment field of the tarball pax extended header + # (see https://www.kernel.org/pub/software/scm/git/docs/git-archive.html) + # For GitHub archives this should be the first header after the default one + # (512 byte) header. + renv_bootstrap_git_extract_sha1_tar <- function(bundle) { + + # open the bundle for reading + # We use gzcon for everything because (from ?gzcon) + # > Reading from a connection which does not supply a 'gzip' magic + # > header is equivalent to reading from the original connection + conn <- gzcon(file(bundle, open = "rb", raw = TRUE)) + on.exit(close(conn)) + + # The default pax header is 512 bytes long and the first pax extended header + # with the comment should be 51 bytes long + # `52 comment=` (11 chars) + 40 byte SHA1 hash + len <- 0x200 + 0x33 + res <- rawToChar(readBin(conn, "raw", n = len)[0x201:len]) + + if (grepl("^52 comment=", res)) { + sub("52 comment=", "", res) + } else { + NULL + } + } + + renv_bootstrap_install <- function(version, tarball, library) { + + # attempt to install it into project library + dir.create(library, showWarnings = FALSE, recursive = TRUE) + output <- renv_bootstrap_install_impl(library, tarball) + + # check for successful install + status <- attr(output, "status") + if (is.null(status) || identical(status, 0L)) + return(status) + + # an error occurred; report it + header <- "installation of renv failed" + lines <- paste(rep.int("=", nchar(header)), collapse = "") + text <- paste(c(header, lines, output), collapse = "\n") + stop(text) + + } + + renv_bootstrap_install_impl <- function(library, tarball) { + + # invoke using system2 so we can capture and report output + bin <- R.home("bin") + exe <- if (Sys.info()[["sysname"]] == "Windows") "R.exe" else "R" + R <- file.path(bin, exe) + + args <- c( + "--vanilla", "CMD", "INSTALL", "--no-multiarch", + "-l", shQuote(path.expand(library)), + shQuote(path.expand(tarball)) + ) + + system2(R, args, stdout = TRUE, stderr = TRUE) + + } + + renv_bootstrap_platform_prefix <- function() { + + # construct version prefix + version <- paste(R.version$major, R.version$minor, sep = ".") + prefix <- paste("R", numeric_version(version)[1, 1:2], sep = "-") + + # include SVN revision for development versions of R + # (to avoid sharing platform-specific artefacts with released versions of R) + devel <- + identical(R.version[["status"]], "Under development (unstable)") || + identical(R.version[["nickname"]], "Unsuffered Consequences") + + if (devel) + prefix <- paste(prefix, R.version[["svn rev"]], sep = "-r") + + # build list of path components + components <- c(prefix, R.version$platform) + + # include prefix if provided by user + prefix <- renv_bootstrap_platform_prefix_impl() + if (!is.na(prefix) && nzchar(prefix)) + components <- c(prefix, components) + + # build prefix + paste(components, collapse = "/") + + } + + renv_bootstrap_platform_prefix_impl <- function() { + + # if an explicit prefix has been supplied, use it + prefix <- Sys.getenv("RENV_PATHS_PREFIX", unset = NA) + if (!is.na(prefix)) + return(prefix) + + # if the user has requested an automatic prefix, generate it + auto <- Sys.getenv("RENV_PATHS_PREFIX_AUTO", unset = NA) + if (is.na(auto) && getRversion() >= "4.4.0") + auto <- "TRUE" + + if (auto %in% c("TRUE", "True", "true", "1")) + return(renv_bootstrap_platform_prefix_auto()) + + # empty string on failure + "" + + } + + renv_bootstrap_platform_prefix_auto <- function() { + + prefix <- tryCatch(renv_bootstrap_platform_os(), error = identity) + if (inherits(prefix, "error") || prefix %in% "unknown") { + + msg <- paste( + "failed to infer current operating system", + "please file a bug report at https://github.com/rstudio/renv/issues", + sep = "; " + ) + + warning(msg) + + } + + prefix + + } + + renv_bootstrap_platform_os <- function() { + + sysinfo <- Sys.info() + sysname <- sysinfo[["sysname"]] + + # handle Windows + macOS up front + if (sysname == "Windows") + return("windows") + else if (sysname == "Darwin") + return("macos") + + # check for os-release files + for (file in c("/etc/os-release", "/usr/lib/os-release")) + if (file.exists(file)) + return(renv_bootstrap_platform_os_via_os_release(file, sysinfo)) + + # check for redhat-release files + if (file.exists("/etc/redhat-release")) + return(renv_bootstrap_platform_os_via_redhat_release()) + + "unknown" + + } + + renv_bootstrap_platform_os_via_os_release <- function(file, sysinfo) { + + # read /etc/os-release + release <- utils::read.table( + file = file, + sep = "=", + quote = c("\"", "'"), + col.names = c("Key", "Value"), + comment.char = "#", + stringsAsFactors = FALSE + ) + + vars <- as.list(release$Value) + names(vars) <- release$Key + + # get os name + os <- tolower(sysinfo[["sysname"]]) + + # read id + id <- "unknown" + for (field in c("ID", "ID_LIKE")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + id <- vars[[field]] + break + } + } + + # read version + version <- "unknown" + for (field in c("UBUNTU_CODENAME", "VERSION_CODENAME", "VERSION_ID", "BUILD_ID")) { + if (field %in% names(vars) && nzchar(vars[[field]])) { + version <- vars[[field]] + break + } + } + + # join together + paste(c(os, id, version), collapse = "-") + + } + + renv_bootstrap_platform_os_via_redhat_release <- function() { + + # read /etc/redhat-release + contents <- readLines("/etc/redhat-release", warn = FALSE) + + # infer id + id <- if (grepl("centos", contents, ignore.case = TRUE)) + "centos" + else if (grepl("redhat", contents, ignore.case = TRUE)) + "redhat" + else + "unknown" + + # try to find a version component (very hacky) + version <- "unknown" + + parts <- strsplit(contents, "[[:space:]]")[[1L]] + for (part in parts) { + + nv <- tryCatch(numeric_version(part), error = identity) + if (inherits(nv, "error")) + next + + version <- nv[1, 1] + break + + } + + paste(c("linux", id, version), collapse = "-") + + } + + renv_bootstrap_library_root_name <- function(project) { + + # use project name as-is if requested + asis <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT_ASIS", unset = "FALSE") + if (asis) + return(basename(project)) + + # otherwise, disambiguate based on project's path + id <- substring(renv_bootstrap_hash_text(project), 1L, 8L) + paste(basename(project), id, sep = "-") + + } + + renv_bootstrap_library_root <- function(project) { + + prefix <- renv_bootstrap_profile_prefix() + + path <- Sys.getenv("RENV_PATHS_LIBRARY", unset = NA) + if (!is.na(path)) + return(paste(c(path, prefix), collapse = "/")) + + path <- renv_bootstrap_library_root_impl(project) + if (!is.null(path)) { + name <- renv_bootstrap_library_root_name(project) + return(paste(c(path, prefix, name), collapse = "/")) + } + + renv_bootstrap_paths_renv("library", project = project) + + } + + renv_bootstrap_library_root_impl <- function(project) { + + root <- Sys.getenv("RENV_PATHS_LIBRARY_ROOT", unset = NA) + if (!is.na(root)) + return(root) + + type <- renv_bootstrap_project_type(project) + if (identical(type, "package")) { + userdir <- renv_bootstrap_user_dir() + return(file.path(userdir, "library")) + } + + } + + renv_bootstrap_validate_version <- function(version, description = NULL) { + + # resolve description file + # + # avoid passing lib.loc to `packageDescription()` below, since R will + # use the loaded version of the package by default anyhow. note that + # this function should only be called after 'renv' is loaded + # https://github.com/rstudio/renv/issues/1625 + description <- description %||% packageDescription("renv") + + # check whether requested version 'version' matches loaded version of renv + sha <- attr(version, "sha", exact = TRUE) + valid <- if (!is.null(sha)) + renv_bootstrap_validate_version_dev(sha, description) + else + renv_bootstrap_validate_version_release(version, description) + + if (valid) + return(TRUE) + + # the loaded version of renv doesn't match the requested version; + # give the user instructions on how to proceed + dev <- identical(description[["RemoteType"]], "github") + remote <- if (dev) + paste("rstudio/renv", description[["RemoteSha"]], sep = "@") + else + paste("renv", description[["Version"]], sep = "@") + + # display both loaded version + sha if available + friendly <- renv_bootstrap_version_friendly( + version = description[["Version"]], + sha = if (dev) description[["RemoteSha"]] + ) + + fmt <- heredoc(" + renv %1$s was loaded from project library, but this project is configured to use renv %2$s. + - Use `renv::record(\"%3$s\")` to record renv %1$s in the lockfile. + - Use `renv::restore(packages = \"renv\")` to install renv %2$s into the project library. + ") + catf(fmt, friendly, renv_bootstrap_version_friendly(version), remote) + + FALSE + + } + + renv_bootstrap_validate_version_dev <- function(version, description) { + expected <- description[["RemoteSha"]] + is.character(expected) && startswith(expected, version) + } + + renv_bootstrap_validate_version_release <- function(version, description) { + expected <- description[["Version"]] + is.character(expected) && identical(expected, version) + } + + renv_bootstrap_hash_text <- function(text) { + + hashfile <- tempfile("renv-hash-") + on.exit(unlink(hashfile), add = TRUE) + + writeLines(text, con = hashfile) + tools::md5sum(hashfile) + + } + + renv_bootstrap_load <- function(project, libpath, version) { + + # try to load renv from the project library + if (!requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) + return(FALSE) + + # warn if the version of renv loaded does not match + renv_bootstrap_validate_version(version) + + # execute renv load hooks, if any + hooks <- getHook("renv::autoload") + for (hook in hooks) + if (is.function(hook)) + tryCatch(hook(), error = warnify) + + # load the project + renv::load(project) + + TRUE + + } + + renv_bootstrap_profile_load <- function(project) { + + # if RENV_PROFILE is already set, just use that + profile <- Sys.getenv("RENV_PROFILE", unset = NA) + if (!is.na(profile) && nzchar(profile)) + return(profile) + + # check for a profile file (nothing to do if it doesn't exist) + path <- renv_bootstrap_paths_renv("profile", profile = FALSE, project = project) + if (!file.exists(path)) + return(NULL) + + # read the profile, and set it if it exists + contents <- readLines(path, warn = FALSE) + if (length(contents) == 0L) + return(NULL) + + # set RENV_PROFILE + profile <- contents[[1L]] + if (!profile %in% c("", "default")) + Sys.setenv(RENV_PROFILE = profile) + + profile + + } + + renv_bootstrap_profile_prefix <- function() { + profile <- renv_bootstrap_profile_get() + if (!is.null(profile)) + return(file.path("profiles", profile, "renv")) + } + + renv_bootstrap_profile_get <- function() { + profile <- Sys.getenv("RENV_PROFILE", unset = "") + renv_bootstrap_profile_normalize(profile) + } + + renv_bootstrap_profile_set <- function(profile) { + profile <- renv_bootstrap_profile_normalize(profile) + if (is.null(profile)) + Sys.unsetenv("RENV_PROFILE") + else + Sys.setenv(RENV_PROFILE = profile) + } + + renv_bootstrap_profile_normalize <- function(profile) { + + if (is.null(profile) || profile %in% c("", "default")) + return(NULL) + + profile + + } + + renv_bootstrap_path_absolute <- function(path) { + + substr(path, 1L, 1L) %in% c("~", "/", "\\") || ( + substr(path, 1L, 1L) %in% c(letters, LETTERS) && + substr(path, 2L, 3L) %in% c(":/", ":\\") + ) + + } + + renv_bootstrap_paths_renv <- function(..., profile = TRUE, project = NULL) { + renv <- Sys.getenv("RENV_PATHS_RENV", unset = "renv") + root <- if (renv_bootstrap_path_absolute(renv)) NULL else project + prefix <- if (profile) renv_bootstrap_profile_prefix() + components <- c(root, renv, prefix, ...) + paste(components, collapse = "/") + } + + renv_bootstrap_project_type <- function(path) { + + descpath <- file.path(path, "DESCRIPTION") + if (!file.exists(descpath)) + return("unknown") + + desc <- tryCatch( + read.dcf(descpath, all = TRUE), + error = identity + ) + + if (inherits(desc, "error")) + return("unknown") + + type <- desc$Type + if (!is.null(type)) + return(tolower(type)) + + package <- desc$Package + if (!is.null(package)) + return("package") + + "unknown" + + } + + renv_bootstrap_user_dir <- function() { + dir <- renv_bootstrap_user_dir_impl() + path.expand(chartr("\\", "/", dir)) + } + + renv_bootstrap_user_dir_impl <- function() { + + # use local override if set + override <- getOption("renv.userdir.override") + if (!is.null(override)) + return(override) + + # use R_user_dir if available + tools <- asNamespace("tools") + if (is.function(tools$R_user_dir)) + return(tools$R_user_dir("renv", "cache")) + + # try using our own backfill for older versions of R + envvars <- c("R_USER_CACHE_DIR", "XDG_CACHE_HOME") + for (envvar in envvars) { + root <- Sys.getenv(envvar, unset = NA) + if (!is.na(root)) + return(file.path(root, "R/renv")) + } + + # use platform-specific default fallbacks + if (Sys.info()[["sysname"]] == "Windows") + file.path(Sys.getenv("LOCALAPPDATA"), "R/cache/R/renv") + else if (Sys.info()[["sysname"]] == "Darwin") + "~/Library/Caches/org.R-project.R/R/renv" + else + "~/.cache/R/renv" + + } + + renv_bootstrap_version_friendly <- function(version, shafmt = NULL, sha = NULL) { + sha <- sha %||% attr(version, "sha", exact = TRUE) + parts <- c(version, sprintf(shafmt %||% " [sha: %s]", substring(sha, 1L, 7L))) + paste(parts, collapse = "") + } + + renv_bootstrap_exec <- function(project, libpath, version) { + if (!renv_bootstrap_load(project, libpath, version)) + renv_bootstrap_run(version, libpath) + } + + renv_bootstrap_run <- function(version, libpath) { + + # perform bootstrap + bootstrap(version, libpath) + + # exit early if we're just testing bootstrap + if (!is.na(Sys.getenv("RENV_BOOTSTRAP_INSTALL_ONLY", unset = NA))) + return(TRUE) + + # try again to load + if (requireNamespace("renv", lib.loc = libpath, quietly = TRUE)) { + return(renv::load(project = getwd())) + } + + # failed to download or load renv; warn the user + msg <- c( + "Failed to find an renv installation: the project will not be loaded.", + "Use `renv::activate()` to re-initialize the project." + ) + + warning(paste(msg, collapse = "\n"), call. = FALSE) + + } + + renv_json_read <- function(file = NULL, text = NULL) { + + jlerr <- NULL + + # if jsonlite is loaded, use that instead + if ("jsonlite" %in% loadedNamespaces()) { + + json <- tryCatch(renv_json_read_jsonlite(file, text), error = identity) + if (!inherits(json, "error")) + return(json) + + jlerr <- json + + } + + # otherwise, fall back to the default JSON reader + json <- tryCatch(renv_json_read_default(file, text), error = identity) + if (!inherits(json, "error")) + return(json) + + # report an error + if (!is.null(jlerr)) + stop(jlerr) + else + stop(json) + + } + + renv_json_read_jsonlite <- function(file = NULL, text = NULL) { + text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n") + jsonlite::fromJSON(txt = text, simplifyVector = FALSE) + } + + renv_json_read_default <- function(file = NULL, text = NULL) { + + # find strings in the JSON + text <- paste(text %||% readLines(file, warn = FALSE), collapse = "\n") + pattern <- '["](?:(?:\\\\.)|(?:[^"\\\\]))*?["]' + locs <- gregexpr(pattern, text, perl = TRUE)[[1]] + + # if any are found, replace them with placeholders + replaced <- text + strings <- character() + replacements <- character() + + if (!identical(c(locs), -1L)) { + + # get the string values + starts <- locs + ends <- locs + attr(locs, "match.length") - 1L + strings <- substring(text, starts, ends) + + # only keep those requiring escaping + strings <- grep("[[\\]{}:]", strings, perl = TRUE, value = TRUE) + + # compute replacements + replacements <- sprintf('"\032%i\032"', seq_along(strings)) + + # replace the strings + mapply(function(string, replacement) { + replaced <<- sub(string, replacement, replaced, fixed = TRUE) + }, strings, replacements) + + } + + # transform the JSON into something the R parser understands + transformed <- replaced + transformed <- gsub("{}", "`names<-`(list(), character())", transformed, fixed = TRUE) + transformed <- gsub("[[{]", "list(", transformed, perl = TRUE) + transformed <- gsub("[]}]", ")", transformed, perl = TRUE) + transformed <- gsub(":", "=", transformed, fixed = TRUE) + text <- paste(transformed, collapse = "\n") + + # parse it + json <- parse(text = text, keep.source = FALSE, srcfile = NULL)[[1L]] + + # construct map between source strings, replaced strings + map <- as.character(parse(text = strings)) + names(map) <- as.character(parse(text = replacements)) + + # convert to list + map <- as.list(map) + + # remap strings in object + remapped <- renv_json_read_remap(json, map) + + # evaluate + eval(remapped, envir = baseenv()) + + } + + renv_json_read_remap <- function(json, map) { + + # fix names + if (!is.null(names(json))) { + lhs <- match(names(json), names(map), nomatch = 0L) + rhs <- match(names(map), names(json), nomatch = 0L) + names(json)[rhs] <- map[lhs] + } + + # fix values + if (is.character(json)) + return(map[[json]] %||% json) + + # handle true, false, null + if (is.name(json)) { + text <- as.character(json) + if (text == "true") + return(TRUE) + else if (text == "false") + return(FALSE) + else if (text == "null") + return(NULL) + } + + # recurse + if (is.recursive(json)) { + for (i in seq_along(json)) { + json[i] <- list(renv_json_read_remap(json[[i]], map)) + } + } + + json + + } + + # load the renv profile, if any + renv_bootstrap_profile_load(project) + + # construct path to library root + root <- renv_bootstrap_library_root(project) + + # construct library prefix for platform + prefix <- renv_bootstrap_platform_prefix() + + # construct full libpath + libpath <- file.path(root, prefix) + + # run bootstrap code + renv_bootstrap_exec(project, libpath, version) + + invisible() + +}) diff --git a/preprocessing/renv/settings.json b/preprocessing/renv/settings.json new file mode 100644 index 0000000..74369b9 --- /dev/null +++ b/preprocessing/renv/settings.json @@ -0,0 +1,21 @@ +{ + "bioconductor.version": null, + "external.libraries": [], + "ignored.packages": [ + "omopcat" + ], + "package.dependency.fields": [ + "Imports", + "Depends", + "LinkingTo" + ], + "ppm.enabled": null, + "ppm.ignored.urls": [], + "r.version": null, + "snapshot.type": "implicit", + "use.cache": true, + "vcs.ignore.cellar": true, + "vcs.ignore.library": true, + "vcs.ignore.local": true, + "vcs.manage.ignores": true +} diff --git a/preprocessing/tests/testthat.R b/preprocessing/tests/testthat.R new file mode 100644 index 0000000..b7d71c5 --- /dev/null +++ b/preprocessing/tests/testthat.R @@ -0,0 +1,12 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + +library(testthat) +library(omopcat.preprocessing) + +test_check("omopcat.preprocessing") diff --git a/preprocessing/tests/testthat/setup.R b/preprocessing/tests/testthat/setup.R new file mode 100644 index 0000000..b519a2b --- /dev/null +++ b/preprocessing/tests/testthat/setup.R @@ -0,0 +1,8 @@ +withr::local_envvar( + EUNOMIA_DATA_FOLDER = testthat::test_path("../../data-raw/test_db"), + TEST_DB_NAME = "GiBleed", + DB_CDM_SCHEMA = "main", + .local_envir = testthat::teardown_env() +) + +mock_cdm <- .setup_cdm_object(.envir = testthat::teardown_env()) diff --git a/preprocessing/tests/testthat/test-concepts.R b/preprocessing/tests/testthat/test-concepts.R new file mode 100644 index 0000000..740d30e --- /dev/null +++ b/preprocessing/tests/testthat/test-concepts.R @@ -0,0 +1,10 @@ +test_that("generate_concepts works on a CDM object", { + concepts <- generate_concepts(mock_cdm, concept_ids = c(1118088, 1569708, 3020630)) + expect_s3_class(concepts, "data.frame") + expect_equal(nrow(concepts), 3) + expect_true( + all(c("concept_id", "concept_name", "vocabulary_id", "domain_id", "concept_class_id") %in% names(concepts)) + ) + expect_type(concepts$concept_id, "integer") + expect_type(concepts$concept_name, "character") +}) diff --git a/preprocessing/tests/testthat/test-monthly_counts.R b/preprocessing/tests/testthat/test-monthly_counts.R new file mode 100644 index 0000000..223bc49 --- /dev/null +++ b/preprocessing/tests/testthat/test-monthly_counts.R @@ -0,0 +1,44 @@ +test_that("generate_monthly_counts works on a CDM object", { + monthly_counts <- generate_monthly_counts(mock_cdm) + expect_s3_class(monthly_counts, "data.frame") + expect_true(nrow(monthly_counts) > 0) + expect_named(monthly_counts, c( + "concept_id", "concept_name", "date_year", "date_month", "record_count", + "person_count", "records_per_person" + )) +}) + +## Set up a mock measurement OMOP table +## Measurements for 3 different patients on the same day, with 1 patient having 2 measurements +measurement <- data.frame( + measurement_id = 1:4, + person_id = c(1, 1, 2, 3), + measurement_type_concept_id = 12345, + measurement_concept_id = 1, + measurement_date = "2020-01-01", + value_as_number = c(2, 1, 2, 1), + value_as_concept_id = 0 +) + +test_that("calculate_monthly_counts produces the expected results", { + res <- calculate_monthly_counts(measurement, measurement_concept_id, measurement_date) + expect_s3_class(res, "data.frame") + expect_named(res, c("concept_id", "date_year", "date_month", "record_count", "person_count", "records_per_person")) + expect_equal(nrow(res), 1) + expect_equal(res$person_count, 3) + expect_equal(res$records_per_person, 4 / 3) +}) + +db <- dbplyr::src_memdb() +db_measurement <- dplyr::copy_to(db, measurement, name = "measurement", overwrite = TRUE) +test_that("calculate_monthly_counts works on Database-stored tables", { + ref <- calculate_monthly_counts(measurement, measurement_concept_id, measurement_date) + db_res <- calculate_monthly_counts(db_measurement, measurement_concept_id, measurement_date) + + expect_s3_class(db_res, "data.frame") + expect_named(db_res, c("concept_id", "date_year", "date_month", "record_count", "person_count", "records_per_person")) + expect_type(db_res$record_count, "integer") + expect_type(db_res$person_count, "integer") + expect_type(db_res$records_per_person, "double") + expect_identical(db_res, ref) +}) diff --git a/preprocessing/tests/testthat/test-preprocessing.R b/preprocessing/tests/testthat/test-preprocessing.R new file mode 100644 index 0000000..378d92b --- /dev/null +++ b/preprocessing/tests/testthat/test-preprocessing.R @@ -0,0 +1,56 @@ +# Disable cli messages +withr::local_options(usethis.quiet = TRUE, cli.default_handler = function(...) {}) + +test_that("preprocessing produces the expected files", { + testthat::skip_on_ci() # avoid re-downloading example database on GHA runners + out_path <- tempdir() + expected_files <- c( + file.path(out_path, "omopcat_concepts.parquet"), + file.path(out_path, "omopcat_monthly_counts.parquet"), + file.path(out_path, "omopcat_summary_stats.parquet") + ) + withr::defer(fs::file_delete(expected_files)) + + expect_no_error({ + success <- preprocess(out_path = out_path) + }) + expect_true(success) + expect_true(all(file.exists(expected_files))) +}) + +test_that("preprocessing fails without valid out_path", { + expect_error(preprocess(out_path = ""), "should not be empty") +}) + +test_that("preprocessing is skipped if files already exist", { + out_path <- tempdir() + out_files <- c( + file.path(out_path, "omopcat_concepts.parquet"), + file.path(out_path, "omopcat_monthly_counts.parquet"), + file.path(out_path, "omopcat_summary_stats.parquet") + ) + fs::file_create(out_files) + withr::defer(fs::file_delete(out_files)) + + expect_false(preprocess(out_path = out_path)) +}) + +test_that("preprocessing fails if envvars are missing", { + withr::local_envvar( + ENV = "prod", + DB_NAME = NULL, + HOST = NULL, + PORT = NULL, + DB_USERNAME = NULL, + DB_PASSWORD = NULL, + DB_CDM_SCHEMA = NULL + ) + expect_error(preprocess(out_path = tempfile()), "not set") +}) + +test_that("Setting up CDM object works for non-prod data", { + skip_if_not_installed("duckdb") + withr::local_envvar(ENV = "test") + cdm <- .setup_cdm_object() + expect_equal(CDMConnector::cdmVersion(cdm), "5.3") +}) diff --git a/preprocessing/tests/testthat/test-summary_stats.R b/preprocessing/tests/testthat/test-summary_stats.R new file mode 100644 index 0000000..7757858 --- /dev/null +++ b/preprocessing/tests/testthat/test-summary_stats.R @@ -0,0 +1,46 @@ +test_that("generate_summary_stats works on a CDM object", { + summary_stats <- generate_summary_stats(mock_cdm) + expect_s3_class(summary_stats, "data.frame") + expect_named(summary_stats, c( + "concept_id", "concept_name", "summary_attribute", "value_as_number", "value_as_string" + )) +}) + +## Mock measurments for 2 concepts +mock_measurements <- data.frame( + measurement_id = 1:8, + person_id = 1, + measurement_type_concept_id = rep(c(12345, 23456), each = 4), + measurement_concept_id = rep(c(1, 2), each = 4), + measurement_date = "2020-01-01", + value_as_number = c(2, 1, 2, 1, rep(NA, 4)), + value_as_concept_id = c(rep(0, 4), c(1, 1, 2, 3)) +) + +test_that("calculate_summary_stats produces the expected results", { + res <- calculate_summary_stats(mock_measurements, "measurement_concept_id") + expect_s3_class(res, "data.frame") + expect_named(res, c("concept_id", "summary_attribute", "value_as_number", "value_as_concept_id")) + expect_equal(nrow(res), 5) + mean <- res[res$summary_attribute == "mean", ][["value_as_number"]] + sd <- res[res$summary_attribute == "sd", ][["value_as_number"]] + expect_equal(mean, 1.5) + expect_equal(sd^2, 1 / 3) + + frequencies <- res[res$summary_attribute == "frequency", ] + expect_equal(nrow(frequencies), 3) + expect_equal(frequencies$value_as_number, c(2, 1, 1)) + expect_equal(frequencies$value_as_concept_id, c(1, 2, 3)) +}) + +db <- dbplyr::src_memdb() +db_measurement <- dplyr::copy_to(db, mock_measurements, name = "measurement", overwrite = TRUE) +test_that("calculate_summary_stats works with a database-stored table", { + ref <- calculate_summary_stats(mock_measurements, "measurement_concept_id") + db_res <- calculate_summary_stats(db_measurement, "measurement_concept_id") + + expect_s3_class(db_res, "data.frame") + expect_named(db_res, c("concept_id", "summary_attribute", "value_as_number", "value_as_concept_id")) + expect_identical(db_res, ref) + expect_type(db_res$value_as_number, "double") +}) diff --git a/scripts/01_setup_test_db.R b/scripts/01_setup_test_db.R deleted file mode 100644 index be5bb2f..0000000 --- a/scripts/01_setup_test_db.R +++ /dev/null @@ -1,22 +0,0 @@ -cli::cli_h1("Setting up test database") - -library(omopcat) - -# Create an duckdb database from Eunomia datasets -db_path <- CDMConnector::eunomia_dir( - dataset_name = Sys.getenv("TEST_DB_NAME"), - cdm_version = Sys.getenv("TEST_DB_OMOP_VERSION"), - database_file = tempfile(fileext = ".duckdb") -) - -con <- connect_to_test_duckdb(db_path) - -# Use 'cdm_from_con' to load the dataset and verify integrity -CDMConnector::cdm_from_con( - con = con, - cdm_schema = Sys.getenv("TEST_DB_CDM_SCHEMA"), - write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"), - cdm_name = Sys.getenv("TEST_DB_NAME") -) - -cli::cli_alert_success("Test database setup successfully") diff --git a/scripts/02_insert_dummy_tables.R b/scripts/02_insert_dummy_tables.R deleted file mode 100644 index e150d3c..0000000 --- a/scripts/02_insert_dummy_tables.R +++ /dev/null @@ -1,70 +0,0 @@ -# PRODUCED FOR A SPECIFIC DATASET: -# synthea-allergies-10k -# (but could work for others) - -cli::cli_h1("Inserting dummy tables") - - -# Setup --------------------------------------------------------------------------------------- - -library(readr) -library(omopcat) - -dir <- Sys.getenv("EUNOMIA_DATA_FOLDER") -name <- Sys.getenv("TEST_DB_NAME") -version <- Sys.getenv("TEST_DB_OMOP_VERSION") - -db_path <- glue::glue("{dir}/{name}_{version}_1.0.duckdb") - -con <- connect_to_test_duckdb(db_path) - - -# Insert dummy tables ------------------------------------------------------------------------- - -## Load dummy data and write tables to database -## We explicitly set the column types for columns that are needed later down the pipeline -dummy_measurements <- read_csv( - here::here("data-raw/test_db/dummy/measurement.csv"), - col_types = cols( - measurement_id = col_integer(), - person_id = col_integer(), - measurement_concept_id = col_integer(), - measurement_date = col_date(), - value_as_number = col_double(), - value_as_concept_id = col_integer(), - ) -) -write_table(dummy_measurements, con, "measurement", schema = Sys.getenv("TEST_DB_CDM_SCHEMA")) - -dummy_observations <- read_csv( - here::here( - "data-raw/test_db/dummy/observation.csv" - ), - col_types = cols( - observation_id = col_integer(), - person_id = col_integer(), - observation_concept_id = col_integer(), - observation_date = col_date(), - value_as_number = col_double(), - value_as_string = col_logical(), - value_as_concept_id = col_integer(), - ) -) -write_table(dummy_observations, con, "observation", schema = Sys.getenv("TEST_DB_CDM_SCHEMA")) - -# Sanity check: read the data back and make sure its consistent -db_measurements <- DBI::dbReadTable(con, "measurement") -stopifnot(all.equal(db_measurements, as.data.frame(dummy_measurements))) - -db_observations <- DBI::dbReadTable(con, "observation") -stopifnot(all.equal(db_observations, as.data.frame(dummy_observations))) - -# Load the CMD object to verify integrity of the schema after insertions -cdm <- CDMConnector::cdm_from_con( - con = con, - cdm_schema = Sys.getenv("TEST_DB_CDM_SCHEMA"), - write_schema = Sys.getenv("TEST_DB_RESULTS_SCHEMA"), - cdm_name = name -) - -cli::cli_alert_success("Dummy tables inserted successfully") diff --git a/scripts/03_analyse_omop_cdm.R b/scripts/03_analyse_omop_cdm.R deleted file mode 100644 index 999a883..0000000 --- a/scripts/03_analyse_omop_cdm.R +++ /dev/null @@ -1,63 +0,0 @@ -cli::cli_h1("Generating summarys statistics") - - -# Setup --------------------------------------------------------------------------------------- - -library(omopcat) - -if (Sys.getenv("ENV") == "prod") { - fs::dir_create({ - out_path <- file.path(Sys.getenv("OMOPCAT_DATA_PATH")) - }) - cli::cli_alert_info("out_path set to {out_path}") - - name <- Sys.getenv("DB_NAME") - con <- connect_to_db( - RPostgres::Postgres(), - dbname = Sys.getenv("DB_NAME"), - host = Sys.getenv("HOST"), - port = Sys.getenv("PORT"), - user = Sys.getenv("DB_USERNAME"), - password = Sys.getenv("DB_PASSWORD") - ) -} else { - dir <- Sys.getenv("EUNOMIA_DATA_FOLDER") - name <- Sys.getenv("TEST_DB_NAME") - version <- Sys.getenv("TEST_DB_OMOP_VERSION") - - db_path <- glue::glue("{dir}/{name}_{version}_1.0.duckdb") - con <- connect_to_test_duckdb(db_path) - - fs::dir_create( - out_path <- here::here("data/test_data") - ) -} - -# Calculate summary stats --------------------------------------------------------------------- - -# Load the data in a CDMConnector object -cdm <- CDMConnector::cdm_from_con( - con = con, - cdm_schema = Sys.getenv("DB_CDM_SCHEMA"), - write_schema = Sys.getenv("DB_CDM_SCHEMA"), - cdm_name = name -) - -cli::cli_progress_step("Calculating monthly counts...") -monthly_counts <- process_monthly_counts(cdm) -cli::cli_progress_step("Calculating summary statistics...") -summary_stats <- process_summary_stats(cdm) -ids <- unique(c(monthly_counts$concept_id, summary_stats$concept_id)) -concepts_table <- query_concepts_table(cdm, ids) - -all_tables <- list( - concepts = concepts_table, - monthly_counts = monthly_counts, - summary_stats = summary_stats -) -paths <- purrr::map_chr(names(all_tables), ~ file.path(out_path, glue::glue("omopcat_{.x}.parquet"))) - -# Write the tables to disk as parquet -purrr::walk2(all_tables, paths, ~ nanoparquet::write_parquet(.x, .y)) - -cli::cli_alert_success("Summary statistics generated successfully and written to {.file {paths}}") diff --git a/scripts/04_produce_dev_data.R b/scripts/04_produce_dev_data.R deleted file mode 100644 index 459e119..0000000 --- a/scripts/04_produce_dev_data.R +++ /dev/null @@ -1,43 +0,0 @@ -cli::cli_h1("Producing test data") - - -# Setup --------------------------------------------------------------------------------------- - -suppressPackageStartupMessages({ - library(dplyr) - library(omopcat) -}) - -data_path <- here::here("data/test_data") -stopifnot(dir.exists(data_path)) -out_path <- here::here("inst/dev_data") -stopifnot(dir.exists(out_path)) - -# Produce test data --------------------------------------------------------------------------- - -# Get the relevant tables and filter -table_names <- c("concepts", "monthly_counts", "summary_stats") -paths <- glue::glue("{data_path}/omopcat_{table_names}.parquet") -tables <- purrr::map(paths, read_parquet_sorted) -names(tables) <- table_names - -# Keep only concepts for which we have summary statistics -keep_concepts <- tables$summary_stats$concept_id -tables <- purrr::map(tables, ~ .x[.x$concept_id %in% keep_concepts, ]) - -# Keep only data from 2019 onwards -monthly_counts <- tables$monthly_counts -filtered_monthly <- monthly_counts[monthly_counts$date_year >= 2019, ] -tables$monthly_counts <- filtered_monthly - -# Filter the other tables to match the concepts left over after year filtering -tables <- purrr::map(tables, ~ .x[.x$concept_id %in% filtered_monthly$concept_id, ]) - -# Write all results to the test data folder -purrr::iwalk(tables, function(tbl, name) { - path <- glue::glue("{out_path}/omopcat_{name}.csv") - cli::cli_alert_info("Writing {name} to {path}") - readr::write_csv(tbl, file = path) -}) - -cli::cli_alert_success("Test data produced") diff --git a/scripts/create_dev_data.R b/scripts/create_dev_data.R index ee6cd8b..96d6306 100644 --- a/scripts/create_dev_data.R +++ b/scripts/create_dev_data.R @@ -1,20 +1,55 @@ -# Master script to set up the test data -# Generates the dummy data in `inst/test_data` for running the app in dev mode by calling -# the relevant scripts in the correct order. - -here::i_am("scripts/create_dev_data.R") - -Sys.setenv("ENV" = "dev") -# Path to download Eunomia datasets -Sys.setenv(EUNOMIA_DATA_FOLDER = file.path("data-raw/test_db/eunomia")) -# Name of the synthetic dataset to use -Sys.setenv(TEST_DB_NAME = "synthea-allergies-10k") -# OMOP CDM version -Sys.setenv(TEST_DB_OMOP_VERSION = "5.3") -# Schema name for data and results -Sys.setenv(DB_CDM_SCHEMA = "main") - -source(here::here("scripts/01_setup_test_db.R")) -source(here::here("scripts/02_insert_dummy_tables.R")) -source(here::here("scripts/03_analyse_omop_cdm.R")) -source(here::here("scripts/04_produce_dev_data.R")) +# Setup --------------------------------------------------------------------------------------- + +suppressPackageStartupMessages({ + library(dplyr) +}) + +data_path <- here::here("data/test_data") +stopifnot(dir.exists(data_path)) +out_path <- here::here("inst/dev_data") +stopifnot(dir.exists(out_path)) + +# Produce test data --------------------------------------------------------------------------- + +#' Read a parquet table and sort the results +#' +#' @param path path to the parquet file to be read +#' @inheritParams nanoparquet::read_parquet +#' +#' @return A `data.frame` with the results sorted by all columns +#' @importFrom dplyr arrange across everything +read_parquet_sorted <- function(path, options = nanoparquet::parquet_options()) { + if (!file.exists(path)) { + cli::cli_abort("File {.file {path}} not found") + } + + nanoparquet::read_parquet(path, options) |> + arrange(across(everything())) +} + +# Get the relevant tables and filter +table_names <- c("concepts", "monthly_counts", "summary_stats") +paths <- glue::glue("{data_path}/omopcat_{table_names}.parquet") +tables <- purrr::map(paths, read_parquet_sorted) +names(tables) <- table_names + +# Keep only concepts for which we have summary statistics +keep_concepts <- tables$summary_stats$concept_id +tables <- purrr::map(tables, ~ .x[.x$concept_id %in% keep_concepts, ]) + +# Keep only data from 2019 onwards +monthly_counts <- tables$monthly_counts +filtered_monthly <- monthly_counts[monthly_counts$date_year >= 2019, ] +tables$monthly_counts <- filtered_monthly + +# Filter the other tables to match the concepts left over after year filtering +tables <- purrr::map(tables, ~ .x[.x$concept_id %in% filtered_monthly$concept_id, ]) + +# Write all results to the test data folder +purrr::iwalk(tables, function(tbl, name) { + path <- glue::glue("{out_path}/omopcat_{name}.csv") + cli::cli_alert_info("Writing {name} to {path}") + readr::write_csv(tbl, file = path) +}) + +cli::cli_alert_success("Test data produced") diff --git a/scripts/create_prod_data.R b/scripts/create_prod_data.R deleted file mode 100644 index b3c68a8..0000000 --- a/scripts/create_prod_data.R +++ /dev/null @@ -1,50 +0,0 @@ -# Master script to create the prod data -# Generates the `data/prod_data/*.parquet` files - -here::i_am("scripts/create_prod_data.R") - -Sys.setenv("ENV" = "prod") - -required_envvars <- c( - "DB_NAME", - "HOST", - "PORT", - "DB_USERNAME", - "DB_PASSWORD", - "OMOPCAT_DATA_PATH" -) - -out_path <- Sys.getenv("OMOPCAT_DATA_PATH") - -check_envvars <- function(x) { - missing <- Sys.getenv(x) == "" - if (any(missing)) { - cli::cli_abort(c( - "x" = "Environment variable{?s} {.envvar {x[missing]}} not set", - "i" = "Make sure to define the environment variables in a local {.file .Renviron} file" - ), call = rlang::caller_env()) - } -} -check_envvars(required_envvars) - -expected_files <- c( - file.path(out_path, "omopcat_concepts.parquet"), - file.path(out_path, "omopcat_monthly_counts.parquet"), - file.path(out_path, "omopcat_summary_stats.parquet") -) - -# Only run pre-processing if the expected files don't exist -exists <- file.exists(expected_files) -if (!all(exists)) { - source(here::here("scripts/03_analyse_omop_cdm.R")) - # Sanity check: make sure the expected files were created - purrr::walk(expected_files, function(path) { - if (!file.exists(path)) { - cli::cli_abort("Expected file not found: {.file {path}}") - } - }) -} else { - cli::cli_alert_info("All expected files already exist. Skipping pre-processing.") - cli::cli_alert_info("To force re-processing, delete the following files:") - cli::cli_ul(expected_files[exists]) -} diff --git a/scripts/create_test_data.R b/scripts/create_test_data.R new file mode 100644 index 0000000..72374a6 --- /dev/null +++ b/scripts/create_test_data.R @@ -0,0 +1,13 @@ +# Generates the parquet files in data/test_data/ by running the preprocessing pipeilne +# on the test database located at data-raw/test_db/eunomia +withr::local_envvar( + ENV = "test", + EUNOMIA_DATA_FOLDER = here::here("data-raw/test_db/eunomia"), + DB_NAME = "synthea-allergies-10k", + DB_CDM_SCHEMA = "main" +) + +out_path <- here::here("data/test_data") +omopcat.preprocessing::preprocess(out_path) + +cli::cli_alert_success("Test data written to {out_path}") diff --git a/tests/testthat/test-utils_preprocessing_summarise.R b/tests/testthat/test-utils_preprocessing_summarise.R deleted file mode 100644 index 2e5713c..0000000 --- a/tests/testthat/test-utils_preprocessing_summarise.R +++ /dev/null @@ -1,72 +0,0 @@ -## Set up a mock measurement OMOP table -## Measurements for 3 different patients on the same day, with 1 patient having 2 measurements -measurement <- data.frame( - measurement_id = 1:4, - person_id = c(1, 1, 2, 3), - measurement_type_concept_id = 12345, - measurement_concept_id = 1, - measurement_date = "2020-01-01", - value_as_number = c(2, 1, 2, 1), - value_as_concept_id = 0 -) - -test_that("calculate_monthly_counts produces the expected results", { - res <- calculate_monthly_counts(measurement, measurement_concept_id, measurement_date) - expect_s3_class(res, "data.frame") - expect_named(res, c("concept_id", "date_year", "date_month", "record_count", "person_count", "records_per_person")) - expect_equal(nrow(res), 1) - expect_equal(res$person_count, 3) - expect_equal(res$records_per_person, 4 / 3) -}) - -db <- dbplyr::src_memdb() -db_measurement <- dplyr::copy_to(db, measurement, name = "measurement", overwrite = TRUE) -test_that("calculate_monthly_counts works on Database-stored tables", { - ref <- calculate_monthly_counts(measurement, measurement_concept_id, measurement_date) - db_res <- calculate_monthly_counts(db_measurement, measurement_concept_id, measurement_date) - - expect_s3_class(db_res, "data.frame") - expect_named(db_res, c("concept_id", "date_year", "date_month", "record_count", "person_count", "records_per_person")) - expect_type(db_res$record_count, "integer") - expect_type(db_res$person_count, "integer") - expect_type(db_res$records_per_person, "double") - expect_identical(db_res, ref) -}) - -test_that("calculate_summary_stats produces the expected results", { - res <- calculate_summary_stats(measurement, "measurement_concept_id") - expect_s3_class(res, "data.frame") - expect_named(res, c("concept_id", "summary_attribute", "value_as_number", "value_as_concept_id")) - expect_equal(nrow(res), 2) - mean <- res[res$summary_attribute == "mean", ][["value_as_number"]] - sd <- res[res$summary_attribute == "sd", ][["value_as_number"]] - expect_equal(mean, 1.5) - expect_equal(sd^2, 1 / 3) -}) - -## Add a categorical concept -categorical_measurement <- data.frame( - measurement_id = 1:4, - person_id = c(1, 1, 2, 3), - measurement_type_concept_id = 12345, - measurement_concept_id = 2, - measurement_date = as.Date("2020-01-01"), - value_as_number = NA, - value_as_concept_id = c(1, 1, 2, 3) -) -measurement <- rbind(measurement, categorical_measurement) -test_that("calculate_summary_stats can handle categorical concepts", { - res <- calculate_summary_stats(measurement, "measurement_concept_id") - frequencies <- res[res$summary_attribute == "frequency", ] - expect_equal(nrow(frequencies), 3) - expect_equal(frequencies$value_as_number, c(2, 1, 1)) - expect_equal(frequencies$value_as_concept_id, c(1, 2, 3)) -}) - -db <- dbplyr::src_memdb() -db_measurement <- dplyr::copy_to(db, measurement, name = "measurement", overwrite = TRUE) -test_that("calculate_summary_stats works with a database-stored table", { - res <- calculate_summary_stats(db_measurement, "measurement_concept_id") - expect_s3_class(res, "data.frame") - expect_named(res, c("concept_id", "summary_attribute", "value_as_number", "value_as_concept_id")) -})