Skip to content

Commit

Permalink
Changes (7c1d73f) from SAE environment
Browse files Browse the repository at this point in the history
  • Loading branch information
popdata-ocwa-user committed Mar 9, 2022
1 parent f866366 commit 55035f0
Show file tree
Hide file tree
Showing 59 changed files with 1,087 additions and 1,504 deletions.
2 changes: 1 addition & 1 deletion .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ data-raw/

^\.github$
^\.github/workflows/R-CMD-check\.yaml$
.Rprofile
^\.Rprofile$


^_pkgdown\.yml$
Expand Down
3 changes: 3 additions & 0 deletions .Rprofile
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@ if (Sys.getenv("USERDOMAIN") == "POPDATA") {

reassignInPackage("check_file", "digest", .custom_check_file)
}

## Silence a CRAN warning
Sys.setenv('_R_CHECK_SYSTEM_CLOCK_' = 0)
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,4 @@
.Ruserdata

.dipr/
scratch/
docs
30 changes: 17 additions & 13 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
Package: dipr
Title: Provide functions to efficiently import SRE data
Version: 1.0.0.9000
Version: 1.2.1
Authors@R:
c(person(given = "Sam",
family = "Albers",
role = c("aut", "cre"),
email = "[email protected]"),
person(given = "Stephanie",
family = "Hazlitt",
role = "aut",
email = "[email protected]"),
person(given = "Craig",
family = "Hutton",
role = "ctb",
email = "[email protected]"),
person(given = "Andy",
family = "Teucher",
email = "[email protected]",
role = "ctb",
role = "aut",
comment = c(ORCID = "0000-0002-7840-692X")),
person(given = "Bonnie",
family = "Robert",
email = "[email protected]",
role = "aut"),
person(given = "Craig",
family = "Hutton",
role = "ctb",
email = "[email protected]"),
person(given = "Stephanie",
family = "Hazlitt",
role = "ctb",
email = "[email protected]"),
person(given = "Province of British Columbia",
role = "cph")
)
Expand All @@ -27,17 +31,17 @@ Description: Provides functions that aid in working with SRE data. Caching
of data into memory. Functions are also provided to convert to Apache
Arrow formats.
License: Apache License (== 2.0) | file LICENSE
URL: https://github.com/bcgov/dipr, https://projectsc.popdata.bc.ca/shares/dipr2
URL: https://github.com/bcgov/dipr, https://projectsc.popdata.bc.ca/shares/dipr
Imports:
arrow,
credentials,
cli,
data.table,
devtools,
remotes,
dplyr,
fs,
fst,
glue,
httr,
janitor,
lubridate,
readr (>= 2.0.0),
Expand Down
14 changes: 14 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Generated by roxygen2: do not edit by hand

S3method(add_linked_status_col,Dataset)
S3method(add_linked_status_col,arrow_dplyr_query)
S3method(add_linked_status_col,data.frame)
S3method(add_linked_status_col,default)
S3method(filter_across,Dataset)
S3method(filter_across,arrow_dplyr_query)
S3method(filter_across,data.frame)
S3method(filter_across,default)
export("%>%")
export(add_linked_status_col)
export(dat_to_arrow)
Expand All @@ -9,6 +17,7 @@ export(dat_to_feather)
export(dat_to_parquet)
export(delete_rtmp_dirs)
export(dipr_create_targets_project)
export(dipr_document_output_groups)
export(dipr_example)
export(dipr_examples)
export(dipr_icd10_categories)
Expand All @@ -17,17 +26,22 @@ export(dipr_icd_categories)
export(dipr_update)
export(dipr_use_export_doc)
export(dipr_write_parquet)
export(filter_across)
export(filter_linked)
export(get_core_dat_path)
export(get_core_dict_path)
export(get_gitlab_sre_repos)
export(group_ages)
export(insert_bcgov_apache_header)
export(insert_bcgov_cc_header)
export(install_sre_gitlab)
export(msp_unique)
export(ocwa_branch_export)
export(read_dat)
export(read_dat_dt)
export(read_health_dict)
export(read_nflt)
export(restore_rstudio_prefs)
export(set_gitlab_credentials)
import(data.table)
importFrom(dplyr,"%>%")
Expand Down
28 changes: 27 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,37 @@
# dipr (development version)
# dipr 1.2.1
* Add Bonnie Robert and Andy Teucher as authors
* Document deprecated functions in one place

# dipr 1.2.0

## Breaking Changes
* Deprecate `msp_unique` and re-export it in the hawkeye package
* Deprecate `dipr_icd_categories`, `dipr_icd9_categories` and `dipr_icd10_categories` functions in favour of functionality added to hawkeye package

## Improvements
* Add new function `filter_across`
* Changed the internal method to install and update `{dipr}`, with new instructions in the README. The new method uses the gitlab API and will enable using `{dipr}` to install other SRE gitlab R packages using `install_sre_gitlab`. (#24)
* Add new function `get_gitlab_sre_repos`
* Add new function `filter_linked` and add `Dataset` methods for `add_linked_status_col` function so that it can be used in a arrow workflow.
* The template `DESCRIPTION` file now adds dependent packages to the `Depends` field rather than the `Imports` field.
* Fixed a bug where the template `.Rprofile` file used in `dipr_create_targets_project()` was not included in the package.
* Deprecate `msp_unique` and re-export it in the hawkeye package
* New function `restore_rstudio_prefs()` to help setup RStudio in a new SRE machine (#31)

# dipr 1.1.0

* Added function `ocwa_branch_export()` (#26) to create a clean branch to prepare the repo for import into OCWA by:
1. Creating a new branch
2. Removing files that can't be imported - these are listed in the `_ocwaignore` file in the root of the repo
3. Cleaning `README.md` to comment out references to images and links that won't be available in the SRE.
4. Committing the changes from 2 and 3 to the new branch and pushing that to GitHub. This branch can then be used as the basis for an import into OCWA.

* Added `dipr_create_targets_project` which will create a thin package-like targets folder structure.
* Adding `dipr_document_output_groups` as convenience to document datasets
* Changes `get_core_dat_path()` and `get_core_dict_path()` to accomodate the new structure of provisioned data where the metadata are in the same directory as the data. (#17)
* Adds a new `data_format` argument to `dat_to_arrow_formats()` and friends, as well as `read_dat()` and the internal `dipr_reader()` (#17)
* Exposes `...` in `read_nflt()` to allow passing options to `readr::read_delim()` (#17)
* Comments are now removed from nflt files in `read_nflt`. Comments are by default expected to be denoted by `/*`, but this is customizable with the `comment` argument. (#21)

# dipr 1.0.0

Expand Down
47 changes: 42 additions & 5 deletions R/add.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,54 @@
#' This function is really meant to be used in a pipeline as a quick shortcut to add the
#' linked status column
#'
#' @param .data a data frame (or tibble) with a studyid column
#' @param .data a data frame (or tibble) or arrow Dataset with a studyid column
#' @param studyid_col the name of the study id column. Defaults to "studyid"
#'
#' @return the same data frame plus a `linked_status` column
#'
#' @export
add_linked_status_col <- function(.data, studyid_col = "studyid"){

UseMethod("add_linked_status_col")

}

#' @export
add_linked_status_col.default <- function(.data, studyid_col = "studyid"){
stop("No add_linked_status_col method for an object of class ", class(.data), call. = FALSE)
}


#' @export
add_linked_status_col.data.frame <- function(.data, studyid_col = "studyid") {
dplyr::mutate(
.data, linked_status = ifelse(
grepl("^s", !!rlang::sym(studyid_col)), "linked", "unlinked"
)
)
}

#' @export
add_linked_status_col.Dataset <- add_linked_status_col.data.frame

add_linked_status_col <- function(.data) {
if (!"studyid" %in% names(.data)) stop("No studyid column", call. = FALSE)

.data$linked_status <- ifelse(grepl("^s", .data$studyid), "linked", "unlinked")
#' @export
add_linked_status_col.arrow_dplyr_query <- add_linked_status_col.Dataset

.data
#' Wrapper to filter out unlinked studyids from either a data.frame or a Dataset
#'
#' This is a convenience function which automatically filter for only "linked" studyids
#' as defined by `add_linked_status()`.
#'
#' @inheritParams add_linked_status_col
#' @inheritDotParams add_linked_status_col
#'
#' @export
#'
#' @examples
#'
#' filter_linked(data.frame(studyid = c("sxxxx", "uxxxx")))
filter_linked <- function(.data, ...) {
d <- add_linked_status_col(.data, ...)
dplyr::filter(d, linked_status == "linked")
}
47 changes: 36 additions & 11 deletions R/core-table-paths.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,37 +15,62 @@
#' Return the paths of all data within a version that match a table name
#'
#' @param core_version Core version of tables typically formatted as a YYYYMMDD string
#' @param core_table_name a string of the data set name
#' @param core_table_name an exact string of the data set name. Should match the folder inside the core-snapshot path. Leave as `NULL` if using `core_table_pattern` to match several folders.
#' @param core_table_pattern a string (can be a regex) to match a table name - can be a subdataset of `core_table_name` or a pattern to match a dataset that may be split over multiple dataset folders in the core-snapshot path.
#' @param pattern Defaults to dat.gz but can be extended for more extended using regex for more
#' flexibility
#' @param base_path Defaults to "R:/DATA/core-snapshot"
#'
#' @export

get_core_dat_path <- function(core_version, core_table_name, pattern = "\\.dat.gz$", base_path = "R:/DATA/core-snapshot") {
get_core_dat_path <- function(core_version, core_table_name = NULL, core_table_pattern = NULL,
pattern = "\\.dat.gz$", base_path = "R:/DATA/core-snapshot") {


path <- if (is.null(core_table_name)) {
file.path(base_path, core_version)
} else {
file.path(base_path, core_version, core_table_name, "dat")
}

core_table_dir <- list.files(
path = file.path(base_path, core_version),
pattern = core_table_name, full.names = TRUE
core_table_files <- list.files(
path = path,
pattern = pattern,
recursive = TRUE,
full.names = TRUE
)
list.files(core_table_dir, pattern = pattern, full.names = TRUE, recursive = TRUE)

if (!is.null(core_table_pattern)) {
core_table_files <- core_table_files[grepl(core_table_pattern, core_table_files)]
}

normalizePath(
core_table_files,
winslash = "/",
mustWork = TRUE
)

}

#' Get paths of data dictionaries
#'
#' @inheritParams get_core_dat_path
#' @param core_dict_name a string of the dictionary name. Often this is different from the data name
#' @param core_dict_name a string of the dataset name.
#' @export
get_core_dict_path <- function(core_version, core_dict_name, base_path = "R:/DATA/core-snapshot") {

core_version_docs <- file.path(base_path, core_version, "docs")
core_doc <- list.files(core_version_docs, pattern = core_dict_name,
core_version_docs <- normalizePath(
file.path(base_path, core_version, core_dict_name, "docs"),
winslash = "/",
mustWork = TRUE
)

core_doc <- list.files(core_version_docs, pattern = "\\.xlsx$|\\.nflt$",
full.names = TRUE,
recursive = TRUE,
ignore.case = TRUE)
core_doc <- core_doc[grepl("\\.xlsx$|\\.nflt$", core_doc)]

if (length(core_doc) > 1) core_doc <- core_doc[grepl("dictionary", core_doc)]
if (length(core_doc) > 1) core_doc <- core_doc[grepl("dictionary|\\.nflt$", core_doc)]

if(grepl("\\.xlsx$", core_doc) & length(core_doc) == 1) {
sheets <- readxl::excel_sheets(core_doc)
Expand Down
14 changes: 8 additions & 6 deletions R/dat-to-arrow-formats.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,19 @@ dat_to_arrow_formats <- function(data_path,
col_types = NULL,
col_select = NULL,
overwrite = TRUE,
data_format = c("fwf", "csv", "tsv", "csv2"),
...) {

#browser()
## Check if data dictionary is in a valid format
is_valid_data_dict(data_dict)
data_format = match.arg(data_format)

if(!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
#browser()

data_name <- gsub(".dat.gz", "", basename(data_path))

## Columns
if(!is.null(col_select)) col_select <- col_selector(data_dict, col_select)

d <- dipr_reader(data_path, data_dict, col_types, col_select)
d <- dipr_reader(data_path, data_dict, col_types, col_select, data_format = data_format)

tf <- file.path(tempdir(), paste0(data_name, ".", arrow_format))

Expand All @@ -64,6 +63,8 @@ dat_to_arrow_formats <- function(data_path,
arrow::write_feather(d, sink = tf)
}

if(!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)

## copy over
if(!file.copy(tf, output_dir, overwrite = overwrite)) {
stop(glue::glue("File copy from {tf} to {output_dir} was unsuccessful"), call. = FALSE)
Expand Down Expand Up @@ -117,7 +118,7 @@ dat_to_feather <- function(...) {
#' @export
#'
#' @examples
#'
#' \dontrun{
#' data_dict_path <- dipr_example("starwars-dict.txt")
#' dict <- read.table(data_dict_path)
#' dat_path <- dipr_example("starwars-fwf.dat.gz")
Expand All @@ -129,6 +130,7 @@ dat_to_feather <- function(...) {
#' path = "starwars_arrow",
#' partitioning = "species",
#' chunk_size = 2)
#' }
#'
dat_to_datasets <- function(data_path, data_dict, chunk_size = 1000000, path, partitioning, ...) {
tdir <- file.path(tempdir(), gsub(".dat.gz", "", basename(data_path)), "arrow-tmp")
Expand Down
Loading

0 comments on commit 55035f0

Please sign in to comment.