Skip to content

Commit

Permalink
Changes (3befe44) from SAE environment
Browse files Browse the repository at this point in the history
  • Loading branch information
popdata-ocwa-user committed May 24, 2022
1 parent 02adbae commit a903eb8
Show file tree
Hide file tree
Showing 39 changed files with 604 additions and 194 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: dipr
Title: Provide functions to efficiently import SRE data
Version: 1.2.1
Version: 1.3.1
Authors@R:
c(person(given = "Sam",
family = "Albers",
Expand Down
7 changes: 6 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ S3method(filter_across,Dataset)
S3method(filter_across,arrow_dplyr_query)
S3method(filter_across,data.frame)
S3method(filter_across,default)
S3method(flag_across,Dataset)
S3method(flag_across,arrow_dplyr_query)
S3method(flag_across,data.frame)
export("%>%")
export(add_linked_status_col)
export(dat_to_arrow)
Expand All @@ -28,10 +31,12 @@ export(dipr_use_export_doc)
export(dipr_write_parquet)
export(filter_across)
export(filter_linked)
export(flag_across)
export(get_core_dat_path)
export(get_core_dict_path)
export(get_gitlab_sre_repos)
export(group_ages)
export(health_dict_to_spec)
export(insert_bcgov_apache_header)
export(insert_bcgov_cc_header)
export(install_sre_gitlab)
Expand All @@ -43,6 +48,6 @@ export(read_health_dict)
export(read_nflt)
export(restore_rstudio_prefs)
export(set_gitlab_credentials)
import(data.table)
importFrom(dplyr,"%>%")
importFrom(rlang,":=")
importFrom(rlang,.data)
21 changes: 21 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,24 @@
# dipr 1.3.1

* Fixed bug in `install_sre_gitlab()` where it would fail if the desired package didn't have a previous version installed.

# dipr 1.3.0

* Added new arguments `date_format` and `time_format` to `read_dat()`, `dat_to_arrow_formats()`
etc, to allow overriding the global
formats when reading in a dataset. Defaults are the same as in `readr` functions.

# dipr 1.2.3

* New function `health_dict_to_spec()` to create a `readr` column specification from a health dictionary. This allows parsing of most date formats used in the health datasets (#37)
* New function `flag_across`, similar to `filter_across`, but rather than filtering the dataset it adds a logical column indicating the result of the pattern search across the selected columns (#38)

# dipr 1.2.2

## Improvements
* Modify credential tests to only run in SRE
* Have `ocwa_branch_export` modified Rmd as well as md

# dipr 1.2.1
* Add Bonnie Robert and Andy Teucher as authors
* Document deprecated functions in one place
Expand Down
12 changes: 8 additions & 4 deletions R/dat-to-arrow-formats.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@ dat_to_arrow_formats <- function(data_path,
col_select = NULL,
overwrite = TRUE,
data_format = c("fwf", "csv", "tsv", "csv2"),
tz = "UTC",
date_format = "%AD",
time_format = "%AT",
...) {

data_format = match.arg(data_format)
Expand All @@ -44,9 +47,9 @@ dat_to_arrow_formats <- function(data_path,
data_name <- gsub(".dat.gz", "", basename(data_path))

## Columns
if(!is.null(col_select)) col_select <- col_selector(data_dict, col_select)
if (!is.null(col_select)) col_select <- col_selector(data_dict, col_select)

d <- dipr_reader(data_path, data_dict, col_types, col_select, data_format = data_format)
d <- dipr_reader(data_path, data_dict, col_types, col_select, data_format, tz = tz, date_format = date_format, time_format = time_format, ...)

tf <- file.path(tempdir(), paste0(data_name, ".", arrow_format))

Expand Down Expand Up @@ -132,7 +135,8 @@ dat_to_feather <- function(...) {
#' chunk_size = 2)
#' }
#'
dat_to_datasets <- function(data_path, data_dict, chunk_size = 1000000, path, partitioning, ...) {
dat_to_datasets <- function(data_path, data_dict, chunk_size = 1000000, path, partitioning,
tz = "UTC", date_format = "%AD", time_format = "%AT", ...) {
tdir <- file.path(tempdir(), gsub(".dat.gz", "", basename(data_path)), "arrow-tmp")
dir.create(tdir, showWarnings = FALSE, recursive = TRUE)
f <- function(x, pos) {
Expand All @@ -146,7 +150,7 @@ dat_to_datasets <- function(data_path, data_dict, chunk_size = 1000000, path, pa
),
progress = TRUE,
lazy = FALSE,
locale = readr::locale(encoding = "latin1"), ...
locale = dipr_locale(tz = tz, date_format = date_format, time_format = time_format), ...
)

num_files <- length(list.files(tdir, pattern = ".parquet"))
Expand Down
3 changes: 1 addition & 2 deletions R/dipr-package.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
#' @keywords internal
"_PACKAGE"

#' @import data.table
#' @importFrom rlang .data
#' @importFrom rlang .data :=

# The following block is used by usethis to automatically manage
# roxygen namespace tags. Modify with care!
Expand Down
20 changes: 17 additions & 3 deletions R/dipr-update.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,9 @@ update_needed <- function(repo, ref = NULL, token) {

gitlab_version <- read.dcf(textConnection(httr::content(resp, as = "parsed")), fields = "Version")

local_desc <- file.path(.libPaths()[1], repo, "DESCRIPTION")
installed_version <- get_installed_version(pkg = repo)

installed_version <- read.dcf(local_desc, fields = "Version")
should_install <- package_version(gitlab_version) > package_version(installed_version)
should_install <- package_version(gitlab_version) > installed_version

if (!should_install) stop(glue::glue("{repo} update not needed. You are using the most recently released version {installed_version}."), call. = FALSE)

Expand All @@ -106,6 +105,21 @@ update_needed <- function(repo, ref = NULL, token) {

}

get_installed_version <- function(pkg) {
# find lib pkg is in:
libs <- .libPaths()
pkg_paths <- vapply(libs, function(x) file.path(x, pkg), FUN.VALUE = "")
lib <- libs[dir.exists(pkg_paths)]

# return version 0.0 if not installed
if (!length(lib)) return(package_version("0.0"))

# in case > 1 path, pick the first one
local_desc <- file.path(lib[1], pkg, "DESCRIPTION")

package_version(read.dcf(local_desc, fields = "Version"))
}

#' Return a data.frame of all repos available to the project you are currently in
#'
#' Access information on the repo in the current project in a data.frame.
Expand Down
81 changes: 60 additions & 21 deletions R/filter-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,6 @@
#' @details
#' - Function currently returns a "data.frame" if passed a "data.frame". If passed a .Dataset or arrow_dplyr_query,
#' the result will be an object of class arrow_dplyr_query which can preceed arrow::collect().
#' @export
#'
#'
#'

#' @export
filter_across <- function(.data, code, cols, partial = FALSE){

Expand All @@ -47,6 +42,65 @@ filter_across.default <- function(.data, code, cols, partial = FALSE){
#' @export
filter_across.Dataset <- function(.data, code = NULL, cols = NULL, partial = FALSE){

cond_str <- make_condition_string(.data, code = code, cols = cols, partial = partial)

dplyr::filter(.data, eval(str2lang(cond_str)))
}

#' @export
filter_across.data.frame <- filter_across.Dataset


#' @export
filter_across.arrow_dplyr_query <- filter_across.Dataset


## add to filter_icd() documentation (details)
## - ICD-9 categories were defined according to the BC Ministry of Health's ICD-9 Diagnostic Code Descriptions
## website: https://www2.gov.bc.ca/gov/content/health/practitioner-professional-resources/msp/physicians/diagnostic-code-descriptions-icd-9



#' Add a logical flag (new column) based on a pattern search across columns
#'
#' @inheritParams filter_across
#' @param cols a vector of column names to search in
#' @param flag_name the name of the new column to create
#'
#' @return An object the same class as the input (methods for `data.frame`, `Dataset`, `arrow_dplyr_query`),
#' with a new logical column
#'
#' @seealso [filter_across()]
#' @export
flag_across <- function(.data, code = NULL, cols = NULL, partial = FALSE, flag_name = NULL) {
UseMethod("flag_across")
}

#' @export
flag_across.Dataset <- function(.data, code = NULL, cols = NULL, partial = FALSE, flag_name = NULL) {

if (!is.character(flag_name) || !length(flag_name) == 1L) {
stop("flag_name must be a length one character vector")
}

if (flag_name %in% names(.data)) {
stop(flag_name, " is already a column in data")
}

cond_str <- make_condition_string(.data, code = code, cols = cols, partial = partial)

dplyr::mutate(.data, {{flag_name}} := eval(str2lang(cond_str)))
}

#' @export
flag_across.data.frame <- flag_across.Dataset

#' @export
flag_across.arrow_dplyr_query <- flag_across.Dataset


make_condition_string <- function(.data, code, cols, partial) {

## restrict values of partial
if (!is.logical(partial)) stop("partial needs to be TRUE or FALSE in filter_across", call. = FALSE)

Expand All @@ -69,20 +123,5 @@ filter_across.Dataset <- function(.data, code = NULL, cols = NULL, partial = FAL

grepl_call <- ifelse(partial, "grepl('(%s)',%s)", "grepl('^(%s)$',%s)")

cond_str <- paste0(sprintf(grepl_call, paste0(code, collapse = ")|("), cols), collapse = " | ")

dplyr::filter(.data, eval(str2lang(cond_str)))

paste0(sprintf(grepl_call, paste0(code, collapse = ")|("), cols), collapse = " | ")
}

#' @export
filter_across.data.frame <- filter_across.Dataset


#' @export
filter_across.arrow_dplyr_query <- filter_across.Dataset


## add to filter_icd() documentation (details)
## - ICD-9 categories were defined according to the BC Ministry of Health's ICD-9 Diagnostic Code Descriptions
## website: https://www2.gov.bc.ca/gov/content/health/practitioner-professional-resources/msp/physicians/diagnostic-code-descriptions-icd-9
18 changes: 14 additions & 4 deletions R/read-dat.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
#' @param use_cache deprecated,
#' @param data_format the format of the input data. Default is `"fwf"`, other choices
#' are `"csv"`, `"csv2"`, `"tsv"`
#' @param tz what timezone should datetime fields use? Default UTC. This is recommended
#' to avoid timezone pain, but remember that the data is in UTC when doing analysis. See
#' OlsonNames() for list of available timezones.
#' @param date_format date format for columns where date format is not specified in `col_types`
#' @param time_format time format for columns where time format is not specified in `col_types`
#' @inheritParams readr::read_fwf
#' @inheritDotParams readr::read_fwf
#'
Expand All @@ -43,7 +48,9 @@
#' dict <- read.table(data_dict_path)
#' dat_path <- dipr_example("starwars-fwf.dat.gz")
#' read_dat(data_path = dat_path,
#' data_dict = dict)
#' data_dict = dict,
#' col_types = "cddlcD",
#' date_format = "%Y%m%d")


read_dat <- function(data_path,
Expand All @@ -53,6 +60,9 @@ read_dat <- function(data_path,
col_select = NULL,
col_types = NULL,
data_format = c("fwf", "csv", "tsv", "csv2"),
tz = "UTC",
date_format = "%AD",
time_format = "%AT",
...) {

data_format = match.arg(data_format)
Expand All @@ -64,13 +74,13 @@ read_dat <- function(data_path,

data_name <- gsub(".dat.gz", "", basename(data_path))

if(use_cache) stop("Caching directly in dipr is deprecated", call. = FALSE)
if (use_cache) stop("Caching directly in dipr is deprecated", call. = FALSE)

## Columns
if(!is.null(col_select)) col_select <- col_selector(data_dict, col_select)
if (!is.null(col_select)) col_select <- col_selector(data_dict, col_select)

cli::cli_alert_success("Reading {data_name}")
d <- dipr_reader(data_path, data_dict, col_types, col_select, data_format, ...)
d <- dipr_reader(data_path, data_dict, col_types, col_select, data_format, tz = tz, date_format = date_format, time_format = time_format, ...)

if (as.data.table) return(message('read_dat now only returns tibbles. To return a data.table object see read_dat_dt'))

Expand Down
Loading

0 comments on commit a903eb8

Please sign in to comment.