Changes (3befe44) from SAE environment

jonjunduan · May 24, 2022 · a903eb8 · a903eb8
1 parent 02adbae
commit a903eb8
Show file tree

Hide file tree

Showing 39 changed files with 604 additions and 194 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: dipr
 Title: Provide functions to efficiently import SRE data
-Version: 1.2.1
+Version: 1.3.1
 Authors@R: 
     c(person(given = "Sam",
              family = "Albers",

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,9 @@ S3method(filter_across,Dataset)
 S3method(filter_across,arrow_dplyr_query)
 S3method(filter_across,data.frame)
 S3method(filter_across,default)
+S3method(flag_across,Dataset)
+S3method(flag_across,arrow_dplyr_query)
+S3method(flag_across,data.frame)
 export("%>%")
 export(add_linked_status_col)
 export(dat_to_arrow)
@@ -28,10 +31,12 @@ export(dipr_use_export_doc)
 export(dipr_write_parquet)
 export(filter_across)
 export(filter_linked)
+export(flag_across)
 export(get_core_dat_path)
 export(get_core_dict_path)
 export(get_gitlab_sre_repos)
 export(group_ages)
+export(health_dict_to_spec)
 export(insert_bcgov_apache_header)
 export(insert_bcgov_cc_header)
 export(install_sre_gitlab)
@@ -43,6 +48,6 @@ export(read_health_dict)
 export(read_nflt)
 export(restore_rstudio_prefs)
 export(set_gitlab_credentials)
-import(data.table)
 importFrom(dplyr,"%>%")
+importFrom(rlang,":=")
 importFrom(rlang,.data)
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,24 @@
+# dipr 1.3.1
+
+* Fixed bug in `install_sre_gitlab()` where it would fail if the desired package didn't have a previous version installed.
+
+# dipr 1.3.0
+
+* Added new arguments `date_format` and `time_format` to `read_dat()`, `dat_to_arrow_formats()` 
+etc, to allow overriding the global 
+formats when reading in a dataset. Defaults are the same as in `readr` functions.
+
+# dipr 1.2.3
+
+* New function `health_dict_to_spec()` to create a `readr` column specification from a health dictionary. This allows parsing of most date formats used in the health datasets (#37)
+* New function `flag_across`, similar to `filter_across`, but rather than filtering the dataset it adds a logical column indicating the result of the pattern search across the selected columns (#38)
+
+# dipr 1.2.2
+
+## Improvements
+* Modify credential tests to only run in SRE
+* Have `ocwa_branch_export` modified Rmd as well as md
+
 # dipr 1.2.1
 * Add Bonnie Robert and Andy Teucher as authors
 * Document deprecated functions in one place

diff --git a/R/dat-to-arrow-formats.R b/R/dat-to-arrow-formats.R
@@ -35,6 +35,9 @@ dat_to_arrow_formats <- function(data_path,
                            col_select = NULL,
                            overwrite = TRUE,
                            data_format = c("fwf", "csv", "tsv", "csv2"),
+                           tz = "UTC",
+                           date_format = "%AD",
+                           time_format = "%AT",
                            ...) {
 
   data_format = match.arg(data_format)
@@ -44,9 +47,9 @@ dat_to_arrow_formats <- function(data_path,
   data_name <- gsub(".dat.gz", "", basename(data_path))
 
   ## Columns
-  if(!is.null(col_select))  col_select <- col_selector(data_dict, col_select)
+  if (!is.null(col_select))  col_select <- col_selector(data_dict, col_select)
 
-  d <- dipr_reader(data_path, data_dict, col_types, col_select, data_format = data_format)
+  d <- dipr_reader(data_path, data_dict, col_types, col_select, data_format, tz = tz, date_format = date_format, time_format = time_format, ...)
 
   tf <- file.path(tempdir(), paste0(data_name, ".", arrow_format))
 
@@ -132,7 +135,8 @@ dat_to_feather <- function(...) {
 #'     chunk_size = 2)
 #'     }
 #'
-dat_to_datasets <- function(data_path, data_dict, chunk_size = 1000000, path, partitioning, ...) {
+dat_to_datasets <- function(data_path, data_dict, chunk_size = 1000000, path, partitioning,
+                            tz = "UTC", date_format = "%AD", time_format = "%AT", ...) {
   tdir <- file.path(tempdir(), gsub(".dat.gz", "", basename(data_path)), "arrow-tmp")
   dir.create(tdir, showWarnings = FALSE, recursive = TRUE)
   f <- function(x, pos) {
@@ -146,7 +150,7 @@ dat_to_datasets <- function(data_path, data_dict, chunk_size = 1000000, path, pa
         ),
       progress = TRUE,
       lazy = FALSE,
-      locale = readr::locale(encoding = "latin1"), ...
+      locale = dipr_locale(tz = tz, date_format = date_format, time_format = time_format), ...
     )
 
     num_files <- length(list.files(tdir, pattern = ".parquet"))

diff --git a/R/dipr-package.R b/R/dipr-package.R
@@ -1,8 +1,7 @@
 #' @keywords internal
 "_PACKAGE"
 
-#' @import data.table
-#' @importFrom rlang .data
+#' @importFrom rlang .data :=
 
 # The following block is used by usethis to automatically manage
 # roxygen namespace tags. Modify with care!

diff --git a/R/dipr-update.R b/R/dipr-update.R
@@ -94,10 +94,9 @@ update_needed <- function(repo, ref = NULL, token) {
 
     gitlab_version <- read.dcf(textConnection(httr::content(resp, as = "parsed")), fields = "Version")
 
-    local_desc <- file.path(.libPaths()[1], repo, "DESCRIPTION")
+    installed_version <- get_installed_version(pkg = repo)
 
-    installed_version <- read.dcf(local_desc, fields = "Version")
-    should_install <- package_version(gitlab_version) > package_version(installed_version)
+    should_install <- package_version(gitlab_version) > installed_version
 
     if (!should_install) stop(glue::glue("{repo} update not needed. You are using the most recently released version {installed_version}."), call. = FALSE)
 
@@ -106,6 +105,21 @@ update_needed <- function(repo, ref = NULL, token) {
 
 }
 
+get_installed_version <- function(pkg) {
+  # find lib pkg is in:
+  libs <- .libPaths()
+  pkg_paths <- vapply(libs, function(x) file.path(x, pkg), FUN.VALUE = "")
+  lib <- libs[dir.exists(pkg_paths)]
+
+  # return version 0.0 if not installed
+  if (!length(lib)) return(package_version("0.0"))
+
+  # in case > 1 path, pick the first one
+  local_desc <- file.path(lib[1], pkg, "DESCRIPTION")
+
+  package_version(read.dcf(local_desc, fields = "Version"))
+}
+
 #' Return a data.frame of all repos available to the project you are currently in
 #'
 #' Access information on the repo in the current project in a data.frame.

diff --git a/R/filter-arrow.R b/R/filter-arrow.R
@@ -25,11 +25,6 @@
 #' @details
 #' -  Function currently returns a "data.frame" if passed a "data.frame".  If passed a .Dataset or arrow_dplyr_query,
 #' the result will be an object of class arrow_dplyr_query which can preceed arrow::collect().
-#' @export
-#'
-#'
-#'
-
 #' @export
 filter_across <- function(.data, code, cols, partial = FALSE){
 
@@ -47,6 +42,65 @@ filter_across.default <- function(.data, code, cols, partial = FALSE){
 #' @export
 filter_across.Dataset <- function(.data, code = NULL, cols = NULL, partial = FALSE){
 
+  cond_str <- make_condition_string(.data, code = code, cols = cols, partial = partial)
+
+  dplyr::filter(.data, eval(str2lang(cond_str)))
+}
+
+#' @export
+filter_across.data.frame <- filter_across.Dataset
+
+
+#' @export
+filter_across.arrow_dplyr_query <- filter_across.Dataset
+
+
+## add to filter_icd() documentation (details)
+## - ICD-9 categories were defined according to the BC Ministry of Health's ICD-9 Diagnostic Code Descriptions
+##  website: https://www2.gov.bc.ca/gov/content/health/practitioner-professional-resources/msp/physicians/diagnostic-code-descriptions-icd-9
+
+
+
+#' Add a logical flag (new column) based on a pattern search across columns
+#'
+#' @inheritParams filter_across
+#' @param cols a vector of column names to search in
+#' @param flag_name the name of the new column to create
+#'
+#' @return An object the same class as the input (methods for `data.frame`, `Dataset`, `arrow_dplyr_query`),
+#' with a new logical column
+#'
+#' @seealso [filter_across()]
+#' @export
+flag_across <- function(.data, code = NULL, cols = NULL, partial = FALSE, flag_name = NULL) {
+  UseMethod("flag_across")
+}
+
+#' @export
+flag_across.Dataset <- function(.data, code = NULL, cols = NULL, partial = FALSE, flag_name = NULL) {
+
+  if (!is.character(flag_name) || !length(flag_name) == 1L) {
+    stop("flag_name must be a length one character vector")
+  }
+
+  if (flag_name %in% names(.data)) {
+    stop(flag_name, " is already a column in data")
+  }
+
+  cond_str <- make_condition_string(.data, code = code, cols = cols, partial = partial)
+
+  dplyr::mutate(.data, {{flag_name}} := eval(str2lang(cond_str)))
+}
+
+#' @export
+flag_across.data.frame <- flag_across.Dataset
+
+#' @export
+flag_across.arrow_dplyr_query <- flag_across.Dataset
+
+
+make_condition_string <- function(.data, code, cols, partial) {
+
   ## restrict values of partial
   if (!is.logical(partial)) stop("partial needs to be TRUE or FALSE in filter_across", call. = FALSE)
 
@@ -69,20 +123,5 @@ filter_across.Dataset <- function(.data, code = NULL, cols = NULL, partial = FAL
 
   grepl_call <- ifelse(partial, "grepl('(%s)',%s)", "grepl('^(%s)$',%s)")
 
-  cond_str <- paste0(sprintf(grepl_call, paste0(code, collapse = ")|("), cols), collapse = " | ")
-
-  dplyr::filter(.data, eval(str2lang(cond_str)))
-
+  paste0(sprintf(grepl_call, paste0(code, collapse = ")|("), cols), collapse = " | ")
 }
-
-#' @export
-filter_across.data.frame <- filter_across.Dataset
-
-
-#' @export
-filter_across.arrow_dplyr_query <- filter_across.Dataset
-
-
-## add to filter_icd() documentation (details)
-## - ICD-9 categories were defined according to the BC Ministry of Health's ICD-9 Diagnostic Code Descriptions
-##  website: https://www2.gov.bc.ca/gov/content/health/practitioner-professional-resources/msp/physicians/diagnostic-code-descriptions-icd-9
diff --git a/R/read-dat.R b/R/read-dat.R
@@ -33,6 +33,11 @@
 #' @param use_cache deprecated,
 #' @param data_format the format of the input data. Default is `"fwf"`, other choices
 #' are `"csv"`, `"csv2"`, `"tsv"`
+#' @param tz what timezone should datetime fields use? Default UTC. This is recommended
+#' to avoid timezone pain, but remember that the data is in UTC when doing analysis. See
+#' OlsonNames() for list of available timezones.
+#' @param date_format date format for columns where date format is not specified in `col_types`
+#' @param time_format time format for columns where time format is not specified in `col_types`
 #' @inheritParams readr::read_fwf
 #' @inheritDotParams readr::read_fwf
 #'
@@ -43,7 +48,9 @@
 #' dict <- read.table(data_dict_path)
 #' dat_path <- dipr_example("starwars-fwf.dat.gz")
 #' read_dat(data_path = dat_path,
-#'          data_dict = dict)
+#'          data_dict = dict,
+#'          col_types = "cddlcD",
+#'          date_format = "%Y%m%d")
 
 
 read_dat <- function(data_path,
@@ -53,6 +60,9 @@ read_dat <- function(data_path,
                      col_select = NULL,
                      col_types = NULL,
                      data_format = c("fwf", "csv", "tsv", "csv2"),
+                     tz = "UTC",
+                     date_format = "%AD",
+                     time_format = "%AT",
                      ...) {
 
   data_format = match.arg(data_format)
@@ -64,13 +74,13 @@ read_dat <- function(data_path,
 
   data_name <- gsub(".dat.gz", "", basename(data_path))
 
-  if(use_cache) stop("Caching directly in dipr is deprecated", call. = FALSE)
+  if (use_cache) stop("Caching directly in dipr is deprecated", call. = FALSE)
 
   ## Columns
-  if(!is.null(col_select))  col_select <- col_selector(data_dict, col_select)
+  if (!is.null(col_select))  col_select <- col_selector(data_dict, col_select)
 
   cli::cli_alert_success("Reading {data_name}")
-  d <- dipr_reader(data_path, data_dict, col_types, col_select, data_format, ...)
+  d <- dipr_reader(data_path, data_dict, col_types, col_select, data_format, tz = tz, date_format = date_format, time_format = time_format, ...)
 
   if (as.data.table) return(message('read_dat now only returns tibbles. To return a data.table object see read_dat_dt'))