Changes (7c1d73f) from SAE environment

jonjunduan · Mar 9, 2022 · 55035f0 · 55035f0
1 parent f866366
commit 55035f0
Show file tree

Hide file tree

Showing 59 changed files with 1,087 additions and 1,504 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -13,7 +13,7 @@ data-raw/
 
 ^\.github$
 ^\.github/workflows/R-CMD-check\.yaml$
-.Rprofile
+^\.Rprofile$
 
 
 ^_pkgdown\.yml$

diff --git a/.Rprofile b/.Rprofile
@@ -31,3 +31,6 @@ if (Sys.getenv("USERDOMAIN") == "POPDATA") {
 
   reassignInPackage("check_file", "digest", .custom_check_file)
 }
+
+## Silence a CRAN warning
+Sys.setenv('_R_CHECK_SYSTEM_CLOCK_' = 0)
diff --git a/.gitignore b/.gitignore
@@ -4,5 +4,4 @@
 .Ruserdata
 
 .dipr/
-scratch/
 docs
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,24 +1,28 @@
 Package: dipr
 Title: Provide functions to efficiently import SRE data
-Version: 1.0.0.9000
+Version: 1.2.1
 Authors@R: 
     c(person(given = "Sam",
              family = "Albers",
              role = c("aut", "cre"),
              email = "[email protected]"),
-      person(given = "Stephanie",
-             family = "Hazlitt",
-             role = "aut",
-             email = "[email protected]"),
-      person(given = "Craig", 
-             family = "Hutton", 
-             role = "ctb",
-             email = "[email protected]"),
       person(given = "Andy", 
              family = "Teucher",
              email = "[email protected]",
-             role  = "ctb",
+             role  = "aut",
              comment = c(ORCID = "0000-0002-7840-692X")),
+      person(given = "Bonnie", 
+             family = "Robert",
+             email = "[email protected]",
+             role  = "aut"),
+      person(given = "Craig", 
+             family = "Hutton", 
+             role = "ctb",
+             email = "[email protected]"),
+      person(given = "Stephanie",
+             family = "Hazlitt",
+             role = "ctb",
+             email = "[email protected]"),
       person(given = "Province of British Columbia",
              role = "cph")
           )
@@ -27,17 +31,17 @@ Description: Provides functions that aid in working with SRE data. Caching
     of data into memory. Functions are also provided to convert to Apache
     Arrow formats.
 License: Apache License (== 2.0) | file LICENSE
-URL: https://github.com/bcgov/dipr, https://projectsc.popdata.bc.ca/shares/dipr2
+URL: https://github.com/bcgov/dipr, https://projectsc.popdata.bc.ca/shares/dipr
 Imports:
     arrow,
     credentials,
     cli,
     data.table,
-    devtools,
+    remotes,
     dplyr,
     fs,
-    fst,
     glue,
+    httr,
     janitor,
     lubridate,
     readr (>= 2.0.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,13 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(add_linked_status_col,Dataset)
+S3method(add_linked_status_col,arrow_dplyr_query)
+S3method(add_linked_status_col,data.frame)
+S3method(add_linked_status_col,default)
+S3method(filter_across,Dataset)
+S3method(filter_across,arrow_dplyr_query)
+S3method(filter_across,data.frame)
+S3method(filter_across,default)
 export("%>%")
 export(add_linked_status_col)
 export(dat_to_arrow)
@@ -9,6 +17,7 @@ export(dat_to_feather)
 export(dat_to_parquet)
 export(delete_rtmp_dirs)
 export(dipr_create_targets_project)
+export(dipr_document_output_groups)
 export(dipr_example)
 export(dipr_examples)
 export(dipr_icd10_categories)
@@ -17,17 +26,22 @@ export(dipr_icd_categories)
 export(dipr_update)
 export(dipr_use_export_doc)
 export(dipr_write_parquet)
+export(filter_across)
+export(filter_linked)
 export(get_core_dat_path)
 export(get_core_dict_path)
+export(get_gitlab_sre_repos)
 export(group_ages)
 export(insert_bcgov_apache_header)
 export(insert_bcgov_cc_header)
+export(install_sre_gitlab)
 export(msp_unique)
 export(ocwa_branch_export)
 export(read_dat)
 export(read_dat_dt)
 export(read_health_dict)
 export(read_nflt)
+export(restore_rstudio_prefs)
 export(set_gitlab_credentials)
 import(data.table)
 importFrom(dplyr,"%>%")

diff --git a/NEWS.md b/NEWS.md
@@ -1,11 +1,37 @@
-# dipr (development version)
+# dipr 1.2.1
+* Add Bonnie Robert and Andy Teucher as authors
+* Document deprecated functions in one place
+
+# dipr 1.2.0
+
+## Breaking Changes
+* Deprecate `msp_unique` and re-export it in the hawkeye package
+* Deprecate `dipr_icd_categories`, `dipr_icd9_categories` and `dipr_icd10_categories` functions in favour of functionality added to hawkeye package
+
+## Improvements
+* Add new function `filter_across`
+* Changed the internal method to install and update `{dipr}`, with new instructions in the README. The new method uses the gitlab API and will enable using `{dipr}` to install other SRE gitlab R packages using `install_sre_gitlab`. (#24)
+* Add new function `get_gitlab_sre_repos` 
+* Add new function `filter_linked` and add `Dataset` methods for `add_linked_status_col` function so that it can be used in a arrow workflow. 
+* The template `DESCRIPTION` file now adds dependent packages to the `Depends` field rather than the `Imports` field.
+* Fixed a bug where the template `.Rprofile` file used in `dipr_create_targets_project()` was not included in the package.
+* Deprecate `msp_unique` and re-export it in the hawkeye package
+* New function `restore_rstudio_prefs()` to help setup RStudio in a new SRE machine (#31)
+
+# dipr 1.1.0
 
 * Added function `ocwa_branch_export()` (#26) to create a clean branch to prepare the repo for import into OCWA by:
   1. Creating a new branch
   2. Removing files that can't be imported - these are listed in the `_ocwaignore` file in the root of the repo
   3. Cleaning `README.md` to comment out references to images and links that won't be available in the SRE.
   4. Committing the changes from 2 and 3 to the new branch and pushing that to GitHub. This branch can then be used as the basis for an import into OCWA.
+
 * Added `dipr_create_targets_project` which will create a thin package-like targets folder structure. 
+* Adding `dipr_document_output_groups` as convenience to document datasets
+* Changes `get_core_dat_path()` and `get_core_dict_path()` to accomodate the new structure of provisioned data where the metadata are in the same directory as the data. (#17)
+* Adds a new `data_format` argument to `dat_to_arrow_formats()` and friends, as well as `read_dat()` and the internal `dipr_reader()` (#17)
+* Exposes `...` in `read_nflt()` to allow passing options to `readr::read_delim()` (#17)
+* Comments are now removed from nflt files in `read_nflt`. Comments are by default expected to be denoted by `/*`, but this is customizable with the `comment` argument. (#21)
 
 # dipr 1.0.0
 

diff --git a/R/add.R b/R/add.R
@@ -16,17 +16,54 @@
 #' This function is really meant to be used in a pipeline as a quick shortcut to add the
 #' linked status column
 #'
-#' @param .data a data frame (or tibble) with a studyid column
+#' @param .data a data frame (or tibble) or arrow Dataset with a studyid column
+#' @param studyid_col the name of the study id column. Defaults to "studyid"
 #'
 #' @return the same data frame plus a `linked_status` column
 #'
 #' @export
+add_linked_status_col <- function(.data, studyid_col = "studyid"){
 
+  UseMethod("add_linked_status_col")
+
+}
+
+#' @export
+add_linked_status_col.default <- function(.data, studyid_col = "studyid"){
+  stop("No add_linked_status_col method for an object of class ", class(.data), call. = FALSE)
+}
+
+
+#' @export
+add_linked_status_col.data.frame <- function(.data, studyid_col = "studyid") {
+  dplyr::mutate(
+    .data, linked_status = ifelse(
+      grepl("^s", !!rlang::sym(studyid_col)), "linked", "unlinked"
+      )
+    )
+}
+
+#' @export
+add_linked_status_col.Dataset <- add_linked_status_col.data.frame
 
-add_linked_status_col <- function(.data) {
-  if (!"studyid" %in% names(.data)) stop("No studyid column", call. = FALSE)
 
-  .data$linked_status <- ifelse(grepl("^s", .data$studyid), "linked", "unlinked")
+#' @export
+add_linked_status_col.arrow_dplyr_query <- add_linked_status_col.Dataset
 
-  .data
+#' Wrapper to filter out unlinked studyids from either a data.frame or a Dataset
+#'
+#' This is a convenience function which automatically filter for only "linked" studyids
+#' as defined by `add_linked_status()`.
+#'
+#' @inheritParams add_linked_status_col
+#' @inheritDotParams add_linked_status_col
+#'
+#' @export
+#'
+#' @examples
+#'
+#' filter_linked(data.frame(studyid = c("sxxxx", "uxxxx")))
+filter_linked <- function(.data, ...) {
+    d <- add_linked_status_col(.data, ...)
+    dplyr::filter(d, linked_status == "linked")
 }
diff --git a/R/core-table-paths.R b/R/core-table-paths.R
@@ -15,37 +15,62 @@
 #' Return the paths of all data within a version that match a table name
 #'
 #' @param core_version Core version of tables typically formatted as a YYYYMMDD string
-#' @param core_table_name a string of the data set name
+#' @param core_table_name an exact string of the data set name. Should match the folder inside the core-snapshot path. Leave as `NULL` if using `core_table_pattern` to match several folders.
+#' @param core_table_pattern a string (can be a regex) to match a table name - can be a subdataset of `core_table_name` or a pattern to match a dataset that may be split over multiple dataset folders in the core-snapshot path.
 #' @param pattern Defaults to dat.gz but can be extended for more extended using regex for more
 #' flexibility
 #' @param base_path Defaults to "R:/DATA/core-snapshot"
+#'
 #' @export
 
-get_core_dat_path <- function(core_version, core_table_name, pattern = "\\.dat.gz$", base_path = "R:/DATA/core-snapshot") {
+get_core_dat_path <- function(core_version, core_table_name = NULL, core_table_pattern = NULL,
+                              pattern = "\\.dat.gz$", base_path = "R:/DATA/core-snapshot") {
+
+
+  path <- if (is.null(core_table_name)) {
+    file.path(base_path, core_version)
+  } else {
+    file.path(base_path, core_version, core_table_name, "dat")
+  }
 
-  core_table_dir <- list.files(
-    path = file.path(base_path, core_version),
-    pattern = core_table_name, full.names = TRUE
+  core_table_files <- list.files(
+    path = path,
+    pattern = pattern,
+    recursive = TRUE,
+    full.names = TRUE
     )
-  list.files(core_table_dir, pattern = pattern, full.names = TRUE, recursive = TRUE)
+
+  if (!is.null(core_table_pattern)) {
+    core_table_files <- core_table_files[grepl(core_table_pattern, core_table_files)]
+  }
+
+  normalizePath(
+    core_table_files,
+    winslash = "/",
+    mustWork = TRUE
+  )
 
 }
 
 #' Get paths of data dictionaries
 #'
 #' @inheritParams get_core_dat_path
-#' @param core_dict_name a string of the dictionary name. Often this is different from the data name
+#' @param core_dict_name a string of the dataset name.
 #' @export
 get_core_dict_path <- function(core_version, core_dict_name, base_path = "R:/DATA/core-snapshot") {
 
-  core_version_docs <- file.path(base_path, core_version, "docs")
-  core_doc <- list.files(core_version_docs, pattern = core_dict_name,
+  core_version_docs <- normalizePath(
+    file.path(base_path, core_version, core_dict_name, "docs"),
+    winslash = "/",
+    mustWork = TRUE
+  )
+
+  core_doc <- list.files(core_version_docs, pattern = "\\.xlsx$|\\.nflt$",
                          full.names = TRUE,
                          recursive = TRUE,
                          ignore.case = TRUE)
-  core_doc <- core_doc[grepl("\\.xlsx$|\\.nflt$", core_doc)]
 
-  if (length(core_doc) > 1) core_doc <- core_doc[grepl("dictionary", core_doc)]
+  if (length(core_doc) > 1) core_doc <- core_doc[grepl("dictionary|\\.nflt$", core_doc)]
 
   if(grepl("\\.xlsx$", core_doc) & length(core_doc) == 1) {
     sheets <- readxl::excel_sheets(core_doc)

diff --git a/R/dat-to-arrow-formats.R b/R/dat-to-arrow-formats.R
@@ -34,20 +34,19 @@ dat_to_arrow_formats <- function(data_path,
                            col_types = NULL,
                            col_select = NULL,
                            overwrite = TRUE,
+                           data_format = c("fwf", "csv", "tsv", "csv2"),
                            ...) {
 
-  #browser()
-  ## Check if data dictionary is in a valid format
-  is_valid_data_dict(data_dict)
+  data_format = match.arg(data_format)
 
-  if(!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
+  #browser()
 
   data_name <- gsub(".dat.gz", "", basename(data_path))
 
   ## Columns
   if(!is.null(col_select))  col_select <- col_selector(data_dict, col_select)
 
-  d <- dipr_reader(data_path, data_dict, col_types, col_select)
+  d <- dipr_reader(data_path, data_dict, col_types, col_select, data_format = data_format)
 
   tf <- file.path(tempdir(), paste0(data_name, ".", arrow_format))
 
@@ -64,6 +63,8 @@ dat_to_arrow_formats <- function(data_path,
     arrow::write_feather(d, sink = tf)
   }
 
+  if(!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
+
   ## copy over
   if(!file.copy(tf, output_dir, overwrite = overwrite)) {
     stop(glue::glue("File copy from {tf} to {output_dir} was unsuccessful"), call. = FALSE)
@@ -117,7 +118,7 @@ dat_to_feather <- function(...) {
 #' @export
 #'
 #' @examples
-#'
+#' \dontrun{
 #' data_dict_path <- dipr_example("starwars-dict.txt")
 #' dict <- read.table(data_dict_path)
 #' dat_path <- dipr_example("starwars-fwf.dat.gz")
@@ -129,6 +130,7 @@ dat_to_feather <- function(...) {
 #'     path = "starwars_arrow",
 #'     partitioning = "species",
 #'     chunk_size = 2)
+#'     }
 #'
 dat_to_datasets <- function(data_path, data_dict, chunk_size = 1000000, path, partitioning, ...) {
   tdir <- file.path(tempdir(), gsub(".dat.gz", "", basename(data_path)), "arrow-tmp")
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,5 +4,4 @@ @@
     .Ruserdata
     .dipr/
-    scratch/
     docs