Boehringer-Ingelheim · mingstat · Oct 17, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 21, 2024
@@ -1,7 +1,7 @@
 Package: dv.loader
 Type: Package
 Title: Data loading module 
-Version: 2.0.0
+Version: 2.1.0
 Authors@R: c(
         person( "Boehringer-Ingelheim Pharma GmbH & Co.KG", role = c("cph", "fnd")),
         person( given = "Ming", family = "Yang", role = c("aut", "cre"), email = "[email protected]"),
@@ -13,10 +13,13 @@ License: Apache License (>= 2)
 Encoding: UTF-8
 LazyData: true
 Depends: R (>= 3.5.0)
-Imports: haven
+Imports:
+    checkmate,
+    haven
 Suggests: 
-    testthat,
+    testthat (>= 3.0.0),
     knitr,
     rmarkdown
 RoxygenNote: 7.3.0
 VignetteBuilder: knitr
+Config/testthat/edition: 3
@@ -1,5 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(get_cre_path)
+export(get_file_paths)
 export(get_nfs_path)
 export(load_data)
+export(load_data_files)
@@ -1,3 +1,11 @@
+# dv.loader 2.1.0
+
+- Refactored code to improve readability and maintainability.
+
+- Fixed issue of partial matching when the `file_names` argument contains no file extensions.
+
+- Added arguments `env_var` and `print_file_paths` in `load_data()` function to provide more flexibility and control.
+
 # dv.loader 2.0.0
 
 - GitHub release with QC report

@@ -1,61 +1,92 @@
-#' gets the NFS base path from an env var
-#' It assumes there is an env var
-#' called RXD_DATA which holds the path suffix.
-#' @return the NFS base path
+#' Get Base Path from an Environment Variable
+#'
+#' This function assumes that there is an environment variable called `RXD_DATA`
+#' which is set to the base path of the data directory.
+#'
+#' @return [character(1)] The normalized base path.
+#'
 #' @export
 get_nfs_path <- function() {
   base_path <- Sys.getenv("RXD_DATA")
-  # check that RXD_DATA is set
+
   if (base_path == "") {
-    stop("Usage: get_nfs_path: RXD_DATA must be set")
+    stop("Environment variable RXD_DATA must be set")
   }
-  return(base_path)
+
+  checkmate::assert_directory_exists(base_path)
+
+  return(normalizePath(base_path))
 }
 
-#' gets the NFS base path from an env var
-#' alias for get_nfs_path to maintain backwards compatibility
+#' Get Base Path from an Environment Variable
+#'
+#' This function is an alias for `get_nfs_path()` to maintain backwards compatibility.
+#'
+#' @return [character(1)] The normalized base path.
+#'
 #' @export
 get_cre_path <- get_nfs_path
 
-#' Loads data into memory based on study directory and one or more file_names.
-#' @param sub_dir A relative directory/folder that will be appended to a base path defined by `Sys.getenv("RXD_DATA")`.
-#' If the argument is left as NULL, the function will load data from the working directory `getwd()`.
-#' @param file_names Study file or file_names name(s) - can be a vector of strings.
-#' This is the only required argument.
-#' @param use_wd for "use working directory" - a flag used when importing local files
-#' not on NFS - default value is FALSE
-#' @param prefer_sas if set to TRUE, imports sas7bdat files first before looking for
-#' RDS files (the opposite of default behavior)
-#' @return a list of dataframes
-#' @export
+
+#' Load Data Files
+#'
+#' This function loads data files from a specified directory or the current working directory.
+#' It supports loading both RDS and SAS7BDAT files.
+#'
+#' @param sub_dir [character(1)] Optional character string specifying a subdirectory. Default is NULL.
+#' @param file_names [character(1+)] Character vector of file names to load (without extension).
+#' @param use_wd [logical(1)] Logical indicating whether to use the current working directory. Default is FALSE.
+#' @param prefer_sas [logical(1)] Logical indicating whether to prefer SAS7BDAT files over RDS. Default is FALSE.
+#' @param print_file_paths [logical(1)] Logical indicating whether to print the directory path and file names.
+#' Default is FALSE.
+#'
+#' @return A named list of data frames, where each name corresponds to a loaded file.
+#'
 #' @examples
-#' \dontrun{
-#' test_data_path <- "../inst/extdata/"
-#' data_list <- load_data(
-#'   sub_dir = test_data_path,
-#'   file_names = "dummyads2",
-#'   use_wd = TRUE
-#' )
-#' }
-load_data <- function(sub_dir = NULL, file_names, use_wd = FALSE, prefer_sas = FALSE) {
-  if (is.null(file_names)) {
-    stop("Usage: load_data: file_names: Must supply at least one file name")
+#' # Get the current value of the RXD_DATA environment variable
+#' base_dir <- Sys.getenv("RXD_DATA")
+#'
+#' # Set the RXD_DATA environment variable to the path of the haven package
+#' Sys.setenv(RXD_DATA = find.package("haven"))
+#'
+#' data_list <- load_data(sub_dir = "examples", file_names = c("iris.sas7bdat"))
+#' str(data_list)
+#'
+#' # Reset the RXD_DATA environment variable to its original value
+#' Sys.setenv(RXD_DATA = base_dir)
+#'
+#' @export
+load_data <- function(
+    sub_dir = NULL,
+    file_names,
+    use_wd = FALSE,
+    prefer_sas = FALSE,
+    print_file_paths = FALSE) {
+  checkmate::assert_character(sub_dir, len = 1, null.ok = TRUE)
+  checkmate::assert_character(file_names, min.len = 1)
+  checkmate::assert_logical(use_wd, len = 1)
+  checkmate::assert_logical(prefer_sas, len = 1)
+
+  if (use_wd) {
+    base_dir <- getwd()
+  } else {
+    base_dir <- get_nfs_path()
   }
 
-  study_path <- "" # will be built using args
+  dir_path <- if (is.null(sub_dir)) base_dir else file.path(base_dir, sub_dir)
 
-  if (is.null(sub_dir)) {
-    study_path <- getwd()
-  } else {
-    if (use_wd) {
-      study_path <- file.path(getwd(), sub_dir)
-    } else {
-      study_path <- file.path(get_cre_path(), sub_dir)
-    }
+  file_ext <- if (prefer_sas) "sas7bdat" else "rds"
+
+  file_paths <- get_file_paths(dir_path = dir_path, file_names = file_names, prefer_sas = prefer_sas)
+
+  if (isTRUE(print_file_paths)) {
+    cat("Loading data from", dir_path, "\n")
+    cat("Loading data file(s):", basename(file_paths), "\n")
   }
 
-  # create the output
-  data_list <- create_data_list(study_path, file_names, prefer_sas) # nolint
+  data_list <- load_data_files(file_paths)
+
+  names(data_list) <- file_names
 
   return(data_list)
 }
@@ -1,81 +1,130 @@
-#' For each file name provided, reads in the first matching file and its meta data/attributes.
-#' Preference is given to RDS because its faster
-#' @param file_path the folder where the files are
-#' @param file_names CDISC names for the files
-#' @param prefer_sas if TRUE, imports .sas7bdat files first instead of .RDS files
-#' @return returns a list of dataframes with metadata as an attribute on each dataframe
-create_data_list <- function(file_path, file_names, prefer_sas) {
-  data_list <- lapply(file_names, function(x) {
-    extensions <- c("", ".rds", ".sas7bdat")
-    if (prefer_sas) {
-      extensions <- c("", ".sas7bdat", ".rds")
-    }
-
-    file_name_to_load <- NULL
-
-    candidates <- list.files(file_path)
-    uppercase_candidates <- Map(toupper, candidates)
-
-    for (ext in extensions) {
-      # Case insensitive file name match
-      uppercase_file_name <- toupper(paste0(x, ext))
-
-      match_count <- sum(uppercase_candidates == uppercase_file_name)
-      if (match_count > 1) {
-        stop(paste("create_data_list(): More than one case-insensitive file name match for", file_path, x))
+#' Get File Paths
+#'
+#' This function constructs file paths for given file names, handling both RDS and SAS7BDAT files.
+#' It can prioritize SAS files over RDS files based on the `prefer_sas` parameter.
+#'
+#' @param dir_path [character(1)] The directory path where the files are located.
+#' @param file_names [character(1+)] A vector of file names to process.
+#' @param prefer_sas [logical(1)] Whether to prefer SAS files over RDS files. Default is FALSE.
+#'
+#' @return [character] A vector of normalized file paths.
+#'
+#' @examples
+#' \dontrun{
+#' temp_dir <- tempdir()
+#'
+#' file_names <- c("adsl", "adae")
+#'
+#' file.create(file.path(temp_dir, paste0(file_names, ".rds")))
+#' file.create(file.path(temp_dir, paste0(file_names, ".sas7bdat")))
+#'
+#' list.files(temp_dir)
+#'
+#' get_file_paths(dir_path = temp_dir, file_names = file_names)
+#' get_file_paths(dir_path = temp_dir, file_names = file_names, prefer_sas = TRUE)
+#'
+#' unlink(temp_dir, recursive = TRUE)
+#' }
+#'
+#' @export
+get_file_paths <- function(dir_path, file_names, prefer_sas = FALSE) {
+  checkmate::assert_character(dir_path, len = 1)
+  checkmate::assert_character(file_names, min.len = 1)
+  checkmate::assert_logical(prefer_sas, len = 1)
+
+  file_paths <- lapply(file_names, function(file_name) {
+    file_path <- file.path(dir_path, file_name)
+    file_ext <- tools::file_ext(file_name)
+
+    if (file_ext == "") {
+      candidates <- basename(list.files(dir_path))
+
+      rds_match <- grep(
+        pattern = paste0("^", file_name, "\\.rds$"),
+        x = candidates,
+        ignore.case = TRUE,
+        value = TRUE
+      )
+
+      sas_match <- grep(
+        pattern = paste0("^", file_name, "\\.sas7bdat$"),
+        x = candidates,
+        ignore.case = TRUE,
+        value = TRUE
+      )
+
+      if (isTRUE(prefer_sas)) {
+        if (length(sas_match) > 0) {
+          return(file.path(dir_path, sas_match[1]))
+        } else if (length(rds_match) > 0) {
+          return(file.path(dir_path, rds_match[1]))
+        } else {
+          stop(dir_path, " does not contain SAS or RDS file: ", file_name)
+        }
+      } else if (isFALSE(prefer_sas)) {
+        if (length(rds_match) > 0) {
+          return(file.path(dir_path, rds_match[1]))
+        } else if (length(sas_match) > 0) {
+          return(file.path(dir_path, sas_match[1]))
+        } else {
+          stop(dir_path, " does not contain RDS or SAS file: ", file_name)
+        }
       }
-
-      index <- match(uppercase_file_name, uppercase_candidates)
-      if (!is.na(index)) {
-        file_name_to_load <- candidates[[index]]
-        break
+    } else {
+      if (file.exists(file_path)) {
+        return(file_path)
+      } else {
+        stop(dir_path, " does not contain: ", file_name)
       }
     }
-
-    if (is.null(file_name_to_load)) {
-      stop(paste("create_data_list(): No RDS or SAS files found for", file_path, x))
-    }
-
-    output <- read_file(file_path, file_name_to_load)
-
-    return(output)
   })
 
-  names(data_list) <- file_names
-
-  return(data_list)
+  return(normalizePath(unlist(file_paths)))
 }
 
 
-#' Reads RDS/SAS file and metadatas from first 6 items from file.info() its file path
-#' @param file_path a path to a file
-#' @param file_name name of a file
-#' @return a data object with an extra attribute of metadata
-read_file <- function(file_path, file_name) {
-  ext <- tools::file_ext(file_name)
 
-  if (!(toupper(ext) %in% c("RDS", "SAS7BDAT"))) {
-    stop("Usage error: read_file: file_name: file must either be RDS or SAS7BDAT.")
-  }
+#' Load Data Files
+#'
+#' This function reads data from multiple file paths and returns a list of data frames.
+#' It supports reading RDS and SAS7BDAT files.
+#'
+#' @param file_paths [character(1+)] A vector of file paths to read.
+#'
+#' @return [list] A named list of data frames, where each name is the basename of the corresponding file path.
+#'
+#' @examples
+#' path <- system.file("examples", "iris.sas7bdat", package = "haven")
+#' data_list <- load_data_files(file_paths = path)
+#' str(data_list)
+#'
+#' @export
+load_data_files <- function(file_paths) {
+  checkmate::assert_character(file_paths, min.len = 1)
+  checkmate::assert_file_exists(file_paths)
+
+  data_list <- lapply(file_paths, function(file_path) {
+    extension <- tools::file_ext(file_path)
+
+    if (tolower(extension) == "rds") {
+      data <- readRDS(file_path)
+    } else if (tolower(extension) == "sas7bdat") {
+      data <- haven::read_sas(file_path)
+    } else {
+      stop("Unsupported file extension: ", extension)
+    }
 
-  is_rds <- toupper(ext) == "RDS"
+    meta <- file.info(file_path, extra_cols = FALSE)
+    meta[["path"]] <- file_path
+    meta[["file_name"]] <- basename(file_path)
 
-  file <- file.path(file_path, file_name)
-  file_name <- tools::file_path_sans_ext(file_name)
+    rownames(data) <- NULL
+    attr(data, "meta") <- meta
 
-  # grab file info
-  meta <- file.info(file)[1L:6L]
-  meta[["path"]] <- row.names(meta)
-  meta[["file_name"]] <- file_name
-  meta <- data.frame(meta, stringsAsFactors = FALSE)
-  row.names(meta) <- NULL
+    return(data)
+  })
 
-  if (is_rds) {
-    out <- readRDS(file)
-  } else {
-    out <- haven::read_sas(file)
-  }
-  attr(out, "meta") <- meta
+  names(data_list) <- basename(file_paths)
 
-  return(out)
+  return(data_list)
 }