Merge pull request #26 from nhs-r-community/francisbarton/issue20

Fixes issue #20 by removing `batch_it_simple()`
nhs-r-community · Feb 8, 2024 · dd1b3df · dd1b3df
2 parents 8298f6e + 9c2eeff
commit dd1b3df
Show file tree

Hide file tree

Showing 11 changed files with 72 additions and 323 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -7,3 +7,4 @@
 ^docs$
 ^pkgdown$
 ^\.github$
+^\.lintr
diff --git a/.lintr b/.lintr
@@ -1,2 +1,5 @@
 linters: all_linters()
+exclusions: list(
+  "vignettes"
+  )
 encoding: "UTF-8"
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,17 +1,28 @@
 Package: NHSRpostcodetools
 Title: Package to work with England Postcodes in R
-Version: 0.0.0.9000
-Authors@R: c(
-          person("Zoë", "Turner", , "[email protected]", c("cre", "aut"), comment = c(ORCID = "0000-0003-1033-9158")),
-          person("Fran", "Barton", ,"[email protected]", "aut"),
-          person("NHS-R community", email = "[email protected]", role = "cph")
-          )
+Version: 0.0.0.9001
+Authors@R:
+    c(
+        person(
+            "Zoë", "Turner", , "[email protected]", c("cre", "aut"),
+            comment = c(ORCID = "0000-0003-1033-9158")
+        ),
+        person(
+            "Fran", "Barton", , "[email protected]", "aut",
+            comment = c(ORCID = "0000-0002-5650-1176")
+        ),
+        person(
+            "NHS-R community", email = "[email protected]", role = "cph"
+        )
+    )
 Maintainer: Zoë Turner <[email protected]>
 Description: Functions related to England Postcodes and geographical areas.
 License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
+Depends: 
+    R (>= 4.1.0)
 Suggests: 
     knitr,
     lubridate,
@@ -20,9 +31,9 @@ Suggests:
 Config/testthat/edition: 3
 Imports: 
     assertthat,
-    dplyr,
-    httr2,
-    purrr,
+    dplyr (>= 1.1.0),
+    httr2 (>= 1.0.0),
+    purrr (>= 1.0.0),
     rlang,
     stringr,
     tibble,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,14 +1,10 @@
 # Generated by roxygen2: do not edit by hand
 
 export(batch_it)
-export(batch_it_simple)
 export(postcode_data_join)
 importFrom(dplyr,across)
 importFrom(rlang,.data)
 importFrom(rlang,`:=`)
 importFrom(tidyselect,all_of)
 importFrom(usethis,ui_info)
-importFrom(usethis,ui_nope)
-importFrom(usethis,ui_oops)
-importFrom(usethis,ui_stop)
 importFrom(utils,URLencode)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,22 @@
-# NHSRpostcodetools (development version)
+# NHSRpostcodetools development log
 
-* Package created with the functions from package [{myrmidon}](https://github.com/francisbarton/myrmidon) created by Fran Barton. Added `postcode_data_join.R` which uses the [postcodes.io](https://postcodes.io/) API to get additional postcode data and which removes the requirement to save the large file from the [Open Geography Portal](https://geoportal.statistics.gov.uk/datasets/postcode-to-output-area-to-lower-layer-super-output-area-to-middle-layer-super-output-area-to-local-authority-district-november-2018-lookup-in-the-uk-3/about) maintained by the ONS (Office of National Statistics).
+## Development version 0.0.0.9001 (8 Feb 2024)
 
+* Fixed issue #20 by removing `batch_it_simple()` and simplifying the code in
+    `batch_it()` so it in turn is simpler.
+    For the purposes of this package, it only has to do a simple thing.
+* Updated `lintr` rules to exclude vignette `.Rmd`s.
+* Ran `styler` over the package.
+* Updated DESCRIPTION file to add an OrcID and to add minimum R version and
+    some minimum package versions.
+
+## Development version 0.0.0.9000 (23 Nov 2023)
+
+* Package created with the functions from package [{myrmidon}][myr_gh] created by Fran Barton.
+* Added `postcode_data_join.R` which uses the [postcodes.io][pio_api] API to get additional postcode data and which removes the requirement to save the large file from the [Open Geography portal][ogp] maintained by the ONS (Office for National Statistics).
 * Added dependency function `batch_it()`
+
+
+[myr_gh]: https://github.com/francisbarton/myrmidon
+[pio_api]: https://postcodes.io
+[ogp]: https://geoportal.statistics.gov.uk/datasets/postcode-to-output-area-to-lower-layer-super-output-area-to-middle-layer-super-output-area-to-local-authority-district-november-2018-lookup-in-the-uk-3/about
diff --git a/R/NHSRpostcodetools-package.R b/R/NHSRpostcodetools-package.R
@@ -1,12 +1,9 @@
 #' @keywords internal
 "_PACKAGE"
 
-# The following block is used by usethis to automatically manage
-# roxygen namespace tags. Modify with care!
 #' @importFrom dplyr across
 #' @importFrom rlang .data `:=`
 #' @importFrom tidyselect all_of
-#' @importFrom usethis ui_info ui_nope ui_oops ui_stop
+#' @importFrom usethis ui_info
 #' @importFrom utils URLencode
-
 NULL
diff --git a/R/batch_it.R b/R/batch_it.R
@@ -1,252 +1,33 @@
-# batch_it() --------------------------------------------------------------
-
-
-#' Convert a list or vector to a batched list of its elements
-#'
-#' @description
-#' Batch up a long vector, or list of vectors. For example so they can be
-#' passed via a `map` function to services with length-limited APIs.
-#'
-#' @param x a vector, or a list flattenable to a vector
-#' @param batches numeric. The size (length) of batches to create. Can be a
-#'   single value or multiple values (see Examples). Should be a whole,
-#'   positive number, if provided, else `NULL`.
-#' @param proportion numeric. Proportional sizes of batches to be created.
-#'   For example `c(4, 6)` will create two batches of approximately 40% and
-#'   60% of the length of the target vector (`x`). When multiple
-#'   `proportion` values are provided, these are not repeated.
-#'   A single proportion less than 1 is repeated as many times as possible to
-#'   get near to the length of the target vector. For example, a `proportion`
-#'   of 0.1 will be treated as a tenth, and batch sizes will be rounded to
-#'   an integer size nearest to a tenth of the length of `x`.
-#' @param maximise Boolean, `FALSE` by default. If `TRUE`, a vector of batch
-#'   sizes will be partially repeated to fit maximally to the length
-#'   of the target vector. See examples below.
-#' @param quiet Boolean, `TRUE` by default. Whether to show informative
-#'   `ui_*` messages from `{usethis}`.
+#' Batch a vector or list into a list of elements with a maximum size
 #'
-#' @seealso [batch_it_simple()] which does the same thing but has fewer options
-#'   and works just fine for simpler cases.
-#'
-#' @returns All the elements of `x` batched into a list.
+#' @param x A vector or list
+#' @param batch_size numeric. The size (length) of batches to create. Should be
+#'   a single positive integer value (see Examples).
 #'
 #' @examples
-#' batch_it(seq(2L, 60L, 2L), 6L)
-#' batch_it(seq(2L, 60L, 2L), proportion = 0.2)
-#'
-#' batch_it(1:100, batches = c(20L, 30L, 50L))
-#' batch_it(letters, batches = c(4L, 6L))
-#' batch_it(letters, batches = c(4L, 6L), maximise = TRUE)
-#' batch_it(letters, proportion = c(4L, 6L))
-#'
 #' # ----
-#' as_year <- function(x) {
-#'   lubridate::as_date(
-#'     lubridate::ymd(paste0(x, "-01-01")):
-#'     lubridate::ymd(paste0(x, "-12-31"))
-#'   )
-#' }
-#' month_lengths <- function(year) {
-#'   lubridate::as_date(paste0(year, "-", 1:12, "-01")) |>
-#'     lubridate::days_in_month()
-#' }
-#' batch_it(x = as_year(2022L), batches = month_lengths(2022L))
+#' batch_it(letters, 6L)
+#' batch_it(letters, 27L)
 #'
 #' @export
-batch_it <- function(
-    x,
-    batches = NULL,
-    proportion = NULL,
-    maximise = FALSE,
-    quiet = TRUE) {
-
-  # Ensure x is a reasonable vector
-  if (is.list(x)) {
-    if (!quiet) ui_info("Flattening list to vector")
-    while (purrr::pluck_depth(x) > 2L) {
-      x <- purrr::list_flatten(x)
-    }
-    x <- purrr::list_c(x)
-  }
-
+batch_it <- function(x, batch_size) {
   assertthat::assert_that(
-    is.atomic(x),
-    msg = ui_stop("This function only works with lists or vectors")
+    is.list(as.list(x)),
+    msg = "x must be a vector or a list"
   )
 
-  if (length(batches) == 1L && (length(x) <= batches)) x
-
-  if (purrr::every(list(batches, proportion), rlang::is_null)) {
-    ui_stop("batch_it: Either `batches` or `proportion` must be supplied.")
-  }
-
-  # prefer batches if both are supplied
-  if (purrr::none(list(batches, proportion), rlang::is_null)) {
-    proportion <- NULL
-    if (!quiet) {
-      ui_info(
-        "batch_it: Values for both `batches` and `proportion` have been
-        supplied. The `batches` value is prioritised.")
-    }
-  }
-
-  if (length(x) > 10e6L) {
-    ui_nope(
-      "batch_it: Easy, tiger! That vector has more than a million
-      items. Are you sure you want to continue?"
-    )
-  }
-
-
-  # A sub-routine to handle proportion parameter
-  if (!is.null(proportion)) {
-    assertthat::assert_that(
-      is.numeric(proportion),
-      msg = ui_oops("batch_it: The proportion parameter is not numeric")
-    )
-    batches <- convert_proportion_to_batches(x, proportion)
-  }
-
-  # Just checking
-  assertthat::assert_that(is.numeric(batches),
-    msg = ui_oops("batch_it: Batch sizes provided are not numeric")
-  )
-
-  assertthat::assert_that(all(batches > 0L),
-    msg = ui_oops("batch_it: Batch sizes must be greater than zero")
+  assertthat::assert_that(
+    length(batch_size) == 1L,
+    round(batch_size) == batch_size,
+    batch_size >= 1L,
+    msg = "The batch_size parameter must be a single positive integer value"
   )
 
+  batch_size <- min(length(x), batch_size)
 
-  batches <- round(batches)
-  batches <- batches[which(batches > 0L)]
-  batches <- maximise_batches(x, batches, maximise)
-
-
-  # This shouldn't be able to happen...
-  if (sum(batches) > length(x)) {
-    ui_stop("Batch sizes ended up longer than the length of the vector")
-  }
-
-  if (length(x) - sum(batches) > 0L) {
-    if (!quiet) {
-      ui_info(
-        "The length of the target vector `x` is not an exact multiple of the
-        batch length(s) supplied. The remaining elements of `x` will be added
-        as a final batch."
-      )
-    }
-    batches <- c(batches, length(x) - sum(batches))
-  }
-
-  list_a <- c(0L, utils::head(batches, -1L)) |>
-    rlang::set_names(names(batches)) |>
-    purrr::accumulate(sum, .simplify = TRUE)
-  list_b <- batches |>
-    purrr::accumulate(sum, .simplify = TRUE)
-
-  purrr::map2(list_a, list_b, \(a, b) x[(a + 1L):b])
-}
-# end of main function
-
-
-
-# helper functions (internal) ---------------------------------------------
-
-
-#' @noRd
-convert_proportion_to_batches <- function(x, proportion) {
-  if (!all(proportion > 0L)) {
-    ui_stop("Proportions must be positive numbers")
-  }
-
-  if (length(proportion) == 1L && proportion < 1L) {
-    proportion <- rep(proportion, times = floor(1L / proportion))
-    if (sum(proportion) < 1L) {
-      proportion <- c(proportion, 1L - sum(proportion))
-    }
-  }
-
-  (proportion / sum(proportion)) * length(x)
-}
-
-
-#' @noRd
-maximise_batches <- function(x, batches, maximise) {
-  # If maximise = TRUE and `batches` has length > 0, partially repeat the
-  # batch lengths as far as possible within the length of x.
-  # If maximise = FALSE, only repeat the batch lengths in full as far as they
-  # will fit. then return the remainder as a final batch.
-  if (maximise) {
-    batches <- rep(batches, times = ceiling(length(x) / sum(batches)))
-    while (sum(batches) > length(x)) {
-      batches <- utils::head(batches, -1L)
-    }
-    batches
-  } else {
-    rep(batches, times = floor(length(x) / sum(batches)))
-  }
-}
-
-
-
-# batch_it_simple() -------------------------------------------------------
-
-
-
-#' Convert a list or vector to a batched list of its elements
-#'
-#' @rdname batch_it
-#'
-#' @param batch_size numeric. The size (length) of batches to create. Should be
-#'   a single value (see Examples). If supplied as a decimal (<1), it will be
-#'   interpreted as a proportion of `length(x)`.
-#'
-#' @examples
-#' # ----
-#' batch_it_simple(letters, 6L)
-#' batch_it_simple(letters, 0.45)
-#'
-#' @export
-batch_it_simple <- function(x, batch_size) {
-
-  # ensure x is a reasonable vector
-  if (is.list(x)) {
-    ui_info("Converting list to single vector")
-    x <- purrr::list_c(x)
-  }
-
-  if (!is.vector(x)) {
-    ui_stop("This function only works with lists or vectors")
-  }
-
-  if (length(x) > 10e6L) {
-    ui_nope(
-      "Easy, tiger! That vector has more than a million items.
-      Are you sure you want to continue?"
-    )
-  }
-
-  # ensure batch_size is an appropriate single positive number
-  if (length(batch_size) != 1L || batch_size <= 0L) {
-    ui_stop("The batch_size parameter must be a single positive value")
-  }
-
-  # if batch_size is supplied as a decimal between 0 and 1, interpret this as
-  # a proportion of the length of `x`, and convert to an integer
-  if (batch_size < 1L) {
-    batch_size <- ceiling(length(x) * batch_size)
-  }
-
-  if (batch_size > length(x)) {
-    batch_size <- length(x)
-  }
-
-  batch_size <- round(batch_size)
-  assertthat::assert_that(batch_size > 0L)
-
-  # do the batching by creating a vector of factors of length(x)
+  # Do the batching by creating a vector of factors of length(x),
   # then use this as the factor argument to split(x)
-  f <- rep(1L:ceiling(length(x) / batch_size), each = batch_size) |>
+  f <- rep(seq_len(ceiling(length(x) / batch_size)), each = batch_size) |>
     utils::head(length(x))
   unname(split(x, f))
 }
diff --git a/R/postcode_data_join.R b/R/postcode_data_join.R
@@ -105,7 +105,7 @@ postcode_data_join <- function(x, var = "postcode", fix_invalid = TRUE) {
 
           fixed_ac_data <- ac_results |>
             purrr::list_c() |>
-            batch_it_simple(100L) |>
+            batch_it(100L) |>
             purrr::map_df(bulk_lookup) |>
             unnest_codes() |>
             dplyr::rename(new_postcode = "postcode")