From 9c2eeff076c89fa6589fc14f79e3a61bb5c43d5d Mon Sep 17 00:00:00 2001 From: Fran Barton Date: Thu, 8 Feb 2024 05:05:30 +0000 Subject: [PATCH] Fixes issue #20 by removing `batch_it_simple()` --- .Rbuildignore | 1 + .lintr | 3 + DESCRIPTION | 31 ++-- NAMESPACE | 4 - NEWS.md | 21 ++- R/NHSRpostcodetools-package.R | 5 +- R/batch_it.R | 253 +++---------------------------- R/postcode_data_join.R | 2 +- man/NHSRpostcodetools-package.Rd | 2 +- man/batch_it.Rd | 67 +------- vignettes/NHSRpostcodetools.Rmd | 6 +- 11 files changed, 72 insertions(+), 323 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 7530c83..aff0459 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,4 @@ ^docs$ ^pkgdown$ ^\.github$ +^\.lintr \ No newline at end of file diff --git a/.lintr b/.lintr index bb3cbb1..b0cec63 100644 --- a/.lintr +++ b/.lintr @@ -1,2 +1,5 @@ linters: all_linters() +exclusions: list( + "vignettes" + ) encoding: "UTF-8" diff --git a/DESCRIPTION b/DESCRIPTION index a3556e1..5789e76 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,17 +1,28 @@ Package: NHSRpostcodetools Title: Package to work with England Postcodes in R -Version: 0.0.0.9000 -Authors@R: c( - person("Zoë", "Turner", , "zoe.turner3@nhs.net", c("cre", "aut"), comment = c(ORCID = "0000-0003-1033-9158")), - person("Fran", "Barton", ,"fbarton@alwaysdata.net", "aut"), - person("NHS-R community", email = "nhs.rcommunity@nhs.net", role = "cph") - ) +Version: 0.0.0.9001 +Authors@R: + c( + person( + "Zoë", "Turner", , "zoe.turner3@nhs.net", c("cre", "aut"), + comment = c(ORCID = "0000-0003-1033-9158") + ), + person( + "Fran", "Barton", , "fbarton@alwaysdata.net", "aut", + comment = c(ORCID = "0000-0002-5650-1176") + ), + person( + "NHS-R community", email = "nhs.rcommunity@nhs.net", role = "cph" + ) + ) Maintainer: Zoë Turner Description: Functions related to England Postcodes and geographical areas. License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 +Depends: + R (>= 4.1.0) Suggests: knitr, lubridate, @@ -20,9 +31,9 @@ Suggests: Config/testthat/edition: 3 Imports: assertthat, - dplyr, - httr2, - purrr, + dplyr (>= 1.1.0), + httr2 (>= 1.0.0), + purrr (>= 1.0.0), rlang, stringr, tibble, diff --git a/NAMESPACE b/NAMESPACE index 4b49c3f..15225e3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,14 +1,10 @@ # Generated by roxygen2: do not edit by hand export(batch_it) -export(batch_it_simple) export(postcode_data_join) importFrom(dplyr,across) importFrom(rlang,.data) importFrom(rlang,`:=`) importFrom(tidyselect,all_of) importFrom(usethis,ui_info) -importFrom(usethis,ui_nope) -importFrom(usethis,ui_oops) -importFrom(usethis,ui_stop) importFrom(utils,URLencode) diff --git a/NEWS.md b/NEWS.md index 92ea733..2d0b0a4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,22 @@ -# NHSRpostcodetools (development version) +# NHSRpostcodetools development log -* Package created with the functions from package [{myrmidon}](https://github.com/francisbarton/myrmidon) created by Fran Barton. Added `postcode_data_join.R` which uses the [postcodes.io](https://postcodes.io/) API to get additional postcode data and which removes the requirement to save the large file from the [Open Geography Portal](https://geoportal.statistics.gov.uk/datasets/postcode-to-output-area-to-lower-layer-super-output-area-to-middle-layer-super-output-area-to-local-authority-district-november-2018-lookup-in-the-uk-3/about) maintained by the ONS (Office of National Statistics). +## Development version 0.0.0.9001 (8 Feb 2024) +* Fixed issue #20 by removing `batch_it_simple()` and simplifying the code in + `batch_it()` so it in turn is simpler. + For the purposes of this package, it only has to do a simple thing. +* Updated `lintr` rules to exclude vignette `.Rmd`s. +* Ran `styler` over the package. +* Updated DESCRIPTION file to add an OrcID and to add minimum R version and + some minimum package versions. + +## Development version 0.0.0.9000 (23 Nov 2023) + +* Package created with the functions from package [{myrmidon}][myr_gh] created by Fran Barton. +* Added `postcode_data_join.R` which uses the [postcodes.io][pio_api] API to get additional postcode data and which removes the requirement to save the large file from the [Open Geography portal][ogp] maintained by the ONS (Office for National Statistics). * Added dependency function `batch_it()` + + +[myr_gh]: https://github.com/francisbarton/myrmidon +[pio_api]: https://postcodes.io +[ogp]: https://geoportal.statistics.gov.uk/datasets/postcode-to-output-area-to-lower-layer-super-output-area-to-middle-layer-super-output-area-to-local-authority-district-november-2018-lookup-in-the-uk-3/about \ No newline at end of file diff --git a/R/NHSRpostcodetools-package.R b/R/NHSRpostcodetools-package.R index 69c85a1..35fe153 100644 --- a/R/NHSRpostcodetools-package.R +++ b/R/NHSRpostcodetools-package.R @@ -1,12 +1,9 @@ #' @keywords internal "_PACKAGE" -# The following block is used by usethis to automatically manage -# roxygen namespace tags. Modify with care! #' @importFrom dplyr across #' @importFrom rlang .data `:=` #' @importFrom tidyselect all_of -#' @importFrom usethis ui_info ui_nope ui_oops ui_stop +#' @importFrom usethis ui_info #' @importFrom utils URLencode - NULL diff --git a/R/batch_it.R b/R/batch_it.R index 9af7ef0..75c7bfe 100644 --- a/R/batch_it.R +++ b/R/batch_it.R @@ -1,252 +1,33 @@ -# batch_it() -------------------------------------------------------------- - - -#' Convert a list or vector to a batched list of its elements -#' -#' @description -#' Batch up a long vector, or list of vectors. For example so they can be -#' passed via a `map` function to services with length-limited APIs. -#' -#' @param x a vector, or a list flattenable to a vector -#' @param batches numeric. The size (length) of batches to create. Can be a -#' single value or multiple values (see Examples). Should be a whole, -#' positive number, if provided, else `NULL`. -#' @param proportion numeric. Proportional sizes of batches to be created. -#' For example `c(4, 6)` will create two batches of approximately 40% and -#' 60% of the length of the target vector (`x`). When multiple -#' `proportion` values are provided, these are not repeated. -#' A single proportion less than 1 is repeated as many times as possible to -#' get near to the length of the target vector. For example, a `proportion` -#' of 0.1 will be treated as a tenth, and batch sizes will be rounded to -#' an integer size nearest to a tenth of the length of `x`. -#' @param maximise Boolean, `FALSE` by default. If `TRUE`, a vector of batch -#' sizes will be partially repeated to fit maximally to the length -#' of the target vector. See examples below. -#' @param quiet Boolean, `TRUE` by default. Whether to show informative -#' `ui_*` messages from `{usethis}`. +#' Batch a vector or list into a list of elements with a maximum size #' -#' @seealso [batch_it_simple()] which does the same thing but has fewer options -#' and works just fine for simpler cases. -#' -#' @returns All the elements of `x` batched into a list. +#' @param x A vector or list +#' @param batch_size numeric. The size (length) of batches to create. Should be +#' a single positive integer value (see Examples). #' #' @examples -#' batch_it(seq(2L, 60L, 2L), 6L) -#' batch_it(seq(2L, 60L, 2L), proportion = 0.2) -#' -#' batch_it(1:100, batches = c(20L, 30L, 50L)) -#' batch_it(letters, batches = c(4L, 6L)) -#' batch_it(letters, batches = c(4L, 6L), maximise = TRUE) -#' batch_it(letters, proportion = c(4L, 6L)) -#' #' # ---- -#' as_year <- function(x) { -#' lubridate::as_date( -#' lubridate::ymd(paste0(x, "-01-01")): -#' lubridate::ymd(paste0(x, "-12-31")) -#' ) -#' } -#' month_lengths <- function(year) { -#' lubridate::as_date(paste0(year, "-", 1:12, "-01")) |> -#' lubridate::days_in_month() -#' } -#' batch_it(x = as_year(2022L), batches = month_lengths(2022L)) +#' batch_it(letters, 6L) +#' batch_it(letters, 27L) #' #' @export -batch_it <- function( - x, - batches = NULL, - proportion = NULL, - maximise = FALSE, - quiet = TRUE) { - - # Ensure x is a reasonable vector - if (is.list(x)) { - if (!quiet) ui_info("Flattening list to vector") - while (purrr::pluck_depth(x) > 2L) { - x <- purrr::list_flatten(x) - } - x <- purrr::list_c(x) - } - +batch_it <- function(x, batch_size) { assertthat::assert_that( - is.atomic(x), - msg = ui_stop("This function only works with lists or vectors") + is.list(as.list(x)), + msg = "x must be a vector or a list" ) - if (length(batches) == 1L && (length(x) <= batches)) x - - if (purrr::every(list(batches, proportion), rlang::is_null)) { - ui_stop("batch_it: Either `batches` or `proportion` must be supplied.") - } - - # prefer batches if both are supplied - if (purrr::none(list(batches, proportion), rlang::is_null)) { - proportion <- NULL - if (!quiet) { - ui_info( - "batch_it: Values for both `batches` and `proportion` have been - supplied. The `batches` value is prioritised.") - } - } - - if (length(x) > 10e6L) { - ui_nope( - "batch_it: Easy, tiger! That vector has more than a million - items. Are you sure you want to continue?" - ) - } - - - # A sub-routine to handle proportion parameter - if (!is.null(proportion)) { - assertthat::assert_that( - is.numeric(proportion), - msg = ui_oops("batch_it: The proportion parameter is not numeric") - ) - batches <- convert_proportion_to_batches(x, proportion) - } - - # Just checking - assertthat::assert_that(is.numeric(batches), - msg = ui_oops("batch_it: Batch sizes provided are not numeric") - ) - - assertthat::assert_that(all(batches > 0L), - msg = ui_oops("batch_it: Batch sizes must be greater than zero") + assertthat::assert_that( + length(batch_size) == 1L, + round(batch_size) == batch_size, + batch_size >= 1L, + msg = "The batch_size parameter must be a single positive integer value" ) + batch_size <- min(length(x), batch_size) - batches <- round(batches) - batches <- batches[which(batches > 0L)] - batches <- maximise_batches(x, batches, maximise) - - - # This shouldn't be able to happen... - if (sum(batches) > length(x)) { - ui_stop("Batch sizes ended up longer than the length of the vector") - } - - if (length(x) - sum(batches) > 0L) { - if (!quiet) { - ui_info( - "The length of the target vector `x` is not an exact multiple of the - batch length(s) supplied. The remaining elements of `x` will be added - as a final batch." - ) - } - batches <- c(batches, length(x) - sum(batches)) - } - - list_a <- c(0L, utils::head(batches, -1L)) |> - rlang::set_names(names(batches)) |> - purrr::accumulate(sum, .simplify = TRUE) - list_b <- batches |> - purrr::accumulate(sum, .simplify = TRUE) - - purrr::map2(list_a, list_b, \(a, b) x[(a + 1L):b]) -} -# end of main function - - - -# helper functions (internal) --------------------------------------------- - - -#' @noRd -convert_proportion_to_batches <- function(x, proportion) { - if (!all(proportion > 0L)) { - ui_stop("Proportions must be positive numbers") - } - - if (length(proportion) == 1L && proportion < 1L) { - proportion <- rep(proportion, times = floor(1L / proportion)) - if (sum(proportion) < 1L) { - proportion <- c(proportion, 1L - sum(proportion)) - } - } - - (proportion / sum(proportion)) * length(x) -} - - -#' @noRd -maximise_batches <- function(x, batches, maximise) { - # If maximise = TRUE and `batches` has length > 0, partially repeat the - # batch lengths as far as possible within the length of x. - # If maximise = FALSE, only repeat the batch lengths in full as far as they - # will fit. then return the remainder as a final batch. - if (maximise) { - batches <- rep(batches, times = ceiling(length(x) / sum(batches))) - while (sum(batches) > length(x)) { - batches <- utils::head(batches, -1L) - } - batches - } else { - rep(batches, times = floor(length(x) / sum(batches))) - } -} - - - -# batch_it_simple() ------------------------------------------------------- - - - -#' Convert a list or vector to a batched list of its elements -#' -#' @rdname batch_it -#' -#' @param batch_size numeric. The size (length) of batches to create. Should be -#' a single value (see Examples). If supplied as a decimal (<1), it will be -#' interpreted as a proportion of `length(x)`. -#' -#' @examples -#' # ---- -#' batch_it_simple(letters, 6L) -#' batch_it_simple(letters, 0.45) -#' -#' @export -batch_it_simple <- function(x, batch_size) { - - # ensure x is a reasonable vector - if (is.list(x)) { - ui_info("Converting list to single vector") - x <- purrr::list_c(x) - } - - if (!is.vector(x)) { - ui_stop("This function only works with lists or vectors") - } - - if (length(x) > 10e6L) { - ui_nope( - "Easy, tiger! That vector has more than a million items. - Are you sure you want to continue?" - ) - } - - # ensure batch_size is an appropriate single positive number - if (length(batch_size) != 1L || batch_size <= 0L) { - ui_stop("The batch_size parameter must be a single positive value") - } - - # if batch_size is supplied as a decimal between 0 and 1, interpret this as - # a proportion of the length of `x`, and convert to an integer - if (batch_size < 1L) { - batch_size <- ceiling(length(x) * batch_size) - } - - if (batch_size > length(x)) { - batch_size <- length(x) - } - - batch_size <- round(batch_size) - assertthat::assert_that(batch_size > 0L) - - # do the batching by creating a vector of factors of length(x) + # Do the batching by creating a vector of factors of length(x), # then use this as the factor argument to split(x) - f <- rep(1L:ceiling(length(x) / batch_size), each = batch_size) |> + f <- rep(seq_len(ceiling(length(x) / batch_size)), each = batch_size) |> utils::head(length(x)) unname(split(x, f)) } diff --git a/R/postcode_data_join.R b/R/postcode_data_join.R index c830a34..7c4df98 100644 --- a/R/postcode_data_join.R +++ b/R/postcode_data_join.R @@ -105,7 +105,7 @@ postcode_data_join <- function(x, var = "postcode", fix_invalid = TRUE) { fixed_ac_data <- ac_results |> purrr::list_c() |> - batch_it_simple(100L) |> + batch_it(100L) |> purrr::map_df(bulk_lookup) |> unnest_codes() |> dplyr::rename(new_postcode = "postcode") diff --git a/man/NHSRpostcodetools-package.Rd b/man/NHSRpostcodetools-package.Rd index 9ae7389..3967fce 100644 --- a/man/NHSRpostcodetools-package.Rd +++ b/man/NHSRpostcodetools-package.Rd @@ -21,7 +21,7 @@ Useful links: Authors: \itemize{ - \item Fran Barton \email{fbarton@alwaysdata.net} + \item Fran Barton \email{fbarton@alwaysdata.net} (\href{https://orcid.org/0000-0002-5650-1176}{ORCID}) } Other contributors: diff --git a/man/batch_it.Rd b/man/batch_it.Rd index 6c909cd..0a4bb3d 100644 --- a/man/batch_it.Rd +++ b/man/batch_it.Rd @@ -2,75 +2,22 @@ % Please edit documentation in R/batch_it.R \name{batch_it} \alias{batch_it} -\alias{batch_it_simple} -\title{Convert a list or vector to a batched list of its elements} +\title{Batch a vector or list into a list of elements with a maximum size} \usage{ -batch_it(x, batches = NULL, proportion = NULL, maximise = FALSE, quiet = TRUE) - -batch_it_simple(x, batch_size) +batch_it(x, batch_size) } \arguments{ -\item{x}{a vector, or a list flattenable to a vector} - -\item{batches}{numeric. The size (length) of batches to create. Can be a -single value or multiple values (see Examples). Should be a whole, -positive number, if provided, else \code{NULL}.} - -\item{proportion}{numeric. Proportional sizes of batches to be created. -For example \code{c(4, 6)} will create two batches of approximately 40\% and -60\% of the length of the target vector (\code{x}). When multiple -\code{proportion} values are provided, these are not repeated. -A single proportion less than 1 is repeated as many times as possible to -get near to the length of the target vector. For example, a \code{proportion} -of 0.1 will be treated as a tenth, and batch sizes will be rounded to -an integer size nearest to a tenth of the length of \code{x}.} - -\item{maximise}{Boolean, \code{FALSE} by default. If \code{TRUE}, a vector of batch -sizes will be partially repeated to fit maximally to the length -of the target vector. See examples below.} - -\item{quiet}{Boolean, \code{TRUE} by default. Whether to show informative -\verb{ui_*} messages from \code{{usethis}}.} +\item{x}{A vector or list} \item{batch_size}{numeric. The size (length) of batches to create. Should be -a single value (see Examples). If supplied as a decimal (<1), it will be -interpreted as a proportion of \code{length(x)}.} -} -\value{ -All the elements of \code{x} batched into a list. +a single positive integer value (see Examples).} } \description{ -Batch up a long vector, or list of vectors. For example so they can be -passed via a \code{map} function to services with length-limited APIs. +Batch a vector or list into a list of elements with a maximum size } \examples{ -batch_it(seq(2L, 60L, 2L), 6L) -batch_it(seq(2L, 60L, 2L), proportion = 0.2) - -batch_it(1:100, batches = c(20L, 30L, 50L)) -batch_it(letters, batches = c(4L, 6L)) -batch_it(letters, batches = c(4L, 6L), maximise = TRUE) -batch_it(letters, proportion = c(4L, 6L)) - -# ---- -as_year <- function(x) { - lubridate::as_date( - lubridate::ymd(paste0(x, "-01-01")): - lubridate::ymd(paste0(x, "-12-31")) - ) -} -month_lengths <- function(year) { - lubridate::as_date(paste0(year, "-", 1:12, "-01")) |> - lubridate::days_in_month() -} -batch_it(x = as_year(2022L), batches = month_lengths(2022L)) - # ---- -batch_it_simple(letters, 6L) -batch_it_simple(letters, 0.45) +batch_it(letters, 6L) +batch_it(letters, 27L) } -\seealso{ -\code{\link[=batch_it_simple]{batch_it_simple()}} which does the same thing but has fewer options -and works just fine for simpler cases. -} diff --git a/vignettes/NHSRpostcodetools.Rmd b/vignettes/NHSRpostcodetools.Rmd index 3715e79..5fe39db 100644 --- a/vignettes/NHSRpostcodetools.Rmd +++ b/vignettes/NHSRpostcodetools.Rmd @@ -39,14 +39,13 @@ Join this vector to the postcode data ```{r} postcode_data_join(postcodes, fix_invalid = TRUE) - ``` ### A tibble of postcodes ```{r} test_df1 <- dplyr::tibble( - place = paste0("place_", 1:3), + place = paste0("place_", seq(3L)), postcode = postcodes ) ``` @@ -57,14 +56,12 @@ to be recognised. ```{r} postcode_data_join(test_df1, fix_invalid = TRUE) - ``` Note that the parameter `fix_invalid = TRUE` defaults to TRUE: ```{r} postcode_data_join(test_df1) - ``` And if it is set to FALSE the same message appears but the `new_postcode` is not @@ -72,5 +69,4 @@ populated and has `NA`. ```{r} postcode_data_join(test_df1, fix_invalid = FALSE) - ```