diff --git a/.Rbuildignore b/.Rbuildignore index 8a8c4820..6a4f537e 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,4 +1,5 @@ rds$ /tools/* /.github -scomps*.html$ \ No newline at end of file +scomps*.html$ +^LICENSE\.md$ diff --git a/DESCRIPTION b/DESCRIPTION index 739616b1..cb66f9d6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,6 +24,7 @@ Imports: testthat, units Suggests: + exactextractr, future.batchtools, igraph, knitr, diff --git a/NAMESPACE b/NAMESPACE index a5832fae..c6adefb7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,8 +10,9 @@ export(check_within_reference) export(clip_as_extent) export(clip_as_extent_ras) export(clip_as_extent_ras2) -export(distribute_process) +export(distribute_process_grid) export(distribute_process_hierarchy) +export(distribute_process_multirasters) export(estimate_demands) export(extent_to_polygon) export(extract_with) diff --git a/R/scale_process.R b/R/scale_process.R index a2d24696..5249025c 100644 --- a/R/scale_process.R +++ b/R/scale_process.R @@ -14,10 +14,10 @@ #' library(future) #' plan(multicore, workers = 4) #' # Does not run ... -#' # distribute_process() +#' # distribute_process_grid() #' @import future #' @export -distribute_process <- function( +distribute_process_grid <- function( grids, grid_target_id = NULL, fun, @@ -45,8 +45,6 @@ distribute_process <- function( grids_target <- grids[grid_target_ids %in% unlist(grids[["CGRIDID"]]),] grids_target_list <- split(grids_target, unlist(grids_target[["CGRIDID"]])) - # pgrs <- progressr::progressor(along = nrow(grids_target)) - results_distributed <- future.apply::future_lapply( grids_target_list, \(x) { @@ -70,7 +68,7 @@ distribute_process <- function( # post-processing detected_id <- grep("^id", names(par_fun), value = TRUE) - detected_point <- grep("^points", names(par_fun), value = TRUE) + detected_point <- grep("^(points|poly)", names(par_fun), value = TRUE) names(results_distributed)[1] <- par_fun[[detected_id]] results_distributed[[par_fun[[detected_id]]]] <- unlist(par_fun[[detected_point]][[par_fun[[detected_id]]]]) @@ -93,7 +91,7 @@ distribute_process <- function( #' library(future) #' plan(multicore, workers = 4) #' # Does not run ... -#' # distribute_process() +#' # distribute_process_hierarchy() #' @import future #' @import progressr #' @export @@ -133,7 +131,65 @@ distribute_process_hierarchy <- function( # post-processing detected_id <- grep("^id", names(par_fun), value = TRUE) - detected_point <- grep("^points", names(par_fun), value = TRUE) + detected_point <- grep("^(points|poly)", names(par_fun), value = TRUE) + names(results_distributed)[1] <- par_fun[[detected_id]] + results_distributed[[par_fun[[detected_id]]]] <- + unlist(par_fun[[detected_point]][[par_fun[[detected_id]]]]) + + return(results_distributed) +} + + + + +#' @title Process a given function over multiple large rasters +#' +#' @description Large raster files usually exceed the memory capacity in size. Cropping a large raster into a small subset even consumes a lot of memory and adds processing time. This function leverages terra SpatRaster proxy to distribute computation jobs over multiple cores. It is assumed that users have multiple large raster files in their disk, then each file path is assigned to a thread. Each thread will directly read raster values from the disk using C++ pointers that operate in terra functions. For use, it is strongly recommended to use vector data with small and confined spatial extent for computation to avoid out-of-memory error. For this, users may need to make subsets of input vector objects in advance. +#' @param filenames character(n). A vector or list of full file paths of raster files. n is the total number of raster files. +#' @param fun function supported in scomps. +#' @param ... Arguments passed to the argument \code{fun}. +#' @return a data.frame object with computation results. For entries of the results, consult the function used in \code{fun} argument. +#' @author Insang Song \email{geoissong@@gmail.com} +#' +#' @examples +#' library(future) +#' plan(multicore, workers = 4) +#' # Does not run ... +#' # distribute_process_multirasters() +#' @import future +#' @import progressr +#' @export +distribute_process_multirasters <- function( + filenames, + fun, + ...) { + par_fun <- list(...) + + if (any(sapply(filenames, \(x) !file.exists(x)))) { + stop("One or many of files do not exist in provided file paths. Check the paths again.\n") + } + + file_list <- split(filenames, filenames) + results_distributed <- future.apply::future_lapply( + file_list, + \(x) { + sf::sf_use_s2(FALSE) + + run_result <- tryCatch({ + res <- fun(...) + return(res) + }, + error = function(e) return(data.frame(ID = NA))) + return(run_result) + }, + future.seed = TRUE, + future.packages = c("terra", "sf", "dplyr", "scomps", "future")) + results_distributed <- do.call(dplyr::bind_rows, results_distributed) + results_distributed <- results_distributed[!is.na(results_distributed[["ID"]]),] + + # post-processing + detected_id <- grep("^id", names(par_fun), value = TRUE) + detected_point <- grep("^(points|poly)", names(par_fun), value = TRUE) names(results_distributed)[1] <- par_fun[[detected_id]] results_distributed[[par_fun[[detected_id]]]] <- unlist(par_fun[[detected_point]][[par_fun[[detected_id]]]]) @@ -141,3 +197,4 @@ distribute_process_hierarchy <- function( return(results_distributed) } + diff --git a/man/distribute_process_grid.Rd b/man/distribute_process_grid.Rd new file mode 100644 index 00000000..dd5349d0 --- /dev/null +++ b/man/distribute_process_grid.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in scomps_rmarkdown_litr.rmd. +\name{distribute_process_grid} +\alias{distribute_process_grid} +\title{Process a given function in the entire or partial computational grids} +\usage{ +distribute_process_grid(grids, grid_target_id = NULL, fun, ...) +} +\arguments{ +\item{grids}{sf/SpatVector object. Computational grids. It takes a strict assumption that the grid input is an output of \code{get_computational_regions}} + +\item{grid_target_id}{character(1) or numeric(2). Default is NULL. If NULL, all grid_ids are used. \code{"id_from:id_to"} format or \code{c(unique(grid_id)[id_from], unique(grid_id)[id_to])}} + +\item{fun}{function supported in scomps.} + +\item{...}{Arguments passed to the argument \code{fun}.} +} +\value{ +a data.frame object with computation results. For entries of the results, consult the function used in \code{fun} argument. +} +\description{ +Currently only accepting \link[future]{multicore} setting (single node, single process, and multiple threads). For details of the terminology in \code{future} package, refer to \link[future]{plan}. This function assumes that users have one raster file and a sizable and spatially distributed target locations. Each thread will process ceiling(|Ng|/|Nt|) grids where |Ng| denotes the number of grids and |Nt| denotes the number of threads. +} +\examples{ +library(future) +plan(multicore, workers = 4) +# Does not run ... +# distribute_process_grid() +} +\author{ +Insang Song \email{geoissong@gmail.com} +} \ No newline at end of file diff --git a/man/distribute_process_hierarchy.Rd b/man/distribute_process_hierarchy.Rd index ea3b1a1b..0187bd63 100644 --- a/man/distribute_process_hierarchy.Rd +++ b/man/distribute_process_hierarchy.Rd @@ -25,7 +25,7 @@ a data.frame object with computation results. For entries of the results, consul library(future) plan(multicore, workers = 4) # Does not run ... -# distribute_process() +# distribute_process_hierarchy() } \author{ Insang Song \email{geoissong@gmail.com} diff --git a/man/distribute_process_multirasters.Rd b/man/distribute_process_multirasters.Rd new file mode 100644 index 00000000..568d967d --- /dev/null +++ b/man/distribute_process_multirasters.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in scomps_rmarkdown_litr.rmd. +\name{distribute_process_multirasters} +\alias{distribute_process_multirasters} +\title{Process a given function over multiple large rasters} +\usage{ +distribute_process_multirasters(filenames, fun, ...) +} +\arguments{ +\item{filenames}{character(n). A vector or list of full file paths of raster files. n is the total number of raster files.} + +\item{fun}{function supported in scomps.} + +\item{...}{Arguments passed to the argument \code{fun}.} +} +\value{ +a data.frame object with computation results. For entries of the results, consult the function used in \code{fun} argument. +} +\description{ +Large raster files usually exceed the memory capacity in size. Cropping a large raster into a small subset even consumes a lot of memory and adds processing time. This function leverages terra SpatRaster proxy to distribute computation jobs over multiple cores. It is assumed that users have multiple large raster files in their disk, then each file path is assigned to a thread. Each thread will directly read raster values from the disk using C++ pointers that operate in terra functions. For use, it is strongly recommended to use vector data with small and confined spatial extent for computation to avoid out-of-memory error. For this, users may need to make subsets of input vector objects in advance. +} +\examples{ +library(future) +plan(multicore, workers = 4) +# Does not run ... +# distribute_process_multirasters() +} +\author{ +Insang Song \email{geoissong@gmail.com} +} \ No newline at end of file diff --git a/scomps_0.0.4.11012023.tar.gz b/scomps_0.0.4.11012023.tar.gz new file mode 100644 index 00000000..ddf6e0e3 Binary files /dev/null and b/scomps_0.0.4.11012023.tar.gz differ diff --git a/scomps_rmarkdown_litr.rmd b/scomps_rmarkdown_litr.rmd index 63f52347..81cb7e8f 100644 --- a/scomps_rmarkdown_litr.rmd +++ b/scomps_rmarkdown_litr.rmd @@ -1,7 +1,7 @@ --- title: "Creating the ``r params$package_name`` R package" author: "Insang Song" -date: "2023-10-04" +date: "2023-10-31" knit: litr::render output: litr::litr_html_document params: @@ -45,6 +45,7 @@ usethis::use_package("units") usethis::use_package("methods") usethis::use_package("progressr") usethis::use_package("logr", "Suggests") # Default is "Imports" +usethis::use_package("exactextractr", "Suggests") # Default is "Imports" usethis::use_package("future") usethis::use_package("future.apply") usethis::use_package("igraph", "Suggests") # Default is "Imports" @@ -1408,10 +1409,10 @@ extract_with_buffer_kernel <- function( #' library(future) #' plan(multicore, workers = 4) #' # Does not run ... -#' # distribute_process() +#' # distribute_process_grid() #' @import future #' @export -distribute_process <- function( +distribute_process_grid <- function( grids, grid_target_id = NULL, fun, @@ -1439,8 +1440,6 @@ distribute_process <- function( grids_target <- grids[grid_target_ids %in% unlist(grids[["CGRIDID"]]),] grids_target_list <- split(grids_target, unlist(grids_target[["CGRIDID"]])) - # pgrs <- progressr::progressor(along = nrow(grids_target)) - results_distributed <- future.apply::future_lapply( grids_target_list, \(x) { @@ -1464,7 +1463,7 @@ distribute_process <- function( # post-processing detected_id <- grep("^id", names(par_fun), value = TRUE) - detected_point <- grep("^points", names(par_fun), value = TRUE) + detected_point <- grep("^(points|poly)", names(par_fun), value = TRUE) names(results_distributed)[1] <- par_fun[[detected_id]] results_distributed[[par_fun[[detected_id]]]] <- unlist(par_fun[[detected_point]][[par_fun[[detected_id]]]]) @@ -1487,7 +1486,7 @@ distribute_process <- function( #' library(future) #' plan(multicore, workers = 4) #' # Does not run ... -#' # distribute_process() +#' # distribute_process_hierarchy() #' @import future #' @import progressr #' @export @@ -1527,7 +1526,65 @@ distribute_process_hierarchy <- function( # post-processing detected_id <- grep("^id", names(par_fun), value = TRUE) - detected_point <- grep("^points", names(par_fun), value = TRUE) + detected_point <- grep("^(points|poly)", names(par_fun), value = TRUE) + names(results_distributed)[1] <- par_fun[[detected_id]] + results_distributed[[par_fun[[detected_id]]]] <- + unlist(par_fun[[detected_point]][[par_fun[[detected_id]]]]) + + return(results_distributed) +} + + + + +#' @title Process a given function over multiple large rasters +#' +#' @description Large raster files usually exceed the memory capacity in size. Cropping a large raster into a small subset even consumes a lot of memory and adds processing time. This function leverages terra SpatRaster proxy to distribute computation jobs over multiple cores. It is assumed that users have multiple large raster files in their disk, then each file path is assigned to a thread. Each thread will directly read raster values from the disk using C++ pointers that operate in terra functions. For use, it is strongly recommended to use vector data with small and confined spatial extent for computation to avoid out-of-memory error. For this, users may need to make subsets of input vector objects in advance. +#' @param filenames character(n). A vector or list of full file paths of raster files. n is the total number of raster files. +#' @param fun function supported in scomps. +#' @param ... Arguments passed to the argument \code{fun}. +#' @return a data.frame object with computation results. For entries of the results, consult the function used in \code{fun} argument. +#' @author Insang Song \email{geoissong@@gmail.com} +#' +#' @examples +#' library(future) +#' plan(multicore, workers = 4) +#' # Does not run ... +#' # distribute_process_multirasters() +#' @import future +#' @import progressr +#' @export +distribute_process_multirasters <- function( + filenames, + fun, + ...) { + par_fun <- list(...) + + if (any(sapply(filenames, \(x) !file.exists(x)))) { + stop("One or many of files do not exist in provided file paths. Check the paths again.\n") + } + + file_list <- split(filenames, filenames) + results_distributed <- future.apply::future_lapply( + file_list, + \(x) { + sf::sf_use_s2(FALSE) + + run_result <- tryCatch({ + res <- fun(...) + return(res) + }, + error = function(e) return(data.frame(ID = NA))) + return(run_result) + }, + future.seed = TRUE, + future.packages = c("terra", "sf", "dplyr", "scomps", "future")) + results_distributed <- do.call(dplyr::bind_rows, results_distributed) + results_distributed <- results_distributed[!is.na(results_distributed[["ID"]]),] + + # post-processing + detected_id <- grep("^id", names(par_fun), value = TRUE) + detected_point <- grep("^(points|poly)", names(par_fun), value = TRUE) names(results_distributed)[1] <- par_fun[[detected_id]] results_distributed[[par_fun[[detected_id]]]] <- unlist(par_fun[[detected_point]][[par_fun[[detected_id]]]]) @@ -1535,6 +1592,7 @@ distribute_process_hierarchy <- function( return(results_distributed) } + ``` @@ -1547,7 +1605,6 @@ testthat::test_that("Processes are properly spawned and compute", { withr::local_package("dplyr") withr::local_package("progressr") withr::local_options(list(sf_use_s2 = FALSE)) - progressr::handlers(global = TRUE) ncpath <- system.file("shape/nc.shp", package = "sf") ncpoly <- terra::vect(ncpath) |> diff --git a/tests/testthat/tests.R b/tests/testthat/tests.R index b589d65f..b9744ee1 100644 --- a/tests/testthat/tests.R +++ b/tests/testthat/tests.R @@ -209,7 +209,6 @@ testthat::test_that("Processes are properly spawned and compute", { withr::local_package("dplyr") withr::local_package("progressr") withr::local_options(list(sf_use_s2 = FALSE)) - progressr::handlers(global = TRUE) ncpath <- system.file("shape/nc.shp", package = "sf") ncpoly <- terra::vect(ncpath) |> diff --git a/tools/tarballs/scomps_0.0.4.11012023.tar.gz b/tools/tarballs/scomps_0.0.4.11012023.tar.gz index 61ad0b3b..dd126211 100644 Binary files a/tools/tarballs/scomps_0.0.4.11012023.tar.gz and b/tools/tarballs/scomps_0.0.4.11012023.tar.gz differ