Skip to content

Commit

Permalink
- Add ResamplingSptCVCluto (#53)
Browse files Browse the repository at this point in the history
  • Loading branch information
pat-s authored Jul 30, 2020
1 parent 1f06e1b commit 1b20d94
Show file tree
Hide file tree
Showing 41 changed files with 1,651 additions and 110 deletions.
5 changes: 5 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,8 @@
^\.lintr$
^\.vscode$
^gfortran.*
^joss$
^man-roxygen$
^BinaryFiles$
^cluto$
^zzz$
20 changes: 14 additions & 6 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## tic GitHub Actions template: linux-macos-windows-deploy
## tic GitHub Actions template: custom-deploy
## revision date: 2020-07-29
on:
push:
Expand All @@ -22,9 +22,11 @@ jobs:
config:
# use a different tic template type if you do not want to build on all listed platforms
- { os: windows-latest, r: "release" }
- { os: macOS-latest, r: "release", pkgdown: "true", latex: "true" }
# [Custom matrix env var]
- { os: macOS-latest, r: "release", pkgdown: "false", latex: "true" }
- { os: ubuntu-latest, r: "devel" }
- { os: ubuntu-latest, r: "release" }
# [Custom matrix env var]
- { os: ubuntu-latest, r: "release", pkgdown: "true" }

env:
# otherwise remotes::fun() errors cause the build to fail. Example: Unavailability of binaries
Expand Down Expand Up @@ -72,8 +74,8 @@ jobs:
uses: pat-s/[email protected]
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-${{steps.date.outputs.date}}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-${{steps.date.outputs.date}}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-${{steps.date.outputs.date}}1
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-${{steps.date.outputs.date}}1

- name: "[Custom block] [Linux] Install spatial libraries"
if: runner.os == 'Linux'
Expand All @@ -83,6 +85,8 @@ jobs:
- name: "[Custom block] [macOS] Install spatial libraries"
if: runner.os == 'macOS'
run: |
# conflicts with gfortran from r-lib/actions when linking gcc
rm '/usr/local/bin/gfortran'
brew install ccache gdal geos proj udunits Caskroom/cask/xquartz
# for some strange Windows reason this step and the next one need to be decoupled
Expand All @@ -106,11 +110,15 @@ jobs:
echo -e 'options(Ncpus = 4, pkgType = "source", repos = structure(c(CRAN = "https://cloud.r-project.org/")))' > $HOME/.Rprofile
Rscript -e "remotes::install_github('ropensci/tic')" -e "print(tic::dsl_load())" -e "tic::prepare_all_stages()" -e "tic::before_install()" -e "tic::install()"
# [Custom block]
- name: "[Stage] Before Script"
run: Rscript -e 'tic::before_script()'

- name: "[Stage] Script"
run: Rscript -e 'tic::script()'

- name: "[Stage] After Success"
if: matrix.config.os == 'macOS-latest' && matrix.config.r == 'release'
if: matrix.config.os == 'ubuntu-latest' && matrix.config.r == 'release'
run: Rscript -e "tic::after_success()"

- name: "[Stage] Upload R CMD check artifacts"
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@ repos:
rev: v2.4.0
hooks:
- id: check-added-large-files
args: ['--maxkb=800']
args: ['--maxkb=2300']
14 changes: 11 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,31 @@ URL: https://mlr3spatiotempcv.mlr-org.com/,
https://github.com/mlr-org/mlr3spatiotempcv
BugReports: https://github.com/mlr-org/mlr3spatiotempcv/issues
Depends:
R (>= 3.1.0)
R (>= 3.5.0)
Imports:
checkmate,
cli,
data.table,
ggplot2,
mlr3,
mlr3misc (>= 0.1.7),
paradox,
R6
R6,
utils
Suggests:
bibtex,
blockCV (>= 2.1.1),
bookdown,
cowplot,
ggsci,
GSIF,
knitr,
plotly,
renv,
rmarkdown,
rpart,
sf,
skmeans,
testthat (>= 2.1.0),
vdiffr
VignetteBuilder:
Expand All @@ -51,16 +57,18 @@ Encoding: UTF-8
LazyData: true
NeedsCompilation: no
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.0
RoxygenNote: 7.1.1
Collate:
'Task_classif_ecuador.R'
'Task_classif_diplodia.R'
'Task_regr_cookfarm_profiles.R'
'ResamplingSpCVBlock.R'
'ResamplingSpCVBuffer.R'
'ResamplingSptCVCluto.R'
'ResamplingSpCVCoords.R'
'ResamplingSpCVEnv.R'
'ResamplingRepeatedSpCVCoords.R'
'ResamplingRepeatedSptCVCluto.R'
'ResamplingRepeatedSpCVEnv.R'
'ResamplingRepeatedSpCVBlock.R'
'TaskClassifST.R'
Expand Down
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,21 @@ S3method(autoplot,ResamplingRepeatedCV)
S3method(autoplot,ResamplingRepeatedSpCVBlock)
S3method(autoplot,ResamplingRepeatedSpCVCoords)
S3method(autoplot,ResamplingRepeatedSpCVEnv)
S3method(autoplot,ResamplingRepeatedSptCVCluto)
S3method(autoplot,ResamplingSpCVBlock)
S3method(autoplot,ResamplingSpCVBuffer)
S3method(autoplot,ResamplingSpCVCoords)
S3method(autoplot,ResamplingSpCVEnv)
S3method(autoplot,ResamplingSptCVCluto)
export(ResamplingRepeatedSpCVBlock)
export(ResamplingRepeatedSpCVCoords)
export(ResamplingRepeatedSpCVEnv)
export(ResamplingRepeatedSptCVCluto)
export(ResamplingSpCVBlock)
export(ResamplingSpCVBuffer)
export(ResamplingSpCVCoords)
export(ResamplingSpCVEnv)
export(ResamplingSptCVCluto)
export(TaskClassifST)
export(TaskRegrST)
export(autoplot)
Expand All @@ -30,3 +34,4 @@ importFrom(R6,R6Class)
importFrom(data.table,data.table)
importFrom(data.table,rbindlist)
importFrom(mlr3,as_data_backend)
importFrom(utils,globalVariables)
223 changes: 223 additions & 0 deletions R/ResamplingRepeatedSptCVCluto.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
#' @title Repeated Spatio-Temporal Cluster Resampling
#'
#' @import mlr3
#'
#' @description Spatio-temporal cluster partitioning via the `vcluster`
#' executable of the
#' [CLUTO](http://glaros.dtc.umn.edu/gkhome/cluto/cluto/overview) clustering
#' application.
#'
#' This partitioning method relies on the external CLUTO library.
#' To use it, CLUTO's executables need to be downloaded and installed into
#' this package.
#'
#' See \url{https://gist.github.com/pat-s/6430470cf817050e27d26c43c0e9be72} for an
#' installation approach that should work on Windows and Linux.
#' macOS is not supported by CLUTO.
#'
#' Before using this method, please check the restrictive
#' [copyright](http://glaros.dtc.umn.edu/gkhome/cluto/cluto/download) shown
#' below.
#'
#' @details
#' By default, `-clmethod='direct'` is passed to the `vcluster` executable in
#' contrast to the upstream default `-clmethod='rb'`.
#' There is no evidence or research that this method is the best among the
#' available ones ("rb", "rbr", "direct", "agglo", "graph", "bagglo"). Also,
#' various other parameters can be set via argument `cluto_parameters` to
#' achieve different clustering results.
#'
#' Parameter `-clusterfile` is handled by \CRANpkg{skmeans} and cannot be
#' changed.
#'
#' @section Copyright:
#'
#' CLUTO's copyright is as follows:
#'
#' The CLUTO package is copyrighted by the Regents of the University of
#' Minnesota.
#' It can be freely used for educational and research purposes by non-profit
#' institutions and US government agencies only.
#' Other organizations are allowed to use CLUTO only for evaluation purposes,
#' and any further uses will require prior approval.
#' The software may not be sold or redistributed without prior approval.
#' One may make copies of the software for their use provided that the copies,
#' are not sold or distributed, are used under the same terms and conditions.
#' As unestablished research software, this code is provided on an “as is” basis
#' without warranty of any kind, either expressed or implied.
#' The downloading, or executing any part of this software constitutes an
#' implicit agreement to these terms. These terms and conditions are subject to
#' change at any time without prior notice.
#'
#' In addition, a different seed is used in every repetition to enforce
#' different clusters for each repetition. By default, all repetitions would use
#' the same seed and hence be identical. Note that setting an R seed has no
#' effect here.
#'
#' @export
#' @examples
#' \dontrun{
#' library(mlr3)
#' library(mlr3spatiotempcv)
#' task = tsk("cookfarm")
#'
#' # Instantiate Resampling
#' rrcv = rsmp("repeated-spcv-cluto", folds = 3, repeats = 5)
#' rrcv$instantiate(task, time_var = "Date")
#'
#' # Individual sets:
#' rrcv$iters
#' rrcv$folds(1:6)
#' rrcv$repeats(1:6)
#'
#' # Individual sets:
#' rrcv$train_set(1)
#' rrcv$test_set(1)
#' intersect(rrcv$train_set(1), rrcv$test_set(1))
#'
#' # Internal storage:
#' rrcv$instance # table
#' }
ResamplingRepeatedSptCVCluto = R6Class("ResamplingRepeatedSptCVCluto",
inherit = mlr3::Resampling,

public = list(
#' @description
#' Create an "coordinate-based" repeated resampling instance.
#' @param id `character(1)`\cr
#' Identifier for the resampling strategy.
initialize = function(id = "repeated-spcv-cluto") {
ps = ParamSet$new(params = list(
ParamInt$new("folds", lower = 1L, default = 10L, tags = "required"),
ParamInt$new("repeats", lower = 1, default = 1L, tags = "required")
))
ps$values = list(folds = 10L, repeats = 1)
super$initialize(
id = id,
param_set = ps,
man = "mlr3spatiotempcv::mlr_resamplings_repeated_SptCVCluto"
)
},

#' @description Translates iteration numbers to fold number.
#' @param iters `integer()`\cr
#' Iteration number.
folds = function(iters) {
iters = assert_integerish(iters, any.missing = FALSE, coerce = TRUE)
((iters - 1L) %% as.integer(self$param_set$values$repeats)) + 1L
},

#' @description Translates iteration numbers to repetition number.
#' @param iters `integer()`\cr
#' Iteration number.
repeats = function(iters) {
iters = assert_integerish(iters, any.missing = FALSE, coerce = TRUE)
((iters - 1L) %/% as.integer(self$param_set$values$folds)) + 1L
},

#' @description
#' Materializes fixed training and test splits for a given task.
#' @param task [Task]\cr
#' A task to instantiate.
#' @param time_var [character]\cr
#' The name of the variable which represents the time dimension.
#' Must be of type numeric.
#' @param clmethod [character]\cr
#' Name of the clustering method to use within `vcluster`.
#' See Details for more information.
#' @param cluto_parameters [character]\cr
#' Additional parameters to pass to `vcluster`.
#' Must be given as a single character string, e.g.
#' `"param1='value1'param2='value2'"`.
#' See the CLUTO documentation for a full list of supported parameters.
#' @param verbose [logical]\cr
#' Whether to show `vcluster` progress and summary output.
instantiate = function(task, time_var, clmethod = "direct",
cluto_parameters = NULL, verbose = TRUE) {

requireNamespace("skmeans", quietly = TRUE)

assert_task(task)
groups = task$groups

if (!is.null(groups)) {
stopf("Grouping is not supported for spatial resampling methods")
}

time = as.POSIXct(task$data()[[time_var]])
# time in seconds since 1/1/1970
time_num = as.numeric(time)

data_matrix = data.matrix(data.frame(task$coordinates(), time_num))
colnames(data_matrix) = c("x", "y", "z")

instance = private$.sample(
task$row_ids, data_matrix, clmethod, cluto_parameters, verbose
)

self$instance = instance
self$task_hash = task$hash
invisible(self)
}
),

active = list(

#' @field iters `integer(1)`\cr
#' Returns the number of resampling iterations, depending on the
#' values stored in the `param_set`.
iters = function() {
pv = self$param_set$values
as.integer(pv$repeats) * as.integer(pv$folds)
}
),

private = list(
.sample = function(ids, data_matrix, clmethod, cluto_parameters, verbose) {

vcluster_loc = check_cluto_path()

pv = self$param_set$values
folds = as.integer(pv$folds)

if (is.null(cluto_parameters)) {
control_cluto = sprintf('-clmethod="%s"', clmethod)
} else {
control_cluto = sprintf('-clmethod="%s""%s"', clmethod, cluto_parameters)
}

mlr3misc::map_dtr(seq_len(pv$repeats), function(i) {
data.table(
row_id = ids, rep = i,
fold = skmeans::skmeans(data_matrix,
k = folds,
method = "CLUTO",
control = list(
vcluster = vcluster_loc,
verbose = verbose,
control = paste(control_cluto, sprintf("-seed='%s'", i))
)
)$cluster
)
})
},

.get_train = function(i) {
i = as.integer(i) - 1L
folds = as.integer(self$param_set$values$folds)
rep = i %/% folds + 1L
fold = i %% folds + 1L
ii = data.table(rep = rep, fold = seq_len(folds)[-fold])
self$instance[ii, "row_id", on = names(ii), nomatch = 0L][[1L]]
},

.get_test = function(i) {
i = as.integer(i) - 1L
folds = as.integer(self$param_set$values$folds)
rep = i %/% folds + 1L
fold = i %% folds + 1L
ii = data.table(rep = rep, fold = fold)
self$instance[ii, "row_id", on = names(ii), nomatch = 0L][[1L]]
}
)
)
Loading

0 comments on commit 1b20d94

Please sign in to comment.