Devel into master (#336)

NorskRegnesentral · Jun 5, 2023 · 8cf35f2 · 8cf35f2
1 parent a85cd23
commit 8cf35f2
Show file tree

Hide file tree

Showing 240 changed files with 41,435 additions and 7,885 deletions.
diff --git a/.Rprofile b/.Rprofile
@@ -0,0 +1,23 @@
+#' Helper function for package development
+#'
+#' This is a manual extension of [testthat::snapshot_review()] which works for the \code{.rds} files used in
+#' this package.
+#'
+#' @param path Character
+#' @param ... Additional arguments passed to [waldo::compare()]
+#' Gives the relative path to the test files to review
+#'
+snapshot_review_man <- function(path, ...) {
+  changed <- testthat:::snapshot_meta(path)
+  these_rds <- (tools::file_ext(changed$name) == "rds")
+  if (any(these_rds)) {
+    for (i in which(these_rds)) {
+      old <- readRDS(changed[i, "cur"])
+      new <- readRDS(changed[i, "new"])
+
+      cat(paste0("Difference for check ", changed[i, "name"], " in test ", changed[i, "test"], "\n"))
+      print(waldo::compare(old, new, max_diffs = 50, ...))
+      browser()
+    }
+  }
+}
diff --git a/.github/workflows/lint-changed-files.yaml b/.github/workflows/lint-changed-files.yaml
@@ -41,4 +41,4 @@ jobs:
           lintr::lint_package(exclusions = exclusions_list)
         shell: Rscript {0}
         env:
-          LINTR_ERROR_ON_LINT: true
+          LINTR_ERROR_ON_LINT: false
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -29,4 +29,4 @@ jobs:
         run: lintr::lint_package()
         shell: Rscript {0}
         env:
-          LINTR_ERROR_ON_LINT: true
+          LINTR_ERROR_ON_LINT: false
diff --git a/.gitignore b/.gitignore
@@ -31,3 +31,6 @@ docs/*
 doc
 Meta
 docs
+/doc/
+/Meta/
+.idea
diff --git a/.lintr b/.lintr
@@ -2,18 +2,11 @@ linters: with_defaults(
         line_length_linter = lintr::line_length_linter(120),
         object_name_linter = NULL,
         object_usage_linter = NULL,
-        seq_linter = NULL,
-        cyclocomp_linter = lintr::cyclocomp_linter()
+        commented_code_linter = NULL
     )
 exclusions: list(
-        "inst/scripts/compare_shap_python.R",
-        "inst/scripts/create_lm_model_object.R",
-        "inst/scripts/create_xgboost_model_object.R",
-        "inst/scripts/example_ctree_model.R",
-        "inst/scripts/example_custom_model.R",
-        "inst/scripts/readme_example.R",
-        "inst/scripts/shap_python_script.py",
-        "inst/scripts/devel/compare_indep_implementations.R",
+        "inst/scripts",
+        "vignettes/understanding_shapr.R",
         "R/RcppExports.R",
         "R/zzz.R"
     )
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: shapr
-Version: 0.2.3
+Version: 0.2.3.9000
 Title: Prediction Explanation with Dependence-Aware Shapley Values
 Description: Complex machine learning models are often hard to interpret. However, in 
   many situations it is crucial to understand and explain why a model made a specific 
@@ -12,6 +12,7 @@ Authors@R: c(
     person("Nikolai", "Sellereite", email = "[email protected]", role = "aut", comment = c(ORCID = "0000-0002-4671-0337")),
     person("Martin", "Jullum", email = "[email protected]", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-3908-5155")),
     person("Annabelle", "Redelmeier", email = "[email protected]", role = "aut"),
+    person("Jon", "Lachmann", email = "[email protected]", role = "aut"),
     person("Anders", "Løland", email = "[email protected]", role = "ctb"), 
     person("Jens Christian", "Wahl", email = "[email protected]", role = "ctb"), 
     person("Camilla", "Lingjærde", role = "ctb"),
@@ -32,22 +33,29 @@ Imports:
     Rcpp (>= 0.12.15),
     condMVNorm,
     mvnfast,
-    Matrix
+    Matrix,
+    future.apply
 Suggests: 
     ranger,
     xgboost,
     mgcv,
-    testthat, 
+    testthat (>= 3.0.0),
     knitr,
     rmarkdown,
     roxygen2,
-    MASS,
     ggplot2,
-    caret,
     gbm,
     party,
-    partykit
+    partykit,
+    waldo,
+    progressr,
+    future,
+    ggbeeswarm,
+    vdiffr,
+    forecast
 LinkingTo: 
     RcppArmadillo,
     Rcpp
 VignetteBuilder: knitr
+Config/testthat/edition: 3
+Roxygen: list(markdown = TRUE)
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,59 +1,71 @@
 # Generated by roxygen2: do not edit by hand
 
-S3method(explain,combined)
-S3method(explain,copula)
-S3method(explain,ctree)
-S3method(explain,ctree_comb_mincrit)
-S3method(explain,empirical)
-S3method(explain,gaussian)
-S3method(explain,independence)
-S3method(get_model_specs,default)
+S3method(get_model_specs,Arima)
+S3method(get_model_specs,ar)
+S3method(get_model_specs,forecast_ARIMA)
 S3method(get_model_specs,gam)
 S3method(get_model_specs,glm)
 S3method(get_model_specs,lm)
 S3method(get_model_specs,ranger)
 S3method(get_model_specs,xgb.Booster)
+S3method(model_checker,Arima)
+S3method(model_checker,ar)
 S3method(model_checker,default)
+S3method(model_checker,forecast_ARIMA)
 S3method(model_checker,gam)
 S3method(model_checker,glm)
 S3method(model_checker,lm)
 S3method(model_checker,ranger)
 S3method(model_checker,xgb.Booster)
 S3method(plot,shapr)
+S3method(predict_model,Arima)
+S3method(predict_model,ar)
 S3method(predict_model,default)
+S3method(predict_model,forecast_ARIMA)
 S3method(predict_model,gam)
 S3method(predict_model,glm)
 S3method(predict_model,lm)
 S3method(predict_model,ranger)
 S3method(predict_model,xgb.Booster)
+S3method(prepare_data,categorical)
 S3method(prepare_data,copula)
 S3method(prepare_data,ctree)
 S3method(prepare_data,empirical)
 S3method(prepare_data,gaussian)
 S3method(prepare_data,independence)
+S3method(prepare_data,timeseries)
 S3method(print,shapr)
+S3method(setup_approach,categorical)
+S3method(setup_approach,combined)
+S3method(setup_approach,copula)
+S3method(setup_approach,ctree)
+S3method(setup_approach,empirical)
+S3method(setup_approach,gaussian)
+S3method(setup_approach,independence)
+S3method(setup_approach,timeseries)
 export(aicc_full_single_cpp)
-export(check_features)
-export(compute_shapley)
+export(compute_shapley_new)
+export(compute_vS)
 export(correction_matrix_cpp)
-export(create_ctree)
 export(explain)
+export(explain_forecast)
 export(feature_combinations)
 export(feature_matrix_cpp)
+export(finalize_explanation)
+export(get_cov_mat)
 export(get_data_specs)
 export(get_model_specs)
+export(get_mu_vec)
+export(get_supported_approaches)
 export(hat_matrix_cpp)
 export(mahalanobis_distance_cpp)
-export(make_dummies)
-export(model_checker)
 export(observation_impute_cpp)
 export(predict_model)
-export(prepare_and_predict)
 export(prepare_data)
-export(preprocess_data)
 export(rss_cpp)
-export(shapr)
-export(update_data)
+export(setup)
+export(setup_approach)
+export(setup_computation)
 export(weight_matrix_cpp)
 importFrom(Rcpp,sourceCpp)
 importFrom(data.table,":=")
@@ -76,6 +88,8 @@ importFrom(graphics,plot)
 importFrom(graphics,rect)
 importFrom(stats,as.formula)
 importFrom(stats,contrasts)
+importFrom(stats,embed)
+importFrom(stats,formula)
 importFrom(stats,model.frame)
 importFrom(stats,model.matrix)
 importFrom(stats,predict)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,52 @@
+# shapr (development version)
+
+* Complete restructuring motivated by introducing a Python wrapper  (`shaprpyr`, [#325](https://github.com/NorskRegnesentral/shapr/pull/325)) for explaining predictions from Python models (from Python) utilizing almost all functionality of `shapr` (not merged to master yet). The restructuring splits the explanation tasks into smaller pieces, allowing the Python wrapper to move back and forth between Python and R, doing the prediction in Python, and almost everything else in R. This simplifies maintenance of `shaprpy` significantly. 
+* As part of the restructuring, we also did a number of design changes, resulting in a series of breaking changes described below.
+
+### Breaking changes
+
+* Moved from explaining predictions using *two* functions (`shapr()` for initial setup + `explain()` for explanation for specific observations), to a *single* function call (also named `explain()`). The data used for training and to be explained have gotten explicit names (`x_train` and `x_explain`). The order of the input arguments has also been slightly changed (`model` is now the first argument).
+* Prediction and checking functions for custom models are now passed directly as arguments to `explain()` instead of being defined as functions of a specific class in the global env.
+* The previously exported function `make_dummies` used to explain `xgboost` models with categorical data, is removed to simplify the code base. This is rather handled with a custom prediction model.
+* The function `explain.ctree_comb_mincrit`, which allowed combining models with `approch=ctree` with different `mincrit` parameters, has been removed to simplify the code base. It may return in a completely general manner in later version of `shapr`.
+
+### New features
+
+* Introduce batch computation of conditional expectations ([#244](https://github.com/NorskRegnesentral/shapr/issues/244)). 
+This essentially compute $v(S)$ for a portion of the $S$-subsets at a time, to reduce the amount of data needed to be held in memory. 
+The user can control the number of batches herself, but we set a reasonable value by default ([#327](https://github.com/NorskRegnesentral/shapr/pull/327)). 
+This allows models with large number of features to be explained with a significantly lower RAM consumption (at the cost of a slight increase in the computation time)
+* Parallelization over batches ([#38](https://github.com/NorskRegnesentral/shapr/issues/38)) using the [future](https://future.futureverse.org/) framework.
+* Progress bar ([#293](https://github.com/NorskRegnesentral/shapr/pull/293)) using the [`progressr`](https://progressr.futureverse.org/) package. Must be activated by the user with `progressr::handlers(global = TRUE)` or wrapping the call to `explain()` around `progressr::with_progress({})`
+* Added `approach = 'categorical'` ([#256](https://github.com/NorskRegnesentral/shapr/issues/256), [#307](https://github.com/NorskRegnesentral/shapr/pull/307)) used to explain models with solely categorical features by directly using/estimating the joint distribution of all feature combinations.
+* Added `approch='timeseries'` ([#314](https://github.com/NorskRegnesentral/shapr/pull/314)) for explaining classifications based on time series data/models with the method described in Sec 4.3 of the [groupShapley paper](https://martinjullum.com/publication/jullum-2021-efficient/jullum-2021-efficient.pdf).
+* Implemented unique sampling of Shapley value subsets ([#227](https://github.com/NorskRegnesentral/shapr/issues/227))
+* Added new function `explain_forecast` to explain forecasts from time series models, at various prediction horizons ([#328](https://github.com/NorskRegnesentral/shapr/pull/328)). 
+Uses a different set of input argument which is more appropriate for these models. 
+* Re-implementation of `approach = 'independence'` method providing significantly faster computation (no longer as a special case of the `empirical` method). 
+Also allow the method to be used on models with categorical data  ([#315](https://github.com/NorskRegnesentral/shapr/pull/315)).
+* Added 'beeswarm' and 'waterfall' plots + new coloring scheme for all plots. See the [vignette](https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html#ex) for examples.
+
+### Under the hood
+
+* The test base have been completely rewritten ([#249](https://github.com/NorskRegnesentral/shapr/issues/249)). 
+Now heavily utilizing [snapshots](https://testthat.r-lib.org/articles/snapshotting.html) on a large set of benchmark calls to `explain`, also using [vdiffr](https://vdiffr.r-lib.org/) for plot tests. 
+Test functions are only written for exported core functions. Internal functions are only tested through the exported ones. 
+* Update GitHub actions ([#335](https://github.com/NorskRegnesentral/shapr/pull/335)).
+
+
+## Minor improvements and bug fixes
+
+* The vignette/readme/tests now uses the `datasets::airquality` dataset. 
+This avoids including a new package just for the dataset ([#248](https://github.com/NorskRegnesentral/shapr/issues/248)).
+* Allows lm/glm/gam models with interactions ([#303](https://github.com/NorskRegnesentral/shapr/pull/303)). 
+Previously, this was not possible with the prediction functions defined internally due to a bug.
+* Sampling of group subsets implemented also for grouping, not only features.
+
+### Documentation improvements
+
+* The [vignette](https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html) has been updated to reflect the new framework for explaining predictions, and all the new package features/functionality.
+
 # shapr 0.2.3 (GitHub only)
 
 * Development version
@@ -30,7 +79,7 @@ only a single function *predict_model*.
   passed to *shapr* and *explain*. The features in the data are checked for consistency with what can be extracted
   from the model object. If the model object is missing some of the necessary information, the info from the data
   is used instead. The system checks feature labels, classes, and any factor levels.
-* Due to the previous point, the *feature_labels* option previously used for custom models is removed.
+* Due to the previous point, the *feature_names* option previously used for custom models is removed.
 * Added a manual testing script for custom model (currently cannot be handled by testthat due to environment issues).
 * A few under-the-hood changes for checking in the *shapr* function.
 

diff --git a/R/approach.R b/R/approach.R
@@ -0,0 +1,74 @@
+#' Set up the framework chosen approach
+#'
+#' The different choices of `approach` takes different (optional) parameters, which are forwarded from [explain()].
+#'
+#' @param ... `approach`-specific arguments. See below.
+#'
+#' @inheritParams default_doc_explain
+#'
+#' @export
+setup_approach <- function(internal, ...) {
+  approach <- internal$parameters$approach
+
+  this_class <- ""
+
+  if (length(approach) > 1) {
+    class(this_class) <- "combined"
+  } else {
+    class(this_class) <- approach
+  }
+
+  UseMethod("setup_approach", this_class)
+}
+
+#' @inheritParams default_doc
+#' @export
+setup_approach.combined <- function(internal, ...) {
+  org_approach <- internal$parameters$approach
+  unique_approaches <- unique(org_approach)
+
+  for (i in unique_approaches) {
+    internal$parameters$approach <- i
+    internal <- setup_approach(internal, ...)
+  }
+  internal$parameters$approach <- org_approach
+
+  return(internal)
+}
+
+#' Generate data used for predictions
+#'
+#' @param x Explainer object. See [explain()] for more information.
+#'
+#' @param seed Positive integer. If `NULL` the seed will be inherited from the calling environment.
+#'
+#' @param index_features Positive integer vector. Specifies the indices of combinations to apply to the present method.
+#' `NULL` means all combinations. Only used internally.
+#'
+#' @param ... Currently not used.
+#'
+#' @return A data.table containing simulated data passed to prediction().
+#'
+#' @export
+#' @keywords internal
+prepare_data <- function(internal, ...) {
+  this_class <- ""
+  class(this_class) <- internal$parameters$approach
+  UseMethod("prepare_data", this_class)
+}
+
+#' @keywords internal
+insert_defaults <- function(internal, defaults) {
+  par_names <- names(defaults)
+
+  overwrite_names <- par_names[!(par_names %in% names(internal$parameters))]
+
+  internal$parameters <- append(internal$parameters, defaults[overwrite_names])
+
+  return(internal)
+}
+
+#' @keywords internal
+get_factor_approaches <- function() {
+  c("'independence' (not recommended)", "'ctree'", "'categorical'")
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -31,3 +31,6 @@ docs/* @@
     doc
     Meta
     docs
+    /doc/
+    /Meta/
+    .idea