Skip to content

Commit

Permalink
Devel into master (#336)
Browse files Browse the repository at this point in the history
  • Loading branch information
martinju authored Jun 5, 2023
1 parent a85cd23 commit 8cf35f2
Show file tree
Hide file tree
Showing 240 changed files with 41,435 additions and 7,885 deletions.
23 changes: 23 additions & 0 deletions .Rprofile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#' Helper function for package development
#'
#' This is a manual extension of [testthat::snapshot_review()] which works for the \code{.rds} files used in
#' this package.
#'
#' @param path Character
#' @param ... Additional arguments passed to [waldo::compare()]
#' Gives the relative path to the test files to review
#'
snapshot_review_man <- function(path, ...) {
changed <- testthat:::snapshot_meta(path)
these_rds <- (tools::file_ext(changed$name) == "rds")
if (any(these_rds)) {
for (i in which(these_rds)) {
old <- readRDS(changed[i, "cur"])
new <- readRDS(changed[i, "new"])

cat(paste0("Difference for check ", changed[i, "name"], " in test ", changed[i, "test"], "\n"))
print(waldo::compare(old, new, max_diffs = 50, ...))
browser()
}
}
}
2 changes: 1 addition & 1 deletion .github/workflows/lint-changed-files.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,4 @@ jobs:
lintr::lint_package(exclusions = exclusions_list)
shell: Rscript {0}
env:
LINTR_ERROR_ON_LINT: true
LINTR_ERROR_ON_LINT: false
2 changes: 1 addition & 1 deletion .github/workflows/lint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ jobs:
run: lintr::lint_package()
shell: Rscript {0}
env:
LINTR_ERROR_ON_LINT: true
LINTR_ERROR_ON_LINT: false
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,6 @@ docs/*
doc
Meta
docs
/doc/
/Meta/
.idea
13 changes: 3 additions & 10 deletions .lintr
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,11 @@ linters: with_defaults(
line_length_linter = lintr::line_length_linter(120),
object_name_linter = NULL,
object_usage_linter = NULL,
seq_linter = NULL,
cyclocomp_linter = lintr::cyclocomp_linter()
commented_code_linter = NULL
)
exclusions: list(
"inst/scripts/compare_shap_python.R",
"inst/scripts/create_lm_model_object.R",
"inst/scripts/create_xgboost_model_object.R",
"inst/scripts/example_ctree_model.R",
"inst/scripts/example_custom_model.R",
"inst/scripts/readme_example.R",
"inst/scripts/shap_python_script.py",
"inst/scripts/devel/compare_indep_implementations.R",
"inst/scripts",
"vignettes/understanding_shapr.R",
"R/RcppExports.R",
"R/zzz.R"
)
20 changes: 14 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: shapr
Version: 0.2.3
Version: 0.2.3.9000
Title: Prediction Explanation with Dependence-Aware Shapley Values
Description: Complex machine learning models are often hard to interpret. However, in
many situations it is crucial to understand and explain why a model made a specific
Expand All @@ -12,6 +12,7 @@ Authors@R: c(
person("Nikolai", "Sellereite", email = "[email protected]", role = "aut", comment = c(ORCID = "0000-0002-4671-0337")),
person("Martin", "Jullum", email = "[email protected]", role = c("cre", "aut"), comment = c(ORCID = "0000-0003-3908-5155")),
person("Annabelle", "Redelmeier", email = "[email protected]", role = "aut"),
person("Jon", "Lachmann", email = "[email protected]", role = "aut"),
person("Anders", "Løland", email = "[email protected]", role = "ctb"),
person("Jens Christian", "Wahl", email = "[email protected]", role = "ctb"),
person("Camilla", "Lingjærde", role = "ctb"),
Expand All @@ -32,22 +33,29 @@ Imports:
Rcpp (>= 0.12.15),
condMVNorm,
mvnfast,
Matrix
Matrix,
future.apply
Suggests:
ranger,
xgboost,
mgcv,
testthat,
testthat (>= 3.0.0),
knitr,
rmarkdown,
roxygen2,
MASS,
ggplot2,
caret,
gbm,
party,
partykit
partykit,
waldo,
progressr,
future,
ggbeeswarm,
vdiffr,
forecast
LinkingTo:
RcppArmadillo,
Rcpp
VignetteBuilder: knitr
Config/testthat/edition: 3
Roxygen: list(markdown = TRUE)
48 changes: 31 additions & 17 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,59 +1,71 @@
# Generated by roxygen2: do not edit by hand

S3method(explain,combined)
S3method(explain,copula)
S3method(explain,ctree)
S3method(explain,ctree_comb_mincrit)
S3method(explain,empirical)
S3method(explain,gaussian)
S3method(explain,independence)
S3method(get_model_specs,default)
S3method(get_model_specs,Arima)
S3method(get_model_specs,ar)
S3method(get_model_specs,forecast_ARIMA)
S3method(get_model_specs,gam)
S3method(get_model_specs,glm)
S3method(get_model_specs,lm)
S3method(get_model_specs,ranger)
S3method(get_model_specs,xgb.Booster)
S3method(model_checker,Arima)
S3method(model_checker,ar)
S3method(model_checker,default)
S3method(model_checker,forecast_ARIMA)
S3method(model_checker,gam)
S3method(model_checker,glm)
S3method(model_checker,lm)
S3method(model_checker,ranger)
S3method(model_checker,xgb.Booster)
S3method(plot,shapr)
S3method(predict_model,Arima)
S3method(predict_model,ar)
S3method(predict_model,default)
S3method(predict_model,forecast_ARIMA)
S3method(predict_model,gam)
S3method(predict_model,glm)
S3method(predict_model,lm)
S3method(predict_model,ranger)
S3method(predict_model,xgb.Booster)
S3method(prepare_data,categorical)
S3method(prepare_data,copula)
S3method(prepare_data,ctree)
S3method(prepare_data,empirical)
S3method(prepare_data,gaussian)
S3method(prepare_data,independence)
S3method(prepare_data,timeseries)
S3method(print,shapr)
S3method(setup_approach,categorical)
S3method(setup_approach,combined)
S3method(setup_approach,copula)
S3method(setup_approach,ctree)
S3method(setup_approach,empirical)
S3method(setup_approach,gaussian)
S3method(setup_approach,independence)
S3method(setup_approach,timeseries)
export(aicc_full_single_cpp)
export(check_features)
export(compute_shapley)
export(compute_shapley_new)
export(compute_vS)
export(correction_matrix_cpp)
export(create_ctree)
export(explain)
export(explain_forecast)
export(feature_combinations)
export(feature_matrix_cpp)
export(finalize_explanation)
export(get_cov_mat)
export(get_data_specs)
export(get_model_specs)
export(get_mu_vec)
export(get_supported_approaches)
export(hat_matrix_cpp)
export(mahalanobis_distance_cpp)
export(make_dummies)
export(model_checker)
export(observation_impute_cpp)
export(predict_model)
export(prepare_and_predict)
export(prepare_data)
export(preprocess_data)
export(rss_cpp)
export(shapr)
export(update_data)
export(setup)
export(setup_approach)
export(setup_computation)
export(weight_matrix_cpp)
importFrom(Rcpp,sourceCpp)
importFrom(data.table,":=")
Expand All @@ -76,6 +88,8 @@ importFrom(graphics,plot)
importFrom(graphics,rect)
importFrom(stats,as.formula)
importFrom(stats,contrasts)
importFrom(stats,embed)
importFrom(stats,formula)
importFrom(stats,model.frame)
importFrom(stats,model.matrix)
importFrom(stats,predict)
Expand Down
51 changes: 50 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,52 @@
# shapr (development version)

* Complete restructuring motivated by introducing a Python wrapper (`shaprpyr`, [#325](https://github.com/NorskRegnesentral/shapr/pull/325)) for explaining predictions from Python models (from Python) utilizing almost all functionality of `shapr` (not merged to master yet). The restructuring splits the explanation tasks into smaller pieces, allowing the Python wrapper to move back and forth between Python and R, doing the prediction in Python, and almost everything else in R. This simplifies maintenance of `shaprpy` significantly.
* As part of the restructuring, we also did a number of design changes, resulting in a series of breaking changes described below.

### Breaking changes

* Moved from explaining predictions using *two* functions (`shapr()` for initial setup + `explain()` for explanation for specific observations), to a *single* function call (also named `explain()`). The data used for training and to be explained have gotten explicit names (`x_train` and `x_explain`). The order of the input arguments has also been slightly changed (`model` is now the first argument).
* Prediction and checking functions for custom models are now passed directly as arguments to `explain()` instead of being defined as functions of a specific class in the global env.
* The previously exported function `make_dummies` used to explain `xgboost` models with categorical data, is removed to simplify the code base. This is rather handled with a custom prediction model.
* The function `explain.ctree_comb_mincrit`, which allowed combining models with `approch=ctree` with different `mincrit` parameters, has been removed to simplify the code base. It may return in a completely general manner in later version of `shapr`.

### New features

* Introduce batch computation of conditional expectations ([#244](https://github.com/NorskRegnesentral/shapr/issues/244)).
This essentially compute $v(S)$ for a portion of the $S$-subsets at a time, to reduce the amount of data needed to be held in memory.
The user can control the number of batches herself, but we set a reasonable value by default ([#327](https://github.com/NorskRegnesentral/shapr/pull/327)).
This allows models with large number of features to be explained with a significantly lower RAM consumption (at the cost of a slight increase in the computation time)
* Parallelization over batches ([#38](https://github.com/NorskRegnesentral/shapr/issues/38)) using the [future](https://future.futureverse.org/) framework.
* Progress bar ([#293](https://github.com/NorskRegnesentral/shapr/pull/293)) using the [`progressr`](https://progressr.futureverse.org/) package. Must be activated by the user with `progressr::handlers(global = TRUE)` or wrapping the call to `explain()` around `progressr::with_progress({})`
* Added `approach = 'categorical'` ([#256](https://github.com/NorskRegnesentral/shapr/issues/256), [#307](https://github.com/NorskRegnesentral/shapr/pull/307)) used to explain models with solely categorical features by directly using/estimating the joint distribution of all feature combinations.
* Added `approch='timeseries'` ([#314](https://github.com/NorskRegnesentral/shapr/pull/314)) for explaining classifications based on time series data/models with the method described in Sec 4.3 of the [groupShapley paper](https://martinjullum.com/publication/jullum-2021-efficient/jullum-2021-efficient.pdf).
* Implemented unique sampling of Shapley value subsets ([#227](https://github.com/NorskRegnesentral/shapr/issues/227))
* Added new function `explain_forecast` to explain forecasts from time series models, at various prediction horizons ([#328](https://github.com/NorskRegnesentral/shapr/pull/328)).
Uses a different set of input argument which is more appropriate for these models.
* Re-implementation of `approach = 'independence'` method providing significantly faster computation (no longer as a special case of the `empirical` method).
Also allow the method to be used on models with categorical data ([#315](https://github.com/NorskRegnesentral/shapr/pull/315)).
* Added 'beeswarm' and 'waterfall' plots + new coloring scheme for all plots. See the [vignette](https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html#ex) for examples.

### Under the hood

* The test base have been completely rewritten ([#249](https://github.com/NorskRegnesentral/shapr/issues/249)).
Now heavily utilizing [snapshots](https://testthat.r-lib.org/articles/snapshotting.html) on a large set of benchmark calls to `explain`, also using [vdiffr](https://vdiffr.r-lib.org/) for plot tests.
Test functions are only written for exported core functions. Internal functions are only tested through the exported ones.
* Update GitHub actions ([#335](https://github.com/NorskRegnesentral/shapr/pull/335)).


## Minor improvements and bug fixes

* The vignette/readme/tests now uses the `datasets::airquality` dataset.
This avoids including a new package just for the dataset ([#248](https://github.com/NorskRegnesentral/shapr/issues/248)).
* Allows lm/glm/gam models with interactions ([#303](https://github.com/NorskRegnesentral/shapr/pull/303)).
Previously, this was not possible with the prediction functions defined internally due to a bug.
* Sampling of group subsets implemented also for grouping, not only features.

### Documentation improvements

* The [vignette](https://norskregnesentral.github.io/shapr/articles/understanding_shapr.html) has been updated to reflect the new framework for explaining predictions, and all the new package features/functionality.

# shapr 0.2.3 (GitHub only)

* Development version
Expand Down Expand Up @@ -30,7 +79,7 @@ only a single function *predict_model*.
passed to *shapr* and *explain*. The features in the data are checked for consistency with what can be extracted
from the model object. If the model object is missing some of the necessary information, the info from the data
is used instead. The system checks feature labels, classes, and any factor levels.
* Due to the previous point, the *feature_labels* option previously used for custom models is removed.
* Due to the previous point, the *feature_names* option previously used for custom models is removed.
* Added a manual testing script for custom model (currently cannot be handled by testthat due to environment issues).
* A few under-the-hood changes for checking in the *shapr* function.

Expand Down
74 changes: 74 additions & 0 deletions R/approach.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#' Set up the framework chosen approach
#'
#' The different choices of `approach` takes different (optional) parameters, which are forwarded from [explain()].
#'
#' @param ... `approach`-specific arguments. See below.
#'
#' @inheritParams default_doc_explain
#'
#' @export
setup_approach <- function(internal, ...) {
approach <- internal$parameters$approach

this_class <- ""

if (length(approach) > 1) {
class(this_class) <- "combined"
} else {
class(this_class) <- approach
}

UseMethod("setup_approach", this_class)
}

#' @inheritParams default_doc
#' @export
setup_approach.combined <- function(internal, ...) {
org_approach <- internal$parameters$approach
unique_approaches <- unique(org_approach)

for (i in unique_approaches) {
internal$parameters$approach <- i
internal <- setup_approach(internal, ...)
}
internal$parameters$approach <- org_approach

return(internal)
}

#' Generate data used for predictions
#'
#' @param x Explainer object. See [explain()] for more information.
#'
#' @param seed Positive integer. If `NULL` the seed will be inherited from the calling environment.
#'
#' @param index_features Positive integer vector. Specifies the indices of combinations to apply to the present method.
#' `NULL` means all combinations. Only used internally.
#'
#' @param ... Currently not used.
#'
#' @return A data.table containing simulated data passed to prediction().
#'
#' @export
#' @keywords internal
prepare_data <- function(internal, ...) {
this_class <- ""
class(this_class) <- internal$parameters$approach
UseMethod("prepare_data", this_class)
}

#' @keywords internal
insert_defaults <- function(internal, defaults) {
par_names <- names(defaults)

overwrite_names <- par_names[!(par_names %in% names(internal$parameters))]

internal$parameters <- append(internal$parameters, defaults[overwrite_names])

return(internal)
}

#' @keywords internal
get_factor_approaches <- function() {
c("'independence' (not recommended)", "'ctree'", "'categorical'")
}
Loading

0 comments on commit 8cf35f2

Please sign in to comment.