diff --git a/.Rbuildignore b/.Rbuildignore index 4aff343ea..45476f458 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -25,7 +25,10 @@ inst/compare_lundberg\.xgb\.obj ^rebuild-long-running-vignette\.R$ ^vignettes/understanding_shapr_vaeac\.Rmd\.orig$ ^vignettes/understanding_shapr\.Rmd\.orig$ +^vignettes/understanding_shapr_regression\.Rmd\.orig$ ^vignettes/figure_main/*$ ^vignettes/cache_main/*$ ^vignettes/figure_vaeac/*$ ^vignettes/cache_vaeac/*$ +^vignettes/figure_regression/*$ +^vignettes/cache_regression/*$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index e9f228593..2b496dba9 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -35,6 +35,7 @@ jobs: fail-fast: false matrix: config: +# Temporary disable all but ubuntu release to reduce compute while debugging - {os: macOS-latest, r: 'release'} - {os: windows-latest, r: 'release'} - {os: ubuntu-20.04, r: 'devel', http-user-agent: 'release'} diff --git a/.gitignore b/.gitignore index 619cbb459..3cfe58bf8 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,7 @@ docs /Meta/ .idea .DS_Store + +vignettes/cache_main/ +vignettes/cache_vaeac/ +vignettes/cache_regression/ diff --git a/DESCRIPTION b/DESCRIPTION index 3e5797f53..a823f1e19 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -57,7 +57,16 @@ Suggests: torch, GGally, progress, - coro + coro, + parsnip, + recipes, + workflows, + tune, + dials, + yardstick, + hardhat, + rsample, + rlang LinkingTo: RcppArmadillo, Rcpp diff --git a/NAMESPACE b/NAMESPACE index 66e963b8b..1fa9bc343 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ S3method(get_model_specs,gam) S3method(get_model_specs,glm) S3method(get_model_specs,lm) S3method(get_model_specs,ranger) +S3method(get_model_specs,workflow) S3method(get_model_specs,xgb.Booster) S3method(model_checker,Arima) S3method(model_checker,ar) @@ -16,6 +17,7 @@ S3method(model_checker,gam) S3method(model_checker,glm) S3method(model_checker,lm) S3method(model_checker,ranger) +S3method(model_checker,workflow) S3method(model_checker,xgb.Booster) S3method(plot,shapr) S3method(predict_model,Arima) @@ -26,6 +28,7 @@ S3method(predict_model,gam) S3method(predict_model,glm) S3method(predict_model,lm) S3method(predict_model,ranger) +S3method(predict_model,workflow) S3method(predict_model,xgb.Booster) S3method(prepare_data,categorical) S3method(prepare_data,copula) @@ -33,6 +36,8 @@ S3method(prepare_data,ctree) S3method(prepare_data,empirical) S3method(prepare_data,gaussian) S3method(prepare_data,independence) +S3method(prepare_data,regression_separate) +S3method(prepare_data,regression_surrogate) S3method(prepare_data,timeseries) S3method(prepare_data,vaeac) S3method(print,shapr) @@ -43,6 +48,8 @@ S3method(setup_approach,ctree) S3method(setup_approach,empirical) S3method(setup_approach,gaussian) S3method(setup_approach,independence) +S3method(setup_approach,regression_separate) +S3method(setup_approach,regression_surrogate) S3method(setup_approach,timeseries) S3method(setup_approach,vaeac) export(aicc_full_single_cpp) @@ -68,6 +75,7 @@ export(predict_model) export(prepare_data) export(prepare_data_copula_cpp) export(prepare_data_gaussian_cpp) +export(regression.train_model) export(rss_cpp) export(setup) export(setup_approach) diff --git a/R/approach_regression_separate.R b/R/approach_regression_separate.R new file mode 100644 index 000000000..7104db548 --- /dev/null +++ b/R/approach_regression_separate.R @@ -0,0 +1,516 @@ +# Shapr functions ====================================================================================================== +#' @rdname setup_approach +#' +#' @param regression.model A `tidymodels` object of class `model_specs`. Default is a linear regression model, i.e., +#' [parsnip::linear_reg()]. See \href{https://www.tidymodels.org/find/parsnip/}{tidymodels} for all possible models, +#' and see the vignette for how to add new/own models. Note, to make it easier to call `explain()` from Python, the +#' `regression.model` parameter can also be a string specifying the model which will be parsed and evaluated. For +#' example, `"parsnip::rand_forest(mtry = hardhat::tune(), trees = 100, engine = "ranger", mode = "regression")"` +#' is also a valid input. It is essential to include the package prefix if the package is not loaded. +#' @param regression.tune_values Either `NULL` (default), a data.frame/data.table/tibble, or a function. +#' The data.frame must contain the possible hyperparameter value combinations to try. +#' The column names must match the names of the tuneable parameters specified in `regression.model`. +#' If `regression.tune_values` is a function, then it should take one argument `x` which is the training data +#' for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above. +#' Using a function allows the hyperparameter values to change based on the size of the combination. See the regression +#' vignette for several examples. +#' Note, to make it easier to call `explain()` from Python, the `regression.tune_values` can also be a string +#' containing an R function. For example, +#' `"function(x) return(dials::grid_regular(dials::mtry(c(1, ncol(x)))), levels = 3))"` is also a valid input. +#' It is essential to include the package prefix if the package is not loaded. +#' @param regression.vfold_cv_para Either `NULL` (default) or a named list containing +#' the parameters to be sent to [rsample::vfold_cv()]. See the regression vignette for +#' several examples. +#' @param regression.recipe_func Either `NULL` (default) or a function that that takes in a [recipes::recipe()] +#' object and returns a modified [recipes::recipe()] with potentially additional recipe steps. See the regression +#' vignette for several examples. +#' Note, to make it easier to call `explain()` from Python, the `regression.recipe_func` can also be a string +#' containing an R function. For example, +#' `"function(recipe) return(recipes::step_ns(recipe, recipes::all_numeric_predictors(), deg_free = 2))"` is also +#' a valid input. It is essential to include the package prefix if the package is not loaded. +#' @inheritParams default_doc_explain +#' +#' @export +#' @author Lars Henry Berge Olsen +setup_approach.regression_separate <- function(internal, + regression.model = parsnip::linear_reg(), + regression.tune_values = NULL, + regression.vfold_cv_para = NULL, + regression.recipe_func = NULL, + ...) { + # Check that required libraries are installed + regression.check_namespaces() + + # Small printout to the user + if (internal$parameters$verbose == 2) message("Starting 'setup_approach.regression_separate'.") + if (internal$parameters$verbose == 2) regression.separate_time_mess() # TODO: maybe remove + + # Add the default parameter values for the non-user specified parameters for the separate regression approach + defaults <- + mget(c("regression.model", "regression.tune_values", "regression.vfold_cv_para", "regression.recipe_func")) + internal <- insert_defaults(internal, defaults) + + # Check the parameters to the regression approach + internal <- regression.check_parameters(internal = internal) + + # Small printout to the user + if (internal$parameters$verbose == 2) message("Done with 'setup_approach.regression_separate'.") + + return(internal) # Return the updated internal list +} + +#' @inheritParams default_doc +#' @rdname prepare_data +#' @export +#' @author Lars Henry Berge Olsen +prepare_data.regression_separate <- function(internal, index_features = NULL, ...) { + # Load `workflows`, needed when parallelized as we call predict with a workflow object. Checked installed above. + requireNamespace("workflows", quietly = TRUE) + + # Get the features in the batch + features <- internal$objects$X$features[index_features] + + # Small printout to the user about which batch that are currently worked on + if (internal$parameters$verbose == 2) regression.prep_message_batch(internal, index_features) + + # Initialize empty data table with specific column names and id_combination (transformed to integer later). The data + # table will contain the contribution function values for the coalitions given by `index_features` and all explicands. + dt_res_column_names <- c("id_combination", paste0("p_hat1_", seq_len(internal$parameters$n_explain))) + dt_res <- data.table(matrix(ncol = length(dt_res_column_names), nrow = 0, dimnames = list(NULL, dt_res_column_names))) + + # Iterate over the coalitions provided by index_features. + # Note that index_features will never be NULL and never contain the empty or grand coalitions. + for (comb_idx in seq_along(features)) { + # Get the column indices of the features in current coalition/combination + current_comb <- features[[comb_idx]] + + # Extract the current training (and add y_hat as response) and explain data + current_x_train <- internal$data$x_train[, ..current_comb][, "y_hat" := internal$data$x_train_y_hat] + current_x_explain <- internal$data$x_explain[, ..current_comb] + + # Fit the current separate regression model to the current training data + if (internal$parameters$verbose == 2) regression.prep_message_comb(internal, index_features, comb_idx) + regression.current_fit <- regression.train_model( + x = current_x_train, + seed = internal$parameters$seed, + verbose = internal$parameters$verbose, + regression.model = internal$parameters$regression.model, + regression.tune = internal$parameters$regression.tune, + regression.tune_values = internal$parameters$regression.tune_values, + regression.vfold_cv_para = internal$parameters$regression.vfold_cv_para, + regression.recipe_func = internal$parameters$regression.recipe_func + ) + + # Compute the predicted response for the explicands, i.e., the v(S, x_i) for all explicands x_i. + pred_explicand <- predict(regression.current_fit, new_data = current_x_explain)$.pred + + # Add the new contribution function values for the current coalitions S to the result data table as a new row + dt_res <- rbind(dt_res, data.table(index_features[comb_idx], matrix(pred_explicand, nrow = 1)), use.names = FALSE) + } + + # Set id_combination to be the key + dt_res[, id_combination := as.integer(id_combination)] + data.table::setkey(dt_res, id_combination) + + # Return the estimated contribution function values + return(dt_res) +} + +# Train functions ====================================================================================================== +#' Train a tidymodels model via workflows +#' +#' Function that trains a `tidymodels` model via `workflows` based on the provided input parameters. +#' This function allows for cross validating the hyperparameters of the model. +#' +#' @inheritParams setup_approach.regression_separate +#' @inheritParams explain +#' @param x Data.table containing the data. Either the training data or the explicands. If `x` is the explicands, +#' then `index_features` must be provided. +#' @param regression.tune Logical (default is `FALSE`). If `TRUE`, then we are to tune the hyperparemeters based on +#' the values provided in `regression.tune_values`. Note that no checks are conducted as this is checked earlier in +#' `setup_approach.regression_separate` and `setup_approach.regression_surrogate`. +#' @param regression.response_var String (default is `y_hat`) containing the name of the response variable. +#' @param regression.surrogate_n_comb Integer (default is `NULL`). The number of times each training observations +#' has been augmented. If `NULL`, then we assume that we are doing separate regression. +#' +#' @return A trained `tidymodels` model based on the provided input parameters. +#' @export +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.train_model <- function(x, + seed = 1, + verbose = 0, + regression.model = parsnip::linear_reg(), + regression.tune = FALSE, + regression.tune_values = NULL, + regression.vfold_cv_para = NULL, + regression.recipe_func = NULL, + regression.response_var = "y_hat", + regression.surrogate_n_comb = NULL) { + # Create a recipe to the augmented training data + regression.recipe <- recipes::recipe(as.formula(paste(regression.response_var, "~ .")), data = x) + + # Update the recipe if user has provided a function for this. User is responsible for that the function works. + # This function can, e.g., add transformations, normalization, dummy encoding, interactions, and so on. + if (!is.null(regression.recipe_func)) regression.recipe <- regression.recipe_func(regression.recipe) + + # Combine workflow, model specification, and recipe + regression.workflow <- + workflows::add_recipe(workflows::add_model(workflows::workflow(), regression.model), regression.recipe) + + # Check if we are to tune hyperparameters in the regression model, as we then need to update the workflow. + # If we are not doing any hyperparameter tuning, then the workflow above is enough. + if (regression.tune) { + # Set up the V-fold cross validation using the user provided parameters in `regression.vfold_cv_para`. + # Note if `regression.vfold_cv_para` is NULL, then we use the default parameters in `vfold_cv()`. + regression.folds <- do.call(rsample::vfold_cv, c(list(data = x), regression.vfold_cv_para)) + + # Check if we are doing surrogate regression, as we then need to update the indices as the augmented + # training data is highly correlated due to the augmentations which will mess up the cross validation. + # Since one there assumes that the training and evaluation data are independent. The following code ensures + # that all augmentations of a single training observations are either in the training or evaluation data. + if (!is.null(regression.surrogate_n_comb)) { + if (!is.null(regression.vfold_cv_para) && any(names(regression.vfold_cv_para) != "v")) { + stop("The `regression.vfold_cv_para` parameter supports only the `v` parameter for surrogate regression.") + } + + n <- nrow(x) / regression.surrogate_n_comb # Get the number of training observations (before augmentation) + n_folds <- nrow(regression.folds) # Get the number of folds + folds <- sample(rep(seq_len(n_folds), length.out = n)) # Sample in which fold the i'th obs is in the eval data + indices <- lapply(split(seq_len(n), folds), function(x) setdiff(seq_len(n), x)) # Sample the training indices + + # Loop over the folds, extend the indices to reflect the augmentation, and insert the updated training indices + for (fold_idx in seq(n_folds)) { + regression.folds$splits[[fold_idx]]$in_id <- + unlist(lapply( + indices[[fold_idx]], + function(idx) seq(regression.surrogate_n_comb * (idx - 1) + 1, regression.surrogate_n_comb * idx) + )) + } + } + + # Extract the grid of hyperparameter values. Note that regression.tune_values is either a data.frame or a function. + if (is.data.frame(regression.tune_values)) { + regression.grid <- regression.tune_values + } else { + regression.grid <- regression.tune_values(x[, -..regression.response_var]) + } + + # Add the hyperparameter tuning to the workflow + regression.results <- tune::tune_grid( + object = regression.workflow, + resamples = regression.folds, + grid = regression.grid, + metrics = yardstick::metric_set(yardstick::rmse) + ) + + # Small printout to the user + if (verbose == 2) regression.cv_message(regression.results = regression.results, regression.grid = regression.grid) + + # Set seed for reproducibility. Without this we get different results based on if we run in parallel or sequential + set.seed(seed) + + # Update the workflow by finalizing it using the hyperparameters that attained the best rmse + regression.workflow <- + tune::finalize_workflow(regression.workflow, tune::select_best(regression.results, metric = "rmse")) + } + + # Fit the model to the augmented training data and return the trained model + return(parsnip::fit(regression.workflow, data = x)) +} + + +# Get functions ======================================================================================================== +#' Convert the string into an R object +#' +#' @param string A character vector/string containing the text to convert into R code. +#' +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.get_string_to_R <- function(string) { + return(eval(parse(text = string))) +} + +#' Get the predicted responses +#' +#' @inheritParams default_doc +#' +#' @return The same `internal` list, but added vectors `internal$data$x_train_y_hat` and +#' `internal$data$x_explain_y_hat` containing the predicted response of the training and explain data. +#' +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.get_y_hat <- function(internal, model, predict_model) { + # Predict the response of the training and explain data. Former is the response the regression models are fitted to. + internal$data$x_train_y_hat <- predict_model(model, internal$data$x_train) + internal$data$x_explain_y_hat <- predict_model(model, internal$data$x_explain) + return(internal) +} + +#' Get if model is to be tuned +#' +#' That is, if the regression model contains hyperparameters we are to tune using cross validation. +#' See \href{https://www.tidymodels.org/find/parsnip/#model-args}{tidymodels} for default model hyperparameters. +#' +#' @inheritParams setup_approach.regression_separate +#' @inheritParams explain +#' +#' @return A boolean variable indicating if the regression model is to be tuned. +#' +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.get_tune <- function(regression.model, regression.tune_values, x_train) { + # Check that the regression model is a tidymodels object + if (is.null(regression.model) || !"model_spec" %in% class(regression.model)) { + stop("`regression.model` must be a tidymodels object with class 'model_spec'. See documentation.") + } + + # Check if we are to tune some model hyperparameters + regression.para <- lapply(regression.model$args, function(para) rlang::quo_get_expr(para)) + regression.para_tune <- lapply(regression.para, function(para) !is.null(para) && grepl("tune()", para)) + regression.para_tune_names <- names(regression.para_tune)[unlist(regression.para_tune)] + regression.tune <- any(unlist(regression.para_tune)) + + # Check that user have provided a tuning + if (isTRUE(regression.tune) && is.null(regression.tune_values)) { + stop("`regression.tune_values` must be provided when `regression.model` contains hyperparameters to tune.") + } + + # Check function or tibble + if (!is.null(regression.tune_values) && + !is.data.frame(regression.tune_values) && + !is.function(regression.tune_values)) { + stop("`regression.tune_values` must be of either class `data.frame` or `function`. See documentation.") + } + + # Get the grid values. And if user provided a function, then check that it is a data.frame. + regression.tune_values_grid <- regression.tune_values + if (is.function(regression.tune_values)) { + regression.tune_values_grid <- regression.tune_values(x_train) + if (!is.data.frame(regression.tune_values_grid)) { + stop("The output of the user provided `regression.tune_values` function must be of class `data.frame`.") + } + } + + # Get the names of the hyperparameters the user provided values for + regression.tune_values_names <- names(regression.tune_values_grid) + + # Check that user have provided values for the hyperparameters to tune + if (!(all(regression.tune_values_names %in% regression.para_tune_names) && + all(regression.para_tune_names %in% regression.tune_values_names))) { + stop(paste0( + "The tunable parameters in `regression.model` ('", + paste(regression.para_tune_names, collapse = "', '"), "') and `regression.tune_values` ('", + paste(regression.tune_values_names, collapse = "', '"), "') must match." + )) + } + + # Return if we are to tune some model hyperparameters + return(regression.tune) +} + +# Check functions ====================================================================================================== +#' Check regression parameters +#' +#' @inheritParams default_doc +#' +#' @return The same `internal` list, but added logical indicator `internal$parameters$regression.tune` +#' if we are to tune the regression model/models. +#' +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.check_parameters <- function(internal) { + # Convert the objects to R-objects if they are strings + if (is.character(internal$parameters$regression.model)) { + internal$parameters$regression.model <- regression.get_string_to_R(internal$parameters$regression.model) + } + if (is.character(internal$parameters$regression.tune_values)) { + internal$parameters$regression.tune_values <- regression.get_string_to_R(internal$parameters$regression.tune_values) + } + if (is.character(internal$parameters$regression.recipe_func)) { + internal$parameters$regression.recipe_func <- regression.get_string_to_R(internal$parameters$regression.recipe_func) + } + + # Check that it is a function that returns the RHS of the formula for arbitrary feature name inputs + regression.check_recipe_func( + regression.recipe_func = internal$parameters$regression.recipe_func, + x_explain = internal$data$x_explain + ) + + # Check that `regression.vfold_cv_para` is either NULL or a named list that only contains recognized parameters + regression.check_vfold_cv_para(regression.vfold_cv_para = internal$parameters$regression.vfold_cv_para) + + # Check that `regression.check_sur_n_comb` is a valid value (only applicable for surrogate regression) + regression.check_sur_n_comb( + regression.surrogate_n_comb = internal$parameters$regression.surrogate_n_comb, + used_n_combinations = internal$parameters$used_n_combinations + ) + + # Check and get if we are to tune the hyperparameters of the regression model + internal$parameters$regression.tune <- regression.get_tune( + regression.model = internal$parameters$regression.model, + regression.tune_values = internal$parameters$regression.tune_values, + x_train = internal$data$x_train + ) + + return(internal) +} + +#' Check `regression.recipe_func` +#' +#' Check that regression.recipe_func is a function that returns the +#' RHS of the formula for arbitrary feature name inputs. +#' +#' @inheritParams explain +#' @inheritParams setup_approach.regression_separate +#' +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.check_recipe_func <- function(regression.recipe_func, x_explain) { + if (!is.null(regression.recipe_func) && !is.function(regression.recipe_func)) { + stop("`regression.recipe_func` must be a function. See documentation.") + } + + if (!is.null(regression.recipe_func) && is.function(regression.recipe_func)) { + x_temp <- copy(x_explain)[, "y_hat_temp" := 1] + regression.recipe_func_output <- regression.recipe_func(recipes::recipe(y_hat_temp ~ ., data = x_temp)) + if (!"recipe" %in% class(regression.recipe_func_output)) { + stop("The output of the `regression.recipe_func` must be of class `recipe`.") + } + } +} + +#' Check the parameters that are sent to [rsample::vfold_cv()] +#' +#' Check that `regression.vfold_cv_para` is either NULL or a named list that only contains recognized parameters. +#' +#' @inheritParams setup_approach.regression_separate +#' +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.check_vfold_cv_para <- function(regression.vfold_cv_para) { + if (!is.null(regression.vfold_cv_para)) { + # Check that regression.vfold_cv_para is a named list + if (!is.list(regression.vfold_cv_para) || is.null(names(regression.vfold_cv_para))) { + stop("`regression.vfold_cv_para` must be a named list. See documentation using '?shapr::explain()'.") + } + + # Check that all entries are parameters in the rsample::vfold_cv() function + unknown_para_names <- + names(regression.vfold_cv_para)[!names(regression.vfold_cv_para) %in% methods::formalArgs(rsample::vfold_cv)[-1]] + if (length(unknown_para_names) > 0) { + stop(paste0( + "The following parameters in `regression.vfold_cv_para` are not supported by `rsample::vfold_cv()`: '", + paste0(unknown_para_names, collapse = "', '"), "'." + )) + } + + # Ensure that we have at least two folds in the cross validation procedure + if ("v" %in% names(regression.vfold_cv_para) && regression.vfold_cv_para[["v"]] <= 1) { + stop("The parameter `v` in `regression.vfold_cv_para` must be strictly larger than 1.") + } + } +} + +#' Check that needed libraries are installed +#' +#' This function checks that the `parsnip`, `recipes`, `workflows`, `tune`, `dials`, +#' `yardstick`, `hardhat`, `rsample`, and `rlang` packages are available. +#' +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.check_namespaces <- function() { + namespaces <- c("parsnip", "recipes", "workflows", "tune", "dials", "yardstick", "hardhat", "rsample", "rlang") + for (namespace in namespaces) { + if (!requireNamespace(namespace, quietly = TRUE)) { + stop(paste0( + "`", namespace, "` is not installed. Please run `install.packages('", namespace, "')` to install ", + "it or run `install.packages('tidymodels')` to install all relevant packages." + )) + } + } +} + +# Message functions ==================================================================================================== +#' Produce time message for separate regression +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.separate_time_mess <- function() { + message(paste( + "When using `approach = 'regression_separate'` the `explanation$timing$timing_secs` object \n", + "can be missleading as `setup_computation` does not contain the training times of the \n", + "regression models as they are trained on the fly in `compute_vS`. This is to reduce memory \n", + "usage and to improve efficency.\n" + )) # TODO: should we add the time somewhere else? +} + +#' Produce message about which batch prepare_data is working on +#' @inheritParams default_doc +#' @inheritParams default_doc_explain +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.prep_message_batch <- function(internal, index_features) { + message(paste0( + "Working on batch ", internal$objects$X[id_combination == index_features[1]]$batch, " of ", + internal$parameters$n_batches, " in `prepare_data.", internal$parameters$approach, "()`." + )) +} + +#' Produce message about which combination prepare_data is working on +#' @inheritParams default_doc +#' @inheritParams default_doc_explain +#' @param comb_idx Integer. The index of the combination in a specific batch. +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.prep_message_comb <- function(internal, index_features, comb_idx) { + message(paste0( + "Working on combination with id ", internal$objects$X$id_combination[index_features[comb_idx]], + " of ", internal$parameters$used_n_combinations, "." + )) +} + +#' Produce message about which batch prepare_data is working on +#' +#' @param regression.results The results of the CV procedures. +#' @param regression.grid Object containing the hyperparameter values. +#' @param n_cv Integer (default is 10) specifying the number of CV hyperparameter configurations to print. +#' +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.cv_message <- function(regression.results, regression.grid, n_cv = 10) { + # Get the feature names and add evaluation metric rmse + feature_names <- names(regression.grid) + feature_names_rmse <- c(feature_names, "rmse", "rmse_std_err") + + # Let n_cv be the minimum of the provided value and the number of possible printouts + n_cv <- min(n_cv, nrow(regression.grid)) + + # Extract the n_cv best results + best_results <- tune::show_best(regression.results, n = n_cv, metric = "rmse") + + # Needed to make prinout tables prettier to ensure same column dimensions for all settings. + regression.grid_best <- best_results[, feature_names] + regression.grid_best$rmse <- round(best_results$mean, 2) + regression.grid_best$rmse_std <- round(best_results$std_err, 2) + width <- sapply(regression.grid_best, function(x) max(nchar(as.character(unique(x))))) + + # Message title of the results + message(paste0("Results of the ", best_results$n[1], "-fold cross validation (top ", n_cv, " best configurations):")) + + # Iterate over the n_cv best results and print out the hyper parameter values and the rmse and rmse_std_err + for (row_idx in seq_len(nrow(best_results))) { + best_result <- best_results[row_idx, ] + feature_values <- best_result[feature_names] + feature_values_rmse <- c( + feature_values, + format(round(best_result$mean, 2), nsmall = 2), format(round(best_result$std_err, 2), nsmall = 2) + ) + values_fixed_len <- sapply( + seq_along(feature_values_rmse), + function(x) format(as.character(feature_values_rmse[x]), width = width[x], justify = "left") + ) + message(paste0("#", row_idx, ": ", paste(paste(feature_names_rmse, "=", values_fixed_len), collapse = " "), "")) + } + + message("") # Empty message to get a blank line +} diff --git a/R/approach_regression_surrogate.R b/R/approach_regression_surrogate.R new file mode 100644 index 000000000..a61890694 --- /dev/null +++ b/R/approach_regression_surrogate.R @@ -0,0 +1,245 @@ +# Shapr functions ====================================================================================================== +#' @rdname setup_approach +#' +#' @inheritParams default_doc_explain +#' @inheritParams setup_approach.regression_separate +#' @param regression.surrogate_n_comb Integer (default is `internal$parameters$used_n_combinations`) specifying the +#' number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is +#' "`internal$parameters$used_n_combinations` - 2". By default, we use all coalitions, but this can take a lot of memory +#' in larger dimensions. Note that by "all", we mean all coalitions chosen by `shapr` to be used. This will be all +#' \eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if `shapr` is in the exact mode. If the +#' user sets a lower value than `internal$parameters$used_n_combinations`, then we sample this amount of unique +#' coalitions separately for each training observations. That is, on average, all coalitions should be equally trained. +#' +#' @export +#' @author Lars Henry Berge Olsen +setup_approach.regression_surrogate <- function(internal, + regression.model = parsnip::linear_reg(), + regression.tune_values = NULL, + regression.vfold_cv_para = NULL, + regression.recipe_func = NULL, + regression.surrogate_n_comb = + internal$parameters$used_n_combinations - 2, + ...) { + # Check that required libraries are installed + regression.check_namespaces() + + # Small printout to the user + if (internal$parameters$verbose == 2) message("Starting 'setup_approach.regression_surrogate'.") + + # Add the default parameter values for the non-user specified parameters for the separate regression approach + defaults <- mget(c( + "regression.model", "regression.tune_values", "regression.vfold_cv_para", + "regression.recipe_func", "regression.surrogate_n_comb" + )) + internal <- insert_defaults(internal, defaults) + + # Check the parameters to the regression approach + internal <- regression.check_parameters(internal) + + # Augment the training data + x_train_augmented <- regression.surrogate_aug_data( + internal = internal, x = internal$data$x_train, y_hat = internal$data$x_train_y_hat, augment_include_grand = TRUE + ) + + # Fit the surrogate regression model and store it in the internal list + if (internal$parameters$verbose == 2) message("Start training the surrogate model.") + internal$objects$regression.surrogate_model <- regression.train_model( + x = x_train_augmented, + seed = internal$parameters$seed, + verbose = internal$parameters$verbose, + regression.model = internal$parameters$regression.model, + regression.tune = internal$parameters$regression.tune, + regression.tune_values = internal$parameters$regression.tune_values, + regression.vfold_cv_para = internal$parameters$regression.vfold_cv_para, + regression.recipe_func = internal$parameters$regression.recipe_func, + regression.surrogate_n_comb = regression.surrogate_n_comb + 1 # Add 1 as augment_include_grand = TRUE above + ) + + # Small printout to the user + if (internal$parameters$verbose == 2) message("Done with 'setup_approach.regression_surrogate'.") + + return(internal) # Return the updated internal list +} + +#' @inheritParams default_doc +#' @rdname prepare_data +#' @export +#' @author Lars Henry Berge Olsen +prepare_data.regression_surrogate <- function(internal, index_features = NULL, ...) { + # Load `workflows`, needed when parallelized as we call predict with a workflow object. Checked installed above. + requireNamespace("workflows", quietly = TRUE) + + # Small printout to the user about which batch that are currently worked on + if (internal$parameters$verbose == 2) regression.prep_message_batch(internal, index_features) + + # Augment the explicand data + x_explain_aug <- regression.surrogate_aug_data(internal, x = internal$data$x_explain, index_features = index_features) + + # Compute the predicted response for the explicands, i.e., v(S, x_i) for all explicands x_i and S in index_features + pred_explicand <- predict(internal$objects$regression.surrogate_model, new_data = x_explain_aug)$.pred + + # Insert the predicted contribution functions values into a data table of the correct setup + dt_res <- data.table(as.integer(index_features), matrix(pred_explicand, nrow = length(index_features))) + data.table::setnames(dt_res, c("id_combination", paste0("p_hat1_", seq_len(internal$parameters$n_explain)))) + data.table::setkey(dt_res, id_combination) # Set id_combination to be the key + + return(dt_res) +} + +# Augment function ===================================================================================================== +#' Augment the training data and the explicands +#' +#' @inheritParams default_doc +#' @inheritParams regression.train_model +#' @param y_hat Vector of numerics (optional) containing the predicted responses for the observations in `x`. +#' @param index_features Array of integers (optional) containing which coalitions to consider. Must be provided if +#' `x` is the explicands. +#' @param augment_add_id_comb Logical (default is `FALSE`). If `TRUE`, an additional column is adding containing +#' which coalition was applied. +#' @param augment_include_grand Logical (default is `FALSE`). If `TRUE`, then the grand coalition is included. +#' If `index_features` are provided, then `augment_include_grand` has no effect. Note that if we sample the +#' combinations then the grand coalition is equally likely to be samples as the other coalitions (or weighted if +#' `augment_comb_prob` is provided). +#' @param augment_masks_as_factor Logical (default is `FALSE`). If `TRUE`, then the binary masks are converted +#' to factors. If `FALSE`, then the binary masks are numerics. +#' @param augment_comb_prob Array of numerics (default is `NULL`). The length of the array must match the number of +#' combinations being considered, where each entry specifies the probability of sampling the corresponding coalition. +#' This is useful if we want to generate more training data for some specific coalitions. One possible choice would be +#' `augment_comb_prob = if (use_Shapley_weights) internal$objects$X$shapley_weight[2:actual_n_combinations] else NULL`. +#' @param augment_weights String (optional). Specifying which type of weights to add to the observations. +#' If `NULL` (default), then no weights are added. If `"Shapley"`, then the Shapley weights for the different +#' combinations are added to corresponding observations where the coalitions was applied. If `uniform`, then +#' all observations get an equal weight of one. +#' +#' @return A data.table containing the augmented data. +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.surrogate_aug_data <- function(internal, + x, + y_hat = NULL, + index_features = NULL, + augment_masks_as_factor = FALSE, + augment_include_grand = FALSE, + augment_add_id_comb = FALSE, + augment_comb_prob = NULL, + augment_weights = NULL) { + # Get some of the parameters + S <- internal$objects$S + actual_n_combinations <- internal$parameters$used_n_combinations - 2 # Remove empty and grand coalitions + regression.surrogate_n_comb <- internal$parameters$regression.surrogate_n_comb + if (!is.null(index_features)) regression.surrogate_n_comb <- length(index_features) # Applicable from prep_data() + if (augment_include_grand) { + actual_n_combinations <- actual_n_combinations + 1 # Add 1 to include the grand comb + regression.surrogate_n_comb <- regression.surrogate_n_comb + 1 + } + if (regression.surrogate_n_comb > actual_n_combinations) regression.surrogate_n_comb <- actual_n_combinations + + # Small checks + if (!is.null(augment_weights)) augment_weights <- match.arg(augment_weights, c("Shapley", "uniform")) + + if (!is.null(augment_comb_prob) && length(augment_comb_prob) != actual_n_combinations) { + stop(paste("`augment_comb_prob` must be of length", actual_n_combinations, ".")) + } + + if (!is.null(augment_weights) && augment_include_grand && augment_weights == "Shapley") { + stop(paste( + "`augment_include_grand = TRUE` and `augment_weights = 'Shapley'` cannot occure", + "because this entails too large weight for the grand coalition." + )) + } + + # Get the number of observations (either the same as n_train or n_explain) + n_obs <- nrow(x) + + # Get the names of the categorical/factor features and the continuous/non-categorical/numeric features. + feature_classes <- internal$objects$feature_specs$classes + feature_cat <- names(feature_classes)[feature_classes == "factor"] + feature_cont <- names(feature_classes)[feature_classes != "factor"] + + # Get the indices of the order of the cat and cont features + feature_cat_idx <- which(names(feature_classes) %in% feature_cat) + feature_cont_idx <- which(names(feature_classes) %in% feature_cont) + + # Check if we are to augment the training data or the explicands + if (is.null(index_features)) { + # Training: get matrix (n_obs x regression.surrogate_n_comb) containing the indices of the active coalitions + if (regression.surrogate_n_comb >= actual_n_combinations) { # Start from two to exclude the empty set + comb_active_idx <- matrix(rep(seq(2, actual_n_combinations + 1), times = n_obs), ncol = n_obs) + } else { + comb_active_idx <- sapply(seq(n_obs), function(x) { # Add 1 as we want to exclude the empty set + sample.int(n = actual_n_combinations, size = regression.surrogate_n_comb, prob = augment_comb_prob) + 1 + }) + } + } else { + # Explicands: get matrix of dimension n_obs x #index_features containing the indices of the active coalitions + comb_active_idx <- matrix(rep(index_features, times = n_obs), ncol = n_obs) + } + + # Extract the active coalitions for each explicand. The number of rows are n_obs * n_comb_per_explicands, + # where the first n_comb_per_explicands rows are connected to the first explicand and so on. Set the column names. + id_comb <- as.vector(comb_active_idx) + comb_active <- S[id_comb, , drop = FALSE] + colnames(comb_active) <- names(feature_classes) + + # Repeat the feature values as many times as there are active coalitions + x_augmented <- x[rep(seq_len(n_obs), each = regression.surrogate_n_comb), ] + + # Mask the categorical features. Add a new level called "level_masked" when value is masked. + x_augmented[, (feature_cat) := lapply(seq_along(.SD), function(col) { + levels(.SD[[col]]) <- c(levels(.SD[[col]]), "level_masked") + .SD[[col]][comb_active[, feature_cat_idx[col]] == 0] <- "level_masked" + return(.SD[[col]]) + }), .SDcols = feature_cat] + + # Mask the continuous/non-categorical features + x_augmented[, (feature_cont) := + lapply(seq_along(.SD), function(col) .SD[[col]] * comb_active[, feature_cont_idx[col]]), + .SDcols = feature_cont + ] + + # Add new columns indicating when the continuous features are masked + if (length(feature_cont) > 0) { + masked_columns <- paste0("mask_", feature_cont) + x_augmented <- cbind(x_augmented, setNames(data.table(1 * (comb_active[, feature_cont_idx] == 0)), masked_columns)) + } + + # Convert the binary masks to factor if user has specified so + if (augment_masks_as_factor) x_augmented[, (masked_columns) := lapply(.SD, as.factor), .SDcols = masked_columns] + + # Add either uniform weights or Shapley kernel weights + if (!is.null(augment_weights)) { + x_augmented[, "weight" := if (augment_weights == "Shapley") internal$objects$X$shapley_weight[id_comb] else 1] + } + + # Add the id_comb as a factor + if (augment_add_id_comb) x_augmented[, "id_comb" := factor(id_comb)] + + # Add repeated responses if provided + if (!is.null(y_hat)) x_augmented[, "y_hat" := rep(y_hat, each = regression.surrogate_n_comb)] + + # Return the augmented data + return(x_augmented) +} + + +# Check function ======================================================================================================= +#' Check the `regression.surrogate_n_comb` parameter +#' +#' Check that `regression.surrogate_n_comb` is either NULL or a valid integer. +#' +#' @inheritParams setup_approach.regression_surrogate +#' @param used_n_combinations Integer. The number of used combinations (including the empty and grand coalitions). +#' +#' @author Lars Henry Berge Olsen +#' @keywords internal +regression.check_sur_n_comb <- function(regression.surrogate_n_comb, used_n_combinations) { + if (!is.null(regression.surrogate_n_comb)) { + if (regression.surrogate_n_comb < 1 || used_n_combinations - 2 < regression.surrogate_n_comb) { + stop(paste0( + "`regression.surrogate_n_comb` (", regression.surrogate_n_comb, ") must be a positive integer less than or ", + "equal to `used_n_combinations` minus two (", used_n_combinations - 2, ")." + )) + } + } +} diff --git a/R/approach_vaeac.R b/R/approach_vaeac.R index b7225914e..9468ee96e 100644 --- a/R/approach_vaeac.R +++ b/R/approach_vaeac.R @@ -1355,7 +1355,7 @@ vaeac_check_mask_gen <- function(mask_gen_coalitions, mask_gen_coalitions_prob, } } -#' Function the checks the verbose parameter +#' Function that checks the verbose parameter #' #' @inheritParams vaeac_train_model #' diff --git a/R/compute_vS.R b/R/compute_vS.R index 2ca5ed1ca..1c6deb190 100644 --- a/R/compute_vS.R +++ b/R/compute_vS.R @@ -11,18 +11,12 @@ compute_vS <- function(internal, model, predict_model, method = "future") { S_batch <- internal$objects$S_batch if (method == "future") { - ret <- future_compute_vS_batch( - S_batch = S_batch, - internal = internal, - model = model, - predict_model = predict_model - ) + ret <- future_compute_vS_batch(S_batch = S_batch, internal = internal, model = model, predict_model = predict_model) } else { # Doing the same as above without future without progressbar or paralellization ret <- list() for (i in seq_along(S_batch)) { S <- S_batch[[i]] - ret[[i]] <- batch_compute_vS( S = S, internal = internal, @@ -41,7 +35,6 @@ future_compute_vS_batch <- function(S_batch, internal, model, predict_model) { } else { p <- NULL } - ret <- future.apply::future_lapply( X = S_batch, FUN = batch_compute_vS, @@ -54,21 +47,67 @@ future_compute_vS_batch <- function(S_batch, internal, model, predict_model) { return(ret) } - #' @keywords internal +#' @author Martin Jullum, Lars Henry Berge Olsen batch_compute_vS <- function(S, internal, model, predict_model, p = NULL) { - keep_samp_for_vS <- internal$parameters$keep_samp_for_vS + regression <- internal$parameters$regression + + # Check if we are to use regression or Monte Carlo integration to compute the contribution function values + if (regression) { + dt_vS <- batch_prepare_vS_regression(S = S, internal = internal) + } else { + # Here dt_vS is either only dt_vS or a list containing dt_vS and dt if internal$parameters$keep_samp_for_vS = TRUE + dt_vS <- batch_prepare_vS_MC(S = S, internal = internal, model = model, predict_model = predict_model) + } + + # Update the progress bar if provided + # TODO: Add a message to state what batch has been computed + if (!is.null(p)) p(amount = length(S), message = "Estimating v(S)") + + return(dt_vS) +} + +#' @keywords internal +#' @author Lars Henry Berge Olsen +batch_prepare_vS_regression <- function(S, internal) { + max_id_comb <- internal$parameters$n_combinations + x_explain_y_hat <- internal$data$x_explain_y_hat + + # Compute the contribution functions different based on if the grand coalition is in S or not + if (!(max_id_comb %in% S)) { + dt <- prepare_data(internal, index_features = S) + } else { + # Remove the grand coalition. NULL is for the special case for when the batch only includes the grand coalition. + dt <- if (length(S) > 1) prepare_data(internal, index_features = S[S != max_id_comb]) else NULL + + # Add the results for the grand coalition (Need to add names in case the batch only contains the grand coalition) + dt <- rbind(dt, data.table(as.integer(max_id_comb), matrix(x_explain_y_hat, nrow = 1)), use.names = FALSE) + + # Need to add column names if batch S only contains the grand coalition + if (length(S) == 1) setnames(dt, c("id_combination", paste0("p_hat1_", seq_len(internal$parameters$n_explain)))) + } + + # Set id_combination to be the key + setkey(dt, id_combination) + + return(dt) +} + +#' @keywords internal +#' @author Martin Jullum, Lars Henry Berge Olsen +batch_prepare_vS_MC <- function(S, internal, model, predict_model) { + output_size <- internal$parameters$output_size feature_names <- internal$parameters$feature_names type <- internal$parameters$type horizon <- internal$parameters$horizon n_endo <- internal$data$n_endo - output_size <- internal$parameters$output_size explain_idx <- internal$parameters$explain_idx explain_lags <- internal$parameters$explain_lags y <- internal$data$y xreg <- internal$data$xreg + keep_samp_for_vS <- internal$parameters$keep_samp_for_vS - dt <- batch_prepare_vS(S = S, internal = internal) # Make it optional to store and return the dt_list + dt <- batch_prepare_vS_MC_auxiliary(S = S, internal = internal) # Make it optional to store and return the dt_list pred_cols <- paste0("p_hat", seq_len(output_size)) @@ -87,22 +126,13 @@ batch_compute_vS <- function(S, internal, model, predict_model, p = NULL) { xreg = xreg ) dt_vS <- compute_MCint(dt, pred_cols) - if (!is.null(p)) { - p( - amount = length(S), - message = "Estimating v(S)" - ) # TODO: Add a message to state what batch has been computed - } - if (keep_samp_for_vS) { - return(list(dt_vS = dt_vS, dt_samp_for_vS = dt)) - } else { - return(dt_vS = dt_vS) - } + # Also return the dt object if keep_samp_for_vS is TRUE + return(if (keep_samp_for_vS) list(dt_vS = dt_vS, dt_samp_for_vS = dt) else dt_vS) } #' @keywords internal -batch_prepare_vS <- function(S, internal) { +batch_prepare_vS_MC_auxiliary <- function(S, internal) { max_id_combination <- internal$parameters$n_combinations x_explain <- internal$data$x_explain n_explain <- internal$parameters$n_explain @@ -171,5 +201,5 @@ compute_MCint <- function(dt, pred_cols = "p_hat") { } # dt_mat[, id_combination := NULL] - dt_mat + return(dt_mat) } diff --git a/R/documentation.R b/R/documentation.R index 79df05266..608284687 100644 --- a/R/documentation.R +++ b/R/documentation.R @@ -39,3 +39,28 @@ default_doc <- function() { default_doc_explain <- function() { NULL } + + +#' Documentation of the approach-specific parameters in [shapr::explain()] +#' +#' @description +#' This helper function displays the specific arguments applicable to the different +#' approaches. Note that when calling [shapr::explain()] from Python, the parameters +#' are renamed from the form `approach.parameter_name` to `approach_parameter_name`. +#' That is, an underscore has replaced the dot as the dot is reserved in Python. +#' +#' @inheritDotParams setup_approach.independence -internal +#' @inheritDotParams setup_approach.empirical -internal -predict_model -model +#' @inheritDotParams setup_approach.categorical -internal +#' @inheritDotParams setup_approach.copula -internal +#' @inheritDotParams setup_approach.ctree -internal +#' @inheritDotParams setup_approach.gaussian -internal +#' @inheritDotParams setup_approach.regression_separate -internal +#' @inheritDotParams setup_approach.regression_surrogate -internal +#' @inheritDotParams setup_approach.timeseries -internal +#' @inheritDotParams setup_approach.vaeac -internal +#' +#' @author Lars Henry Berge Olsen and Martin Jullum +explain_tripledot_docs <- function(...) { + NULL +} diff --git a/R/explain.R b/R/explain.R index 008786ea4..3e1e10c97 100644 --- a/R/explain.R +++ b/R/explain.R @@ -18,8 +18,8 @@ #' #' @param approach Character vector of length `1` or one less than the number of features. #' All elements should, either be `"gaussian"`, `"copula"`, `"empirical"`, `"ctree"`, `"vaeac"`, -#' `"categorical"`, `"timeseries"`, or `"independence"`. -#' See details for more information. +#' `"categorical"`, `"timeseries"`, `"independence"`, `"regression_separate"`, or `"regression_surrogate"`. +#' The two regression approaches can not be combined with any other approach. See details for more information. #' #' @param prediction_zero Numeric. #' The prediction value for unseen data, i.e. an estimate of the expected prediction without conditioning on any @@ -104,12 +104,16 @@ #' @inheritDotParams setup_approach.ctree #' @inheritDotParams setup_approach.vaeac #' @inheritDotParams setup_approach.categorical +#' @inheritDotParams setup_approach.regression_separate +#' @inheritDotParams setup_approach.regression_surrogate #' @inheritDotParams setup_approach.timeseries #' -#' @details The most important thing to notice is that `shapr` has implemented six different -#' approaches for estimating the conditional distributions of the data, namely `"empirical"`, +#' @details The most important thing to notice is that `shapr` has implemented eight different +#' Monte Carlo-based approaches for estimating the conditional distributions of the data, namely `"empirical"`, #' `"gaussian"`, `"copula"`, `"ctree"`, `"vaeac"`, `"categorical"`, `"timeseries"`, and `"independence"`. -#' In addition, the user also has the option of combining the different approaches. +#' `shapr` has also implemented two regression-based approaches `"regression_separate"` and `"regression_surrogate"`, +#' and see the separate vignette on the regression-based approaches for more information. +#' In addition, the user also has the option of combining the different Monte Carlo-based approaches. #' E.g., if you're in a situation where you have trained a model that consists of 10 features, #' and you'd like to use the `"gaussian"` approach when you condition on a single feature, #' the `"empirical"` approach if you condition on 2-5 features, and `"copula"` version @@ -251,9 +255,33 @@ #' ) #' print(explain_groups$shapley_values) #' +#' # Separate and surrogate regression approaches with linear regression models. +#' # More complex regression models can be used, and we can use CV to +#' # tune the hyperparameters of the regression models and preprocess +#' # the data before sending it to the model. See the regression vignette +#' # (Shapley value explanations using the regression paradigm) for more +#' # details about the `regression_separate` and `regression_surrogate` approaches. +#' explain_separate_lm <- explain( +#' model = model, +#' x_explain = x_explain, +#' x_train = x_train, +#' prediction_zero = p, +#' approach = "regression_separate", +#' regression.model = parsnip::linear_reg() +#' ) +#' +#' explain_surrogate_lm <- explain( +#' model = model, +#' x_explain = x_explain, +#' x_train = x_train, +#' prediction_zero = p, +#' approach = "regression_surrogate", +#' regression.model = parsnip::linear_reg() +#' ) +#' #' @export #' -#' @author Martin Jullum +#' @author Martin Jullum, Lars Henry Berge Olsen #' #' @references #' Aas, K., Jullum, M., & Lland, A. (2021). Explaining individual predictions when features are dependent: @@ -276,9 +304,7 @@ explain <- function(model, verbose = 0, ...) { # ... is further arguments passed to specific approaches - timing_list <- list( - init_time = Sys.time() - ) + timing_list <- list(init_time = Sys.time()) set.seed(seed) @@ -309,10 +335,7 @@ explain <- function(model, timing_list$setup <- Sys.time() # Gets predict_model (if not passed to explain) - predict_model <- get_predict_model( - predict_model = predict_model, - model = model - ) + predict_model <- get_predict_model(predict_model = predict_model, model = model) # Checks that predict_model gives correct format test_predict_model( @@ -325,6 +348,12 @@ explain <- function(model, timing_list$test_prediction <- Sys.time() + # Add the predicted response of the training and explain data to the internal list for regression-based methods. + # Use isTRUE as `regression` is not present (NULL) for non-regression methods (i.e., Monte Carlo-based methods). + if (isTRUE(internal$parameters$regression)) { + internal <- regression.get_y_hat(internal = internal, model = model, predict_model = predict_model) + } + # Sets up the Shapley (sampling) framework and prepares the # conditional expectation computation for the chosen approach # Note: model and predict_model are ONLY used by the AICc-methods of approach empirical to find optimal parameters @@ -332,28 +361,25 @@ explain <- function(model, timing_list$setup_computation <- Sys.time() - # Compute the v(S): - # Get the samples for the conditional distributions with the specified approach - # Predict with these samples - # Perform MC integration on these to estimate the conditional expectation (v(S)) + # MC: + # 1. Get the samples for the conditional distributions with the specified approach + # 2. Predict with these samples + # 3. Perform MC integration on these to estimate the conditional expectation (v(S)) + # Regression: + # 1. Directly estimate the conditional expectation (v(S)) using the fitted regression model(s) vS_list <- compute_vS(internal, model, predict_model) timing_list$compute_vS <- Sys.time() - # Compute Shapley values based on conditional expectations (v(S)) # Organize function output - output <- finalize_explanation( - vS_list = vS_list, - internal = internal - ) + output <- finalize_explanation(vS_list = vS_list, internal = internal) timing_list$shapley_computation <- Sys.time() - if (timing == TRUE) { - output$timing <- compute_time(timing_list) - } + # Compute the elapsed time for the different steps + if (timing == TRUE) output$timing <- compute_time(timing_list) # Temporary to avoid failing tests output <- remove_outputs_to_pass_tests(output) @@ -376,5 +402,8 @@ remove_outputs_to_pass_tests <- function(output) { NULL } + # Remove the `regression` parameter from the output list when we are not doing regression + if (isFALSE(output$internal$parameters$regression)) output$internal$parameters$regression <- NULL + return(output) } diff --git a/R/explain_forecast.R b/R/explain_forecast.R index f2a48eb5b..f182e0c63 100644 --- a/R/explain_forecast.R +++ b/R/explain_forecast.R @@ -197,12 +197,26 @@ explain_forecast <- function(model, } # Temporary to avoid failing tests - if (isFALSE(output$internal$parameters$vaeac.save_model)) { - output$internal$parameters$vaeac$models <- NULL - output$internal$parameters$vaeac$parameters$folder_to_save_model <- NULL - output$internal$parameters$vaeac$parameters$model_description <- NULL + output <- remove_outputs_pass_tests_fore(output) + + return(output) +} + +#' @keywords internal +#' @author Lars Henry Berge Olsen +remove_outputs_pass_tests_fore <- function(output) { + # Temporary to avoid failing tests related to vaeac approach + if (isFALSE(output$internal$parameters$vaeac.extra_parameters$vaeac.save_model)) { + output$internal$parameters[c( + "vaeac", "vaeac.sampler", "vaeac.model", "vaeac.activation_function", "vaeac.checkpoint" + )] <- NULL + output$internal$parameters$vaeac.extra_parameters[c("vaeac.folder_to_save_model", "vaeac.model_description")] <- + NULL } + # Remove the `regression` parameter from the output list when we are not doing regression + if (isFALSE(output$internal$parameters$regression)) output$internal$parameters$regression <- NULL + return(output) } diff --git a/R/finalize_explanation.R b/R/finalize_explanation.R index 31ae74432..8cf68a230 100644 --- a/R/finalize_explanation.R +++ b/R/finalize_explanation.R @@ -7,7 +7,6 @@ #' #' @export finalize_explanation <- function(vS_list, internal) { - keep_samp_for_vS <- internal$parameters$keep_samp_for_vS MSEv_uniform_comb_weights <- internal$parameters$MSEv_uniform_comb_weights processed_vS_list <- postprocess_vS_list( diff --git a/R/model.R b/R/model.R index b706cbb0d..551d09b79 100644 --- a/R/model.R +++ b/R/model.R @@ -4,7 +4,8 @@ #' [stats::lm()], #' [stats::glm()], #' [ranger::ranger()], -#' [mgcv::gam()] and +#' [mgcv::gam()], +#' [workflows::workflow()] (i.e., `tidymodels` models), and #' [xgboost::xgb.train()] with binary or continuous #' response. See details for more information. #' @@ -21,6 +22,7 @@ #' \item [stats::glm()] #' \item [ranger::ranger()] #' \item [mgcv::gam()] +#' \item [workflows::workflow()] #' \item [xgboost::xgb.train()] #' } #' diff --git a/R/model_workflow.R b/R/model_workflow.R new file mode 100644 index 000000000..4665e95d2 --- /dev/null +++ b/R/model_workflow.R @@ -0,0 +1,30 @@ +#' @rdname predict_model +#' @export +predict_model.workflow <- function(x, newdata, ...) { + if (!requireNamespace("workflows", quietly = TRUE)) { + stop("The `workflows` package is required for predicting `workflows`") + } + predict(x, as.data.frame(newdata))$.pred +} + +#' @rdname get_model_specs +#' @export +get_model_specs.workflow <- function(x) { + model_checker(x) # Checking if the model is supported + var_info <- x$pre$actions$recipe$recipe$var_info + predictors <- var_info$variable[var_info$role == "predictor"] + template <- x$pre$actions$recipe$recipe$template[predictors] + feature_specs <- list() + feature_specs$labels <- colnames(template) + feature_specs$classes <- sapply(template, class) + feature_specs$classes[feature_specs$classes == "integer"] <- "numeric" # Integers to numerics, see `get_data_specs()` + feature_specs$factor_levels <- sapply(template, levels) + return(feature_specs) +} + +#' @rdname model_checker +#' @export +model_checker.workflow <- function(x) { + if (x$fit$actions$model$spec$mode != "regression") stop("We only support models with `mode = 'regression'`.") + return(NULL) +} diff --git a/R/setup.R b/R/setup.R index b627cc6db..5f2f2b548 100644 --- a/R/setup.R +++ b/R/setup.R @@ -71,33 +71,17 @@ setup <- function(x_train, # Sets up and organizes data if (type == "forecast") { - internal$data <- get_data_forecast( - y, - xreg, - train_idx, - explain_idx, - explain_y_lags, - explain_xreg_lags, - horizon - ) - - internal$parameters$output_labels <- cbind( - rep(explain_idx, horizon), - rep(seq_len(horizon), each = length(explain_idx)) - ) + internal$data <- get_data_forecast(y, xreg, train_idx, explain_idx, explain_y_lags, explain_xreg_lags, horizon) + internal$parameters$output_labels <- + cbind(rep(explain_idx, horizon), rep(seq_len(horizon), each = length(explain_idx))) colnames(internal$parameters$output_labels) <- c("explain_idx", "horizon") internal$parameters$explain_idx <- explain_idx internal$parameters$explain_lags <- list(y = explain_y_lags, xreg = explain_xreg_lags) # TODO: Consider handling this parameter update somewhere else (like in get_extra_parameters?) - if (group_lags) { - internal$parameters$group <- internal$data$group - } + if (group_lags) internal$parameters$group <- internal$data$group } else { - internal$data <- get_data( - x_train, - x_explain - ) + internal$data <- get_data(x_train, x_explain) } internal$objects <- list(feature_specs = feature_specs) @@ -106,7 +90,6 @@ setup <- function(x_train, internal <- get_extra_parameters(internal) # This includes both extra parameters and other objects - internal <- check_and_set_parameters(internal) return(internal) @@ -123,24 +106,14 @@ check_and_set_parameters <- function(internal) { is_groupwise <- internal$parameters$is_groupwise exact <- internal$parameters$exact + if (!is.null(group)) check_groups(feature_names, group) - if (!is.null(group)) { - check_groups(feature_names, group) - } - - if (!exact) { - if (!is_groupwise) { - internal$parameters$used_n_combinations <- min(2^n_features, n_combinations) - } else { - internal$parameters$used_n_combinations <- min(2^n_groups, n_combinations) - } - check_n_combinations(internal) + if (exact) { + internal$parameters$used_n_combinations <- if (is_groupwise) 2^n_groups else 2^n_features } else { - if (!is_groupwise) { - internal$parameters$used_n_combinations <- 2^n_features - } else { - internal$parameters$used_n_combinations <- 2^n_groups - } + internal$parameters$used_n_combinations <- + if (is_groupwise) min(2^n_groups, n_combinations) else min(2^n_features, n_combinations) + check_n_combinations(internal) } # Check approach @@ -152,6 +125,35 @@ check_and_set_parameters <- function(internal) { # Checking n_batches vs n_combinations etc check_n_batches(internal) + # Check regression if we are doing regression + if (internal$parameters$regression) internal <- regression.check(internal) + + return(internal) +} + +#' @keywords internal +#' @author Lars Henry Berge Olsen +regression.check <- function(internal) { + # Check that the model outputs one-dimensional predictions + if (internal$parameters$output_size != 1) { + stop("`regression_separate` and `regression_surrogate` only support models with one-dimensional output") + } + + # Check that we are NOT explaining a forecast model + if (internal$parameters$type == "forecast") { + stop("`regression_separate` and `regression_surrogate` does not support `forecast`.") + } + + # Check that we are not to keep the Monte Carlo samples + if (internal$parameters$keep_samp_for_vS) { + stop(paste( + "`keep_samp_for_vS` must be `FALSE` for the `regression_separate` and `regression_surrogate`", + "approaches as there are no Monte Carlo samples to keep for these approaches." + )) + } + + # Remove n_samples if we are doing regression, as we are not doing MC sampling + internal$parameters$n_samples <- NULL return(internal) } @@ -191,13 +193,9 @@ check_n_combinations <- function(internal) { } } else { if (!is_groupwise) { - if (n_combinations <= n_features) { - stop("`n_combinations` has to be greater than the number of features.") - } + if (n_combinations <= n_features) stop("`n_combinations` has to be greater than the number of features.") } else { - if (n_combinations <= n_groups) { - stop("`n_combinations` has to be greater than the number of groups.") - } + if (n_combinations <= n_groups) stop("`n_combinations` has to be greater than the number of groups.") } } } @@ -255,7 +253,6 @@ check_data <- function(internal) { NA_classes <- any(is.na(model_feature_specs$classes)) NA_factor_levels <- any(is.na(model_feature_specs$factor_levels)) - if (is.null(model_feature_specs)) { message( "Note: You passed a model to explain() which is not natively supported, and did not supply a ", @@ -288,7 +285,6 @@ check_data <- function(internal) { model_feature_specs$factor_levels <- x_train_feature_specs$factor_levels } - # Check model vs x_train (allowing different label ordering in specs from model) compare_feature_specs(model_feature_specs, x_train_feature_specs, "model", "x_train", sort_labels = TRUE) @@ -523,6 +519,9 @@ get_parameters <- function(approach, prediction_zero, output_size = 1, n_combina # Setting exact based on n_combinations (TRUE if NULL) parameters$exact <- ifelse(is.null(parameters$n_combinations), TRUE, FALSE) + # Setting that we are using regression based the approach name (any in case several approaches) + parameters$regression <- any(grepl("regression", parameters$approach)) + return(parameters) } @@ -603,8 +602,6 @@ get_data_specs <- function(x) { return(feature_specs) } - - #' Check that the group parameter has the right form and content #' #' @@ -684,13 +681,17 @@ check_approach <- function(internal) { all(is.element(approach, supported_approaches))) ) { stop( - paste( - "`approach` must be one of the following: \n", paste0(supported_approaches, collapse = ", "), "\n", - "or a vector of length one less than the number of features (", n_features - 1, "),", - "with only the above strings." + paste0( + "`approach` must be one of the following: '", paste0(supported_approaches, collapse = "', '"), "'.\n", + "These can also be combined (except 'regression_surrogate' and 'regression_separate') by passing a vector ", + "of length one less than the number of features (", n_features - 1, ")." ) ) } + + if (length(approach) > 1 && any(grepl("regression", approach))) { + stop("The `regression_separate` and `regression_surrogate` approaches cannot be combined with other approaches.") + } } #' @keywords internal diff --git a/R/setup_computation.R b/R/setup_computation.R index 195e1931e..dad9b6240 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -9,16 +9,11 @@ setup_computation <- function(internal, model, predict_model) { type <- internal$parameters$type # setup the Shapley framework - if (type == "forecast") { - internal <- shapley_setup_forecast(internal) - } else { - internal <- shapley_setup(internal) - } + internal <- if (type == "forecast") shapley_setup_forecast(internal) else shapley_setup(internal) # Setup for approach internal <- setup_approach(internal, model = model, predict_model = predict_model) - return(internal) } diff --git a/R/zzz.R b/R/zzz.R index 47318dbd9..cc55b6de6 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -104,8 +104,11 @@ "val_dataloader", "x_train", "x_train_preprocessed", - "x_train_torch" + "x_train_torch", + "..current_comb", + "..regression.response_var" ) ) + invisible() } diff --git a/_pkgdown.yml b/_pkgdown.yml index 33ffc2573..836c4da3c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -12,6 +12,8 @@ navbar: href: articles/understanding_shapr.html - text: "Advanced usage of the `vaeac` approach" href: articles/understanding_shapr_vaeac.html + - text: "The separate and surrogate regression approches" + href: articles/understanding_shapr_regression.html news: text: News href: news/index.html diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 4f2e4df24..ccce694e2 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -143,14 +143,15 @@ @article{olsen2022using year={2022} } -@article{olsen2023comparative, - title={A Comparative Study of Methods for Estimating Conditional Shapley Values and When to Use Them}, +@article{olsen2024comparative, + title={A comparative study of methods for estimating model-agnostic Shapley value explanations}, author={Olsen, Lars Henry Berge and Glad, Ingrid Kristine and Jullum, Martin and Aas, Kjersti}, - journal={arXiv preprint arXiv:2305.09536}, - year={2023} + journal={Data Mining and Knowledge Discovery}, + pages={1--48}, + year={2024}, + publisher={Springer} } - @inproceedings{frye2020shapley, title={Shapley explainability on the data manifold}, author={Christopher Frye and Damien de Mijolla and Tom Begley and Laurence Cowton and Megan Stanley and Ilya Feige}, @@ -181,3 +182,11 @@ @Manual{torch note = {R package version 0.11.0}, url = {https://CRAN.R-project.org/package=torch}, } + +@Manual{tidymodels, + title = {Tidymodels: a collection of packages for modeling and machine learning using tidyverse principles.}, + author = {Max Kuhn and Hadley Wickham}, + url = {https://www.tidymodels.org}, + year = {2020}, + } + diff --git a/inst/scripts/check_model_workflow.R b/inst/scripts/check_model_workflow.R new file mode 100644 index 000000000..01799eae1 --- /dev/null +++ b/inst/scripts/check_model_workflow.R @@ -0,0 +1,169 @@ +# Compare xgboost with parsnip version ---------------------------------------------------------------------------- +# Either use library(tidymodels) or separately specify the libraries +library(parsnip) +library(ggplot2) +library(recipes) +library(workflows) +library(dials) +library(hardhat) +library(workflows) +library(yardstick) + +data("airquality") +data <- data.table::as.data.table(airquality) +data <- data[complete.cases(data), ] + +x_var <- c("Solar.R", "Wind", "Temp", "Month") +y_var <- "Ozone" +all_var <- c(y_var, x_var) + +ind_x_explain <- 1:20 +x_train <- data[-ind_x_explain, ..x_var] +y_train <- data[-ind_x_explain, get(y_var)] +x_explain <- data[ind_x_explain, ..x_var] +train <- data[-ind_x_explain, ..all_var] +test <- data[ind_x_explain, ..all_var] + +# Specifying the phi_0, i.e. the expected prediction without any features +p0 <- mean(y_train) + +# Fitting a basic xgboost model to the training data using tidymodels +set.seed(1) +model_xgboost <- xgboost::xgboost( + data = as.matrix(x_train), + label = y_train, + nround = 10, + verbose = FALSE +) + +set.seed(1) +model_workflow <- workflows::workflow() %>% + workflows::add_model(parsnip::boost_tree(trees = 10, engine = "xgboost", mode = "regression")) %>% + workflows::add_recipe(recipes::recipe(Ozone ~ ., data = train)) %>% + parsnip::fit(data = test) + +# See that the predictions are identical +all.equal(predict(model_workflow, x_train)$.pred, predict(model_xgboost, as.matrix(x_train))) + +explain_workflow = explain( + model = model_workflow, + x_explain = x_explain, + x_train = x_train, + approach = "empirical", + prediction_zero = p0, + n_batches = 4 +) + +explain_xgboost = explain( + model = model_xgboost, + x_explain = x_explain, + x_train = x_train, + approach = "empirical", + prediction_zero = p0, + n_batches = 4 +) + +# See that the shapley values are identical +all.equal(explain_workflow$shapley_values, explain_xgboost$shapley_values) + +# Other models in workflow --------------------------------------------------------------------------------------------- +set.seed(1) +data <- data.table::as.data.table(airquality) +data[, Month_factor := as.factor(Month)] +data[, Ozone_sub30 := (Ozone < 30) * 1] +data[, Ozone_sub30_factor := as.factor(Ozone_sub30)] +data[, Solar.R_factor := as.factor(cut(Solar.R, 10))] +data[, Wind_factor := as.factor(round(Wind))] + +data_complete <- data[complete.cases(airquality), ] +data_complete <- data_complete[sample(seq_len(.N))] # Sh + +x_var_mixed <- c("Solar.R", "Wind", "Temp", "Day", "Month_factor") +var_mixed <- c("Ozone", x_var_mixed) + +data_train <- head(data_complete, -3) +data_explain <- tail(data_complete, 3) + +x_train_mixed <- data_train[, ..x_var_mixed] +x_explain_mixed <- data_explain[, ..x_var_mixed] +train_mixed <- data_train[, ..var_mixed] + +model_decision_tree <- workflows::workflow() %>% + workflows::add_model(parsnip::decision_tree(engine = "rpart", mode = "regression")) %>% + workflows::add_recipe(recipes::recipe(Ozone ~ ., data = train_mixed) %>% + recipes::step_dummy(all_factor_predictors())) %>% + parsnip::fit(data = train_mixed) + +y_var_numeric <- "Ozone" +lm_formula_mixed <- as.formula(paste0(y_var_numeric, " ~ ", paste0(x_var_mixed, collapse = " + "))) +model_lm_mixed <- lm(lm_formula_mixed, data = data_complete) + +explain_decision_tree_ctree = explain( + model = model_decision_tree, + x_explain = x_explain_mixed, + x_train = x_train_mixed, + approach = "ctree", + prediction_zero = p0, + n_batches = 4 +) + +explain_decision_tree_lm = explain( + model = model_decision_tree, #model_lm_mixed + x_explain = x_explain_mixed, + x_train = x_train_mixed, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + prediction_zero = p0, + n_batches = 4 +) + +# + +# CV ------------------------------------------------------------------------------------------------------------------- +set.seed(1) +regression.workflow <- workflows::workflow() %>% + workflows::add_model(parsnip::rand_forest( + trees = hardhat::tune(), engine = "ranger", mode = "regression" + )) %>% + workflows::add_recipe(recipes::recipe(Ozone ~ ., data = train_mixed) %>% + recipes::step_dummy(all_factor_predictors())) + +# Add the hyperparameter tuning to the workflow +regression.results <- tune::tune_grid( + object = regression.workflow, + resamples = rsample::vfold_cv(data = train_mixed, v = 3), + grid = dials::grid_regular(dials::trees(c(50, 750)), levels = 3), + metrics = yardstick::metric_set(yardstick::rmse) +) + +# Update the workflow by finalizing it using the hyperparameters that attained the best rmse +regression.workflow <- tune::finalize_workflow(regression.workflow, tune::select_best(regression.results, "rmse")) + +# Fit the model to the augmented training data +model_rf_cv <- parsnip::fit(regression.workflow, data = train_mixed) + +# See that the model works with regression +explain_decision_model_rf_cv_rf = explain( + model = model_rf_cv, #model_lm_mixed + x_explain = x_explain_mixed, + x_train = x_train_mixed, + approach = "regression_separate", + regression.model = parsnip::rand_forest(engine = "ranger", mode = "regression"), + prediction_zero = p0, + n_batches = 4 +) + +# See that the model works with MC method too +explain_decision_model_rf_cv_ctree = explain( + model = model_rf_cv, #model_lm_mixed + x_explain = x_explain_mixed, + x_train = x_train_mixed, + approach = "ctree", + prediction_zero = p0, + n_batches = 4 +) + +# Quite similar +plot_MSEv_eval_crit(list(ctree = explain_decision_model_rf_cv_ctree, rf = explain_decision_model_rf_cv_rf)) +plot_SV_several_approaches(list(ctree = explain_decision_model_rf_cv_ctree, rf = explain_decision_model_rf_cv_rf)) + diff --git a/inst/scripts/example_plot_several_vaeacs_VLB_IWAE.R b/inst/scripts/example_plot_several_vaeacs_VLB_IWAE.R index b85ece096..a364a9ce4 100644 --- a/inst/scripts/example_plot_several_vaeacs_VLB_IWAE.R +++ b/inst/scripts/example_plot_several_vaeacs_VLB_IWAE.R @@ -119,7 +119,7 @@ vaeac_plot_eval_crit(explanation_list = explanation_list_named) # The function also works if we have only one method, but then one should only look at the method plot vaeac_plot_eval_crit(explanation_list = list("Paired samp. & large NN" = explanation_paired_sampling_TRUE), - plot_type = "method") + plot_type = "method") # Can alter the plot vaeac_plot_eval_crit( diff --git a/man/explain.Rd b/man/explain.Rd index a45f7266e..2b121b12d 100644 --- a/man/explain.Rd +++ b/man/explain.Rd @@ -40,8 +40,8 @@ needed to properly estimate the conditional expectations in the Shapley formula. \item{approach}{Character vector of length \code{1} or one less than the number of features. All elements should, either be \code{"gaussian"}, \code{"copula"}, \code{"empirical"}, \code{"ctree"}, \code{"vaeac"}, -\code{"categorical"}, \code{"timeseries"}, or \code{"independence"}. -See details for more information.} +\code{"categorical"}, \code{"timeseries"}, \code{"independence"}, \code{"regression_separate"}, or \code{"regression_surrogate"}. +The two regression approaches can not be combined with any other approach. See details for more information.} \item{prediction_zero}{Numeric. The prediction value for unseen data, i.e. an estimate of the expected prediction without conditioning on any @@ -118,7 +118,7 @@ Use \code{0} (default) for no verbosity, \code{1} for low verbose, and \code{2} TODO: Make this clearer when we end up fixing this and if they should force a progressr bar.} \item{...}{ - Arguments passed on to \code{\link[=setup_approach.empirical]{setup_approach.empirical}}, \code{\link[=setup_approach.independence]{setup_approach.independence}}, \code{\link[=setup_approach.gaussian]{setup_approach.gaussian}}, \code{\link[=setup_approach.copula]{setup_approach.copula}}, \code{\link[=setup_approach.ctree]{setup_approach.ctree}}, \code{\link[=setup_approach.vaeac]{setup_approach.vaeac}}, \code{\link[=setup_approach.categorical]{setup_approach.categorical}}, \code{\link[=setup_approach.timeseries]{setup_approach.timeseries}} + Arguments passed on to \code{\link[=setup_approach.empirical]{setup_approach.empirical}}, \code{\link[=setup_approach.independence]{setup_approach.independence}}, \code{\link[=setup_approach.gaussian]{setup_approach.gaussian}}, \code{\link[=setup_approach.copula]{setup_approach.copula}}, \code{\link[=setup_approach.ctree]{setup_approach.ctree}}, \code{\link[=setup_approach.vaeac]{setup_approach.vaeac}}, \code{\link[=setup_approach.categorical]{setup_approach.categorical}}, \code{\link[=setup_approach.regression_separate]{setup_approach.regression_separate}}, \code{\link[=setup_approach.regression_surrogate]{setup_approach.regression_surrogate}}, \code{\link[=setup_approach.timeseries]{setup_approach.timeseries}} \describe{ \item{\code{empirical.type}}{Character. (default = \code{"fixed_sigma"}) Should be equal to either \code{"independence"},\code{"fixed_sigma"}, \code{"AICc_each_k"} \code{"AICc_full"}. @@ -191,6 +191,40 @@ If \code{joint_probability_dt} is not supplied, probabilities/frequencies are estimated using \code{x_train}. If certain observations occur in \code{x_train} and NOT in \code{x_explain}, then epsilon is used as the proportion of times that these observations occurs in the training data. In theory, this proportion should be zero, but this causes an error later in the Shapley computation.} + \item{\code{regression.model}}{A \code{tidymodels} object of class \code{model_specs}. Default is a linear regression model, i.e., +\code{\link[parsnip:linear_reg]{parsnip::linear_reg()}}. See \href{https://www.tidymodels.org/find/parsnip/}{tidymodels} for all possible models, +and see the vignette for how to add new/own models. Note, to make it easier to call \code{explain()} from Python, the +\code{regression.model} parameter can also be a string specifying the model which will be parsed and evaluated. For +example, \verb{"parsnip::rand_forest(mtry = hardhat::tune(), trees = 100, engine = "ranger", mode = "regression")"} +is also a valid input. It is essential to include the package prefix if the package is not loaded.} + \item{\code{regression.tune_values}}{Either \code{NULL} (default), a data.frame/data.table/tibble, or a function. +The data.frame must contain the possible hyperparameter value combinations to try. +The column names must match the names of the tuneable parameters specified in \code{regression.model}. +If \code{regression.tune_values} is a function, then it should take one argument \code{x} which is the training data +for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above. +Using a function allows the hyperparameter values to change based on the size of the combination. See the regression +vignette for several examples. +Note, to make it easier to call \code{explain()} from Python, the \code{regression.tune_values} can also be a string +containing an R function. For example, +\code{"function(x) return(dials::grid_regular(dials::mtry(c(1, ncol(x)))), levels = 3))"} is also a valid input. +It is essential to include the package prefix if the package is not loaded.} + \item{\code{regression.vfold_cv_para}}{Either \code{NULL} (default) or a named list containing +the parameters to be sent to \code{\link[rsample:vfold_cv]{rsample::vfold_cv()}}. See the regression vignette for +several examples.} + \item{\code{regression.recipe_func}}{Either \code{NULL} (default) or a function that that takes in a \code{\link[recipes:recipe]{recipes::recipe()}} +object and returns a modified \code{\link[recipes:recipe]{recipes::recipe()}} with potentially additional recipe steps. See the regression +vignette for several examples. +Note, to make it easier to call \code{explain()} from Python, the \code{regression.recipe_func} can also be a string +containing an R function. For example, +\code{"function(recipe) return(recipes::step_ns(recipe, recipes::all_numeric_predictors(), deg_free = 2))"} is also +a valid input. It is essential to include the package prefix if the package is not loaded.} + \item{\code{regression.surrogate_n_comb}}{Integer (default is \code{internal$parameters$used_n_combinations}) specifying the +number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is +"\code{internal$parameters$used_n_combinations} - 2". By default, we use all coalitions, but this can take a lot of memory +in larger dimensions. Note that by "all", we mean all coalitions chosen by \code{shapr} to be used. This will be all +\eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in the exact mode. If the +user sets a lower value than \code{internal$parameters$used_n_combinations}, then we sample this amount of unique +coalitions separately for each training observations. That is, on average, all coalitions should be equally trained.} \item{\code{timeseries.fixed_sigma_vec}}{Numeric. (Default = 2) Represents the kernel bandwidth in the distance computation. TODO: What length should it have? 1?} \item{\code{timeseries.bounds}}{Numeric vector of length two. (Default = c(NULL, NULL)) @@ -234,10 +268,12 @@ Computes dependence-aware Shapley values for observations in \code{x_explain} fr \code{model} by using the method specified in \code{approach} to estimate the conditional expectation. } \details{ -The most important thing to notice is that \code{shapr} has implemented six different -approaches for estimating the conditional distributions of the data, namely \code{"empirical"}, +The most important thing to notice is that \code{shapr} has implemented eight different +Monte Carlo-based approaches for estimating the conditional distributions of the data, namely \code{"empirical"}, \code{"gaussian"}, \code{"copula"}, \code{"ctree"}, \code{"vaeac"}, \code{"categorical"}, \code{"timeseries"}, and \code{"independence"}. -In addition, the user also has the option of combining the different approaches. +\code{shapr} has also implemented two regression-based approaches \code{"regression_separate"} and \code{"regression_surrogate"}, +and see the separate vignette on the regression-based approaches for more information. +In addition, the user also has the option of combining the different Monte Carlo-based approaches. E.g., if you're in a situation where you have trained a model that consists of 10 features, and you'd like to use the \code{"gaussian"} approach when you condition on a single feature, the \code{"empirical"} approach if you condition on 2-5 features, and \code{"copula"} version @@ -349,11 +385,35 @@ explain_groups <- explain( ) print(explain_groups$shapley_values) +# Separate and surrogate regression approaches with linear regression models. +# More complex regression models can be used, and we can use CV to +# tune the hyperparameters of the regression models and preprocess +# the data before sending it to the model. See the regression vignette +# (Shapley value explanations using the regression paradigm) for more +# details about the `regression_separate` and `regression_surrogate` approaches. +explain_separate_lm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p, + approach = "regression_separate", + regression.model = parsnip::linear_reg() +) + +explain_surrogate_lm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p, + approach = "regression_surrogate", + regression.model = parsnip::linear_reg() +) + } \references{ Aas, K., Jullum, M., & Lland, A. (2021). Explaining individual predictions when features are dependent: More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502. } \author{ -Martin Jullum +Martin Jullum, Lars Henry Berge Olsen } diff --git a/man/explain_forecast.Rd b/man/explain_forecast.Rd index 57f351971..91565d96d 100644 --- a/man/explain_forecast.Rd +++ b/man/explain_forecast.Rd @@ -67,8 +67,8 @@ The forecast horizon to explain. Passed to the \code{predict_model} function.} \item{approach}{Character vector of length \code{1} or one less than the number of features. All elements should, either be \code{"gaussian"}, \code{"copula"}, \code{"empirical"}, \code{"ctree"}, \code{"vaeac"}, -\code{"categorical"}, \code{"timeseries"}, or \code{"independence"}. -See details for more information.} +\code{"categorical"}, \code{"timeseries"}, \code{"independence"}, \code{"regression_separate"}, or \code{"regression_surrogate"}. +The two regression approaches can not be combined with any other approach. See details for more information.} \item{prediction_zero}{Numeric. The prediction value for unseen data, i.e. an estimate of the expected prediction without conditioning on any @@ -301,5 +301,5 @@ Aas, K., Jullum, M., & Lland, A. (2021). Explaining individual predictio More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502. } \author{ -Martin Jullum +Martin Jullum, Lars Henry Berge Olsen } diff --git a/man/explain_tripledot_docs.Rd b/man/explain_tripledot_docs.Rd new file mode 100644 index 000000000..a739b97b5 --- /dev/null +++ b/man/explain_tripledot_docs.Rd @@ -0,0 +1,133 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/documentation.R +\name{explain_tripledot_docs} +\alias{explain_tripledot_docs} +\title{Documentation of the approach-specific parameters in \code{\link[=explain]{explain()}}} +\usage{ +explain_tripledot_docs(...) +} +\arguments{ +\item{...}{ + Arguments passed on to \code{\link[=setup_approach.independence]{setup_approach.independence}}, \code{\link[=setup_approach.empirical]{setup_approach.empirical}}, \code{\link[=setup_approach.categorical]{setup_approach.categorical}}, \code{\link[=setup_approach.copula]{setup_approach.copula}}, \code{\link[=setup_approach.ctree]{setup_approach.ctree}}, \code{\link[=setup_approach.gaussian]{setup_approach.gaussian}}, \code{\link[=setup_approach.regression_separate]{setup_approach.regression_separate}}, \code{\link[=setup_approach.regression_surrogate]{setup_approach.regression_surrogate}}, \code{\link[=setup_approach.timeseries]{setup_approach.timeseries}}, \code{\link[=setup_approach.vaeac]{setup_approach.vaeac}} + \describe{ + \item{\code{empirical.type}}{Character. (default = \code{"fixed_sigma"}) +Should be equal to either \code{"independence"},\code{"fixed_sigma"}, \code{"AICc_each_k"} \code{"AICc_full"}. +TODO: Describe better what the methods do here.} + \item{\code{empirical.eta}}{Numeric. (default = 0.95) +Needs to be \verb{0 < eta <= 1}. +Represents the minimum proportion of the total empirical weight that data samples should use. +If e.g. \code{eta = .8} we will choose the \code{K} samples with the largest weight so that the sum of the weights +accounts for 80\\% of the total weight. +\code{eta} is the \eqn{\eta} parameter in equation (15) of Aas et al (2021).} + \item{\code{empirical.fixed_sigma}}{Positive numeric scalar. (default = 0.1) +Represents the kernel bandwidth in the distance computation used when conditioning on all different combinations. +Only used when \code{empirical.type = "fixed_sigma"}} + \item{\code{empirical.n_samples_aicc}}{Positive integer. (default = 1000) +Number of samples to consider in AICc optimization. +Only used for \code{empirical.type} is either \code{"AICc_each_k"} or \code{"AICc_full"}.} + \item{\code{empirical.eval_max_aicc}}{Positive integer. (default = 20) +Maximum number of iterations when optimizing the AICc. +Only used for \code{empirical.type} is either \code{"AICc_each_k"} or \code{"AICc_full"}.} + \item{\code{empirical.start_aicc}}{Numeric. (default = 0.1) +Start value of the \code{sigma} parameter when optimizing the AICc. +Only used for \code{empirical.type} is either \code{"AICc_each_k"} or \code{"AICc_full"}.} + \item{\code{empirical.cov_mat}}{Numeric matrix. (Optional, default = NULL) +Containing the covariance matrix of the data generating distribution used to define the Mahalanobis distance. +\code{NULL} means it is estimated from \code{x_train}.} + \item{\code{categorical.joint_prob_dt}}{Data.table. (Optional) +Containing the joint probability distribution for each combination of feature +values. +\code{NULL} means it is estimated from the \code{x_train} and \code{x_explain}.} + \item{\code{categorical.epsilon}}{Numeric value. (Optional) +If \code{joint_probability_dt} is not supplied, probabilities/frequencies are +estimated using \code{x_train}. If certain observations occur in \code{x_train} and NOT in \code{x_explain}, +then epsilon is used as the proportion of times that these observations occurs in the training data. +In theory, this proportion should be zero, but this causes an error later in the Shapley computation.} + \item{\code{ctree.mincriterion}}{Numeric scalar or vector. (default = 0.95) +Either a scalar or vector of length equal to the number of features in the model. +Value is equal to 1 - \eqn{\alpha} where \eqn{\alpha} is the nominal level of the conditional independence tests. +If it is a vector, this indicates which value to use when conditioning on various numbers of features.} + \item{\code{ctree.minsplit}}{Numeric scalar. (default = 20) +Determines minimum value that the sum of the left and right daughter nodes required for a split.} + \item{\code{ctree.minbucket}}{Numeric scalar. (default = 7) +Determines the minimum sum of weights in a terminal node required for a split} + \item{\code{ctree.sample}}{Boolean. (default = TRUE) +If TRUE, then the method always samples \code{n_samples} observations from the leaf nodes (with replacement). +If FALSE and the number of observations in the leaf node is less than \code{n_samples}, +the method will take all observations in the leaf. +If FALSE and the number of observations in the leaf node is more than \code{n_samples}, +the method will sample \code{n_samples} observations (with replacement). +This means that there will always be sampling in the leaf unless +\code{sample} = FALSE AND the number of obs in the node is less than \code{n_samples}.} + \item{\code{gaussian.mu}}{Numeric vector. (Optional) +Containing the mean of the data generating distribution. +\code{NULL} means it is estimated from the \code{x_train}.} + \item{\code{gaussian.cov_mat}}{Numeric matrix. (Optional) +Containing the covariance matrix of the data generating distribution. +\code{NULL} means it is estimated from the \code{x_train}.} + \item{\code{regression.model}}{A \code{tidymodels} object of class \code{model_specs}. Default is a linear regression model, i.e., +\code{\link[parsnip:linear_reg]{parsnip::linear_reg()}}. See \href{https://www.tidymodels.org/find/parsnip/}{tidymodels} for all possible models, +and see the vignette for how to add new/own models. Note, to make it easier to call \code{explain()} from Python, the +\code{regression.model} parameter can also be a string specifying the model which will be parsed and evaluated. For +example, \verb{"parsnip::rand_forest(mtry = hardhat::tune(), trees = 100, engine = "ranger", mode = "regression")"} +is also a valid input. It is essential to include the package prefix if the package is not loaded.} + \item{\code{regression.tune_values}}{Either \code{NULL} (default), a data.frame/data.table/tibble, or a function. +The data.frame must contain the possible hyperparameter value combinations to try. +The column names must match the names of the tuneable parameters specified in \code{regression.model}. +If \code{regression.tune_values} is a function, then it should take one argument \code{x} which is the training data +for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above. +Using a function allows the hyperparameter values to change based on the size of the combination. See the regression +vignette for several examples. +Note, to make it easier to call \code{explain()} from Python, the \code{regression.tune_values} can also be a string +containing an R function. For example, +\code{"function(x) return(dials::grid_regular(dials::mtry(c(1, ncol(x)))), levels = 3))"} is also a valid input. +It is essential to include the package prefix if the package is not loaded.} + \item{\code{regression.vfold_cv_para}}{Either \code{NULL} (default) or a named list containing +the parameters to be sent to \code{\link[rsample:vfold_cv]{rsample::vfold_cv()}}. See the regression vignette for +several examples.} + \item{\code{regression.recipe_func}}{Either \code{NULL} (default) or a function that that takes in a \code{\link[recipes:recipe]{recipes::recipe()}} +object and returns a modified \code{\link[recipes:recipe]{recipes::recipe()}} with potentially additional recipe steps. See the regression +vignette for several examples. +Note, to make it easier to call \code{explain()} from Python, the \code{regression.recipe_func} can also be a string +containing an R function. For example, +\code{"function(recipe) return(recipes::step_ns(recipe, recipes::all_numeric_predictors(), deg_free = 2))"} is also +a valid input. It is essential to include the package prefix if the package is not loaded.} + \item{\code{regression.surrogate_n_comb}}{Integer (default is \code{internal$parameters$used_n_combinations}) specifying the +number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is +"\code{internal$parameters$used_n_combinations} - 2". By default, we use all coalitions, but this can take a lot of memory +in larger dimensions. Note that by "all", we mean all coalitions chosen by \code{shapr} to be used. This will be all +\eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in the exact mode. If the +user sets a lower value than \code{internal$parameters$used_n_combinations}, then we sample this amount of unique +coalitions separately for each training observations. That is, on average, all coalitions should be equally trained.} + \item{\code{timeseries.fixed_sigma_vec}}{Numeric. (Default = 2) +Represents the kernel bandwidth in the distance computation. TODO: What length should it have? 1?} + \item{\code{timeseries.bounds}}{Numeric vector of length two. (Default = c(NULL, NULL)) +If one or both of these bounds are not NULL, we restrict the sampled time series to be +between these bounds. +This is useful if the underlying time series are scaled between 0 and 1, for example.} + \item{\code{vaeac.depth}}{Positive integer (default is \code{3}). The number of hidden layers +in the neural networks of the masked encoder, full encoder, and decoder.} + \item{\code{vaeac.width}}{Positive integer (default is \code{32}). The number of neurons in each +hidden layer in the neural networks of the masked encoder, full encoder, and decoder.} + \item{\code{vaeac.latent_dim}}{Positive integer (default is \code{8}). The number of dimensions in the latent space.} + \item{\code{vaeac.lr}}{Positive numeric (default is \code{0.001}). The learning rate used in the \code{\link[torch:optim_adam]{torch::optim_adam()}} optimizer.} + \item{\code{vaeac.activation_function}}{An \code{\link[torch:nn_module]{torch::nn_module()}} representing an activation function such as, e.g., +\code{\link[torch:nn_relu]{torch::nn_relu()}} (default), \code{\link[torch:nn_leaky_relu]{torch::nn_leaky_relu()}}, \code{\link[torch:nn_selu]{torch::nn_selu()}}, or \code{\link[torch:nn_sigmoid]{torch::nn_sigmoid()}}.} + \item{\code{vaeac.n_vaeacs_initialize}}{Positive integer (default is \code{4}). The number of different vaeac models to initiate +in the start. Pick the best performing one after \code{vaeac.extra_parameters$epochs_initiation_phase} +epochs (default is \code{2}) and continue training that one.} + \item{\code{vaeac.epochs}}{Positive integer (default is \code{100}). The number of epochs to train the final vaeac model. +This includes \code{vaeac.extra_parameters$epochs_initiation_phase}, where the default is \code{2}.} + \item{\code{vaeac.extra_parameters}}{Named list with extra parameters to the \code{vaeac} approach. See +\code{\link[=vaeac_get_extra_para_default]{vaeac_get_extra_para_default()}} for description of possible additional parameters and their default values.} + }} +} +\description{ +This helper function displays the specific arguments applicable to the different +approaches. Note that when calling \code{\link[=explain]{explain()}} from Python, the parameters +are renamed from the form \code{approach.parameter_name} to \code{approach_parameter_name}. +That is, an underscore has replaced the dot as the dot is reserved in Python. +} +\author{ +Lars Henry Berge Olsen and Martin Jullum +} diff --git a/man/finalize_explanation.Rd b/man/finalize_explanation.Rd index ffff80604..ee74c8903 100644 --- a/man/finalize_explanation.Rd +++ b/man/finalize_explanation.Rd @@ -49,10 +49,12 @@ Computes dependence-aware Shapley values for observations in \code{x_explain} fr \code{model} by using the method specified in \code{approach} to estimate the conditional expectation. } \details{ -The most important thing to notice is that \code{shapr} has implemented six different -approaches for estimating the conditional distributions of the data, namely \code{"empirical"}, +The most important thing to notice is that \code{shapr} has implemented eight different +Monte Carlo-based approaches for estimating the conditional distributions of the data, namely \code{"empirical"}, \code{"gaussian"}, \code{"copula"}, \code{"ctree"}, \code{"vaeac"}, \code{"categorical"}, \code{"timeseries"}, and \code{"independence"}. -In addition, the user also has the option of combining the different approaches. +\code{shapr} has also implemented two regression-based approaches \code{"regression_separate"} and \code{"regression_surrogate"}, +and see the separate vignette on the regression-based approaches for more information. +In addition, the user also has the option of combining the different Monte Carlo-based approaches. E.g., if you're in a situation where you have trained a model that consists of 10 features, and you'd like to use the \code{"gaussian"} approach when you condition on a single feature, the \code{"empirical"} approach if you condition on 2-5 features, and \code{"copula"} version @@ -164,11 +166,35 @@ explain_groups <- explain( ) print(explain_groups$shapley_values) +# Separate and surrogate regression approaches with linear regression models. +# More complex regression models can be used, and we can use CV to +# tune the hyperparameters of the regression models and preprocess +# the data before sending it to the model. See the regression vignette +# (Shapley value explanations using the regression paradigm) for more +# details about the `regression_separate` and `regression_surrogate` approaches. +explain_separate_lm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p, + approach = "regression_separate", + regression.model = parsnip::linear_reg() +) + +explain_surrogate_lm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p, + approach = "regression_surrogate", + regression.model = parsnip::linear_reg() +) + } \references{ Aas, K., Jullum, M., & Lland, A. (2021). Explaining individual predictions when features are dependent: More accurate approximations to Shapley values. Artificial Intelligence, 298, 103502. } \author{ -Martin Jullum +Martin Jullum, Lars Henry Berge Olsen } diff --git a/man/get_model_specs.Rd b/man/get_model_specs.Rd index dcad6b450..01f313952 100644 --- a/man/get_model_specs.Rd +++ b/man/get_model_specs.Rd @@ -1,7 +1,7 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/model.R, R/model_ar.R, R/model_arima.R, % R/model_glm.R, R/model_lm.R, R/model_mgcv_gam.R, R/model_ranger.R, -% R/model_xgboost.R +% R/model_workflow.R, R/model_xgboost.R \name{get_model_specs} \alias{get_model_specs} \alias{get_model_specs.default} @@ -12,6 +12,7 @@ \alias{get_model_specs.lm} \alias{get_model_specs.gam} \alias{get_model_specs.ranger} +\alias{get_model_specs.workflow} \alias{get_model_specs.xgb.Booster} \title{Fetches feature information from natively supported models} \usage{ @@ -33,6 +34,8 @@ get_model_specs(x) \method{get_model_specs}{ranger}(x) +\method{get_model_specs}{workflow}(x) + \method{get_model_specs}{xgb.Booster}(x) } \arguments{ diff --git a/man/model_checker.Rd b/man/model_checker.Rd index 313902a13..be755e88a 100644 --- a/man/model_checker.Rd +++ b/man/model_checker.Rd @@ -1,7 +1,7 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/model.R, R/model_ar.R, R/model_arima.R, % R/model_glm.R, R/model_lm.R, R/model_mgcv_gam.R, R/model_ranger.R, -% R/model_xgboost.R +% R/model_workflow.R, R/model_xgboost.R \name{model_checker} \alias{model_checker} \alias{model_checker.default} @@ -12,6 +12,7 @@ \alias{model_checker.lm} \alias{model_checker.gam} \alias{model_checker.ranger} +\alias{model_checker.workflow} \alias{model_checker.xgb.Booster} \title{Check that the type of model is supported by the native implementation of the model class} \usage{ @@ -33,6 +34,8 @@ model_checker(x) \method{model_checker}{ranger}(x) +\method{model_checker}{workflow}(x) + \method{model_checker}{xgb.Booster}(x) } \arguments{ diff --git a/man/predict_model.Rd b/man/predict_model.Rd index 9e2f06996..587b9e107 100644 --- a/man/predict_model.Rd +++ b/man/predict_model.Rd @@ -1,7 +1,7 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/model.R, R/model_ar.R, R/model_arima.R, % R/model_glm.R, R/model_lm.R, R/model_mgcv_gam.R, R/model_ranger.R, -% R/model_xgboost.R +% R/model_workflow.R, R/model_xgboost.R \name{predict_model} \alias{predict_model} \alias{predict_model.default} @@ -12,6 +12,7 @@ \alias{predict_model.lm} \alias{predict_model.gam} \alias{predict_model.ranger} +\alias{predict_model.workflow} \alias{predict_model.xgb.Booster} \title{Generate predictions for input data with specified model} \usage{ @@ -43,6 +44,8 @@ predict_model(x, newdata, ...) \method{predict_model}{ranger}(x, newdata, ...) +\method{predict_model}{workflow}(x, newdata, ...) + \method{predict_model}{xgb.Booster}(x, newdata, ...) } \arguments{ @@ -60,7 +63,8 @@ Performs prediction of response \code{\link[stats:lm]{stats::lm()}}, \code{\link[stats:glm]{stats::glm()}}, \code{\link[ranger:ranger]{ranger::ranger()}}, -\code{\link[mgcv:gam]{mgcv::gam()}} and +\code{\link[mgcv:gam]{mgcv::gam()}}, +\code{\link[workflows:workflow]{workflows::workflow()}} (i.e., \code{tidymodels} models), and \code{\link[xgboost:xgb.train]{xgboost::xgb.train()}} with binary or continuous response. See details for more information. @@ -74,6 +78,7 @@ The following models are currently supported: \item \code{\link[stats:glm]{stats::glm()}} \item \code{\link[ranger:ranger]{ranger::ranger()}} \item \code{\link[mgcv:gam]{mgcv::gam()}} +\item \code{\link[workflows:workflow]{workflows::workflow()}} \item \code{\link[xgboost:xgb.train]{xgboost::xgb.train()}} } diff --git a/man/prepare_data.Rd b/man/prepare_data.Rd index 0d6098204..d7d6d7f39 100644 --- a/man/prepare_data.Rd +++ b/man/prepare_data.Rd @@ -1,8 +1,9 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/approach.R, R/approach_categorical.R, % R/approach_copula.R, R/approach_ctree.R, R/approach_empirical.R, -% R/approach_gaussian.R, R/approach_independence.R, R/approach_timeseries.R, -% R/approach_vaeac.R +% R/approach_gaussian.R, R/approach_independence.R, +% R/approach_regression_separate.R, R/approach_regression_surrogate.R, +% R/approach_timeseries.R, R/approach_vaeac.R \name{prepare_data} \alias{prepare_data} \alias{prepare_data.categorical} @@ -11,6 +12,8 @@ \alias{prepare_data.empirical} \alias{prepare_data.gaussian} \alias{prepare_data.independence} +\alias{prepare_data.regression_separate} +\alias{prepare_data.regression_surrogate} \alias{prepare_data.timeseries} \alias{prepare_data.vaeac} \title{Generate data used for predictions and Monte Carlo integration} @@ -29,6 +32,10 @@ prepare_data(internal, index_features = NULL, ...) \method{prepare_data}{independence}(internal, index_features = NULL, ...) +\method{prepare_data}{regression_separate}(internal, index_features = NULL, ...) + +\method{prepare_data}{regression_surrogate}(internal, index_features = NULL, ...) + \method{prepare_data}{timeseries}(internal, index_features = NULL, ...) \method{prepare_data}{vaeac}(internal, index_features = NULL, ...) diff --git a/man/regression.check_namespaces.Rd b/man/regression.check_namespaces.Rd new file mode 100644 index 000000000..772f44c03 --- /dev/null +++ b/man/regression.check_namespaces.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.check_namespaces} +\alias{regression.check_namespaces} +\title{Check that needed libraries are installed} +\usage{ +regression.check_namespaces() +} +\description{ +This function checks that the \code{parsnip}, \code{recipes}, \code{workflows}, \code{tune}, \code{dials}, +\code{yardstick}, \code{hardhat}, \code{rsample}, and \code{rlang} packages are available. +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.check_parameters.Rd b/man/regression.check_parameters.Rd new file mode 100644 index 000000000..fbe747374 --- /dev/null +++ b/man/regression.check_parameters.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.check_parameters} +\alias{regression.check_parameters} +\title{Check regression parameters} +\usage{ +regression.check_parameters(internal) +} +\arguments{ +\item{internal}{List. +Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}} +The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.} +} +\value{ +The same \code{internal} list, but added logical indicator \code{internal$parameters$regression.tune} +if we are to tune the regression model/models. +} +\description{ +Check regression parameters +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.check_recipe_func.Rd b/man/regression.check_recipe_func.Rd new file mode 100644 index 000000000..7a009a128 --- /dev/null +++ b/man/regression.check_recipe_func.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.check_recipe_func} +\alias{regression.check_recipe_func} +\title{Check \code{regression.recipe_func}} +\usage{ +regression.check_recipe_func(regression.recipe_func, x_explain) +} +\arguments{ +\item{regression.recipe_func}{Either \code{NULL} (default) or a function that that takes in a \code{\link[recipes:recipe]{recipes::recipe()}} +object and returns a modified \code{\link[recipes:recipe]{recipes::recipe()}} with potentially additional recipe steps. See the regression +vignette for several examples. +Note, to make it easier to call \code{explain()} from Python, the \code{regression.recipe_func} can also be a string +containing an R function. For example, +\code{"function(recipe) return(recipes::step_ns(recipe, recipes::all_numeric_predictors(), deg_free = 2))"} is also +a valid input. It is essential to include the package prefix if the package is not loaded.} + +\item{x_explain}{A matrix or data.frame/data.table. +Contains the the features, whose predictions ought to be explained.} +} +\description{ +Check that regression.recipe_func is a function that returns the +RHS of the formula for arbitrary feature name inputs. +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.check_sur_n_comb.Rd b/man/regression.check_sur_n_comb.Rd new file mode 100644 index 000000000..1ede6d346 --- /dev/null +++ b/man/regression.check_sur_n_comb.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_surrogate.R +\name{regression.check_sur_n_comb} +\alias{regression.check_sur_n_comb} +\title{Check the \code{regression.surrogate_n_comb} parameter} +\usage{ +regression.check_sur_n_comb(regression.surrogate_n_comb, used_n_combinations) +} +\arguments{ +\item{regression.surrogate_n_comb}{Integer (default is \code{internal$parameters$used_n_combinations}) specifying the +number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is +"\code{internal$parameters$used_n_combinations} - 2". By default, we use all coalitions, but this can take a lot of memory +in larger dimensions. Note that by "all", we mean all coalitions chosen by \code{shapr} to be used. This will be all +\eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in the exact mode. If the +user sets a lower value than \code{internal$parameters$used_n_combinations}, then we sample this amount of unique +coalitions separately for each training observations. That is, on average, all coalitions should be equally trained.} + +\item{used_n_combinations}{Integer. The number of used combinations (including the empty and grand coalitions).} +} +\description{ +Check that \code{regression.surrogate_n_comb} is either NULL or a valid integer. +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.check_vfold_cv_para.Rd b/man/regression.check_vfold_cv_para.Rd new file mode 100644 index 000000000..d26c7d922 --- /dev/null +++ b/man/regression.check_vfold_cv_para.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.check_vfold_cv_para} +\alias{regression.check_vfold_cv_para} +\title{Check the parameters that are sent to \code{\link[rsample:vfold_cv]{rsample::vfold_cv()}}} +\usage{ +regression.check_vfold_cv_para(regression.vfold_cv_para) +} +\arguments{ +\item{regression.vfold_cv_para}{Either \code{NULL} (default) or a named list containing +the parameters to be sent to \code{\link[rsample:vfold_cv]{rsample::vfold_cv()}}. See the regression vignette for +several examples.} +} +\description{ +Check that \code{regression.vfold_cv_para} is either NULL or a named list that only contains recognized parameters. +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.cv_message.Rd b/man/regression.cv_message.Rd new file mode 100644 index 000000000..145e514a0 --- /dev/null +++ b/man/regression.cv_message.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.cv_message} +\alias{regression.cv_message} +\title{Produce message about which batch prepare_data is working on} +\usage{ +regression.cv_message(regression.results, regression.grid, n_cv = 10) +} +\arguments{ +\item{regression.results}{The results of the CV procedures.} + +\item{regression.grid}{Object containing the hyperparameter values.} + +\item{n_cv}{Integer (default is 10) specifying the number of CV hyperparameter configurations to print.} +} +\description{ +Produce message about which batch prepare_data is working on +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.get_string_to_R.Rd b/man/regression.get_string_to_R.Rd new file mode 100644 index 000000000..8b306177d --- /dev/null +++ b/man/regression.get_string_to_R.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.get_string_to_R} +\alias{regression.get_string_to_R} +\title{Convert the string into an R object} +\usage{ +regression.get_string_to_R(string) +} +\arguments{ +\item{string}{A character vector/string containing the text to convert into R code.} +} +\description{ +Convert the string into an R object +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.get_tune.Rd b/man/regression.get_tune.Rd new file mode 100644 index 000000000..7c5440741 --- /dev/null +++ b/man/regression.get_tune.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.get_tune} +\alias{regression.get_tune} +\title{Get if model is to be tuned} +\usage{ +regression.get_tune(regression.model, regression.tune_values, x_train) +} +\arguments{ +\item{regression.model}{A \code{tidymodels} object of class \code{model_specs}. Default is a linear regression model, i.e., +\code{\link[parsnip:linear_reg]{parsnip::linear_reg()}}. See \href{https://www.tidymodels.org/find/parsnip/}{tidymodels} for all possible models, +and see the vignette for how to add new/own models. Note, to make it easier to call \code{explain()} from Python, the +\code{regression.model} parameter can also be a string specifying the model which will be parsed and evaluated. For +example, \verb{"parsnip::rand_forest(mtry = hardhat::tune(), trees = 100, engine = "ranger", mode = "regression")"} +is also a valid input. It is essential to include the package prefix if the package is not loaded.} + +\item{regression.tune_values}{Either \code{NULL} (default), a data.frame/data.table/tibble, or a function. +The data.frame must contain the possible hyperparameter value combinations to try. +The column names must match the names of the tuneable parameters specified in \code{regression.model}. +If \code{regression.tune_values} is a function, then it should take one argument \code{x} which is the training data +for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above. +Using a function allows the hyperparameter values to change based on the size of the combination. See the regression +vignette for several examples. +Note, to make it easier to call \code{explain()} from Python, the \code{regression.tune_values} can also be a string +containing an R function. For example, +\code{"function(x) return(dials::grid_regular(dials::mtry(c(1, ncol(x)))), levels = 3))"} is also a valid input. +It is essential to include the package prefix if the package is not loaded.} + +\item{x_train}{Matrix or data.frame/data.table. +Contains the data used to estimate the (conditional) distributions for the features +needed to properly estimate the conditional expectations in the Shapley formula.} +} +\value{ +A boolean variable indicating if the regression model is to be tuned. +} +\description{ +That is, if the regression model contains hyperparameters we are to tune using cross validation. +See \href{https://www.tidymodels.org/find/parsnip/#model-args}{tidymodels} for default model hyperparameters. +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.get_y_hat.Rd b/man/regression.get_y_hat.Rd new file mode 100644 index 000000000..6b03d3d49 --- /dev/null +++ b/man/regression.get_y_hat.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.get_y_hat} +\alias{regression.get_y_hat} +\title{Get the predicted responses} +\usage{ +regression.get_y_hat(internal, model, predict_model) +} +\arguments{ +\item{internal}{List. +Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}} +The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.} + +\item{model}{Objects. +The model object that ought to be explained. +See the documentation of \code{\link[=explain]{explain()}} for details.} + +\item{predict_model}{Function. +The prediction function used when \code{model} is not natively supported. +See the documentation of \code{\link[=explain]{explain()}} for details.} +} +\value{ +The same \code{internal} list, but added vectors \code{internal$data$x_train_y_hat} and +\code{internal$data$x_explain_y_hat} containing the predicted response of the training and explain data. +} +\description{ +Get the predicted responses +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.prep_message_batch.Rd b/man/regression.prep_message_batch.Rd new file mode 100644 index 000000000..9b8a942e2 --- /dev/null +++ b/man/regression.prep_message_batch.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.prep_message_batch} +\alias{regression.prep_message_batch} +\title{Produce message about which batch prepare_data is working on} +\usage{ +regression.prep_message_batch(internal, index_features) +} +\arguments{ +\item{internal}{List. +Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}} +The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.} + +\item{index_features}{Positive integer vector. Specifies the indices of combinations to +apply to the present method. \code{NULL} means all combinations. Only used internally.} +} +\description{ +Produce message about which batch prepare_data is working on +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.prep_message_comb.Rd b/man/regression.prep_message_comb.Rd new file mode 100644 index 000000000..84739b82a --- /dev/null +++ b/man/regression.prep_message_comb.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.prep_message_comb} +\alias{regression.prep_message_comb} +\title{Produce message about which combination prepare_data is working on} +\usage{ +regression.prep_message_comb(internal, index_features, comb_idx) +} +\arguments{ +\item{internal}{List. +Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}} +The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.} + +\item{index_features}{Positive integer vector. Specifies the indices of combinations to +apply to the present method. \code{NULL} means all combinations. Only used internally.} + +\item{comb_idx}{Integer. The index of the combination in a specific batch.} +} +\description{ +Produce message about which combination prepare_data is working on +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.separate_time_mess.Rd b/man/regression.separate_time_mess.Rd new file mode 100644 index 000000000..cf0438000 --- /dev/null +++ b/man/regression.separate_time_mess.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.separate_time_mess} +\alias{regression.separate_time_mess} +\title{Produce time message for separate regression} +\usage{ +regression.separate_time_mess() +} +\description{ +Produce time message for separate regression +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.surrogate_aug_data.Rd b/man/regression.surrogate_aug_data.Rd new file mode 100644 index 000000000..8ebd0ccbd --- /dev/null +++ b/man/regression.surrogate_aug_data.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_surrogate.R +\name{regression.surrogate_aug_data} +\alias{regression.surrogate_aug_data} +\title{Augment the training data and the explicands} +\usage{ +regression.surrogate_aug_data( + internal, + x, + y_hat = NULL, + index_features = NULL, + augment_masks_as_factor = FALSE, + augment_include_grand = FALSE, + augment_add_id_comb = FALSE, + augment_comb_prob = NULL, + augment_weights = NULL +) +} +\arguments{ +\item{internal}{List. +Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}} +The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.} + +\item{x}{Data.table containing the data. Either the training data or the explicands. If \code{x} is the explicands, +then \code{index_features} must be provided.} + +\item{y_hat}{Vector of numerics (optional) containing the predicted responses for the observations in \code{x}.} + +\item{index_features}{Array of integers (optional) containing which coalitions to consider. Must be provided if +\code{x} is the explicands.} + +\item{augment_masks_as_factor}{Logical (default is \code{FALSE}). If \code{TRUE}, then the binary masks are converted +to factors. If \code{FALSE}, then the binary masks are numerics.} + +\item{augment_include_grand}{Logical (default is \code{FALSE}). If \code{TRUE}, then the grand coalition is included. +If \code{index_features} are provided, then \code{augment_include_grand} has no effect. Note that if we sample the +combinations then the grand coalition is equally likely to be samples as the other coalitions (or weighted if +\code{augment_comb_prob} is provided).} + +\item{augment_add_id_comb}{Logical (default is \code{FALSE}). If \code{TRUE}, an additional column is adding containing +which coalition was applied.} + +\item{augment_comb_prob}{Array of numerics (default is \code{NULL}). The length of the array must match the number of +combinations being considered, where each entry specifies the probability of sampling the corresponding coalition. +This is useful if we want to generate more training data for some specific coalitions. One possible choice would be +\code{augment_comb_prob = if (use_Shapley_weights) internal$objects$X$shapley_weight[2:actual_n_combinations] else NULL}.} + +\item{augment_weights}{String (optional). Specifying which type of weights to add to the observations. +If \code{NULL} (default), then no weights are added. If \code{"Shapley"}, then the Shapley weights for the different +combinations are added to corresponding observations where the coalitions was applied. If \code{uniform}, then +all observations get an equal weight of one.} +} +\value{ +A data.table containing the augmented data. +} +\description{ +Augment the training data and the explicands +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/regression.train_model.Rd b/man/regression.train_model.Rd new file mode 100644 index 000000000..8ee6b669a --- /dev/null +++ b/man/regression.train_model.Rd @@ -0,0 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/approach_regression_separate.R +\name{regression.train_model} +\alias{regression.train_model} +\title{Train a tidymodels model via workflows} +\usage{ +regression.train_model( + x, + seed = 1, + verbose = 0, + regression.model = parsnip::linear_reg(), + regression.tune = FALSE, + regression.tune_values = NULL, + regression.vfold_cv_para = NULL, + regression.recipe_func = NULL, + regression.response_var = "y_hat", + regression.surrogate_n_comb = NULL +) +} +\arguments{ +\item{x}{Data.table containing the data. Either the training data or the explicands. If \code{x} is the explicands, +then \code{index_features} must be provided.} + +\item{seed}{Positive integer. +Specifies the seed before any randomness based code is being run. +If \code{NULL} the seed will be inherited from the calling environment.} + +\item{verbose}{An integer specifying the level of verbosity. If \code{0}, \code{shapr} will stay silent. +If \code{1}, it will print information about performance. If \code{2}, some additional information will be printed out. +Use \code{0} (default) for no verbosity, \code{1} for low verbose, and \code{2} for high verbose. +TODO: Make this clearer when we end up fixing this and if they should force a progressr bar.} + +\item{regression.model}{A \code{tidymodels} object of class \code{model_specs}. Default is a linear regression model, i.e., +\code{\link[parsnip:linear_reg]{parsnip::linear_reg()}}. See \href{https://www.tidymodels.org/find/parsnip/}{tidymodels} for all possible models, +and see the vignette for how to add new/own models. Note, to make it easier to call \code{explain()} from Python, the +\code{regression.model} parameter can also be a string specifying the model which will be parsed and evaluated. For +example, \verb{"parsnip::rand_forest(mtry = hardhat::tune(), trees = 100, engine = "ranger", mode = "regression")"} +is also a valid input. It is essential to include the package prefix if the package is not loaded.} + +\item{regression.tune}{Logical (default is \code{FALSE}). If \code{TRUE}, then we are to tune the hyperparemeters based on +the values provided in \code{regression.tune_values}. Note that no checks are conducted as this is checked earlier in +\code{setup_approach.regression_separate} and \code{setup_approach.regression_surrogate}.} + +\item{regression.tune_values}{Either \code{NULL} (default), a data.frame/data.table/tibble, or a function. +The data.frame must contain the possible hyperparameter value combinations to try. +The column names must match the names of the tuneable parameters specified in \code{regression.model}. +If \code{regression.tune_values} is a function, then it should take one argument \code{x} which is the training data +for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above. +Using a function allows the hyperparameter values to change based on the size of the combination. See the regression +vignette for several examples. +Note, to make it easier to call \code{explain()} from Python, the \code{regression.tune_values} can also be a string +containing an R function. For example, +\code{"function(x) return(dials::grid_regular(dials::mtry(c(1, ncol(x)))), levels = 3))"} is also a valid input. +It is essential to include the package prefix if the package is not loaded.} + +\item{regression.vfold_cv_para}{Either \code{NULL} (default) or a named list containing +the parameters to be sent to \code{\link[rsample:vfold_cv]{rsample::vfold_cv()}}. See the regression vignette for +several examples.} + +\item{regression.recipe_func}{Either \code{NULL} (default) or a function that that takes in a \code{\link[recipes:recipe]{recipes::recipe()}} +object and returns a modified \code{\link[recipes:recipe]{recipes::recipe()}} with potentially additional recipe steps. See the regression +vignette for several examples. +Note, to make it easier to call \code{explain()} from Python, the \code{regression.recipe_func} can also be a string +containing an R function. For example, +\code{"function(recipe) return(recipes::step_ns(recipe, recipes::all_numeric_predictors(), deg_free = 2))"} is also +a valid input. It is essential to include the package prefix if the package is not loaded.} + +\item{regression.response_var}{String (default is \code{y_hat}) containing the name of the response variable.} + +\item{regression.surrogate_n_comb}{Integer (default is \code{NULL}). The number of times each training observations +has been augmented. If \code{NULL}, then we assume that we are doing separate regression.} +} +\value{ +A trained \code{tidymodels} model based on the provided input parameters. +} +\description{ +Function that trains a \code{tidymodels} model via \code{workflows} based on the provided input parameters. +This function allows for cross validating the hyperparameters of the model. +} +\author{ +Lars Henry Berge Olsen +} +\keyword{internal} diff --git a/man/setup.Rd b/man/setup.Rd index 1b096533d..fce91a6b0 100644 --- a/man/setup.Rd +++ b/man/setup.Rd @@ -43,8 +43,8 @@ Contains the the features, whose predictions ought to be explained.} \item{approach}{Character vector of length \code{1} or one less than the number of features. All elements should, either be \code{"gaussian"}, \code{"copula"}, \code{"empirical"}, \code{"ctree"}, \code{"vaeac"}, -\code{"categorical"}, \code{"timeseries"}, or \code{"independence"}. -See details for more information.} +\code{"categorical"}, \code{"timeseries"}, \code{"independence"}, \code{"regression_separate"}, or \code{"regression_surrogate"}. +The two regression approaches can not be combined with any other approach. See details for more information.} \item{prediction_zero}{Numeric. The prediction value for unseen data, i.e. an estimate of the expected prediction without conditioning on any diff --git a/man/setup_approach.Rd b/man/setup_approach.Rd index c1b955efc..cf1ee8d0d 100644 --- a/man/setup_approach.Rd +++ b/man/setup_approach.Rd @@ -1,8 +1,9 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/approach.R, R/approach_categorical.R, % R/approach_copula.R, R/approach_ctree.R, R/approach_empirical.R, -% R/approach_gaussian.R, R/approach_independence.R, R/approach_timeseries.R, -% R/approach_vaeac.R +% R/approach_gaussian.R, R/approach_independence.R, +% R/approach_regression_separate.R, R/approach_regression_surrogate.R, +% R/approach_timeseries.R, R/approach_vaeac.R \name{setup_approach} \alias{setup_approach} \alias{setup_approach.categorical} @@ -11,6 +12,8 @@ \alias{setup_approach.empirical} \alias{setup_approach.gaussian} \alias{setup_approach.independence} +\alias{setup_approach.regression_separate} +\alias{setup_approach.regression_surrogate} \alias{setup_approach.timeseries} \alias{setup_approach.vaeac} \title{Set up the framework chosen approach} @@ -53,6 +56,25 @@ setup_approach(internal, ...) \method{setup_approach}{independence}(internal, ...) +\method{setup_approach}{regression_separate}( + internal, + regression.model = parsnip::linear_reg(), + regression.tune_values = NULL, + regression.vfold_cv_para = NULL, + regression.recipe_func = NULL, + ... +) + +\method{setup_approach}{regression_surrogate}( + internal, + regression.model = parsnip::linear_reg(), + regression.tune_values = NULL, + regression.vfold_cv_para = NULL, + regression.recipe_func = NULL, + regression.surrogate_n_comb = internal$parameters$used_n_combinations - 2, + ... +) + \method{setup_approach}{timeseries}( internal, timeseries.fixed_sigma_vec = 2, @@ -156,6 +178,45 @@ Containing the mean of the data generating distribution. Containing the covariance matrix of the data generating distribution. \code{NULL} means it is estimated from the \code{x_train}.} +\item{regression.model}{A \code{tidymodels} object of class \code{model_specs}. Default is a linear regression model, i.e., +\code{\link[parsnip:linear_reg]{parsnip::linear_reg()}}. See \href{https://www.tidymodels.org/find/parsnip/}{tidymodels} for all possible models, +and see the vignette for how to add new/own models. Note, to make it easier to call \code{explain()} from Python, the +\code{regression.model} parameter can also be a string specifying the model which will be parsed and evaluated. For +example, \verb{"parsnip::rand_forest(mtry = hardhat::tune(), trees = 100, engine = "ranger", mode = "regression")"} +is also a valid input. It is essential to include the package prefix if the package is not loaded.} + +\item{regression.tune_values}{Either \code{NULL} (default), a data.frame/data.table/tibble, or a function. +The data.frame must contain the possible hyperparameter value combinations to try. +The column names must match the names of the tuneable parameters specified in \code{regression.model}. +If \code{regression.tune_values} is a function, then it should take one argument \code{x} which is the training data +for the current combination/coalition and returns a data.frame/data.table/tibble with the properties described above. +Using a function allows the hyperparameter values to change based on the size of the combination. See the regression +vignette for several examples. +Note, to make it easier to call \code{explain()} from Python, the \code{regression.tune_values} can also be a string +containing an R function. For example, +\code{"function(x) return(dials::grid_regular(dials::mtry(c(1, ncol(x)))), levels = 3))"} is also a valid input. +It is essential to include the package prefix if the package is not loaded.} + +\item{regression.vfold_cv_para}{Either \code{NULL} (default) or a named list containing +the parameters to be sent to \code{\link[rsample:vfold_cv]{rsample::vfold_cv()}}. See the regression vignette for +several examples.} + +\item{regression.recipe_func}{Either \code{NULL} (default) or a function that that takes in a \code{\link[recipes:recipe]{recipes::recipe()}} +object and returns a modified \code{\link[recipes:recipe]{recipes::recipe()}} with potentially additional recipe steps. See the regression +vignette for several examples. +Note, to make it easier to call \code{explain()} from Python, the \code{regression.recipe_func} can also be a string +containing an R function. For example, +\code{"function(recipe) return(recipes::step_ns(recipe, recipes::all_numeric_predictors(), deg_free = 2))"} is also +a valid input. It is essential to include the package prefix if the package is not loaded.} + +\item{regression.surrogate_n_comb}{Integer (default is \code{internal$parameters$used_n_combinations}) specifying the +number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is +"\code{internal$parameters$used_n_combinations} - 2". By default, we use all coalitions, but this can take a lot of memory +in larger dimensions. Note that by "all", we mean all coalitions chosen by \code{shapr} to be used. This will be all +\eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if \code{shapr} is in the exact mode. If the +user sets a lower value than \code{internal$parameters$used_n_combinations}, then we sample this amount of unique +coalitions separately for each training observations. That is, on average, all coalitions should be equally trained.} + \item{timeseries.fixed_sigma_vec}{Numeric. (Default = 2) Represents the kernel bandwidth in the distance computation. TODO: What length should it have? 1?} diff --git a/man/vaeac_check_verbose.Rd b/man/vaeac_check_verbose.Rd index c5aecac15..73ab85049 100644 --- a/man/vaeac_check_verbose.Rd +++ b/man/vaeac_check_verbose.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/approach_vaeac.R \name{vaeac_check_verbose} \alias{vaeac_check_verbose} -\title{Function the checks the verbose parameter} +\title{Function that checks the verbose parameter} \usage{ vaeac_check_verbose(verbose) } @@ -14,7 +14,7 @@ vaeac_check_verbose(verbose) The function does not return anything. } \description{ -Function the checks the verbose parameter +Function that checks the verbose parameter } \author{ Lars Henry Berge Olsen diff --git a/python/README.md b/python/README.md index 6b5b1238a..b010fec77 100644 --- a/python/README.md +++ b/python/README.md @@ -60,3 +60,14 @@ print(df_shapley) For other models, one can provide a custom `predict_model` function (and optionally a custom `get_model_specs`) to `shaprpy.explain`. See `/examples` for runnable examples, including an example of a custom PyTorch model. + +The `/examples/regression_paradigm.py` file demonstrates how +to use the regression paradigm explained in +[Olsen et al. (2024)](https://link.springer.com/article/10.1007/s10618-024-01016-z). +We describe how to specify the regression model, how to enable automatic +cross-validation of the model's hyperparameters, and applying +pre-processing steps to the data before fitting the regression +models. We refer to +[Olsen et al. (2024)](https://link.springer.com/article/10.1007/s10618-024-01016-z) +for when one should use the different paradigms, method classes, and methods. + diff --git a/python/examples/regression_paradigm.py b/python/examples/regression_paradigm.py new file mode 100644 index 000000000..c5daab4c4 --- /dev/null +++ b/python/examples/regression_paradigm.py @@ -0,0 +1,194 @@ +# In this file, we demonstrate how to use the regression-based method from Python. +# For more details, we refer the reader to the vignette. There are two differences +# between the Python and R version. +# First, in R, the parameter names have the structure 'regression.parameter_name', +# while in Python they have the structure 'regression_parameter_name'. This means +# that we should use e.g., 'regression_recipe_func' in Python and NOT 'regression.recipe_func'. +# Second, the parameters 'regression_model', 'regression_recipe_func', and 'regression_tune_values'. +# must be provided as strings of functional R code. The latter is only needed to be +# a string if it is a function, i.e., it can also be, e.g., a pandas data frame. + +import xgboost as xgb +from shaprpy import explain +from shaprpy.datasets import load_california_housing + +dfx_train, dfx_test, dfy_train, dfy_test = load_california_housing() + +# Fit model +model = xgb.XGBRegressor() +model.fit(dfx_train, dfy_train.values.flatten()) + +# List to store the explanations +explanation_list = {} + +# Explain the model using the empirical approach +explanation_list["empirical"] = explain( + model=model, + x_train=dfx_train, + x_explain=dfx_test, + approach='empirical', + prediction_zero=dfy_train.mean().item() +) + +# Explain the model using several separate regression methods +# Linear regression +explanation_list["sep_lm"] = explain( + model=model, + x_train=dfx_train, + x_explain=dfx_test, + approach='regression_separate', + prediction_zero=dfy_train.mean().item(), + verbose=2, + n_batches=1, + regression_model='parsnip::linear_reg()' +) + +# Principal component regression with (up to) three principal components +explanation_list["sep_pca"] = explain( + model=model, + x_train=dfx_train, + x_explain=dfx_test, + approach='regression_separate', + prediction_zero=dfy_train.mean().item(), + verbose=2, + n_batches=1, + regression_model='parsnip::linear_reg()', + regression_recipe_func='''function(regression_recipe) { + return(recipes::step_ns(regression_recipe, recipes::all_numeric_predictors(), deg_free = 3)) + }''' +) + +# GAM with splines with (up to) three degrees of freedom +explanation_list["sep_splines"] = explain( + model=model, + x_train=dfx_train, + x_explain=dfx_test, + approach='regression_separate', + prediction_zero=dfy_train.mean().item(), + verbose=2, + n_batches=1, + regression_model='parsnip::linear_reg()', + regression_recipe_func='''function(regression_recipe) { + return(recipes::step_ns(regression_recipe, recipes::all_numeric_predictors(), deg_free = 3)) + }''' +) + +# Decision tree with cross validated tree depth +explanation_list["sep_tree_cv"] = explain( + model=model, + x_train=dfx_train, + x_explain=dfx_test, + approach='regression_separate', + prediction_zero=dfy_train.mean().item(), + verbose=2, + n_batches=1, + regression_model="parsnip::decision_tree(tree_depth = hardhat::tune(), engine = 'rpart', mode = 'regression')", + regression_tune_values='dials::grid_regular(dials::tree_depth(), levels = 4)', + regression_vfold_cv_para={'v': 5} +) + +# XGboost with default parameters +explanation_list["sep_xgboost"] = explain( + model=model, + x_train=dfx_train, + x_explain=dfx_test, + approach='regression_separate', + prediction_zero=dfy_train.mean().item(), + verbose=2, + n_batches=1, + regression_model="parsnip::boost_tree(engine = 'xgboost', mode = 'regression')" +) + +# XGboost with cross validated number of trees +explanation_list["sep_xgboost_cv"] = explain( + model=model, + x_train=dfx_train, + x_explain=dfx_test, + approach='regression_separate', + prediction_zero=dfy_train.mean().item(), + verbose=2, + n_batches=1, + regression_model="parsnip::boost_tree(trees = hardhat::tune(), engine = 'xgboost', mode = 'regression')", + regression_tune_values='expand.grid(trees = c(10, 15, 25, 50, 100, 500))', + regression_vfold_cv_para={'v': 5} +) + +# Explain the model using several surrogate regression methods +# Linear regression +explanation_list["sur_lm"] = explain( + model=model, + x_train=dfx_train, + x_explain=dfx_test, + approach='regression_surrogate', + prediction_zero=dfy_train.mean().item(), + verbose=2, + n_batches=1, + regression_model='parsnip::linear_reg()' +) + +# Using random forest with default parameters as the surrogate model +explanation_list["sur_rf"] = explain( + model=model, + x_train=dfx_train, + x_explain=dfx_test, + approach='regression_surrogate', + prediction_zero=dfy_train.mean().item(), + verbose=2, + n_batches=1, + regression_model="parsnip::rand_forest(engine = 'ranger', mode = 'regression')" +) + +# Using random forest with parameters tuned by cross-validation as the surrogate model +explanation_list["sur_rf_cv"] = explain( + model=model, + x_train=dfx_train, + x_explain=dfx_test, + approach='regression_surrogate', + prediction_zero=dfy_train.mean().item(), + verbose=2, + n_batches=1, + regression_model="""parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = 'ranger', mode = 'regression' + )""", + regression_tune_values="""dials::grid_regular( + dials::mtry(c(1, 8)), + dials::trees(c(50, 750)), + levels = 4 + )""", + regression_vfold_cv_para={'v': 4} +) + +# Print the MSEv evaluation criterion scores +print("Method", "MSEv", "Elapsed time (seconds)") +for i, (method, explanation) in enumerate(explanation_list.items()): + print(method, round(explanation[4]["MSEv"]["MSEv"].iloc[0], 3), round(explanation[3]["total_time_secs"], 3)) + +""" +Method MSEv Time +empirical 0.826 1.096 +sep_lm 1.623 12.093 +sep_pca 1.626 16.435 +sep_splines 1.626 15.072 +sep_tree_cv 1.436 275.002 +sep_xgboost 0.769 13.870 +sep_xgboost_cv 0.802 312.758 +sur_lm 1.772 0.548 +sur_rf 0.886 41.250 +""" + +explanation_list["sep_xgboost"][0] + +""" + none MedInc HouseAge AveRooms AveBedrms Population AveOccup \ +1 2.205937 -0.496421 0.195272 -0.077923 0.010124 -0.219369 -0.316029 +2 2.205938 -0.163246 0.014565 -0.415945 -0.114073 0.084315 0.144754 +3 2.205938 0.574157 0.258926 0.090818 -0.665126 0.354005 0.869530 +4 2.205938 0.311416 -0.105142 0.211300 0.031939 -0.180331 -0.059839 +5 2.205938 0.077537 -0.150997 -0.117875 0.087118 -0.085118 0.414764 + Latitude Longitude +1 -0.434240 -0.361774 +2 -0.483618 -0.324016 +3 0.276002 0.957242 +4 0.028560 0.049815 +5 -0.242943 0.006815 +""" \ No newline at end of file diff --git a/python/examples/sklearn_regressor.py b/python/examples/sklearn_regressor.py index 4c2d0b39d..6f7d59067 100644 --- a/python/examples/sklearn_regressor.py +++ b/python/examples/sklearn_regressor.py @@ -46,7 +46,7 @@ 'B': ['AveBedrms','Population','AveOccup'], 'C': ['Latitude','Longitude']} -df_shapley_g, pred_explain_g, internal_g, timing_g = explain( +df_shapley_g, pred_explain_g, internal_g, timing_g, MSEv_g = explain( model = model, x_train = dfx_train, x_explain = dfx_test, diff --git a/python/shaprpy/explain.py b/python/shaprpy/explain.py index c2fad550e..1e0642227 100644 --- a/python/shaprpy/explain.py +++ b/python/shaprpy/explain.py @@ -2,11 +2,11 @@ import numpy as np import pandas as pd from typing import Callable -from datetime import datetime, timedelta +from datetime import datetime import rpy2.robjects as ro from rpy2.robjects.packages import importr from rpy2.rinterface import NULL, NA -from .utils import r2py, py2r, recurse_r_tree +from shaprpy.utils import r2py, py2r, recurse_r_tree from rpy2.robjects.vectors import StrVector, ListVector data_table = importr('data.table') @@ -36,6 +36,7 @@ def explain( MSEv_uniform_comb_weights: bool = True, timing: bool = True, verbose: int | None = 0, + **kwargs, ): '''Explain the output of machine learning models with more accurately estimated Shapley values. @@ -44,7 +45,7 @@ def explain( Parameters ---------- - model: The model whose predictions we want to explain. + model: The model whose predictions we want to explain. `shaprpy` natively supports `sklearn`, `xgboost` and `keras` models. Unsupported models can still be explained by passing `predict_model` and (optionally) `get_model_specs`. x_explain: Contains the features whose predictions ought to be explained. @@ -54,14 +55,14 @@ def explain( `n_features` equals the total number of features in the model. All elements should, either be `"gaussian"`, `"copula"`, `"empirical"`, `"ctree"`, `"categorical"`, `"timeseries"`, or `"independence"`. prediction_zero: The prediction value for unseen data, i.e. an estimate of the expected prediction without conditioning on any - features. Typically we set this value equal to the mean of the response variable in our training data, but other + features. Typically we set this value equal to the mean of the response variable in our training data, but other choices such as the mean of the predictions in the training data are also reasonable. n_combinations: If `group = None`, `n_combinations` represents the number of unique feature combinations to sample. If `group != None`, `n_combinations` represents the number of unique group combinations to sample. If `n_combinations = None`, the exact method is used and all combinations are considered. The maximum number of combinations equals `2^m`, where `m` is the number of features. group: If `None` regular feature wise Shapley values are computed. - If a dict is provided, group wise Shapley values are computed. `group` then contains lists of unique feature names with the + If a dict is provided, group wise Shapley values are computed. `group` then contains lists of unique feature names with the features included in each of the different groups. The length of the dict equals the number of groups. n_samples: Indicating the maximum number of samples to use in the Monte Carlo integration for every conditional expectation. @@ -78,7 +79,7 @@ def explain( and a pandas.DataFrame to compute predictions for. The function must give the prediction as a numpy.Array. `None` (the default) uses functions specified internally. Can also be used to override the default function for natively supported model classes. - get_model_specs: An optional function for checking model/data consistency when `model` is not natively supported. + get_model_specs: An optional function for checking model/data consistency when `model` is not natively supported. This method has yet to be implemented for keras models. The function takes `model` as argument and provides a `dict with 3 elements: - labels: list[str] with the names of each feature. @@ -94,6 +95,10 @@ def explain( timing: Indicates whether the timing of the different parts of the explain call should be saved and returned. verbose: An integer specifying the level of verbosity. If `0` (default), `shapr` will stay silent. If `1`, it will print information about performance. If `2`, some additional information will be printed out. + kwargs: Further arguments passed to specific approaches. See R-documentation of the function + `explain_tripledot_docs` for more information about the approach specific arguments + (https://norskregnesentral.github.io/shapr/reference/explain_tripledot_docs.html). Note that the parameters + in R are called 'approach.parameter_name', but in Python the equivalent would be 'approach_parameter_name'. Returns ------- @@ -110,20 +115,27 @@ def explain( only over the explicands, and only over the coalitions. ''' - timing_list = { - "init_time": datetime.now() - } + timing_list = {"init_time": datetime.now()} base.set_seed(seed) + # Gets and check feature specs from the model rfeature_specs = get_feature_specs(get_model_specs, model) # Fixes the conversion from dict to a named list of vectors in R - if group is None: - r_group = NULL - else: - r_group = ListVector({key: StrVector(value) for key, value in group.items()}) + r_group = NULL if group is None else ListVector({key: StrVector(value) for key, value in group.items()}) + # Fixes method specific argument names by replacing first occurrence of "_" with "." + if len(kwargs) > 0: + kwargs = change_first_underscore_to_dot(kwargs) + + # Convert from dict to a named list of vectors in R if `regression.vfold_cv_para` is provided by the user + if 'regression.vfold_cv_para' in kwargs: + kwargs['regression.vfold_cv_para'] = ListVector(kwargs['regression.vfold_cv_para']) + + # Sets up and organizes input parameters + # Checks the input parameters and their compatability + # Checks data/model compatability rinternal = shapr.setup( x_train = py2r(x_train), x_explain = py2r(x_explain), @@ -140,40 +152,54 @@ def explain( timing = timing, verbose = verbose, is_python=True, + **kwargs ) timing_list["setup"] = datetime.now() - predict_model = get_predict_model( - x_test = x_train.head(2), - predict_model = predict_model, - model = model, - ) + # Gets predict_model (if not passed to explain) and checks that predict_model gives correct format + predict_model = get_predict_model(x_test=x_train.head(2), predict_model=predict_model, model=model) timing_list["test_prediction"] = datetime.now() + # Add the predicted response of the training and explain data to the internal list for regression-based methods + using_regression_paradigm = rinternal.rx2("parameters").rx2("regression")[0] + if using_regression_paradigm: + rinternal = regression_get_y_hat(rinternal, model, predict_model, x_train, x_explain) + + # Sets up the Shapley framework and prepares the conditional expectation computation for the chosen approach rinternal = shapr.setup_computation(rinternal, NULL, NULL) + + # Compute the v(S): + # MC: + # 1. Get the samples for the conditional distributions with the specified approach + # 2. Predict with these samples + # 3. Perform MC integration on these to estimate the conditional expectation (v(S)) + # Regression: + # 1. Directly estimate the conditional expectation (v(S)) using the fitted regression model(s) rvS_list = compute_vS(rinternal, model, predict_model) - - timing_list["compute_vS"] = datetime.now() + timing_list["compute_vS"] = datetime.now() - routput = shapr.finalize_explanation( - vS_list = rvS_list, - internal = rinternal, - ) + # Compute Shapley values based on conditional expectations (v(S)) + # Organize function output + routput = shapr.finalize_explanation(vS_list=rvS_list, internal=rinternal) timing_list["shapley_computation"] = datetime.now() - if timing==True: - timing = compute_time(timing_list) - else: - timing = None + # Compute the elapsed time for the different steps + timing = compute_time(timing_list) if timing else None + # If regression, then delete the regression/tidymodels objects in routput as they cannot be converted to python + if using_regression_paradigm: + routput = regression_remove_objects(routput) + + # Convert R objects to Python objects df_shapley = r2py(base.as_data_frame(routput.rx2('shapley_values'))) pred_explain = r2py(routput.rx2('pred_explain')) internal = recurse_r_tree(routput.rx2('internal')) MSEv = recurse_r_tree(routput.rx2('MSEv')) + return df_shapley, pred_explain, internal, timing, MSEv @@ -181,24 +207,28 @@ def compute_vS(rinternal, model, predict_model): S_batch = rinternal.rx2('objects').rx2('S_batch') ret = ro.ListVector({}) for i, S in enumerate(S_batch): - ret.rx2[i+1] = batch_compute_vS( - S=S, - rinternal=rinternal, - model=model, - predict_model=predict_model, - ) + ret.rx2[i+1] = batch_compute_vS(S=S, rinternal=rinternal, model=model, predict_model=predict_model) return ret def batch_compute_vS(S, rinternal, model, predict_model): + regression = rinternal.rx2('parameters').rx2('regression')[0] + + # Check if we are to use regression or Monte Carlo integration to compute the contribution function values + if regression: + dt_vS = shapr.batch_prepare_vS_regression(S=S, internal=rinternal) + else: + # dt_vS is either only dt_vS or a list containing dt_vS and dt if internal$parameters$keep_samp_for_vS = TRUE + dt_vS = batch_prepare_vS_MC(S=S, rinternal=rinternal, model=model, predict_model=predict_model) + + return dt_vS + + +def batch_prepare_vS_MC(S, rinternal, model, predict_model): keep_samp_for_vS = rinternal.rx2('parameters').rx2('keep_samp_for_vS')[0] feature_names = list(rinternal.rx2('parameters').rx2('feature_names')) - dt = shapr.batch_prepare_vS(S=S, internal=rinternal) - dt = compute_preds( - dt, - feature_names=feature_names, - predict_model=predict_model, - model=model) + dt = shapr.batch_prepare_vS_MC_auxiliary(S=S, internal=rinternal) + dt = compute_preds(dt=dt, feature_names=feature_names, predict_model=predict_model, model=model) dt_vS = shapr.compute_MCint(dt) if keep_samp_for_vS: @@ -211,9 +241,9 @@ def compute_preds(dt, feature_names, predict_model, model): preds = predict_model(model, r2py(dt).loc[:,feature_names]) return ro.r.cbind(dt, p_hat=ro.FloatVector(preds.tolist())) - -def get_feature_specs(get_model_specs, model): + +def get_feature_specs(get_model_specs, model): model_class0 = type(model) if (get_model_specs is not None) and (not callable(get_model_specs)): @@ -230,7 +260,7 @@ def get_feature_specs(get_model_specs, model): feature_specs = get_model_specs(model) except Exception as e: raise RuntimeError(f'The get_model_specs function of class `{model_class0}` is invalid.\nA basic function test threw the following error:\n{e}') - + if not isinstance(feature_specs, dict): raise ValueError(f'`get_model_specs` returned an object of type `{type(feature_specs)}`, but it should be of type `dict`') if set(feature_specs.keys()) != set(["labels","classes","factor_levels"]): @@ -248,6 +278,7 @@ def strvec_or_na(v): def listvec_or_na(v): if v is None: return NA return ro.ListVector({k:list(val) for k,val in v.items()}) + rfeature_specs = ro.ListVector({ 'labels': py2r_or_na(feature_specs['labels']), 'classes': strvec_or_na(feature_specs['classes']), @@ -267,7 +298,7 @@ def get_predict_model(x_test, predict_model, model): predict_model = prebuilt_predict_model(model) if predict_model is None: raise ValueError(f'No pre-built predict_model for model of type {type(model)}. Please pass a custom predict_model to shaprpy.explain(...).') - + try: tmp = py2r(predict_model(model, x_test)) except Exception as e: @@ -325,7 +356,7 @@ def prebuilt_predict_model(model): return lambda m, x: m.predict(xgb.DMatrix(x)) except: pass - + # Look for keras try: from keras.models import Model @@ -353,3 +384,38 @@ def compute_time(timing_list): } return timing_output + + +def regression_get_y_hat(rinternal, model, predict_model, x_train, x_explain): + x_train_y_hat = predict_model(model, x_train) + x_explain_y_hat = predict_model(model, x_explain) + + # Extract data list, add the predicted responses, and then updated rinternal (direct assignment did not work) + data = rinternal.rx2['data'] + data.rx2['x_train_y_hat'] = ro.FloatVector(x_train_y_hat.tolist()) + data.rx2['x_explain_y_hat'] = ro.FloatVector(x_explain_y_hat.tolist()) + rinternal.rx2['data'] = data + + return rinternal + + +def regression_remove_objects(routput): + tmp_internal = routput.rx2("internal") + tmp_parameters = tmp_internal.rx2("parameters") + objects = ro.StrVector(("regression", "regression.model", "regression.tune_values", "regression.vfold_cv_para", + "regression.recipe_func", "regression.tune", "regression.surrogate_n_comb")) + tmp_parameters.rx[objects] = NULL + tmp_internal.rx2["parameters"] = tmp_parameters + if tmp_parameters.rx2("approach")[0] == "regression_surrogate": + tmp_objects = tmp_internal.rx2("objects") + tmp_objects.rx["regression.surrogate_model"] = NULL + tmp_internal.rx2["objects"] = tmp_objects + routput.rx2["internal"] = tmp_internal + return routput + + +def change_first_underscore_to_dot(kwargs): + kwargs_tmp = {} + for k, v in kwargs.items(): + kwargs_tmp[k.replace('_', '.', 1)] = v + return kwargs_tmp \ No newline at end of file diff --git a/rebuild_long_running_vignette.R b/rebuild_long_running_vignette.R index 448e1205c..a75a3a7a4 100644 --- a/rebuild_long_running_vignette.R +++ b/rebuild_long_running_vignette.R @@ -5,10 +5,14 @@ old_wd <- getwd() setwd("vignettes/") -knitr::knit("understanding_shapr_vaeac.Rmd.orig", output = "understanding_shapr_vaeac.Rmd") -#knitr::purl("understanding_shapr_vaeac.Rmd.orig", output = "understanding_shapr_vaeac.R") # Don't need this knitr::knit("understanding_shapr.Rmd.orig", output = "understanding_shapr.Rmd") -#knitr::purl("understanding_shapr.Rmd.orig", output = "understanding_shapr.R") # Don't need this +# knitr::purl("understanding_shapr.Rmd.orig", output = "understanding_shapr.R") # Don't need this + +knitr::knit("understanding_shapr_vaeac.Rmd.orig", output = "understanding_shapr_vaeac.Rmd") +# knitr::purl("understanding_shapr_vaeac.Rmd.orig", output = "understanding_shapr_vaeac.R") # Don't need this + +knitr::knit("understanding_shapr_regression.Rmd.orig", output = "understanding_shapr_regression.Rmd") +# knitr::purl("understanding_shapr_regression.Rmd.orig", output = "understanding_shapr_regression.R") # Don't need this setwd(old_wd) diff --git a/tests/testthat/_snaps/regression-output.md b/tests/testthat/_snaps/regression-output.md new file mode 100644 index 000000000..4b8f56c25 --- /dev/null +++ b/tests/testthat/_snaps/regression-output.md @@ -0,0 +1,165 @@ +# output_lm_numeric_lm_separate + + Code + (out <- code) + Output + none Solar.R Wind Temp Month Day + + 1: 42.44 -8.577 7.845 14.4756 0.6251 -1.7664 + 2: 42.44 4.818 -4.811 -11.6350 -1.0423 -1.2086 + 3: 42.44 7.406 -25.587 0.3353 -0.4718 0.7491 + +# output_lm_numeric_lm_separate_n_comb + + Code + (out <- code) + Output + none Solar.R Wind Temp Month Day + + 1: 42.44 -7.806 14.811 5.751 4.056 -4.2111 + 2: 42.44 5.056 -7.055 -16.887 5.976 -0.9692 + 3: 42.44 7.020 -33.059 2.395 3.782 2.2943 + +# output_lm_categorical_lm_separate + + Code + (out <- code) + Output + none Month_factor Ozone_sub30_factor Solar.R_factor Wind_factor + + 1: 42.44 -9.806 18.60 -11.788 2.489 + 2: 42.44 -7.256 -18.88 24.751 -13.445 + 3: 42.44 15.594 -26.01 5.887 -13.834 + +# output_lm_mixed_lm_separate + + Code + (out <- code) + Output + none Solar.R Wind Temp Day Month_factor + + 1: 42.44 -8.782 8.165 20.389 -1.2383 -7.950 + 2: 42.44 4.623 -3.551 -6.199 -0.9110 -9.345 + 3: 42.44 8.029 -25.200 -4.821 0.4172 10.975 + +# output_lm_mixed_splines_separate + + Code + (out <- code) + Output + none Solar.R Wind Temp Day Month_factor + + 1: 42.44 -8.083 7.102 18.732 1.483 -8.651 + 2: 42.44 6.147 -4.314 -6.445 -2.136 -8.635 + 3: 42.44 7.536 -22.504 -5.081 -2.170 11.619 + +# output_lm_mixed_decision_tree_cv_separate + + Code + (out <- code) + Output + none Solar.R Wind Temp Day Month_factor + + 1: 42.44 -8.131 12.303 9.935 1.6221 -5.145 + 2: 42.44 2.907 -5.119 -7.128 1.7841 -7.827 + 3: 42.44 6.237 -9.010 -17.927 -0.6915 10.791 + +# output_lm_mixed_decision_tree_cv_separate_parallel + + Code + (out <- code) + Output + none Solar.R Wind Temp Day Month_factor + + 1: 42.44 -8.131 12.303 9.935 1.6221 -5.145 + 2: 42.44 2.907 -5.119 -7.128 1.7841 -7.827 + 3: 42.44 6.237 -9.010 -17.927 -0.6915 10.791 + +# output_lm_mixed_xgboost_separate + + Code + (out <- code) + Output + none Solar.R Wind Temp Day Month_factor + + 1: 42.44 -13.991 14.352 16.490 1.82 -8.088 + 2: 42.44 8.183 -1.463 -16.499 3.63 -9.233 + 3: 42.44 3.364 -14.946 0.401 -11.32 11.905 + +# output_lm_numeric_lm_surrogate + + Code + (out <- code) + Output + none Solar.R Wind Temp Month Day + + 1: 42.44 -9.273 9.578 16.536 -1.2690 -2.9707 + 2: 42.44 2.623 -5.766 -6.717 -1.4694 -2.5496 + 3: 42.44 6.801 -24.090 -1.295 0.1202 0.8953 + +# output_lm_numeric_lm_surrogate_n_comb + + Code + (out <- code) + Output + none Solar.R Wind Temp Month Day + + 1: 42.44 -9.6804 12.2171 11.4871 0.74529 -2.1671 + 2: 42.44 0.6882 0.3332 -12.8835 1.93235 -3.9496 + 3: 42.44 7.8022 -26.0731 -0.2148 0.04831 0.8691 + +# output_lm_numeric_lm_surrogate_reg_surr_n_comb + + Code + (out <- code) + Output + none Solar.R Wind Temp Month Day + + 1: 42.44 -9.6804 12.2171 11.4871 0.74529 -2.1671 + 2: 42.44 0.6882 0.3332 -12.8835 1.93235 -3.9496 + 3: 42.44 7.8022 -26.0731 -0.2148 0.04831 0.8691 + +# output_lm_categorical_lm_surrogate + + Code + (out <- code) + Output + none Month_factor Ozone_sub30_factor Solar.R_factor Wind_factor + + 1: 42.44 -7.137 16.29 -9.895 0.2304 + 2: 42.44 -6.018 -16.28 23.091 -15.6258 + 3: 42.44 10.042 -18.58 2.415 -12.2431 + +# output_lm_mixed_lm_surrogate + + Code + (out <- code) + Output + none Solar.R Wind Temp Day Month_factor + + 1: 42.44 -7.427 10.831 16.477 -0.6280 -8.669 + 2: 42.44 3.916 -4.232 -4.849 -0.8776 -9.341 + 3: 42.44 5.629 -24.012 -2.274 -0.4774 10.534 + +# output_lm_mixed_decision_tree_cv_surrogate + + Code + (out <- code) + Output + none Solar.R Wind Temp Day Month_factor + + 1: 42.44 -4.219 -4.219 27.460 -4.219 -4.219 + 2: 42.44 -3.077 -3.077 -3.077 -3.077 -3.077 + 3: 42.44 -6.716 -6.716 -6.716 -6.716 16.262 + +# output_lm_mixed_xgboost_surrogate + + Code + (out <- code) + Output + none Solar.R Wind Temp Day Month_factor + + 1: 42.44 -11.165 8.002 20.61 2.030 -8.896 + 2: 42.44 4.143 -1.515 -11.23 2.025 -8.806 + 3: 42.44 6.515 -18.268 -4.06 -3.992 9.204 + diff --git a/tests/testthat/_snaps/regression-output/output_lm_categorical_lm_separate.rds b/tests/testthat/_snaps/regression-output/output_lm_categorical_lm_separate.rds new file mode 100644 index 000000000..0bf5e6e52 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_categorical_lm_separate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_categorical_lm_surrogate.rds b/tests/testthat/_snaps/regression-output/output_lm_categorical_lm_surrogate.rds new file mode 100644 index 000000000..f859e3d75 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_categorical_lm_surrogate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_mixed_decision_tree_cv_separate.rds b/tests/testthat/_snaps/regression-output/output_lm_mixed_decision_tree_cv_separate.rds new file mode 100644 index 000000000..54e491a34 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_mixed_decision_tree_cv_separate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_mixed_decision_tree_cv_separate_parallel.rds b/tests/testthat/_snaps/regression-output/output_lm_mixed_decision_tree_cv_separate_parallel.rds new file mode 100644 index 000000000..959f84115 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_mixed_decision_tree_cv_separate_parallel.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_mixed_decision_tree_cv_surrogate.rds b/tests/testthat/_snaps/regression-output/output_lm_mixed_decision_tree_cv_surrogate.rds new file mode 100644 index 000000000..fb0af97eb Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_mixed_decision_tree_cv_surrogate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_mixed_lm_separate.rds b/tests/testthat/_snaps/regression-output/output_lm_mixed_lm_separate.rds new file mode 100644 index 000000000..b45d28996 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_mixed_lm_separate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_mixed_lm_surrogate.rds b/tests/testthat/_snaps/regression-output/output_lm_mixed_lm_surrogate.rds new file mode 100644 index 000000000..46e511c58 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_mixed_lm_surrogate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_mixed_splines_separate.rds b/tests/testthat/_snaps/regression-output/output_lm_mixed_splines_separate.rds new file mode 100644 index 000000000..2a7766305 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_mixed_splines_separate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_mixed_xgboost_separate.rds b/tests/testthat/_snaps/regression-output/output_lm_mixed_xgboost_separate.rds new file mode 100644 index 000000000..c187df49e Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_mixed_xgboost_separate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_mixed_xgboost_surrogate.rds b/tests/testthat/_snaps/regression-output/output_lm_mixed_xgboost_surrogate.rds new file mode 100644 index 000000000..4fc7f83c5 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_mixed_xgboost_surrogate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_separate.rds b/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_separate.rds new file mode 100644 index 000000000..365fd2c69 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_separate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_separate_n_comb.rds b/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_separate_n_comb.rds new file mode 100644 index 000000000..f14c13b35 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_separate_n_comb.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_surrogate.rds b/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_surrogate.rds new file mode 100644 index 000000000..373f99a3d Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_surrogate.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_surrogate_n_comb.rds b/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_surrogate_n_comb.rds new file mode 100644 index 000000000..d5bf3bb59 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_surrogate_n_comb.rds differ diff --git a/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_surrogate_reg_surr_n_comb.rds b/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_surrogate_reg_surr_n_comb.rds new file mode 100644 index 000000000..96fcf7828 Binary files /dev/null and b/tests/testthat/_snaps/regression-output/output_lm_numeric_lm_surrogate_reg_surr_n_comb.rds differ diff --git a/tests/testthat/_snaps/regression-setup.md b/tests/testthat/_snaps/regression-setup.md new file mode 100644 index 000000000..6cf8babcf --- /dev/null +++ b/tests/testthat/_snaps/regression-setup.md @@ -0,0 +1,210 @@ +# regression erroneous input: `approach` + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = c( + "regression_surrogate", "gaussian", "independence", "empirical"), ) + Condition + Error in `check_approach()`: + ! The `regression_separate` and `regression_surrogate` approaches cannot be combined with other approaches. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = c( + "regression_separate", "gaussian", "independence", "empirical"), ) + Condition + Error in `check_approach()`: + ! The `regression_separate` and `regression_surrogate` approaches cannot be combined with other approaches. + +# regression erroneous input: `regression.model` + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = NULL) + Condition + Error in `regression.get_tune()`: + ! `regression.model` must be a tidymodels object with class 'model_spec'. See documentation. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = lm) + Condition + Error in `regression.get_tune()`: + ! `regression.model` must be a tidymodels object with class 'model_spec'. See documentation. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", + mode = "regression")) + Condition + Error in `regression.get_tune()`: + ! `regression.tune_values` must be provided when `regression.model` contains hyperparameters to tune. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", + mode = "regression"), regression.tune_values = data.frame(num_terms = c(1, 2, + 3))) + Condition + Error in `regression.get_tune()`: + ! The tunable parameters in `regression.model` ('tree_depth') and `regression.tune_values` ('num_terms') must match. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", + mode = "regression"), regression.tune_values = data.frame(tree_depth = c(1, 2, + 3), num_terms = c(1, 2, 3))) + Condition + Error in `regression.get_tune()`: + ! The tunable parameters in `regression.model` ('tree_depth') and `regression.tune_values` ('tree_depth', 'num_terms') must match. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = 2, engine = "rpart", + mode = "regression"), regression.tune_values = data.frame(tree_depth = c(1, + 2, 3))) + Condition + Error in `regression.get_tune()`: + ! The tunable parameters in `regression.model` ('') and `regression.tune_values` ('tree_depth') must match. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_surrogate", + regression.tune_values = data.frame(tree_depth = c(1, 2, 3))) + Condition + Error in `regression.get_tune()`: + ! The tunable parameters in `regression.model` ('') and `regression.tune_values` ('tree_depth') must match. + +# regression erroneous input: `regression.tune_values` + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = 2, engine = "rpart", + mode = "regression"), regression.tune_values = as.matrix(data.frame( + tree_depth = c(1, 2, 3)))) + Condition + Error in `regression.get_tune()`: + ! `regression.tune_values` must be of either class `data.frame` or `function`. See documentation. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", + mode = "regression"), regression.tune_values = function(x) c(1, 2, 3)) + Condition + Error in `regression.get_tune()`: + ! The output of the user provided `regression.tune_values` function must be of class `data.frame`. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", + mode = "regression"), regression.tune_values = function(x) data.frame( + wrong_name = c(1, 2, 3))) + Condition + Error in `regression.get_tune()`: + ! The tunable parameters in `regression.model` ('tree_depth') and `regression.tune_values` ('wrong_name') must match. + +# regression erroneous input: `regression.vfold_cv_para` + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", + mode = "regression"), regression.tune_values = data.frame(tree_depth = c(1, 2, + 3)), regression.vfold_cv_para = 10) + Condition + Error in `regression.check_vfold_cv_para()`: + ! `regression.vfold_cv_para` must be a named list. See documentation using '?shapr::explain()'. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", + mode = "regression"), regression.tune_values = data.frame(tree_depth = c(1, 2, + 3)), regression.vfold_cv_para = list(10)) + Condition + Error in `regression.check_vfold_cv_para()`: + ! `regression.vfold_cv_para` must be a named list. See documentation using '?shapr::explain()'. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", + mode = "regression"), regression.tune_values = data.frame(tree_depth = c(1, 2, + 3)), regression.vfold_cv_para = list(hey = 10)) + Condition + Error in `regression.check_vfold_cv_para()`: + ! The following parameters in `regression.vfold_cv_para` are not supported by `rsample::vfold_cv()`: 'hey'. + +# regression erroneous input: `regression.recipe_func` + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_separate", + regression.recipe_func = 3) + Condition + Error in `regression.check_recipe_func()`: + ! `regression.recipe_func` must be a function. See documentation. + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_surrogate", + regression.recipe_func = function(x) { + return(2) + }) + Condition + Error in `regression.check_recipe_func()`: + ! The output of the `regression.recipe_func` must be of class `recipe`. + +# regression erroneous input: `regression.surrogate_n_comb` + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_surrogate", + regression.surrogate_n_comb = 2^ncol(x_explain_numeric) - 1) + Condition + Error in `regression.check_sur_n_comb()`: + ! `regression.surrogate_n_comb` (31) must be a positive integer less than or equal to `used_n_combinations` minus two (30). + +--- + + Code + explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, + prediction_zero = p0, n_batches = 1, timing = FALSE, approach = "regression_surrogate", + regression.surrogate_n_comb = 0) + Condition + Error in `regression.check_sur_n_comb()`: + ! `regression.surrogate_n_comb` (0) must be a positive integer less than or equal to `used_n_combinations` minus two (30). + diff --git a/tests/testthat/_snaps/setup.md b/tests/testthat/_snaps/setup.md index 2d040cb13..72c288315 100644 --- a/tests/testthat/_snaps/setup.md +++ b/tests/testthat/_snaps/setup.md @@ -178,9 +178,8 @@ timing = FALSE) Condition Error in `check_approach()`: - ! `approach` must be one of the following: - categorical, copula, ctree, empirical, gaussian, independence, timeseries, vaeac - or a vector of length one less than the number of features ( 4 ), with only the above strings. + ! `approach` must be one of the following: 'categorical', 'copula', 'ctree', 'empirical', 'gaussian', 'independence', 'regression_separate', 'regression_surrogate', 'timeseries', 'vaeac'. + These can also be combined (except 'regression_surrogate' and 'regression_separate') by passing a vector of length one less than the number of features (4). --- @@ -191,9 +190,8 @@ timing = FALSE) Condition Error in `check_approach()`: - ! `approach` must be one of the following: - categorical, copula, ctree, empirical, gaussian, independence, timeseries, vaeac - or a vector of length one less than the number of features ( 4 ), with only the above strings. + ! `approach` must be one of the following: 'categorical', 'copula', 'ctree', 'empirical', 'gaussian', 'independence', 'regression_separate', 'regression_surrogate', 'timeseries', 'vaeac'. + These can also be combined (except 'regression_surrogate' and 'regression_separate') by passing a vector of length one less than the number of features (4). --- @@ -204,9 +202,8 @@ timing = FALSE) Condition Error in `check_approach()`: - ! `approach` must be one of the following: - categorical, copula, ctree, empirical, gaussian, independence, timeseries, vaeac - or a vector of length one less than the number of features ( 4 ), with only the above strings. + ! `approach` must be one of the following: 'categorical', 'copula', 'ctree', 'empirical', 'gaussian', 'independence', 'regression_separate', 'regression_surrogate', 'timeseries', 'vaeac'. + These can also be combined (except 'regression_surrogate' and 'regression_separate') by passing a vector of length one less than the number of features (4). # erroneous input: `prediction_zero` diff --git a/tests/testthat/test-regression-output.R b/tests/testthat/test-regression-output.R new file mode 100644 index 000000000..5b97e46e8 --- /dev/null +++ b/tests/testthat/test-regression-output.R @@ -0,0 +1,262 @@ +# Separate regression ================================================================================================== +test_that("output_lm_numeric_lm_separate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "regression_separate", + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + regression.model = parsnip::linear_reg() + ), + "output_lm_numeric_lm_separate" + ) +}) + +test_that("output_lm_numeric_lm_separate_n_comb", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "regression_separate", + prediction_zero = p0, + n_batches = 4, + n_combinations = 10, + timing = FALSE, + regression.model = parsnip::linear_reg() + ), + "output_lm_numeric_lm_separate_n_comb" + ) +}) + +test_that("output_lm_categorical_lm_separate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_categorical, + x_explain = x_explain_categorical, + x_train = x_train_categorical, + approach = "regression_separate", + prediction_zero = p0, + n_batches = 4, + timing = FALSE, + regression.model = parsnip::linear_reg() + ), + "output_lm_categorical_lm_separate" + ) +}) + +test_that("output_lm_mixed_lm_separate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_mixed, + x_explain = x_explain_mixed, + x_train = x_train_mixed, + approach = "regression_separate", + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + regression.model = parsnip::linear_reg(), + ), + "output_lm_mixed_lm_separate" + ) +}) + +test_that("output_lm_mixed_splines_separate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_mixed, + x_explain = x_explain_mixed, + x_train = x_train_mixed, + approach = "regression_separate", + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + regression.model = parsnip::linear_reg(), + regression.recipe_func = function(regression.recipe) { + recipes::step_ns(regression.recipe, recipes::all_numeric_predictors(), deg_free = 2) + } + ), + "output_lm_mixed_splines_separate" + ) +}) + +test_that("output_lm_mixed_decision_tree_cv_separate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_mixed, + x_explain = x_explain_mixed, + x_train = x_train_mixed, + prediction_zero = p0, + n_batches = 4, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = hardhat::tune(), engine = "rpart", mode = "regression"), + regression.tune_values = data.frame(tree_depth = c(1, 2)), + regression.vfold_cv_para = list(v = 2) + ), + "output_lm_mixed_decision_tree_cv_separate" + ) +}) + +test_that("output_lm_mixed_decision_tree_cv_separate_parallel", { + future::plan("multisession", workers = 2) + expect_snapshot_rds( + shapr::explain( + model = model_lm_mixed, + x_explain = x_explain_mixed, + x_train = x_train_mixed, + prediction_zero = p0, + n_batches = 4, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = hardhat::tune(), engine = "rpart", mode = "regression"), + regression.tune_values = data.frame(tree_depth = c(1, 2)), + regression.vfold_cv_para = list(v = 2) + ), + "output_lm_mixed_decision_tree_cv_separate_parallel" + ) + future::plan("sequential") +}) + +test_that("output_lm_mixed_xgboost_separate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_mixed, + x_explain = x_explain_mixed, + x_train = x_train_mixed, + prediction_zero = p0, + n_batches = 4, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::boost_tree(engine = "xgboost", mode = "regression"), + regression.recipe_func = function(regression.recipe) { + return(recipes::step_dummy(regression.recipe, recipes::all_factor_predictors())) + } + ), + "output_lm_mixed_xgboost_separate" + ) +}) + +# Surrogate regression ================================================================================================= +test_that("output_lm_numeric_lm_surrogate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "regression_surrogate", + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + regression.model = parsnip::linear_reg() + ), + "output_lm_numeric_lm_surrogate" + ) +}) + +test_that("output_lm_numeric_lm_surrogate_n_comb", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "regression_surrogate", + prediction_zero = p0, + n_batches = 4, + n_combinations = 10, + timing = FALSE, + regression.model = parsnip::linear_reg() + ), + "output_lm_numeric_lm_surrogate_n_comb" + ) +}) + +test_that("output_lm_numeric_lm_surrogate_reg_surr_n_comb", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "regression_surrogate", + prediction_zero = p0, + n_batches = 4, + n_combinations = 10, + timing = FALSE, + regression.model = parsnip::linear_reg(), + regression.surrogate_n_comb = 8 + ), + "output_lm_numeric_lm_surrogate_reg_surr_n_comb" + ) +}) + +test_that("output_lm_categorical_lm_surrogate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_categorical, + x_explain = x_explain_categorical, + x_train = x_train_categorical, + approach = "regression_surrogate", + prediction_zero = p0, + n_batches = 2, + timing = FALSE, + regression.model = parsnip::linear_reg() + ), + "output_lm_categorical_lm_surrogate" + ) +}) + +test_that("output_lm_mixed_lm_surrogate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_mixed, + x_explain = x_explain_mixed, + x_train = x_train_mixed, + approach = "regression_surrogate", + prediction_zero = p0, + n_batches = 4, + timing = FALSE, + regression.model = parsnip::linear_reg() + ), + "output_lm_mixed_lm_surrogate" + ) +}) + +test_that("output_lm_mixed_decision_tree_cv_surrogate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_mixed, + x_explain = x_explain_mixed, + x_train = x_train_mixed, + prediction_zero = p0, + n_batches = 4, + timing = FALSE, + approach = "regression_surrogate", + regression.model = parsnip::decision_tree(tree_depth = hardhat::tune(), engine = "rpart", mode = "regression"), + regression.tune_values = data.frame(tree_depth = c(1, 2)), + regression.vfold_cv_para = list(v = 2) + ), + "output_lm_mixed_decision_tree_cv_surrogate" + ) +}) + +test_that("output_lm_mixed_xgboost_surrogate", { + expect_snapshot_rds( + shapr::explain( + model = model_lm_mixed, + x_explain = x_explain_mixed, + x_train = x_train_mixed, + prediction_zero = p0, + n_batches = 4, + timing = FALSE, + approach = "regression_surrogate", + regression.model = parsnip::boost_tree(engine = "xgboost", mode = "regression"), + regression.recipe_func = function(regression.recipe) { + recipes::step_dummy(regression.recipe, recipes::all_factor_predictors()) + } + ), + "output_lm_mixed_xgboost_surrogate" + ) +}) diff --git a/tests/testthat/test-regression-setup.R b/tests/testthat/test-regression-setup.R new file mode 100644 index 000000000..43f4b3fc4 --- /dev/null +++ b/tests/testthat/test-regression-setup.R @@ -0,0 +1,360 @@ +test_that("regression erroneous input: `approach`", { + set.seed(123) + + expect_snapshot( + { + # Include regression_surrogate + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = c("regression_surrogate", "gaussian", "independence", "empirical"), + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # Include regression_separate + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = c("regression_separate", "gaussian", "independence", "empirical"), + ) + }, + error = TRUE + ) +}) + +test_that("regression erroneous input: `regression.model`", { + set.seed(123) + + expect_snapshot( + { + # no regression model passed + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = NULL + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # not a tidymodels object of class model_spec + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = lm + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # regression.tune_values` must be provided when `regression.model` contains hyperparameters to tune. + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", mode = "regression") + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # The tunable parameters and the parameters value do not match + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", mode = "regression"), + regression.tune_values = data.frame(num_terms = c(1, 2, 3)) + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # The tunable parameters and the parameters value do not match + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", mode = "regression"), + regression.tune_values = data.frame(tree_depth = c(1, 2, 3), num_terms = c(1, 2, 3)) + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # Provide regression.tune_values but the parameter has allready been specified in the regression.model + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = 2, engine = "rpart", mode = "regression"), + regression.tune_values = data.frame(tree_depth = c(1, 2, 3)) + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # Provide regression.tune_values but not a model where these are to be used + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_surrogate", + regression.tune_values = data.frame(tree_depth = c(1, 2, 3)) + ) + }, + error = TRUE + ) +}) + + +test_that("regression erroneous input: `regression.tune_values`", { + set.seed(123) + + expect_snapshot( + { + # Provide hyperparameter values, but hyperparameter has not been declared as a tunable parameter + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = 2, engine = "rpart", mode = "regression"), + regression.tune_values = as.matrix(data.frame(tree_depth = c(1, 2, 3))) + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # The regression.tune_values function must return a data.frame + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", mode = "regression"), + regression.tune_values = function(x) c(1, 2, 3) + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # The regression.tune_values function must return a data.frame with correct names + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", mode = "regression"), + regression.tune_values = function(x) data.frame(wrong_name = c(1, 2, 3)) + ) + }, + error = TRUE + ) +}) + +test_that("regression erroneous input: `regression.vfold_cv_para`", { + set.seed(123) + + expect_snapshot( + { + # `regression.vfold_cv_para` is not a list + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", mode = "regression"), + regression.tune_values = data.frame(tree_depth = c(1, 2, 3)), + regression.vfold_cv_para = 10 + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # `regression.vfold_cv_para` is not a named list + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", mode = "regression"), + regression.tune_values = data.frame(tree_depth = c(1, 2, 3)), + regression.vfold_cv_para = list(10) + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # Unrecognized parameter + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.model = parsnip::decision_tree(tree_depth = tune(), engine = "rpart", mode = "regression"), + regression.tune_values = data.frame(tree_depth = c(1, 2, 3)), + regression.vfold_cv_para = list(hey = 10) + ) + }, + error = TRUE + ) +}) + + +test_that("regression erroneous input: `regression.recipe_func`", { + set.seed(123) + + expect_snapshot( + { + # regression.recipe_func is not a function + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_separate", + regression.recipe_func = 3 + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # regression.recipe_func must output a recipe + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_surrogate", + regression.recipe_func = function(x) { + return(2) + } + ) + }, + error = TRUE + ) +}) + +test_that("regression erroneous input: `regression.surrogate_n_comb`", { + set.seed(123) + + expect_snapshot( + { + # regression.surrogate_n_comb must be between 1 and 2^n_features - 2 + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_surrogate", + regression.surrogate_n_comb = 2^ncol(x_explain_numeric) - 1 + ) + }, + error = TRUE + ) + + expect_snapshot( + { + # regression.surrogate_n_comb must be between 1 and 2^n_features - 2 + explain( + model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0, + n_batches = 1, + timing = FALSE, + approach = "regression_surrogate", + regression.surrogate_n_comb = 0 + ) + }, + error = TRUE + ) +}) diff --git a/vignettes/.gitignore b/vignettes/.gitignore index 53f693548..f48855dd4 100644 --- a/vignettes/.gitignore +++ b/vignettes/.gitignore @@ -2,3 +2,4 @@ *.R cache_main/ cache_vaeac/ +cache_regression/ diff --git a/vignettes/cache_main/__packages b/vignettes/cache_main/__packages new file mode 100644 index 000000000..ab530a493 --- /dev/null +++ b/vignettes/cache_main/__packages @@ -0,0 +1,4 @@ +shapr +xgboost +data.table +gbm diff --git a/vignettes/figure_main/unnamed-chunk-12-1.png b/vignettes/figure_main/unnamed-chunk-12-1.png index 8ed2b2f9e..f39f175bb 100644 Binary files a/vignettes/figure_main/unnamed-chunk-12-1.png and b/vignettes/figure_main/unnamed-chunk-12-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-12-2.png b/vignettes/figure_main/unnamed-chunk-12-2.png index f4df03213..48ef4a1b4 100644 Binary files a/vignettes/figure_main/unnamed-chunk-12-2.png and b/vignettes/figure_main/unnamed-chunk-12-2.png differ diff --git a/vignettes/figure_main/unnamed-chunk-12-3.png b/vignettes/figure_main/unnamed-chunk-12-3.png index 3a015862d..af70e02ca 100644 Binary files a/vignettes/figure_main/unnamed-chunk-12-3.png and b/vignettes/figure_main/unnamed-chunk-12-3.png differ diff --git a/vignettes/figure_main/unnamed-chunk-13-1.png b/vignettes/figure_main/unnamed-chunk-13-1.png index fcba65389..4dde3b845 100644 Binary files a/vignettes/figure_main/unnamed-chunk-13-1.png and b/vignettes/figure_main/unnamed-chunk-13-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-13-2.png b/vignettes/figure_main/unnamed-chunk-13-2.png index f7b80dd32..c6fe23ef8 100644 Binary files a/vignettes/figure_main/unnamed-chunk-13-2.png and b/vignettes/figure_main/unnamed-chunk-13-2.png differ diff --git a/vignettes/figure_main/unnamed-chunk-14-1.png b/vignettes/figure_main/unnamed-chunk-14-1.png index d57cf2214..c3e047ece 100644 Binary files a/vignettes/figure_main/unnamed-chunk-14-1.png and b/vignettes/figure_main/unnamed-chunk-14-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-2-1.png b/vignettes/figure_main/unnamed-chunk-2-1.png index b8a19b268..ac95b5818 100644 Binary files a/vignettes/figure_main/unnamed-chunk-2-1.png and b/vignettes/figure_main/unnamed-chunk-2-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-20-1.png b/vignettes/figure_main/unnamed-chunk-20-1.png index bd1817166..f915a7961 100644 Binary files a/vignettes/figure_main/unnamed-chunk-20-1.png and b/vignettes/figure_main/unnamed-chunk-20-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-22-1.png b/vignettes/figure_main/unnamed-chunk-22-1.png index 40ee44d83..dd32ab9fe 100644 Binary files a/vignettes/figure_main/unnamed-chunk-22-1.png and b/vignettes/figure_main/unnamed-chunk-22-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-23-1.png b/vignettes/figure_main/unnamed-chunk-23-1.png index d746251a5..4f01679fd 100644 Binary files a/vignettes/figure_main/unnamed-chunk-23-1.png and b/vignettes/figure_main/unnamed-chunk-23-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-23-2.png b/vignettes/figure_main/unnamed-chunk-23-2.png index d746251a5..4f01679fd 100644 Binary files a/vignettes/figure_main/unnamed-chunk-23-2.png and b/vignettes/figure_main/unnamed-chunk-23-2.png differ diff --git a/vignettes/figure_main/unnamed-chunk-3-1.png b/vignettes/figure_main/unnamed-chunk-3-1.png index 148f14fa9..90868c1fb 100644 Binary files a/vignettes/figure_main/unnamed-chunk-3-1.png and b/vignettes/figure_main/unnamed-chunk-3-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-4-1.png b/vignettes/figure_main/unnamed-chunk-4-1.png index 0163012f2..df0fde471 100644 Binary files a/vignettes/figure_main/unnamed-chunk-4-1.png and b/vignettes/figure_main/unnamed-chunk-4-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-5-1.png b/vignettes/figure_main/unnamed-chunk-5-1.png index 2a9708a94..0290ecd84 100644 Binary files a/vignettes/figure_main/unnamed-chunk-5-1.png and b/vignettes/figure_main/unnamed-chunk-5-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-6-1.png b/vignettes/figure_main/unnamed-chunk-6-1.png index e0187283e..271c82ed9 100644 Binary files a/vignettes/figure_main/unnamed-chunk-6-1.png and b/vignettes/figure_main/unnamed-chunk-6-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-73-1.png b/vignettes/figure_main/unnamed-chunk-73-1.png new file mode 100644 index 000000000..06198e489 Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-73-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-74-1.png b/vignettes/figure_main/unnamed-chunk-74-1.png new file mode 100644 index 000000000..d3fdbbb72 Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-74-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-75-1.png b/vignettes/figure_main/unnamed-chunk-75-1.png new file mode 100644 index 000000000..a89870996 Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-75-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-76-1.png b/vignettes/figure_main/unnamed-chunk-76-1.png new file mode 100644 index 000000000..5f0855eb7 Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-76-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-77-1.png b/vignettes/figure_main/unnamed-chunk-77-1.png new file mode 100644 index 000000000..8d83a6146 Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-77-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-83-1.png b/vignettes/figure_main/unnamed-chunk-83-1.png new file mode 100644 index 000000000..1ec5f05c5 Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-83-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-83-2.png b/vignettes/figure_main/unnamed-chunk-83-2.png new file mode 100644 index 000000000..1b8a127b3 Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-83-2.png differ diff --git a/vignettes/figure_main/unnamed-chunk-83-3.png b/vignettes/figure_main/unnamed-chunk-83-3.png new file mode 100644 index 000000000..475691c3c Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-83-3.png differ diff --git a/vignettes/figure_main/unnamed-chunk-84-1.png b/vignettes/figure_main/unnamed-chunk-84-1.png new file mode 100644 index 000000000..fca266803 Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-84-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-84-2.png b/vignettes/figure_main/unnamed-chunk-84-2.png new file mode 100644 index 000000000..a10a1c91b Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-84-2.png differ diff --git a/vignettes/figure_main/unnamed-chunk-85-1.png b/vignettes/figure_main/unnamed-chunk-85-1.png new file mode 100644 index 000000000..db1e1cc22 Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-85-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-91-1.png b/vignettes/figure_main/unnamed-chunk-91-1.png new file mode 100644 index 000000000..b449a1bde Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-91-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-93-1.png b/vignettes/figure_main/unnamed-chunk-93-1.png new file mode 100644 index 000000000..3460d65df Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-93-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-94-1.png b/vignettes/figure_main/unnamed-chunk-94-1.png new file mode 100644 index 000000000..3d396956f Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-94-1.png differ diff --git a/vignettes/figure_main/unnamed-chunk-94-2.png b/vignettes/figure_main/unnamed-chunk-94-2.png new file mode 100644 index 000000000..3d396956f Binary files /dev/null and b/vignettes/figure_main/unnamed-chunk-94-2.png differ diff --git a/vignettes/figure_main/vaeac-plot-1-1.png b/vignettes/figure_main/vaeac-plot-1-1.png index c745af293..c4cd18e88 100644 Binary files a/vignettes/figure_main/vaeac-plot-1-1.png and b/vignettes/figure_main/vaeac-plot-1-1.png differ diff --git a/vignettes/figure_main/vaeac-plot-2-1.png b/vignettes/figure_main/vaeac-plot-2-1.png index 1270b1b0c..8fc2362bc 100644 Binary files a/vignettes/figure_main/vaeac-plot-2-1.png and b/vignettes/figure_main/vaeac-plot-2-1.png differ diff --git a/vignettes/figure_main/vaeac-plot-3-1.png b/vignettes/figure_main/vaeac-plot-3-1.png index 61fd06d98..92434e4a5 100644 Binary files a/vignettes/figure_main/vaeac-plot-3-1.png and b/vignettes/figure_main/vaeac-plot-3-1.png differ diff --git a/vignettes/figure_regression/MSEv-sum-1.png b/vignettes/figure_regression/MSEv-sum-1.png new file mode 100644 index 000000000..946229591 Binary files /dev/null and b/vignettes/figure_regression/MSEv-sum-1.png differ diff --git a/vignettes/figure_regression/MSEv-sum-2-1.png b/vignettes/figure_regression/MSEv-sum-2-1.png new file mode 100644 index 000000000..7c9cff6c6 Binary files /dev/null and b/vignettes/figure_regression/MSEv-sum-2-1.png differ diff --git a/vignettes/figure_regression/SV-sum-1.png b/vignettes/figure_regression/SV-sum-1.png new file mode 100644 index 000000000..a5e3156d6 Binary files /dev/null and b/vignettes/figure_regression/SV-sum-1.png differ diff --git a/vignettes/figure_regression/SV-sum-2-1.png b/vignettes/figure_regression/SV-sum-2-1.png new file mode 100644 index 000000000..67a259a0a Binary files /dev/null and b/vignettes/figure_regression/SV-sum-2-1.png differ diff --git a/vignettes/figure_regression/SV-sum-2.png b/vignettes/figure_regression/SV-sum-2.png new file mode 100644 index 000000000..b5c6c6360 Binary files /dev/null and b/vignettes/figure_regression/SV-sum-2.png differ diff --git a/vignettes/figure_regression/SV-sum-3.png b/vignettes/figure_regression/SV-sum-3.png new file mode 100644 index 000000000..c7a0578de Binary files /dev/null and b/vignettes/figure_regression/SV-sum-3.png differ diff --git a/vignettes/figure_regression/decision-tree-plot-1.png b/vignettes/figure_regression/decision-tree-plot-1.png new file mode 100644 index 000000000..c211b764b Binary files /dev/null and b/vignettes/figure_regression/decision-tree-plot-1.png differ diff --git a/vignettes/figure_regression/dt-cv-plot-1.png b/vignettes/figure_regression/dt-cv-plot-1.png new file mode 100644 index 000000000..e3f0c1901 Binary files /dev/null and b/vignettes/figure_regression/dt-cv-plot-1.png differ diff --git a/vignettes/figure_regression/lm-emp-msev-1.png b/vignettes/figure_regression/lm-emp-msev-1.png new file mode 100644 index 000000000..a79ef864e Binary files /dev/null and b/vignettes/figure_regression/lm-emp-msev-1.png differ diff --git a/vignettes/figure_regression/mixed-plot-1.png b/vignettes/figure_regression/mixed-plot-1.png new file mode 100644 index 000000000..def0c68ad Binary files /dev/null and b/vignettes/figure_regression/mixed-plot-1.png differ diff --git a/vignettes/figure_regression/mixed-plot-2-1.png b/vignettes/figure_regression/mixed-plot-2-1.png new file mode 100644 index 000000000..bbf7975cf Binary files /dev/null and b/vignettes/figure_regression/mixed-plot-2-1.png differ diff --git a/vignettes/figure_regression/mixed-plot-3-1.png b/vignettes/figure_regression/mixed-plot-3-1.png new file mode 100644 index 000000000..a31e191b2 Binary files /dev/null and b/vignettes/figure_regression/mixed-plot-3-1.png differ diff --git a/vignettes/figure_regression/mixed-plot-4-1.png b/vignettes/figure_regression/mixed-plot-4-1.png new file mode 100644 index 000000000..134ddaf59 Binary files /dev/null and b/vignettes/figure_regression/mixed-plot-4-1.png differ diff --git a/vignettes/figure_regression/ppr-plot-1.png b/vignettes/figure_regression/ppr-plot-1.png new file mode 100644 index 000000000..80e82d1a7 Binary files /dev/null and b/vignettes/figure_regression/ppr-plot-1.png differ diff --git a/vignettes/figure_regression/preproc-plot-1.png b/vignettes/figure_regression/preproc-plot-1.png new file mode 100644 index 000000000..d69b210e8 Binary files /dev/null and b/vignettes/figure_regression/preproc-plot-1.png differ diff --git a/vignettes/figure_regression/surrogate-plot-1.png b/vignettes/figure_regression/surrogate-plot-1.png new file mode 100644 index 000000000..ffc2d7584 Binary files /dev/null and b/vignettes/figure_regression/surrogate-plot-1.png differ diff --git a/vignettes/figure_vaeac/check-n_combinations-and-more-batches-1.png b/vignettes/figure_vaeac/check-n_combinations-and-more-batches-1.png index 145c8f245..98add3289 100644 Binary files a/vignettes/figure_vaeac/check-n_combinations-and-more-batches-1.png and b/vignettes/figure_vaeac/check-n_combinations-and-more-batches-1.png differ diff --git a/vignettes/figure_vaeac/continue-training-1.png b/vignettes/figure_vaeac/continue-training-1.png index 443dcc38c..0e5fb697e 100644 Binary files a/vignettes/figure_vaeac/continue-training-1.png and b/vignettes/figure_vaeac/continue-training-1.png differ diff --git a/vignettes/figure_vaeac/continue-training-2.png b/vignettes/figure_vaeac/continue-training-2.png index 731cd3805..149759141 100644 Binary files a/vignettes/figure_vaeac/continue-training-2.png and b/vignettes/figure_vaeac/continue-training-2.png differ diff --git a/vignettes/figure_vaeac/continue-training-3.png b/vignettes/figure_vaeac/continue-training-3.png index ed7c68be3..1120e4ed3 100644 Binary files a/vignettes/figure_vaeac/continue-training-3.png and b/vignettes/figure_vaeac/continue-training-3.png differ diff --git a/vignettes/figure_vaeac/continue-training-4.png b/vignettes/figure_vaeac/continue-training-4.png index 1e6be8da9..6ee322459 100644 Binary files a/vignettes/figure_vaeac/continue-training-4.png and b/vignettes/figure_vaeac/continue-training-4.png differ diff --git a/vignettes/figure_vaeac/continue-training-5.png b/vignettes/figure_vaeac/continue-training-5.png index ec9a4b7db..7ad958b75 100644 Binary files a/vignettes/figure_vaeac/continue-training-5.png and b/vignettes/figure_vaeac/continue-training-5.png differ diff --git a/vignettes/figure_vaeac/continue-training-6.png b/vignettes/figure_vaeac/continue-training-6.png new file mode 100644 index 000000000..3eed9e190 Binary files /dev/null and b/vignettes/figure_vaeac/continue-training-6.png differ diff --git a/vignettes/figure_vaeac/continue-training-7.png b/vignettes/figure_vaeac/continue-training-7.png new file mode 100644 index 000000000..73e30b758 Binary files /dev/null and b/vignettes/figure_vaeac/continue-training-7.png differ diff --git a/vignettes/figure_vaeac/early-stopping-1-1.png b/vignettes/figure_vaeac/early-stopping-1-1.png index c81b7bf88..464cc6f9d 100644 Binary files a/vignettes/figure_vaeac/early-stopping-1-1.png and b/vignettes/figure_vaeac/early-stopping-1-1.png differ diff --git a/vignettes/figure_vaeac/early-stopping-2-1.png b/vignettes/figure_vaeac/early-stopping-2-1.png index e2c268815..5c7c98dde 100644 Binary files a/vignettes/figure_vaeac/early-stopping-2-1.png and b/vignettes/figure_vaeac/early-stopping-2-1.png differ diff --git a/vignettes/figure_vaeac/early-stopping-3-1.png b/vignettes/figure_vaeac/early-stopping-3-1.png index 6f39e740d..cc3a3fd5e 100644 Binary files a/vignettes/figure_vaeac/early-stopping-3-1.png and b/vignettes/figure_vaeac/early-stopping-3-1.png differ diff --git a/vignettes/figure_vaeac/early-stopping-3-2.png b/vignettes/figure_vaeac/early-stopping-3-2.png index 61168a938..3c6fe34fe 100644 Binary files a/vignettes/figure_vaeac/early-stopping-3-2.png and b/vignettes/figure_vaeac/early-stopping-3-2.png differ diff --git a/vignettes/figure_vaeac/first-vaeac-plots-1.png b/vignettes/figure_vaeac/first-vaeac-plots-1.png index 871d72873..dd10cc011 100644 Binary files a/vignettes/figure_vaeac/first-vaeac-plots-1.png and b/vignettes/figure_vaeac/first-vaeac-plots-1.png differ diff --git a/vignettes/figure_vaeac/paired-sampling-plotting-1.png b/vignettes/figure_vaeac/paired-sampling-plotting-1.png index 6ab4794f1..4e5f4052a 100644 Binary files a/vignettes/figure_vaeac/paired-sampling-plotting-1.png and b/vignettes/figure_vaeac/paired-sampling-plotting-1.png differ diff --git a/vignettes/figure_vaeac/paired-sampling-plotting-2.png b/vignettes/figure_vaeac/paired-sampling-plotting-2.png index 792c3a742..0117a8e8e 100644 Binary files a/vignettes/figure_vaeac/paired-sampling-plotting-2.png and b/vignettes/figure_vaeac/paired-sampling-plotting-2.png differ diff --git a/vignettes/figure_vaeac/unnamed-chunk-2-1.png b/vignettes/figure_vaeac/unnamed-chunk-2-1.png new file mode 100644 index 000000000..fbeb8ce59 Binary files /dev/null and b/vignettes/figure_vaeac/unnamed-chunk-2-1.png differ diff --git a/vignettes/figure_vaeac/unnamed-chunk-2-2.png b/vignettes/figure_vaeac/unnamed-chunk-2-2.png new file mode 100644 index 000000000..87a440d7c Binary files /dev/null and b/vignettes/figure_vaeac/unnamed-chunk-2-2.png differ diff --git a/vignettes/figure_vaeac/unnamed-chunk-2-3.png b/vignettes/figure_vaeac/unnamed-chunk-2-3.png new file mode 100644 index 000000000..b3a9b8d09 Binary files /dev/null and b/vignettes/figure_vaeac/unnamed-chunk-2-3.png differ diff --git a/vignettes/figure_vaeac/vaeac-grouping-of-features-1.png b/vignettes/figure_vaeac/vaeac-grouping-of-features-1.png index f9ca0bf7e..0e618dd3d 100644 Binary files a/vignettes/figure_vaeac/vaeac-grouping-of-features-1.png and b/vignettes/figure_vaeac/vaeac-grouping-of-features-1.png differ diff --git a/vignettes/figure_vaeac/vaeac-mixed-data-1.png b/vignettes/figure_vaeac/vaeac-mixed-data-1.png index eb3866df0..81e21d290 100644 Binary files a/vignettes/figure_vaeac/vaeac-mixed-data-1.png and b/vignettes/figure_vaeac/vaeac-mixed-data-1.png differ diff --git a/vignettes/figure_vaeac/vaeac-mixed-data-2.png b/vignettes/figure_vaeac/vaeac-mixed-data-2.png index d020c12d4..4c4bce005 100644 Binary files a/vignettes/figure_vaeac/vaeac-mixed-data-2.png and b/vignettes/figure_vaeac/vaeac-mixed-data-2.png differ diff --git a/vignettes/figure_vaeac/vaeac-mixed-data-3.png b/vignettes/figure_vaeac/vaeac-mixed-data-3.png index 085b3815d..ce652685a 100644 Binary files a/vignettes/figure_vaeac/vaeac-mixed-data-3.png and b/vignettes/figure_vaeac/vaeac-mixed-data-3.png differ diff --git a/vignettes/understanding_shapr.Rmd b/vignettes/understanding_shapr.Rmd index 279e83666..a0bd1ab0d 100644 --- a/vignettes/understanding_shapr.Rmd +++ b/vignettes/understanding_shapr.Rmd @@ -139,6 +139,11 @@ implements the `ctree` method from @redelmeier2020explaining, and the of @lundberg2017unified is also available. The methods may also be combined, such that e.g. one method is used when conditioning on a small number of features, while another method is used otherwise. +The `shapr` package also supports directly estimating the contribution +function using regression. We briefly introduce the regression-based +methods below, but we refer to the separate regression vignette +(Shapley value explanations using the regression paradigm) and +@olsen2024comparative for an in-depth explanation of the regression paradigm. @@ -336,11 +341,36 @@ calculated. For example, the expected value of $X_1$ given $X_2 = 1$ and $X_3 = 2$ is $$E(X_1|X_2, X_3) = \sum_{x}x P(X_1 = x | X_2=1, X_3=2) = \sum_{x} x \frac{P(X_1 = x, X_2 = 1, X_3 = 2)}{P(X_2=1, X_3=2)}.$$. + + + +## Separate and Surrogate Regression Approaches + +Another paradigm for estimating the contribution function is the regression +paradigm. In contrast to the methods above, which belong to the Monte Carlo +paradigm, the regression based-methods use regression models to estimate the +contribution function $v(S) = E[f(\boldsymbol{x})|\boldsymbol{x}_S = \boldsymbol{x}_S^*]$ directly. +The separate regression method class fits a separate regression model for +each coalition $S$, while the surrogate regression method class fits a single +regression model to simultaneously predict the contribution function for all +coalitions. We refer to @olsen2024comparative for when one should use the +different paradigms, method classes, and methods. + +In a separate vignette (Shapley value explanations using the regression paradigm), +we elaborate and demonstrate the regression paradigm. +We describe how to specify the regression model, enable automatic cross-validation +of the model's hyperparameters, and apply pre-processing steps to the data before +fitting the regression models. @olsen2024comparative divides the regression +paradigm into the separate and surrogate regression method classes. In the +separate vignette, we briefly introduce the two method classes. For an in-depth +explanation, we refer the reader to Sections 3.5 and 3.6 in @olsen2024comparative. + +
-# Examples +# Examples {#examples} `shapr` supports computation of Shapley values with any predictive model which takes a set of numeric features and produces a numeric outcome. @@ -357,6 +387,7 @@ used instead by setting the argument `approach` to either `"gaussian"`, below. + ```r library(xgboost) library(data.table) @@ -373,6 +404,9 @@ x_train <- data[-ind_x_explain, ..x_var] y_train <- data[-ind_x_explain, get(y_var)] x_explain <- data[ind_x_explain, ..x_var] +# Set seed for reproducibility +set.seed(123) + # Fitting a basic xgboost model to the training data model <- xgboost::xgboost( data = as.matrix(x_train), @@ -395,6 +429,7 @@ explanation <- explain( ) #> Note: Feature classes extracted from the model contains NA. #> Assuming feature classes from the data are correct. +#> #> Setting parameter 'n_batches' to 2 as a fair trade-off between memory consumption and computation time. #> Reducing 'n_batches' typically reduces the computation time at the cost of increased memory consumption. @@ -639,7 +674,7 @@ explanation_timeseries <- explain( ## MSEv evaluation criterion We can use the $\operatorname{MSE}_{v}$ criterion proposed by @frye2020shapley, -and later used by, e.g., @olsen2022using and @olsen2023comparative, to evaluate +and later used by, e.g., @olsen2022using and @olsen2024comparative, to evaluate and rank the approaches/methods. The $\operatorname{MSE}_{v}$ is given by ```{=tex} \begin{align} @@ -689,7 +724,7 @@ First, we can only use the $\operatorname{MSE}_{v}$ criterion to rank the method their closeness to the optimum since the minimum value of the $\operatorname{MSE}_{v}$ criterion is unknown. Second, the criterion evaluates the contribution functions and not the Shapley values. -Note that @olsen2023comparative observed a relatively linear relationship between the +Note that @olsen2024comparative observed a relatively linear relationship between the $\operatorname{MSE}_{v}$ criterion and the mean absolute error $(\operatorname{MAE})$ between the true and estimated Shapley values in extensive simulation studies where the true Shapley values were known. That is, a method that achieves a low $\operatorname{MSE}_{v}$ score also tends to @@ -719,10 +754,10 @@ Start by explaining the predictions by using different methods and combining the ```r # We use more explicands here for more stable confidence intervals -ind_x_explain <- 1:25 -x_train <- data[-ind_x_explain, ..x_var] -y_train <- data[-ind_x_explain, get(y_var)] -x_explain <- data[ind_x_explain, ..x_var] +ind_x_explain_many <- 1:25 +x_train <- data[-ind_x_explain_many, ..x_var] +y_train <- data[-ind_x_explain_many, get(y_var)] +x_explain <- data[ind_x_explain_many, ..x_var] # Fitting a basic xgboost model to the training data model <- xgboost::xgboost( @@ -828,8 +863,7 @@ MSEv_plots <- plot_MSEv_eval_crit(explanation_list_named, # 5 plots are made names(MSEv_plots) -#> [1] "MSEv_explicand_bar" "MSEv_explicand_line_point" "MSEv_combination_bar" "MSEv_combination_line_point" -#> [5] "MSEv_bar" +#> [1] "MSEv_explicand_bar" "MSEv_explicand_line_point" "MSEv_combination_bar" "MSEv_combination_line_point" "MSEv_bar" ``` The main plot if interest is the `MSEv_bar`, which displays the $\operatorname{MSE}_{v}$ evaluation criterion for each method averaged over both the combinations/coalitions and test observations/explicands. However, we can also look at the other plots where we have only averaged over the observations or the combinations (both as bar and line plots). @@ -1083,9 +1117,9 @@ comes after the last day in the data, this forecast starts from index ```r -data <- data.table::as.data.table(airquality) +data_ts2 <- data.table::as.data.table(airquality) -model_ar_temp <- ar(data$Temp, order = 2) +model_ar_temp <- ar(data_ts2$Temp, order = 2) predict(model_ar_temp, n.ahead = 2)$pred #> Time Series: @@ -1121,9 +1155,9 @@ variable separately. ```r -explanation <- explain_forecast( +explanation_forecast <- explain_forecast( model = model_ar_temp, - y = data[, "Temp"], + y = data_ts2[, "Temp"], train_idx = 2:152, explain_idx = 153, explain_y_lags = 2, @@ -1137,11 +1171,11 @@ explanation <- explain_forecast( #> Note: Feature names extracted from the model contains NA. #> Consistency checks between model and data is therefore disabled. -print(explanation) -#> explain_idx horizon none Temp.1 Temp.2 -#> -#> 1: 153 1 77.88 -6.622 -0.1788 -#> 2: 153 2 77.88 -6.025 -0.3327 +print(explanation_forecast) +#> explain_idx horizon none Temp.1 Temp.2 +#> +#> 1: 153 1 77.79 -6.578 -0.134 +#> 2: 153 2 77.79 -5.980 -0.288 ``` The results are presented per value of `explain_idx` and forecast @@ -1160,13 +1194,13 @@ just fit on the 151 first observations, leaving two observations of ```r -data <- data.table::as.data.table(airquality) +data_ts3 <- data.table::as.data.table(airquality) -data_fit <- data[seq_len(151), ] +data_fit <- data_ts3[seq_len(151), ] model_arimax_temp <- arima(data_fit$Temp, order = c(2, 0, 0), xreg = data_fit$Wind) -newxreg <- data[-seq_len(151), "Wind", drop = FALSE] +newxreg <- data_ts3[-seq_len(151), "Wind", drop = FALSE] predict(model_arimax_temp, n.ahead = 2, newxreg = newxreg)$pred #> Time Series: @@ -1187,10 +1221,10 @@ during the forecasting period. ```r -explanation <- explain_forecast( +explanation_forecast <- explain_forecast( model = model_ar_temp, y = data_fit[, "Temp"], - xreg = data[, "Wind"], + xreg = data_ts3[, "Wind"], train_idx = 2:150, explain_idx = 151, explain_y_lags = 2, @@ -1205,7 +1239,7 @@ explanation <- explain_forecast( #> Note: Feature names extracted from the model contains NA. #> Consistency checks between model and data is therefore disabled. -print(explanation$shapley_values) +print(explanation_forecast$shapley_values) #> explain_idx horizon none Temp.1 Temp.2 Wind.1 Wind.F1 Wind.F2 #> #> 1: 151 1 77.96 -0.67793 -0.67340 -1.2688 0.493408 NA @@ -1364,10 +1398,13 @@ models fitted with the following functions: - `ranger::ranger` - `mgcv::gam` - `xgboost::xgboost`/`xgboost::xgb.train` +- `workflows::workflow` Any continuous response regression model or binary classification model of these model classes, can be explained with the package directly as -exemplified above. Moreover, essentially any feature dependent +exemplified above, while we give an example for the `workflows::workflow` +in the [`tidymodels`/`workflows`](#workflow_example) section. +Moreover, essentially any feature dependent prediction model can be explained by the package by specifying two (or one) simple additional functions for your model. @@ -1485,7 +1522,6 @@ explanation_custom_minimal <- explain( ) #> Note: You passed a model to explain() which is not natively supported, and did not supply a 'get_model_specs' function to explain(). #> Consistency checks between model and data is therefore disabled. -#> #> Setting parameter 'n_batches' to 2 as a fair trade-off between memory consumption and computation time. #> Reducing 'n_batches' typically reduces the computation time at the cost of increased memory consumption. @@ -1495,6 +1531,59 @@ plot(explanation_custom_minimal, index_x_explain = c(1, 6)) ![](figure_main/unnamed-chunk-23-2.png) +### Tidymodels and workflows {#workflow_example} +In this section, we demonstrate how to use `shapr` to explain `tidymodels` models fitted using `workflows`. +In the example [above](#examples), we directly used the `xgboost` package to fit the `xgboost` model. +However, we can also fit the `xgboost` model using the `tidymodels` package. These fits will be identical +as `tidymodels` calls `xgboost` internally. which we demonstrate in the example below. Note that we can replace +`xgboost` (i.e., `parsnip::boost_tree`) with any other fitted `tidymodels` in the `workflows` procedure outlined below. + + +```r +# Fitting a basic xgboost model to the training data using tidymodels +set.seed(123) # Set the same seed as above +all_var <- c(y_var, x_var) +train <- data[-ind_x_explain, ..all_var] + +# Fitting the `tidymodels` model using `workflows` +model_tidymodels <- parsnip::fit( + workflows::add_recipe( + workflows::add_model( + workflows::workflow(), + parsnip::boost_tree(trees = 20, engine = "xgboost", mode = "regression") + ), + recipes::recipe(Ozone ~ ., data = train) + ), + data = train +) + +# # We can also specify the same model using pipes `%>%` by (if pipes are installed/loaded) +# model_tidymodels <- +# workflows::workflow() %>% +# workflows::add_model(parsnip::boost_tree(trees = 20, engine = "xgboost", mode = "regression")) %>% +# workflows::add_recipe(recipes::recipe(Ozone ~ ., data = train)) %>% +# parsnip::fit(data = train) + +# See that the output of the two models are identical +all.equal(predict(model_tidymodels, x_train)$.pred, predict(model, as.matrix(x_train))) +#> [1] "Mean relative difference: 0.018699" + +# Create the Shapley values for the tidymodels version +explanation_tidymodels <- explain( + model = model_tidymodels, + x_explain = x_explain, + x_train = x_train, + approach = "empirical", + prediction_zero = p0, + n_batches = 4 +) + +# See that the Shapley value explanations are identical too +all.equal(explanation$shapley_values, explanation_tidymodels$shapley_values) +#> [1] "Different number of rows" +``` + + ## The parameters of the `vaeac` approach The `vaeac` approach is a very flexible method that supports mixed data. The main diff --git a/vignettes/understanding_shapr.Rmd.orig b/vignettes/understanding_shapr.Rmd.orig index d8fe633a5..32699e239 100644 --- a/vignettes/understanding_shapr.Rmd.orig +++ b/vignettes/understanding_shapr.Rmd.orig @@ -24,6 +24,7 @@ knitr::opts_chunk$set( warning = FALSE, message = TRUE ) +options("digits" = 5) ``` ```{r setup, include=FALSE, warning=FALSE} @@ -153,6 +154,11 @@ implements the `ctree` method from @redelmeier2020explaining, and the of @lundberg2017unified is also available. The methods may also be combined, such that e.g. one method is used when conditioning on a small number of features, while another method is used otherwise. +The `shapr` package also supports directly estimating the contribution +function using regression. We briefly introduce the regression-based +methods below, but we refer to the separate regression vignette +(Shapley value explanations using the regression paradigm) and +@olsen2024comparative for an in-depth explanation of the regression paradigm. @@ -350,11 +356,36 @@ calculated. For example, the expected value of $X_1$ given $X_2 = 1$ and $X_3 = 2$ is $$E(X_1|X_2, X_3) = \sum_{x}x P(X_1 = x | X_2=1, X_3=2) = \sum_{x} x \frac{P(X_1 = x, X_2 = 1, X_3 = 2)}{P(X_2=1, X_3=2)}.$$. + + + +## Separate and Surrogate Regression Approaches + +Another paradigm for estimating the contribution function is the regression +paradigm. In contrast to the methods above, which belong to the Monte Carlo +paradigm, the regression based-methods use regression models to estimate the +contribution function $v(S) = E[f(\boldsymbol{x})|\boldsymbol{x}_S = \boldsymbol{x}_S^*]$ directly. +The separate regression method class fits a separate regression model for +each coalition $S$, while the surrogate regression method class fits a single +regression model to simultaneously predict the contribution function for all +coalitions. We refer to @olsen2024comparative for when one should use the +different paradigms, method classes, and methods. + +In a separate vignette (Shapley value explanations using the regression paradigm), +we elaborate and demonstrate the regression paradigm. +We describe how to specify the regression model, enable automatic cross-validation +of the model's hyperparameters, and apply pre-processing steps to the data before +fitting the regression models. @olsen2024comparative divides the regression +paradigm into the separate and surrogate regression method classes. In the +separate vignette, we briefly introduce the two method classes. For an in-depth +explanation, we refer the reader to Sections 3.5 and 3.6 in @olsen2024comparative. + +
-# Examples +# Examples {#examples} `shapr` supports computation of Shapley values with any predictive model which takes a set of numeric features and produces a numeric outcome. @@ -387,6 +418,9 @@ x_train <- data[-ind_x_explain, ..x_var] y_train <- data[-ind_x_explain, get(y_var)] x_explain <- data[ind_x_explain, ..x_var] +# Set seed for reproducibility +set.seed(123) + # Fitting a basic xgboost model to the training data model <- xgboost::xgboost( data = as.matrix(x_train), @@ -612,7 +646,7 @@ explanation_timeseries <- explain( ## MSEv evaluation criterion We can use the $\operatorname{MSE}_{v}$ criterion proposed by @frye2020shapley, -and later used by, e.g., @olsen2022using and @olsen2023comparative, to evaluate +and later used by, e.g., @olsen2022using and @olsen2024comparative, to evaluate and rank the approaches/methods. The $\operatorname{MSE}_{v}$ is given by ```{=tex} \begin{align} @@ -662,7 +696,7 @@ First, we can only use the $\operatorname{MSE}_{v}$ criterion to rank the method their closeness to the optimum since the minimum value of the $\operatorname{MSE}_{v}$ criterion is unknown. Second, the criterion evaluates the contribution functions and not the Shapley values. -Note that @olsen2023comparative observed a relatively linear relationship between the +Note that @olsen2024comparative observed a relatively linear relationship between the $\operatorname{MSE}_{v}$ criterion and the mean absolute error $(\operatorname{MAE})$ between the true and estimated Shapley values in extensive simulation studies where the true Shapley values were known. That is, a method that achieves a low $\operatorname{MSE}_{v}$ score also tends to @@ -691,10 +725,10 @@ each observation, as each combination is a different prediction tasks. Start by explaining the predictions by using different methods and combining them into lists. ```{r} # We use more explicands here for more stable confidence intervals -ind_x_explain <- 1:25 -x_train <- data[-ind_x_explain, ..x_var] -y_train <- data[-ind_x_explain, get(y_var)] -x_explain <- data[ind_x_explain, ..x_var] +ind_x_explain_many <- 1:25 +x_train <- data[-ind_x_explain_many, ..x_var] +y_train <- data[-ind_x_explain_many, get(y_var)] +x_explain <- data[ind_x_explain_many, ..x_var] # Fitting a basic xgboost model to the training data model <- xgboost::xgboost( @@ -985,9 +1019,9 @@ comes after the last day in the data, this forecast starts from index 153. ```{r} -data <- data.table::as.data.table(airquality) +data_ts2 <- data.table::as.data.table(airquality) -model_ar_temp <- ar(data$Temp, order = 2) +model_ar_temp <- ar(data_ts2$Temp, order = 2) predict(model_ar_temp, n.ahead = 2)$pred ``` @@ -1017,9 +1051,9 @@ model with multiple variables, as it is then possible to explain each variable separately. ```{r} -explanation <- explain_forecast( +explanation_forecast <- explain_forecast( model = model_ar_temp, - y = data[, "Temp"], + y = data_ts2[, "Temp"], train_idx = 2:152, explain_idx = 153, explain_y_lags = 2, @@ -1031,7 +1065,7 @@ explanation <- explain_forecast( timing = FALSE ) -print(explanation) +print(explanation_forecast) ``` The results are presented per value of `explain_idx` and forecast @@ -1049,13 +1083,13 @@ just fit on the 151 first observations, leaving two observations of `Wind` to be used as exogenous values during the prediction phase. ```{r} -data <- data.table::as.data.table(airquality) +data_ts3 <- data.table::as.data.table(airquality) -data_fit <- data[seq_len(151), ] +data_fit <- data_ts3[seq_len(151), ] model_arimax_temp <- arima(data_fit$Temp, order = c(2, 0, 0), xreg = data_fit$Wind) -newxreg <- data[-seq_len(151), "Wind", drop = FALSE] +newxreg <- data_ts3[-seq_len(151), "Wind", drop = FALSE] predict(model_arimax_temp, n.ahead = 2, newxreg = newxreg)$pred ``` @@ -1070,10 +1104,10 @@ first lag of the exogenous variable, but also the contemporary effect during the forecasting period. ```{r} -explanation <- explain_forecast( +explanation_forecast <- explain_forecast( model = model_ar_temp, y = data_fit[, "Temp"], - xreg = data[, "Wind"], + xreg = data_ts3[, "Wind"], train_idx = 2:150, explain_idx = 151, explain_y_lags = 2, @@ -1086,7 +1120,7 @@ explanation <- explain_forecast( timing = FALSE ) -print(explanation$shapley_values) +print(explanation_forecast$shapley_values) ``` @@ -1194,10 +1228,13 @@ models fitted with the following functions: - `ranger::ranger` - `mgcv::gam` - `xgboost::xgboost`/`xgboost::xgb.train` +- `workflows::workflow` Any continuous response regression model or binary classification model of these model classes, can be explained with the package directly as -exemplified above. Moreover, essentially any feature dependent +exemplified above, while we give an example for the `workflows::workflow` +in the [`tidymodels`/`workflows`](#workflow_example) section. +Moreover, essentially any feature dependent prediction model can be explained by the package by specifying two (or one) simple additional functions for your model. @@ -1310,6 +1347,56 @@ explanation_custom_minimal <- explain( plot(explanation_custom_minimal, index_x_explain = c(1, 6)) ``` +### Tidymodels and workflows {#workflow_example} +In this section, we demonstrate how to use `shapr` to explain `tidymodels` models fitted using `workflows`. +In the example [above](#examples), we directly used the `xgboost` package to fit the `xgboost` model. +However, we can also fit the `xgboost` model using the `tidymodels` package. These fits will be identical +as `tidymodels` calls `xgboost` internally. which we demonstrate in the example below. Note that we can replace +`xgboost` (i.e., `parsnip::boost_tree`) with any other fitted `tidymodels` in the `workflows` procedure outlined below. + +```{r} +# Fitting a basic xgboost model to the training data using tidymodels +set.seed(123) # Set the same seed as above +all_var <- c(y_var, x_var) +train <- data[-ind_x_explain, ..all_var] + +# Fitting the `tidymodels` model using `workflows` +model_tidymodels <- parsnip::fit( + workflows::add_recipe( + workflows::add_model( + workflows::workflow(), + parsnip::boost_tree(trees = 20, engine = "xgboost", mode = "regression") + ), + recipes::recipe(Ozone ~ ., data = train) + ), + data = train +) + +# # We can also specify the same model using pipes `%>%` by (if pipes are installed/loaded) +# model_tidymodels <- +# workflows::workflow() %>% +# workflows::add_model(parsnip::boost_tree(trees = 20, engine = "xgboost", mode = "regression")) %>% +# workflows::add_recipe(recipes::recipe(Ozone ~ ., data = train)) %>% +# parsnip::fit(data = train) + +# See that the output of the two models are identical +all.equal(predict(model_tidymodels, x_train)$.pred, predict(model, as.matrix(x_train))) + +# Create the Shapley values for the tidymodels version +explanation_tidymodels <- explain( + model = model_tidymodels, + x_explain = x_explain, + x_train = x_train, + approach = "empirical", + prediction_zero = p0, + n_batches = 4 +) + +# See that the Shapley value explanations are identical too +all.equal(explanation$shapley_values, explanation_tidymodels$shapley_values) +``` + + ## The parameters of the `vaeac` approach The `vaeac` approach is a very flexible method that supports mixed data. The main diff --git a/vignettes/understanding_shapr_regression.Rmd b/vignettes/understanding_shapr_regression.Rmd new file mode 100644 index 000000000..964ae972b --- /dev/null +++ b/vignettes/understanding_shapr_regression.Rmd @@ -0,0 +1,2191 @@ +--- +title: "Shapley value explanations using the regression paradigm" +author: "Lars Henry Berge Olsen" +output: + rmarkdown::html_vignette: + toc: true +bibliography: ../inst/REFERENCES.bib +vignette: > + %\VignetteEncoding{UTF-8} + %\VignetteIndexEntry{Shapley value explanations using the regression paradigm} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + markdown: + wrap: 72 + toc: true +--- + + + + + + + + + + + + + + + + + + + + + + + + + + +This vignette elaborates and demonstrates the regression +paradigm explained in @olsen2024comparative. We describe +how to specify the regression model, how to enable automatic +cross-validation of the model's hyperparameters, and applying +pre-processing steps to the data before fitting the regression +models. We refer to @olsen2024comparative for when one should +use the different paradigms, method classes, and methods. + +@olsen2024comparative divides the regression paradigm into +the separate and surrogate regression method classes. In this +vignette, we briefly introduce the two method classes. For an +in-depth explanation, we refer the reader to Sections 3.5 and +3.6 in @olsen2024comparative. + +Briefly stated, the regression paradigm uses regression +models to directly estimate the contribution function +$v(S) = E[f(\boldsymbol{x})|\boldsymbol{x}_S = \boldsymbol{x}_S^*]$. +The separate regression method class fits a separate regression +model for each coalition $S$, while the surrogate regression +method class fits a single regression model to simultaneously +predict the contribution function for all coalitions. + +The `shapr` package supports any regression model from the +popular `tidymodels` package developed by @tidymodels. The +[`tidymodels`](https://www.tidymodels.org/) framework is a +collection of packages for modeling and machine learning +using [`tidyverse`](https://www.tidyverse.org/) principles. +Some packages included in the `tidymodels` framework are +`parsnip`, `recipes`, `workflows` `tune`, and `rsample`; +see the [setup](#setup) section below for more examples. +Furthermore, click [here](https://www.tidymodels.org/find/parsnip/) +to access the complete list of supported regression models +in the `tidymodels` package. There are currently 80 supported +models, but we can apply a wide range of pre-processing data +steps to increase this number or add regression models not +already implemented in `tidymodels`. In the former setting, +we can either apply the linear regression model directly to +the data or pre-process the data to compute principal components +(principal component regression), which we do in the +[pre-process](#separate_preproc) section. For the latter setting, +we demonstrate how to incorporate the projection pursuit regression +model into the `tidymodels` framework in the +[add new regression methods](#new) section. + +Note that our framework does not currently support model +formulas with special terms. For example, we do not support +`parsnip::gen_additive_mod` (i.e., `mgcv::gam()`) as it uses +a non-standard notion in its formulas (in this case, the +`s(feature, k = 2)` function). See `?parsnip::model_formula()` +for more information. However, this hurdle is overcome by +pre-processing data steps containing spline functions, which +we do in the [pre-process](#separate_preproc) section for the +separate regression method class. + +In the [mixed data](#mixed) section, we demonstrate that the +regression-based methods work on mixed data, too. However, we must +add a pre-processing step for the regression models that do not +natively support categorical data to encode the categorical features. + +Note that we use the same data and predictive models in this +vignette as in the main vignette. + +See the end of the [continous data](#summary) and +[mixed data](#summary_mixed) sections for summary figures of all the +methods used in this vignette to compute the Shapley value explanations. + + +# The separate regression method class {#separate} + +In the `regression_separate` methods, we train a new regression +model $g_S(\boldsymbol{x}s)$ to estimate the conditional expectation +for each coalition of features. + +The idea is to estimate +$v(S) = E[f(\boldsymbol{x})|\boldsymbol{x}_S = \boldsymbol{x}_S^*] = E[f(\boldsymbol{x}_{\bar{S}},\boldsymbol{x}_S)|\boldsymbol{x}_S=\boldsymbol{x}_S^*]$ +separately for each coalition $S$ using regression. Let +$\mathcal{D} = \{ \boldsymbol{x}^{[i]}, y^{[i]} \}_{i=1}^{N_{\text{train}}}$ +denote the training data, where $\boldsymbol{x}^{[i]}$ is the $i$th +$M$-dimensional input and $y^{[i]}$ is the associated response. +For each coalition $S \subseteq \{1,2,\dots,M\}$, the corresponding +training data set is +\begin{align*} + \mathcal{D}_S + = + \{\boldsymbol{x}_S^{[i]}, f(\underbrace{\boldsymbol{x}_\bar{S}^{[i]}, \boldsymbol{x}_S^{[i]}}_{\boldsymbol{x}^{[i]}})\}_{i=1}^{N_{\text{train}}} + = + \{\boldsymbol{x}_S^{[i]}, \underbrace{f(\boldsymbol{x}^{[i]})}_{z^{[i]}}\}_{i=1}^{N_{\text{train}}} + = + \{\boldsymbol{x}_S^{[i]}, z^{[i]}\}_{i=1}^{N_{\text{train}}}. +\end{align*} + +For each data set $\mathcal{D}_S$, we train a regression model +$g_S(\boldsymbol{x}s)$ with respect to the mean squared error +loss function. That is, we fit a regression model where the +prediction $f(\boldsymbol{x})$ is acting as the response and +the feature subset of coalition $S$, $\boldsymbol{x}_S$, is +acting as the available features. The optimal model, with +respect to the loss function, is +$g^*_S(\boldsymbol{x}_S) = E[z|\boldsymbol{x}_S] = E[f(\boldsymbol{x}_\bar{S}, \boldsymbol{x}_S)|\boldsymbol{x}_S]$, +which corresponds to the contribution function $v(S)$. The +regression model $g_S$ aims for the optimal, hence, it +resembles/estimates the contribution function, i.e., +$g_S(\boldsymbol{x}_S) = \hat{v}(S) \approx v(S) = E[f(\boldsymbol{x}_\bar{S}, \boldsymbol{x}_S) | \boldsymbol{x}_S = \boldsymbol{x}_S^*]$. + + +## Code {#separate_code} + +In this supplementary vignette, we use the same data and explain +the same model type as in the main vignette. We train a simple +`xgboost` model on the `airquality` dataset and demonstrate how +to use the `shapr` and the separate regression method class to +explain the individual predictions. + + +### Setup {#setup} + +First, we set up the `airquality` dataset and train an `xgboost` +model, whose predictions we want to explain using the Shapley value +explanation framework. We import all packages in the `tidymodels` +framework in the code chunk below, but we could have specified them +directly, too. In this vignette, we use the following packages in +the `tidymodels` framework: `parsnip`, `recipes`, `workflows`, +`dials`, `hardhat`, `tibble`, `rlang`, and `ggplot2`. We include the +`package::function()` notation throughout this vignette to indicate +which package the functions originate from in the `tidymodels` framework. + + +```r +# Either use `library(tidymodels)` or separately specify the libraries indicated above +library(tidymodels) + +# Other libraries +library(xgboost) +library(data.table) +library(shapr) + +# Ensure that shapr's functions are prioritzed, otherwise we need to use the `shapr::` +# prefix when calling explain(). The `conflicted` package is imported by `tidymodels`. +conflicted::conflicts_prefer(shapr::explain, shapr::prepare_data) + +data("airquality") +data <- data.table::as.data.table(airquality) +data <- data[complete.cases(data), ] + +x_var <- c("Solar.R", "Wind", "Temp", "Month") +y_var <- "Ozone" + +ind_x_explain <- 1:20 +x_train <- data[-ind_x_explain, ..x_var] +y_train <- data[-ind_x_explain, get(y_var)] +x_explain <- data[ind_x_explain, ..x_var] + +# Fitting a basic xgboost model to the training data +set.seed(123) # Set seed for reproducibility +model <- xgboost::xgboost( + data = as.matrix(x_train), + label = y_train, + nround = 20, + verbose = FALSE +) + +# Specifying the phi_0, i.e. the expected prediction without any features +p0 <- mean(y_train) + +# List to store all the explanation objects +explanation_list <- list() +``` + +To make the rest of the vignette easier to follow, we create some helper +functions that plot and summarize the results of the explanation methods. +This code block is optional to understand and can be skipped. + + +```r +# Plot the MSEv criterion scores as horizontal bars and add dashed line of one method's score +plot_MSEv_scores <- function(explanation_list, method_line = NULL) { + fig <- plot_MSEv_eval_crit(explanation_list) + + ggplot2::theme(legend.position = "none") + + ggplot2::coord_flip() + + ggplot2::theme(plot.title = ggplot2::element_text(size = rel(0.95))) + fig <- fig + ggplot2::scale_x_discrete(limits = rev(levels(fig$data$Method))) + if (!is.null(method_line) && method_line %in% fig$data$Method) { + fig <- fig + ggplot2::geom_hline( + yintercept = fig$data$MSEv[fig$data$Method == method_line], + linetype = "dashed", + color = "black" + ) + } + return(fig) +} + +# Extract the MSEv criterion scores and elapsed times +print_MSEv_scores_and_time <- function(explanation_list) { + res <- as.data.frame(t(sapply( + explanation_list, + function(explanation) { + round(c(explanation$MSEv$MSEv$MSEv, explanation$timing$total_time_secs), 2) + } + ))) + colnames(res) <- c("MSEv", "Time") + return(res) +} + +# Extract the k best methods in decreasing order +get_k_best_methods <- function(explanation_list, k_best) { + res <- print_MSEv_scores_and_time(explanation_list) + return(rownames(res)[order(res$MSEv)[seq(k_best)]]) +} +``` + +To establish a baseline against which to compare the regression methods, +we will compare them with the Monte Carlo-based `empirical` approach +with default hyperparameters. In the last section, we include all Monte +Carlo-based methods implemented in `shapr` to make an extensive comparison. + + +```r +# Compute the Shapley value explanations using the empirical method +explanation_list$MC_empirical <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "empirical", + prediction_zero = p0, + n_batches = 4 +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +``` + + +### Linear regression model +Then we compute the Shapley value explanations using a linear +regression model and the separate regression method class. + + +```r +explanation_list$sep_lm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg() +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +``` + +A linear model is often not flexible enough to properly model the +contribution function. Thus, it can produce inaccurate Shapley value +explanations. The figure below shows that the `empirical` approach +outperforms the linear regression model approach quite significantly +concerning the $\operatorname{MSE}_v$ evaluation criterion. + + +```r +plot_MSEv_scores(explanation_list) +``` + +![](figure_regression/lm-emp-msev-1.png) + + +### Pre-processing {#separate_preproc} + +This section describes how to pre-process the data before +fitting the separate regression models. We demonstrate this +for the linear regression model, but we can apply this +pre-processing to other regression methods. + +The `recipe` package in the `tidymodels` framework contains +many functions to pre-process the data before fitting the model, +for example, normalization, interaction, encodings, and +transformations (e.g., log, splines, pls, pca). Click +[here](https://recipes.tidymodels.org/reference/index.html) +to access a complete list of all available functions. The list +also contains functions for helping us select which features +to apply the functions to, e.g., `recipes::all_predictors()`, +`recipes::all_numeric_predictors()`, and `recipes::all_factor_predictors()` +apply the functions to all features, only the numerical features, +and only the factor features, respectively. We can also specify +the names of the features to which the functions are applied. +However, as the included features change in each coalition, +we need to check that the feature we want to apply the function +to is present in the dataset. We give an example of this below. + +First, we demonstrate how to compute the principal components +and use (up to) the first two components for each separate +linear regression model. We write "up to" as we can only compute +a single principal component for the singleton coalitions, i.e., +the feature itself. This regression model is called principal +component regression. + + +```r +explanation_list$sep_pcr <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = function(regression_recipe) { + return(recipes::step_pca(regression_recipe, recipes::all_numeric_predictors(), num_comp = 2)) + } +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +``` + +Second, we apply a pre-processing step that computes the basis +expansions of the features using natural splines with two degrees +of freedom. This is similar to fitting a generalized additive model. + + +```r +explanation_list$sep_splines <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = function(regression_recipe) { + return(recipes::step_ns(regression_recipe, recipes::all_numeric_predictors(), deg_free = 2)) + } +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +``` + +Finally, we provide an example where we include interactions +between the features `Solar.R` and `Wind`, log-transform `Solar.R`, +convert `Wind` to be between 0 and 1 and then take the square root, +include polynomials of the third degree for `Temp`, and apply the +Box-Cox transformation to `Month`. These transformations are only +applied when the features are present for the different separate models. + +Furthermore, we stress that the purpose of this example is to highlight +the framework's flexibility, NOT that the transformations below are reasonable. + + +```r +# Example function of how to apply step functions from the recipes package to specific features +regression.recipe_func <- function(recipe) { + # Get the names of the present features + feature_names <- recipe$var_info$variable[recipe$var_info$role == "predictor"] + + # If Solar.R and Wind is present, then we add the interaction between them + if (all(c("Solar.R", "Wind") %in% feature_names)) { + recipe <- recipes::step_interact(recipe, terms = ~ Solar.R:Wind) + } + + # If Solar.R is present, then log transform it + if ("Solar.R" %in% feature_names) recipe <- recipes::step_log(recipe, Solar.R) + + # If Wind is present, then scale it to be between 0 and 1 and then sqrt transform it + if ("Wind" %in% feature_names) recipe <- recipes::step_sqrt(recipes::step_range(recipe, Wind)) + + # If Temp is present, then expand it using orthogonal polynomials of degree 3 + if ("Temp" %in% feature_names) recipe <- recipes::step_poly(recipe, Temp, degree = 3) + + # If Month is present, then Box-Cox transform it + if ("Month" %in% feature_names) recipe <- recipes::step_BoxCox(recipe, Month) + + # Finally we normalize all features (not needed as LM does this internally) + recipe <- recipes::step_normalize(recipe, recipes::all_numeric_predictors()) + + return(recipe) +} + +# Compute the Shapley values using the pre-processing steps defined above +explanation_list$sep_reicpe_example <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = regression.recipe_func +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +``` + +We can examine the $\operatorname{MSE}_v$ evaluation scores, and we +see that the method using natural splines significantly outperforms +the other methods. + + +```r +# Compare the MSEv criterion of the different explanation methods +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +``` + +![](figure_regression/preproc-plot-1.png) + +```r + +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +#> MSEv Time +#> MC_empirical 179.43 2.22 +#> sep_lm 745.21 0.77 +#> sep_pcr 784.91 1.32 +#> sep_splines 165.13 1.09 +#> sep_reicpe_example 687.45 1.74 +``` + + +### Other regression models + +In the following example, we use a decision tree +model instead of the simple linear regression model. + +The `tidymodels` framework supports several implementations +of the decision tree model. We use `set_engine("rpart")` +to specify that we want to use the implementation in the +`rpart` package, and we use `set_mode("regression")` to +specify that we are doing regression. The `tidymodels` +framework uses the default hyperparameter values set in +`rpart` when we do not specify them. By searching for +"decision tree" in the [list of tidymodels](https://www.tidymodels.org/find/parsnip/), +we see that the default hyperparameter values for the +[`decision_tree_rpart`](https://parsnip.tidymodels.org//reference/details_decision_tree_rpart.html) +model are `tree_depth = 30`, `min_n = 2`, and `cost_complexity = 0.01`. + + +```r +# Decision tree with specified parameters (stumps) +explanation_list$sep_tree_stump <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree( + tree_depth = 1, + min_n = 2, + cost_complexity = 0.01, + engine = "rpart", + mode = "regression" + ) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Decision tree with default parameters +explanation_list$sep_tree_default <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree(engine = "rpart", mode = "regression") +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +``` + +We can also set `regression.model = parsnip::decision_tree(tree_depth = 1, min_n = 2, cost_complexity = 0.01) %>% parsnip::set_engine("rpart") %>% parsnip::set_mode("regression")` +if we want to use the pipe function (`%>%`). + +We can now compare the two new methods. The decision tree with +default parameters outperforms the linear model approach concerning +the $\operatorname{MSE}_v$ criterion and is on the same level as +the empirical approach. We obtained a worse method by using stumps, +i.e., trees with depth one. + + +```r +# Compare the MSEv criterion of the different explanation methods +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +``` + +![](figure_regression/decision-tree-plot-1.png) + +```r +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +#> MSEv Time +#> MC_empirical 179.43 2.22 +#> sep_lm 745.21 0.77 +#> sep_pcr 784.91 1.32 +#> sep_splines 165.13 1.09 +#> sep_reicpe_example 687.45 1.74 +#> sep_tree_stump 218.05 1.03 +#> sep_tree_default 177.68 0.89 +``` + + +### Cross-validation {#separate_cv} + +Another option is to use cross-validation to tune the hyperparameters. +To do this, we need to specify three things: + +1. In `regression.model`, we need to specify which parameters to tune +in the model. We do this by setting the parameter equal to `hardhat::tune()`. +For example., if we want to tune the `tree_depth` parameter in the +`parsnip::decision_tree` model while using default parameters for the +other parameters, then we set `parsnip::decision_tree(tree_depth = hardhat::tune())`. +2. In `regression.tune_values`, we must provide either a data.frame (can also +be a data.table or tibble) containing the possible hyperparameter values or a +function that takes in the training data for each combination/coalition and +outputs a data.frame containing the possible hyperparameter values. The latter +allows us to use different hyperparameter values for different coalition sizes, +which is essential if a hyperparameter's domain changes with the coalition size. +For example, see the example below where we want to tune the `mtry` parameter in +`ranger` (random forest). The column names of `regression.tune_values` (or the +output if it is a function) must match the tuneable hyperparameters specified +in `regression.model`. For the example above, `regression.tune_values` must be +a one-column data.frame with the column name `tree_depth`. We can either +manually specify the hyperparameter values or use the `dials` package, e.g., +`dials::grid_regular(dials::tree_depth(), levels = 5)`. Or it can be a function +that outputs a data.frame on the same form. +3. Specifying the `regression.vfold_cv_para` parameter is optional. If used, +then `regression.vfold_cv_para` must be a list specifying the parameters to +send to the cross-validation function `rsample::vfold_cv()`. Use `?rsample::vfold_cv` +to see the default parameters. The names of the objects in the `regression.vfold_cv_para` +list must match the parameter names in `rsample::vfold_cv()`. For example, if +we want 5-fold cross-validation, we set `regression.vfold_cv_para = list(v = 5)`. + +First, let us look at some ways to specify `regression.tune_values`. +Note that `dials` have several other grid functions, e.g., `dials::grid_random()` +and `dials::grid_latin_hypercube()`. + + +```r +# Possible ways to define the `regression.tune_values` object. +# function(x) dials::grid_regular(dials::tree_depth(), levels = 4) +dials::grid_regular(dials::tree_depth(), levels = 4) +data.table(tree_depth = c(1, 5, 10, 15)) # Can also use data.frame or tibble + +# For several features +# function(x) dials::grid_regular(dials::tree_depth(), dials::cost_complexity(), levels = 3) +dials::grid_regular(dials::tree_depth(), dials::cost_complexity(), levels = 3) +expand.grid(tree_depth = c(1, 3, 5), cost_complexity = c(0.001, 0.05, 0.01)) +``` + +We will now demonstrate how to use cross-validation to fine-tune +the separate decision tree regression method. In the following +examples, we consider two versions. In the first example, we use +cross-validation to tune the `tree_depth` parameter using the +`dials::grid_regular()` function. In the second example, we tune +both the `tree_depth` and `cost_complexity` parameters, but we +will manually specify the possible hyperparameter values this time. + + +```r +# Decision tree with cross validated depth (default values other parameters) +explanation_list$sep_tree_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree( + tree_depth = hardhat::tune(), engine = "rpart", mode = "regression" + ), + regression.tune_values = dials::grid_regular(dials::tree_depth(), levels = 4), + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Use trees with cross-validation on the depth and cost complexity. Manually set the values. +explanation_list$sep_tree_cv_2 <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree( + tree_depth = hardhat::tune(), + cost_complexity = hardhat::tune(), + engine = "rpart", + mode = "regression" + ), + regression.tune_values = + expand.grid(tree_depth = c(1, 3, 5), cost_complexity = c(0.001, 0.01, 0.1)), + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +``` + +We also include one example with a random forest model where +the tunable hyperparameter `mtry` depends on the coalition size. +Thus, `regression.tune_values` must be a function that returns +a data.frame where the hyperparameter values for `mtry` will change +based on the coalition size. If we do not let `regression.tune_values` +be a function, then `tidymodels` will crash for any `mtry` higher +than 1. Furthermore, by setting `verbose = 2`, we receive messages +about which batch and coalition/combination that `shapr` processes +and the results of the cross-validation procedure. Note that the tested +hyperparameter value combinations change based on the coalition size. + + +```r +# Using random forest with default parameters +explanation_list$sep_rf <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::rand_forest(engine = "ranger", mode = "regression") +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Using random forest with parameters tuned by cross-validation +explanation_list$sep_rf_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 1, # One batch to get printouts in chronological order + verbose = 2, # To get printouts + approach = "regression_separate", + regression.model = parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = "ranger", mode = "regression" + ), + regression.tune_values = + function(x) { + dials::grid_regular(dials::mtry(c(1, ncol(x))), dials::trees(c(50, 750)), levels = 3) + }, + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +#> Starting 'setup_approach.regression_separate'. +#> When using `approach = 'regression_separate'` the `explanation$timing$timing_secs` object +#> can be missleading as `setup_computation` does not contain the training times of the +#> regression models as they are trained on the fly in `compute_vS`. This is to reduce memory +#> usage and to improve efficency. +#> Done with 'setup_approach.regression_separate'. +#> Working on batch 1 of 1 in `prepare_data.regression_separate()`. +#> Working on combination with id 2 of 16. +#> Results of the 5-fold cross validation (top 3 best configurations): +#> #1: mtry = 1 trees = 750 rmse = 34.85 rmse_std_err = 2.99 +#> #2: mtry = 1 trees = 400 rmse = 34.95 rmse_std_err = 3.05 +#> #3: mtry = 1 trees = 50 rmse = 34.99 rmse_std_err = 2.81 +#> +#> Working on combination with id 3 of 16. +#> Results of the 5-fold cross validation (top 3 best configurations): +#> #1: mtry = 1 trees = 50 rmse = 27.48 rmse_std_err = 1.50 +#> #2: mtry = 1 trees = 750 rmse = 27.52 rmse_std_err = 1.29 +#> #3: mtry = 1 trees = 400 rmse = 27.74 rmse_std_err = 1.30 +#> +#> Working on combination with id 4 of 16. +#> Results of the 5-fold cross validation (top 3 best configurations): +#> #1: mtry = 1 trees = 400 rmse = 23.60 rmse_std_err = 3.17 +#> #2: mtry = 1 trees = 750 rmse = 23.63 rmse_std_err = 3.17 +#> #3: mtry = 1 trees = 50 rmse = 24.24 rmse_std_err = 3.37 +#> +#> Working on combination with id 5 of 16. +#> Results of the 5-fold cross validation (top 3 best configurations): +#> #1: mtry = 1 trees = 400 rmse = 33.31 rmse_std_err = 2.81 +#> #2: mtry = 1 trees = 750 rmse = 33.34 rmse_std_err = 2.81 +#> #3: mtry = 1 trees = 50 rmse = 33.41 rmse_std_err = 2.87 +#> +#> Working on combination with id 6 of 16. +#> Results of the 5-fold cross validation (top 6 best configurations): +#> #1: mtry = 1 trees = 50 rmse = 21.25 rmse_std_err = 2.24 +#> #2: mtry = 1 trees = 400 rmse = 21.69 rmse_std_err = 2.38 +#> #3: mtry = 1 trees = 750 rmse = 21.81 rmse_std_err = 2.40 +#> #4: mtry = 2 trees = 400 rmse = 22.38 rmse_std_err = 2.11 +#> #5: mtry = 2 trees = 750 rmse = 22.68 rmse_std_err = 2.04 +#> #6: mtry = 2 trees = 50 rmse = 22.91 rmse_std_err = 1.97 +#> +#> Working on combination with id 7 of 16. +#> Results of the 5-fold cross validation (top 6 best configurations): +#> #1: mtry = 2 trees = 50 rmse = 22.18 rmse_std_err = 2.93 +#> #2: mtry = 2 trees = 400 rmse = 22.28 rmse_std_err = 2.74 +#> #3: mtry = 1 trees = 750 rmse = 22.31 rmse_std_err = 2.90 +#> #4: mtry = 2 trees = 750 rmse = 22.35 rmse_std_err = 2.76 +#> #5: mtry = 1 trees = 400 rmse = 22.40 rmse_std_err = 2.80 +#> #6: mtry = 1 trees = 50 rmse = 22.62 rmse_std_err = 2.71 +#> +#> Working on combination with id 8 of 16. +#> Results of the 5-fold cross validation (top 6 best configurations): +#> #1: mtry = 1 trees = 50 rmse = 29.35 rmse_std_err = 2.17 +#> #2: mtry = 1 trees = 400 rmse = 29.45 rmse_std_err = 2.37 +#> #3: mtry = 1 trees = 750 rmse = 29.57 rmse_std_err = 2.32 +#> #4: mtry = 2 trees = 750 rmse = 30.43 rmse_std_err = 2.21 +#> #5: mtry = 2 trees = 400 rmse = 30.49 rmse_std_err = 2.18 +#> #6: mtry = 2 trees = 50 rmse = 30.51 rmse_std_err = 2.19 +#> +#> Working on combination with id 9 of 16. +#> Results of the 5-fold cross validation (top 6 best configurations): +#> #1: mtry = 1 trees = 750 rmse = 18.61 rmse_std_err = 1.56 +#> #2: mtry = 2 trees = 400 rmse = 18.63 rmse_std_err = 1.56 +#> #3: mtry = 1 trees = 400 rmse = 18.80 rmse_std_err = 1.55 +#> #4: mtry = 2 trees = 750 rmse = 19.00 rmse_std_err = 1.70 +#> #5: mtry = 1 trees = 50 rmse = 19.02 rmse_std_err = 1.86 +#> #6: mtry = 2 trees = 50 rmse = 19.50 rmse_std_err = 1.72 +#> +#> Working on combination with id 10 of 16. +#> Results of the 5-fold cross validation (top 6 best configurations): +#> #1: mtry = 1 trees = 400 rmse = 23.61 rmse_std_err = 1.61 +#> #2: mtry = 1 trees = 50 rmse = 23.72 rmse_std_err = 1.49 +#> #3: mtry = 1 trees = 750 rmse = 23.79 rmse_std_err = 1.64 +#> #4: mtry = 2 trees = 750 rmse = 23.86 rmse_std_err = 0.83 +#> #5: mtry = 2 trees = 400 rmse = 23.91 rmse_std_err = 0.80 +#> #6: mtry = 2 trees = 50 rmse = 24.74 rmse_std_err = 0.68 +#> +#> Working on combination with id 11 of 16. +#> Results of the 5-fold cross validation (top 6 best configurations): +#> #1: mtry = 1 trees = 400 rmse = 22.99 rmse_std_err = 4.29 +#> #2: mtry = 1 trees = 750 rmse = 23.08 rmse_std_err = 4.33 +#> #3: mtry = 1 trees = 50 rmse = 23.16 rmse_std_err = 4.28 +#> #4: mtry = 2 trees = 50 rmse = 23.80 rmse_std_err = 3.70 +#> #5: mtry = 2 trees = 400 rmse = 23.85 rmse_std_err = 3.72 +#> #6: mtry = 2 trees = 750 rmse = 24.07 rmse_std_err = 3.79 +#> +#> Working on combination with id 12 of 16. +#> Results of the 5-fold cross validation (top 9 best configurations): +#> #1: mtry = 1 trees = 50 rmse = 16.86 rmse_std_err = 2.19 +#> #2: mtry = 1 trees = 400 rmse = 16.90 rmse_std_err = 1.83 +#> #3: mtry = 1 trees = 750 rmse = 16.91 rmse_std_err = 1.93 +#> #4: mtry = 2 trees = 50 rmse = 17.47 rmse_std_err = 1.47 +#> #5: mtry = 2 trees = 750 rmse = 17.53 rmse_std_err = 1.77 +#> #6: mtry = 2 trees = 400 rmse = 17.82 rmse_std_err = 1.67 +#> #7: mtry = 3 trees = 50 rmse = 18.03 rmse_std_err = 1.84 +#> #8: mtry = 3 trees = 750 rmse = 18.47 rmse_std_err = 1.91 +#> #9: mtry = 3 trees = 400 rmse = 18.49 rmse_std_err = 1.82 +#> +#> Working on combination with id 13 of 16. +#> Results of the 5-fold cross validation (top 9 best configurations): +#> #1: mtry = 1 trees = 50 rmse = 19.27 rmse_std_err = 2.13 +#> #2: mtry = 2 trees = 750 rmse = 19.80 rmse_std_err = 1.59 +#> #3: mtry = 1 trees = 750 rmse = 20.03 rmse_std_err = 1.95 +#> #4: mtry = 2 trees = 400 rmse = 20.21 rmse_std_err = 1.59 +#> #5: mtry = 3 trees = 50 rmse = 20.42 rmse_std_err = 1.64 +#> #6: mtry = 1 trees = 400 rmse = 20.49 rmse_std_err = 2.13 +#> #7: mtry = 2 trees = 50 rmse = 20.59 rmse_std_err = 1.26 +#> #8: mtry = 3 trees = 400 rmse = 20.61 rmse_std_err = 1.68 +#> #9: mtry = 3 trees = 750 rmse = 20.85 rmse_std_err = 1.74 +#> +#> Working on combination with id 14 of 16. +#> Results of the 5-fold cross validation (top 9 best configurations): +#> #1: mtry = 1 trees = 750 rmse = 21.96 rmse_std_err = 3.12 +#> #2: mtry = 1 trees = 400 rmse = 22.36 rmse_std_err = 2.96 +#> #3: mtry = 1 trees = 50 rmse = 22.53 rmse_std_err = 3.01 +#> #4: mtry = 2 trees = 750 rmse = 22.59 rmse_std_err = 2.53 +#> #5: mtry = 2 trees = 400 rmse = 22.76 rmse_std_err = 2.39 +#> #6: mtry = 2 trees = 50 rmse = 22.80 rmse_std_err = 2.41 +#> #7: mtry = 3 trees = 400 rmse = 23.19 rmse_std_err = 2.26 +#> #8: mtry = 3 trees = 750 rmse = 23.42 rmse_std_err = 2.07 +#> #9: mtry = 3 trees = 50 rmse = 23.69 rmse_std_err = 2.22 +#> +#> Working on combination with id 15 of 16. +#> Results of the 5-fold cross validation (top 9 best configurations): +#> #1: mtry = 1 trees = 400 rmse = 18.33 rmse_std_err = 2.07 +#> #2: mtry = 1 trees = 750 rmse = 18.59 rmse_std_err = 2.25 +#> #3: mtry = 2 trees = 750 rmse = 18.78 rmse_std_err = 1.59 +#> #4: mtry = 2 trees = 400 rmse = 18.81 rmse_std_err = 1.58 +#> #5: mtry = 3 trees = 50 rmse = 18.93 rmse_std_err = 1.53 +#> #6: mtry = 3 trees = 400 rmse = 19.11 rmse_std_err = 1.57 +#> #7: mtry = 3 trees = 750 rmse = 19.17 rmse_std_err = 1.71 +#> #8: mtry = 2 trees = 50 rmse = 19.18 rmse_std_err = 1.33 +#> #9: mtry = 1 trees = 50 rmse = 19.94 rmse_std_err = 2.02 +#> +``` + +We can look at the $\operatorname{MSE}_v$ evaluation criterion, +and we see that cross-validation improves both the decision tree +and the random forest methods. The two cross-validated decision +tree methods are comparable, but the second version outperforms +the first version by a small margin. This comparison is somewhat +unfair for the `empirical` approach, which also has hyperparameters +we could potentially tune. However, `shapr` does not currently +provide a function to do this automatically. In the figure below, +we include a vertical line at the $\operatorname{MSE}_v$ score of +the `empirical` method for easier comparison. + + +```r +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +``` + +![](figure_regression/dt-cv-plot-1.png) + +Furthermore, we must consider that cross-validation drastically +increases the elapsed time (seconds) and determine if the +increased precision is worth the extra computational time. +We also see that the complex random forest method performs +significantly worse than the simple decision tree method. +This result indicates that even though we do hyperparameter +tuning, we still overfit the data. + + +```r +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +#> MSEv Time +#> MC_empirical 179.43 2.22 +#> sep_lm 745.21 0.77 +#> sep_pcr 784.91 1.32 +#> sep_splines 165.13 1.09 +#> sep_reicpe_example 687.45 1.74 +#> sep_tree_stump 218.05 1.03 +#> sep_tree_default 177.68 0.89 +#> sep_tree_cv 169.96 17.31 +#> sep_tree_cv_2 166.17 35.01 +#> sep_rf 210.99 1.58 +#> sep_rf_cv 212.88 38.41 +``` + + +### Parallelization {#separate_parallelization} + +The `future` package can train the separate regression models +in parallel. More specifically, we parallelize both the +training step (when we fit the models) and the prediction +step (when we compute $v(S)$). In the main vignette, we also +explain how to enable progress bars. + +In the code chunk below, we consider four regression-based +methods. The first method uses `xgboost` models with default +hyperparameter values, while the remaining three use +cross-validation to tune the number of trees. The second and +third methods specify the same potential hyperparameter values, +but we run the former sequentially while the latter is run in +parallel to speed up the computations. The fourth model is run +in parallel but also tunes the depth of the trees and not only +the number of trees. + +A small side note: If we set `verbose = 2`, we can see which +`tree` value `shapr` chooses for each coalition. We would then +see that the values 25, 50, 100, and 500 are never chosen. +Thus, we can remove these values without influencing the result +and instead do a finer grid search among the lower values. +We do this in the fourth method. + + +```r +# Regular xgboost with default parameters +explanation_list$sep_xgboost <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::boost_tree(engine = "xgboost", mode = "regression") +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Cross validate the number of trees +explanation_list$sep_xgboost_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = + parsnip::boost_tree(trees = hardhat::tune(), engine = "xgboost", mode = "regression"), + regression.tune_values = expand.grid(trees = c(10, 15, 25, 50, 100, 500)), + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Cross validate the number of trees in parallel on two threads +future::plan(future::multisession, workers = 2) +explanation_list$sep_xgboost_cv_par <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = + parsnip::boost_tree(trees = hardhat::tune(), engine = "xgboost", mode = "regression"), + regression.tune_values = expand.grid(trees = c(10, 15, 25, 50, 100, 500)), + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Use a finer grid of low values for `trees` and also tune `tree_depth` +future::plan(future::multisession, workers = 4) # Change to 4 threads due to more complex CV +explanation_list$sep_xgboost_cv_2_par <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::boost_tree( + trees = hardhat::tune(), + tree_depth = hardhat::tune(), + engine = "xgboost", + mode = "regression" + ), + regression.tune_values = expand.grid(trees = c(8, 10, 12, 15), tree_depth = c(4, 6, 8)), + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +future::plan(future::sequential) # To return to non-parallel computation +``` + +Looking at the elapsed time, we see that the parallel version +with two workers is faster than the sequential version. Note +that the elapsed time of the parallel version is not reduced +by a factor of two as the creation of the parallel processes +creates some additional overhead, which is significant in +this small example. However, parallelization will yield considerable +relative time improvements in more complex situations. E.g., +in settings with (more) training observations with more features +(i.e., more coalitions to compute) and situations with more +time-consuming cross-validation (i.e., more folds, +hyperparameters to tune, or hyperparameter values to consider). +Furthermore, we see that conducting the cross-validation has +lowered the $\operatorname{MSE}_v$criterion drastically. Finally, +note that we obtain the same value whether we run the +cross-validation in parallel or sequentially. + + +```r +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +#> MSEv Time +#> MC_empirical 179.43 2.22 +#> sep_lm 745.21 0.77 +#> sep_pcr 784.91 1.32 +#> sep_splines 165.13 1.09 +#> sep_reicpe_example 687.45 1.74 +#> sep_tree_stump 218.05 1.03 +#> sep_tree_default 177.68 0.89 +#> sep_tree_cv 169.96 17.31 +#> sep_tree_cv_2 166.17 35.01 +#> sep_rf 210.99 1.58 +#> sep_rf_cv 212.88 38.41 +#> sep_xgboost 197.72 0.99 +#> sep_xgboost_cv 164.69 20.72 +#> sep_xgboost_cv_par 164.69 17.53 +#> sep_xgboost_cv_2_par 146.51 21.94 +``` + + +# The surrogate regression method class {#surrogate} + +Since the `regression_separate` methods train a new +regression model $g_S(\boldsymbol{x}_S)$ for each coalition +$S \subseteq \{1,2,\dots,M\}$, a total of $2^M-2$ +models has to be trained, which can be time-consuming +for slowly fitted models. The minus two corresponds to +the empty and grand coalitions. + +The `regression_surrogate` method class builds on the +ideas from the `regression_separate` class, but instead of +fitting a new regression model for each coalition, we +train a single regression model $g(\tilde{\boldsymbol{x}}_S)$ +for all coalitions $S \subseteq \{1,2,\dots,M\}$ (except the +empty and grand coalitions), where $\tilde{\boldsymbol{x}}_S$ +is an augmented version of $\boldsymbol{x}_S$. See Section 3.6.1 +in @olsen2024comparative for more details and examples. + +We can also apply all the examples above for the separate +regression method class to the surrogate regression method class. + + +## Code {#surrogate_code} + +We demonstrate the surrogate method class using several +regression models below. More specifically, we use linear +regression, random forest (with and without (some) +cross-validation), and `xgboost` (with and without (some) +cross-validation). + + +```r +# Compute the Shapley value explanations using a surrogate linear regression model +explanation_list$sur_lm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::linear_reg() +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Using xgboost with default parameters as the surrogate model +explanation_list$sur_xgboost <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::boost_tree(engine = "xgboost", mode = "regression") +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Using xgboost with parameters tuned by cross-validation as the surrogate model +explanation_list$sur_xgboost_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::boost_tree( + trees = hardhat::tune(), + tree_depth = hardhat::tune(), + engine = "xgboost", + mode = "regression" + ), + regression.tune_values = expand.grid(trees = c(5, 15, 25), tree_depth = c(2, 6, 10)), + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Using random forest with default parameters as the surrogate model +explanation_list$sur_rf <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::rand_forest(engine = "ranger", mode = "regression") +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Using random forest with parameters tuned by cross-validation as the surrogate model +explanation_list$sur_rf_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = "ranger", mode = "regression" + ), + regression.tune_values = dials::grid_regular( + dials::mtry(c(1, ncol(x_explain))), + dials::trees(c(50, 750)), + levels = 6 + ), + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +``` + + +### Parallelization {#surrogate_parallelization} + +The code chunk below demonstrates how to run the surrogate +regression method class in parallel using the `future` package. +The setup procedure is identical to the one we specified for +[separate regression method class](#separate_parallelization). +The training step of the surrogate regression model can be run +in parallel if we tune some of its hyperparameters. We +parallelize the cross-validation procedure in the training step; +hence, we apply no parallelization in the training step of a +surrogate model with specified hyperparameters. Furthermore, +we parallelize the prediction step (when we compute $v(S)$) +in the same way as for the separate regression method class. +Note that parallelization will introduce some overhead, which +can cause it to be slower than running the code sequentially +for smaller problems. + + +```r +# Cross validate the number of trees in parallel on four threads +future::plan(future::multisession, workers = 4) +explanation_list$sur_rf_cv_par <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = "ranger", mode = "regression" + ), + regression.tune_values = dials::grid_regular( + dials::mtry(c(1, ncol(x_explain))), + dials::trees(c(50, 750)), + levels = 6 + ), + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +future::plan(future::sequential) # To return to non-parallel computation + +# Check that we get identical Shapley value explanations +all.equal( + explanation_list$sur_rf_cv$shapley_values, + explanation_list$sur_rf_cv_par$shapley_values +) +#> [1] TRUE +``` + +By looking at the $\operatorname{MSE}_v$ evaluation criterion +and the elapsed time, we see that the surrogate methods +(except the linear regression model) outperform `empirical` +but are not on the same level as the best separate regression +methods. Furthermore, parallelization (4 cores) decreased the +elapsed time while obtaining the same $\operatorname{MSE}_v$ +score. The identical scores mean that the separate models are +identical and independent of whether they were run sequentially +or in parallel. + + +```r +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +#> MSEv Time +#> MC_empirical 179.43 2.22 +#> sep_lm 745.21 0.77 +#> sep_pcr 784.91 1.32 +#> sep_splines 165.13 1.09 +#> sep_reicpe_example 687.45 1.74 +#> sep_tree_stump 218.05 1.03 +#> sep_tree_default 177.68 0.89 +#> sep_tree_cv 169.96 17.31 +#> sep_tree_cv_2 166.17 35.01 +#> sep_rf 210.99 1.58 +#> sep_rf_cv 212.88 38.41 +#> sep_xgboost 197.72 0.99 +#> sep_xgboost_cv 164.69 20.72 +#> sep_xgboost_cv_par 164.69 17.53 +#> sep_xgboost_cv_2_par 146.51 21.94 +#> sur_lm 649.61 0.31 +#> sur_xgboost 169.92 0.26 +#> sur_xgboost_cv 169.87 2.37 +#> sur_rf 195.10 0.52 +#> sur_rf_cv 171.84 30.55 +#> sur_rf_cv_par 171.84 33.24 + +# Compare the MSEv criterion of the different explanation methods. +# Include vertical line corresponding to the MSEv of the empirical method. +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +``` + +![](figure_regression/surrogate-plot-1.png) + + +# Add new regression methods {#new} + +Even though the `tidymodels` framework contains many +[models](https://www.tidymodels.org/find/parsnip/), we might +want to add additional methods. In the following section, we +demonstrate how to add the projection pursuit regression (PPR) +model as a new method that can be used by `shapr` to compute the +Shapley value explanations, both as a separate and surrogate method. + +We use the `ppr()` implementation in the `stats` package to fit +the PPR model. The model has several hyperparameters that can be +tuned, but the main hyperparameter is the number of terms `nterms`. +The following is based on the [`tidymodels` guide](https://www.tidymodels.org/learn/develop/models/) +on adding new regression models. We refer to that guide for more +details and explanations of the code below. + + +```r +# Step 1: register the model, modes, and arguments +parsnip::set_new_model(model = "ppr_reg") +parsnip::set_model_mode(model = "ppr_reg", mode = "regression") +parsnip::set_model_engine(model = "ppr_reg", mode = "regression", eng = "ppr") +parsnip::set_dependency("ppr_reg", eng = "ppr", pkg = "stats") + +# If your function has several parameters, then we add one of these functions for each parameter +parsnip::set_model_arg( + model = "ppr_reg", + eng = "ppr", + original = "nterms", # The original parameter name used in stats::ppr + parsnip = "num_terms", # Change parameter name to match tidymodels' name convention + func = list(pkg = "dials", fun = "num_terms"), # list(pkg = "stats", fun = "ppr"), + has_submodel = FALSE +) + +# Step 2: create the model function +ppr_reg <- function(mode = "regression", engine = "ppr", num_terms = NULL) { + # Check for correct mode + if (mode != "regression") rlang::abort("`mode` should be 'regression'") + + # Check for correct engine + if (engine != "ppr") rlang::abort("`engine` should be 'ppr'") + + # Capture the arguments in quosures + args <- list(num_terms = rlang::enquo(num_terms)) + + # Save some empty slots for future parts of the specification + parsnip::new_model_spec( + "ppr_reg", + args = args, + eng_args = NULL, + mode = mode, + method = NULL, + engine = engine + ) +} + +# Step 3: add a fit module +parsnip::set_fit( + model = "ppr_reg", + eng = "ppr", + mode = "regression", + value = list( + interface = "formula", + protect = c("formula", "data", "weights"), + func = c(pkg = "stats", fun = "ppr"), + defaults = list() + ) +) + +parsnip::set_encoding( + model = "ppr_reg", + eng = "ppr", + mode = "regression", + options = list( + predictor_indicators = "traditional", + compute_intercept = TRUE, + remove_intercept = TRUE, + allow_sparse_x = FALSE + ) +) + +# Step 4: add modules for prediction +parsnip::set_pred( + model = "ppr_reg", + eng = "ppr", + mode = "regression", + type = "numeric", + value = list( + pre = NULL, + post = NULL, + func = c(fun = "predict"), + args = list( + object = quote(object$fit), + newdata = quote(new_data), + type = "numeric" + ) + ) +) + +# Step 5: add tuning function (used by tune::tune_grid()) +tunable.ppr_reg <- function(x, ...) { + tibble::tibble( + name = c("num_terms"), + call_info = list(list(pkg = NULL, fun = "num_terms")), + source = "model_spec", + component = "ppr_reg", + component_id = "main" + ) +} + +# Step 6: add updating function (used by tune::finalize_workflow()) +update.ppr_reg <- function(object, parameters = NULL, num_terms = NULL, ...) { + rlang::check_installed("parsnip") + eng_args <- parsnip::update_engine_parameters(object$eng_args, fresh = TRUE, ...) + args <- list(num_terms = rlang::enquo(num_terms)) + args <- parsnip::update_main_parameters(args, parameters) + parsnip::new_model_spec( + "ppr_reg", + args = args, + eng_args = eng_args, + mode = object$mode, + method = NULL, + engine = object$engine + ) +} +``` + +We can now use the PPR model to compute the Shapley value +explanations. We can use it as a separate and surrogate +regression method, and we can either set the number of +terms `num_terms` to a specific value or use cross-validation +to tune the hyperparameter. We do all four combinations below. + + +```r +# PPR separate with specified number of terms +explanation_list$sep_ppr <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = ppr_reg(num_terms = 2) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# PPR separate with cross-validated number of terms +explanation_list$sep_ppr_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = ppr_reg(num_terms = hardhat::tune()), + regression.tune_values = dials::grid_regular(dials::num_terms(c(1, 4)), levels = 3), + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# PPR surrogate with specified number of terms +explanation_list$sur_ppr <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = ppr_reg(num_terms = 3) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# PPR surrogate with cross-validated number of terms +explanation_list$sur_ppr_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = ppr_reg(num_terms = hardhat::tune()), + regression.tune_values = dials::grid_regular(dials::num_terms(c(1, 8)), levels = 4), + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. +``` + +We can then compare the $\operatorname{MSE}_v$ and some of the Shapley value explanations. +We see that conducting cross-validation improves the evaluation criterion, +but also increase the running time. + + +```r +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +#> MSEv Time +#> MC_empirical 179.43 2.22 +#> sep_lm 745.21 0.77 +#> sep_pcr 784.91 1.32 +#> sep_splines 165.13 1.09 +#> sep_reicpe_example 687.45 1.74 +#> sep_tree_stump 218.05 1.03 +#> sep_tree_default 177.68 0.89 +#> sep_tree_cv 169.96 17.31 +#> sep_tree_cv_2 166.17 35.01 +#> sep_rf 210.99 1.58 +#> sep_rf_cv 212.88 38.41 +#> sep_xgboost 197.72 0.99 +#> sep_xgboost_cv 164.69 20.72 +#> sep_xgboost_cv_par 164.69 17.53 +#> sep_xgboost_cv_2_par 146.51 21.94 +#> sur_lm 649.61 0.31 +#> sur_xgboost 169.92 0.26 +#> sur_xgboost_cv 169.87 2.37 +#> sur_rf 195.10 0.52 +#> sur_rf_cv 171.84 30.55 +#> sur_rf_cv_par 171.84 33.24 +#> sep_ppr 327.23 1.41 +#> sep_ppr_cv 269.74 15.46 +#> sur_ppr 395.42 0.29 +#> sur_ppr_cv 415.62 1.86 + +# Compare the MSEv criterion of the different explanation methods +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +``` + +![](figure_regression/ppr-plot-1.png) + + +# Summary figures {#summary} + +In this section, we compute the Shapley value explanations for the +Monte Carlo-based methods in the `shapr` package and compare the results +with all the regression-based methods above. The purpose of this vignette +is to demonstrate the rich possibilities that the regression paradigm and +the `tidymodels` framework adds to the `shapr` package. + +In the code chunk below, we compute the Shapley value explanations using +the different Monte Carlo-based methods. + + +```r +explanation_list_MC <- list() + +# Compute the Shapley value explanations using the independence method +explanation_list_MC$MC_independence <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + n_batches = 4, + approach = "independence", + prediction_zero = p0 +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Copy the Shapley value explanations for the empirical method +explanation_list_MC$MC_empirical <- explanation_list$MC_empirical + +# Compute the Shapley value explanations using the gaussian method +explanation_list_MC$MC_gaussian <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + n_batches = 4, + approach = "gaussian", + prediction_zero = p0 +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Compute the Shapley value explanations using the copula method +explanation_list_MC$MC_copula <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + n_batches = 4, + approach = "copula", + prediction_zero = p0 +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Compute the Shapley value explanations using the ctree method +explanation_list_MC$MC_ctree <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + n_batches = 4, + approach = "ctree", + prediction_zero = p0 +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Compute the Shapley value explanations using the vaeac method +explanation_list_MC$MC_vaeac <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + n_batches = 4, + approach = "vaeac", + prediction_zero = p0, + vaeac.epochs = 10 +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Combine the two explanations lists +explanation_list$MC_empirical <- NULL +explanation_list <- c(explanation_list_MC, explanation_list) +``` + +We then compare the compare the regression and Monte Carlo-based methods by +plotting the $\operatorname{MSE}_v$ evaluation criterion. We continue with +include a vertical line corresponding to the $\operatorname{MSE}_v$ of +the `MC_empirical` method to make the comparison easier. + + +```r +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +#> MSEv Time +#> MC_independence 206.92 0.50 +#> MC_empirical 179.43 2.22 +#> MC_gaussian 245.19 0.49 +#> MC_copula 247.29 0.46 +#> MC_ctree 191.82 1.72 +#> MC_vaeac 141.88 72.61 +#> sep_lm 745.21 0.77 +#> sep_pcr 784.91 1.32 +#> sep_splines 165.13 1.09 +#> sep_reicpe_example 687.45 1.74 +#> sep_tree_stump 218.05 1.03 +#> sep_tree_default 177.68 0.89 +#> sep_tree_cv 169.96 17.31 +#> sep_tree_cv_2 166.17 35.01 +#> sep_rf 210.99 1.58 +#> sep_rf_cv 212.88 38.41 +#> sep_xgboost 197.72 0.99 +#> sep_xgboost_cv 164.69 20.72 +#> sep_xgboost_cv_par 164.69 17.53 +#> sep_xgboost_cv_2_par 146.51 21.94 +#> sur_lm 649.61 0.31 +#> sur_xgboost 169.92 0.26 +#> sur_xgboost_cv 169.87 2.37 +#> sur_rf 195.10 0.52 +#> sur_rf_cv 171.84 30.55 +#> sur_rf_cv_par 171.84 33.24 +#> sep_ppr 327.23 1.41 +#> sep_ppr_cv 269.74 15.46 +#> sur_ppr 395.42 0.29 +#> sur_ppr_cv 415.62 1.86 + +# Compare the MSEv criterion of the different explanation methods +# Include vertical line corresponding to the MSEv of the MC_empirical method +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +``` + +![](figure_regression/MSEv-sum-1.png) + +The `vaeac` approach is the best-performing method according to the +$\operatorname{MSE}_v$ evaluation criterion, while the `sep_xgboost_cv_2_par` +is the best-performing regression-based method. However, we should note that +the `vaeac` method is much slower and that the difference between the +$\operatorname{MSE}_v$ values is minuscule and inside the confidence intervals. + +We can also order the methods to more easily look at the order of the methods +according to the $\operatorname{MSE}_v$ criterion. + + +```r +order <- get_k_best_methods(explanation_list, k = length(explanation_list)) +plot_MSEv_scores(explanation_list[order], method_line = "MC_empirical") +``` + +![](figure_regression/MSEv-sum-2-1.png) + +We can also examine the different Shapley value explanations for the first six +explicands (two at a time), and we still sort the methods from best to worst. +Most methods agree in the general directions, especially for the most important +features (the features with the largest absolute Shapley values), but there are +some differences for the less important features. These tendencies/discrepancies +are often more visible for the methods with poor/larger $\operatorname{MSE}_v$ values. + + +```r +plot_SV_several_approaches(explanation_list[order], index_explicands = c(1, 2), facet_ncol = 1) +``` + +![](figure_regression/SV-sum-1.png) + +```r +plot_SV_several_approaches(explanation_list[order], index_explicands = c(3, 4), facet_ncol = 1) +``` + +![](figure_regression/SV-sum-2.png) + +```r +plot_SV_several_approaches(explanation_list[order], index_explicands = c(5, 6), facet_ncol = 1) +``` + +![](figure_regression/SV-sum-3.png) + +Here, we focus on the five best methods (and `MC_empricial`) to make it +easier to analyze the individual Shapley value explanations, and we see +a quite strong agreement between the different methods. + + +```r +# Extract the 5 best methods (and empirical) +best_methods <- get_k_best_methods(explanation_list, k = 5) +if (!"MC_empirical" %in% best_methods) best_methods <- c(best_methods, "MC_empirical") +plot_SV_several_approaches(explanation_list[best_methods], index_explicands = 1:4) +``` + +![](figure_regression/SV-sum-2-1.png) + + +# Mixed data {#mixed} + +In this section, we replicate and extend the mixed data example +from the main vignette by demonstrating the separate and surrogate +regression methods. Of the Monte Carlo-based methods, only the +`independence` (not recommended), `ctree`, and `vaeac` methods support +mixed data. We can divide the regression models into two groups based +on whether the model can handle categorical features by default or if +we need to apply pre-processing of the categorical features. By +pre-processing, we mean that we need to convert the categorical features +into numerical values using, for example, dummy features. We demonstrate +this below using the `regression.recipe_func` function. + +## Mixed data: setup + +First, we copy the setup from the main vignette. + + +```r +# convert the month variable to a factor +data_cat <- copy(data)[, Month_factor := as.factor(Month)] + +data_train_cat <- data_cat[-ind_x_explain, ] +data_explain_cat <- data_cat[ind_x_explain, ] + +x_var_cat <- c("Solar.R", "Wind", "Temp", "Month_factor") + +x_train_cat <- data_train_cat[, ..x_var_cat] +x_explain_cat <- data_explain_cat[, ..x_var_cat] + +p0_cat <- mean(y_train) + +# Fitting an lm model here as xgboost does not handle categorical features directly +formula <- as.formula(paste0(y_var, " ~ ", paste0(x_var_cat, collapse = " + "))) +model_cat <- lm(formula, data_train_cat) + +# We could also consider other models such as random forest which supports mixed data +# model_cat <- ranger(formula, data_train_cat) + +# List to store the explanations for this mixed data setup +explanation_list_mixed <- list() +``` + + +## Mixed data: Monte Carlo-based methods + +Second, we compute the explanations using the Monte Carlo-based methods. + + +```r +explanation_list_mixed$MC_independence <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "independence" +) + +explanation_list_mixed$MC_ctree <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "ctree" +) + +explanation_list_mixed$MC_vaeac <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "vaeac" +) +``` + + +## Mixed data: separate regression methods + +Third, we compute the Shapley value explanations using separate +regression methods. We use many of the same regression models +as we did above for the continuous data examples. + + +```r +# Standard linear regression +explanation_list_mixed$sep_lm <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg() +) + +# Linear regression where we have added splines to the numerical features +explanation_list_mixed$sep_splines <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = function(regression_recipe) { + return(step_ns(regression_recipe, all_numeric_predictors(), deg_free = 2)) + } +) + +# Decision tree with default parameters +explanation_list_mixed$sep_tree <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree(engine = "rpart", mode = "regression") +) + +# Use trees with cross-validation on the depth and cost complexity. Manually set the values. +explanation_list_mixed$sep_tree_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree( + tree_depth = hardhat::tune(), + cost_complexity = hardhat::tune(), + engine = "rpart", + mode = "regression" + ), + regression.tune_values = + expand.grid(tree_depth = c(1, 3, 5), cost_complexity = c(0.001, 0.01, 0.1)), + regression.vfold_cv_para = list(v = 5) +) + +# Random forest with default hyperparameters. Do NOT need to use dummy features. +explanation_list_mixed$sep_rf <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::rand_forest(engine = "ranger", mode = "regression") +) + +# Random forest with cross validated hyperparameters. +explanation_list_mixed$sep_rf_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = "ranger", mode = "regression" + ), + regression.tune_values = + function(x) { + dials::grid_regular(dials::mtry(c(1, ncol(x))), dials::trees(c(50, 750)), levels = 4) + }, + regression.vfold_cv_para = list(v = 5) +) + +# Xgboost with default hyperparameters, but we have to dummy encode the factors +explanation_list_mixed$sep_xgboost <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::boost_tree(engine = "xgboost", mode = "regression"), + regression.recipe_func = function(regression_recipe) { + return(step_dummy(regression_recipe, all_factor_predictors())) + } +) + +# Xgboost with cross validated hyperparameters and we dummy encode the factors +explanation_list_mixed$sep_xgboost_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::boost_tree( + trees = hardhat::tune(), + tree_depth = hardhat::tune(), + engine = "xgboost", + mode = "regression" + ), + regression.recipe_func = function(regression_recipe) { + return(step_dummy(regression_recipe, all_factor_predictors())) + }, + regression.tune_values = expand.grid(trees = c(5, 15, 25), tree_depth = c(2, 6, 10)), + regression.vfold_cv_para = list(v = 5) +) +``` + + +## Mixed data: surrogate regression methods + +Fourth, we compute the Shapley value explanations using surrogate +regression methods. We use the same regression models as we did +above for separate regression method class. + + +```r +# Standard linear regression +explanation_list_mixed$sur_lm <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::linear_reg() +) + +# Linear regression where we have added splines to the numerical features +# NOTE, that we remove the augmented mask variables to avoid a rank-deficient fit +explanation_list_mixed$sur_splines <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = function(recipe) { + return(step_ns(recipe, all_numeric_predictors(), -starts_with("mask_"), deg_free = 2)) + } +) + +# Decision tree with default parameters +explanation_list_mixed$sur_tree <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::decision_tree(engine = "rpart", mode = "regression") +) + +# Use trees with cross-validation on the depth and cost complexity. Manually set the values. +explanation_list_mixed$sur_tree_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::decision_tree( + tree_depth = hardhat::tune(), + cost_complexity = hardhat::tune(), + engine = "rpart", + mode = "regression" + ), + regression.tune_values = + expand.grid(tree_depth = c(1, 3, 5), cost_complexity = c(0.001, 0.01, 0.1)), + regression.vfold_cv_para = list(v = 5) +) + +# Random forest with default hyperparameters. Do NOT need to use dummy features. +explanation_list_mixed$sur_rf <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::rand_forest(engine = "ranger", mode = "regression") +) + +# Random forest with cross validated hyperparameters. +explanation_list_mixed$sur_rf_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = "ranger", mode = "regression" + ), + regression.tune_values = expand.grid(mtry = c(1, 2, 4), trees = c(50, 250, 500, 750)), + regression.vfold_cv_para = list(v = 5) +) + +# Xgboost with default hyperparameters, but we have to dummy encode the factors +explanation_list_mixed$sur_xgboost <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::boost_tree(engine = "xgboost", mode = "regression"), + regression.recipe_func = function(regression_recipe) { + return(step_dummy(regression_recipe, all_factor_predictors())) + } +) + +# Xgboost with cross validated hyperparameters and we dummy encode the factors +explanation_list_mixed$sur_xgboost_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::boost_tree( + trees = hardhat::tune(), + tree_depth = hardhat::tune(), + engine = "xgboost", + mode = "regression" + ), + regression.recipe_func = function(regression_recipe) { + return(step_dummy(regression_recipe, all_factor_predictors())) + }, + regression.tune_values = expand.grid(trees = c(5, 15, 25), tree_depth = c(2, 6, 10)), + regression.vfold_cv_para = list(v = 5) +) +``` + + +## Mixed data: summary {#summary_mixed} + +Fifth, and finally, we compare the results. The surrogate random +forest model performs well and outperforms the cross-validated +version, but note the wide confidence interval. We see that several +of the regression-based methods outperform the Monte Carlo-based +methods. More specifically, three separate regression methods and +three surrogate regression methods. + + +```r +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list_mixed) +#> MSEv Time +#> MC_independence 641.82 0.69 +#> MC_ctree 554.50 2.36 +#> MC_vaeac 629.43 147.26 +#> sep_lm 550.06 1.53 +#> sep_splines 541.36 1.80 +#> sep_tree 753.84 0.84 +#> sep_tree_cv 756.27 41.75 +#> sep_rf 521.79 1.10 +#> sep_rf_cv 609.58 51.42 +#> sep_xgboost 792.17 1.13 +#> sep_xgboost_cv 595.98 26.29 +#> sur_lm 610.61 0.51 +#> sur_splines 596.86 0.55 +#> sur_tree 677.04 0.38 +#> sur_tree_cv 789.37 3.34 +#> sur_rf 414.15 0.55 +#> sur_rf_cv 533.06 15.50 +#> sur_xgboost 606.92 0.40 +#> sur_xgboost_cv 429.06 3.05 + +# Compare the MSEv criterion of the different explanation methods +# Include vertical line corresponding to the MSEv of the empirical method. +plot_MSEv_scores(explanation_list_mixed, method_line = "MC_ctree") +``` + +![](figure_regression/mixed-plot-1.png) + +The best-performing methods are the surrogate random forest +and xgboost with cross-validation methods. The Monte Carlo-based +methods perform worse, with `ctree` being the best, with a +seventh-place overall ranking. + +We can also order the methods to more easily look at the order +of the methods according to the $\operatorname{MSE}_v$ criterion. + + +```r +order <- get_k_best_methods(explanation_list_mixed, k = length(explanation_list_mixed)) +plot_MSEv_scores(explanation_list_mixed[order], method_line = "MC_ctree") +``` + +![](figure_regression/mixed-plot-2-1.png) + +We also look at some of the Shapley value explanations and +see that many methods produce similar explanations. + + +```r +plot_SV_several_approaches(explanation_list_mixed[order], index_explicands = c(1, 2), facet_ncol = 1) +``` + +![](figure_regression/mixed-plot-3-1.png) + +We can also focus on the Shapley value explanations for the best five +methods according to the $\operatorname{MSE}_v$ criterion. We also +include the `ctree` method, the best-performing Monte Carlo-based method. + + +```r +best_methods <- get_k_best_methods(explanation_list_mixed, k = 5) +if (!"MC_ctree" %in% best_methods) best_methods <- c(best_methods, "MC_ctree") +plot_SV_several_approaches(explanation_list_mixed[best_methods], index_explicands = 1:4) +``` + +![](figure_regression/mixed-plot-4-1.png) + + +# Regression arguments as strings + +In this section, we demonstrate that the `regression.model`, +`regression.tune_values`, and `regression.recipe_func` +parameters can be provided as strings. This is a property +which is convenient if the `explain()` function is called +from Python. That is, the user only has to specify strings +containing R code instead of having to deal with creating +the R objects in Python. In the code chunk below, we see +that we obtain identical $\operatorname{MSE}_v$ scores for +the string and non-string versions. + + +```r +explanation_list_str <- list() +explanation_list_str$sep_lm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = "parsnip::linear_reg()" +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +explanation_list_str$sep_pcr <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = "parsnip::linear_reg()", + regression.recipe_func = "function(regression_recipe) { + return(recipes::step_pca(regression_recipe, recipes::all_numeric_predictors(), num_comp = 2)) + }" +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +explanation_list_str$sep_splines <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = "function(regression_recipe) { + return(recipes::step_ns(regression_recipe, recipes::all_numeric_predictors(), deg_free = 2)) + }" +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +explanation_list_str$sep_tree_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = "parsnip::decision_tree( + tree_depth = hardhat::tune(), engine = 'rpart', mode = 'regression' + )", + regression.tune_values = "dials::grid_regular(dials::tree_depth(), levels = 4)", + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Using random forest with parameters tuned by cross-validation +explanation_list_str$sep_rf_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 1, # As we used this for the non-string version + approach = "regression_separate", + regression.model = "parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = 'ranger', mode = 'regression' + )", + regression.tune_values = + "function(x) { + dials::grid_regular(dials::mtry(c(1, ncol(x))), dials::trees(c(50, 750)), levels = 3) + }", + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# Using random forest with parameters tuned by cross-validation as the surrogate model +explanation_list_str$sur_rf_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = "parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = 'ranger', mode = 'regression' + )", + regression.tune_values = "dials::grid_regular( + dials::mtry(c(1, ncol(x_explain))), + dials::trees(c(50, 750)), + levels = 6 + )", + regression.vfold_cv_para = list(v = 5) +) +#> Note: Feature classes extracted from the model contains NA. +#> Assuming feature classes from the data are correct. + +# See that the evaluation scores match the non-string versions. +print_MSEv_scores_and_time(explanation_list_str) +#> MSEv Time +#> sep_lm 745.21 1.14 +#> sep_pcr 784.91 1.19 +#> sep_splines 165.13 1.15 +#> sep_tree_cv 169.96 20.65 +#> sep_rf_cv 212.88 39.29 +#> sur_rf_cv 171.84 30.51 +print_MSEv_scores_and_time(explanation_list[names(explanation_list_str)]) +#> MSEv Time +#> sep_lm 745.21 0.77 +#> sep_pcr 784.91 1.32 +#> sep_splines 165.13 1.09 +#> sep_tree_cv 169.96 17.31 +#> sep_rf_cv 212.88 38.41 +#> sur_rf_cv 171.84 30.55 +``` + + +# Vignette summary + +This vignette demonstrates the rich possibilities that the regression +paradigm and the `tidymodels` framework add to the `shapr` package. +We have seen that regression-based methods are on par with or outperform +the Monte Carlo-based methods regarding the $\operatorname{MSE}_v$ +evaluation criterion. Furthermore, we have seen that the regression-based +methods are relatively computationally fast and that parallelization can +be used to speed up the computations. + +# References diff --git a/vignettes/understanding_shapr_regression.Rmd.orig b/vignettes/understanding_shapr_regression.Rmd.orig new file mode 100644 index 000000000..8db1271ee --- /dev/null +++ b/vignettes/understanding_shapr_regression.Rmd.orig @@ -0,0 +1,1760 @@ +--- +title: "Shapley value explanations using the regression paradigm" +author: "Lars Henry Berge Olsen" +output: + rmarkdown::html_vignette: + toc: true +bibliography: ../inst/REFERENCES.bib +vignette: > + %\VignetteEncoding{UTF-8} + %\VignetteIndexEntry{Shapley value explanations using the regression paradigm} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + markdown: + wrap: 72 + toc: true +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.cap = "", + fig.width = 7, + fig.height = 5, + fig.path = "figure_regression/", # Ensure that figures are saved in the right folder (build vignette manually) + cache.path = "cache_regression/", # Ensure that cached objects are saved in the right folder + warning = FALSE, + message = TRUE +) +``` + + + + + + + + + + + + + + + + + + + + + + + + +This vignette elaborates and demonstrates the regression +paradigm explained in @olsen2024comparative. We describe +how to specify the regression model, how to enable automatic +cross-validation of the model's hyperparameters, and applying +pre-processing steps to the data before fitting the regression +models. We refer to @olsen2024comparative for when one should +use the different paradigms, method classes, and methods. + +@olsen2024comparative divides the regression paradigm into +the separate and surrogate regression method classes. In this +vignette, we briefly introduce the two method classes. For an +in-depth explanation, we refer the reader to Sections 3.5 and +3.6 in @olsen2024comparative. + +Briefly stated, the regression paradigm uses regression +models to directly estimate the contribution function +$v(S) = E[f(\boldsymbol{x})|\boldsymbol{x}_S = \boldsymbol{x}_S^*]$. +The separate regression method class fits a separate regression +model for each coalition $S$, while the surrogate regression +method class fits a single regression model to simultaneously +predict the contribution function for all coalitions. + +The `shapr` package supports any regression model from the +popular `tidymodels` package developed by @tidymodels. The +[`tidymodels`](https://www.tidymodels.org/) framework is a +collection of packages for modeling and machine learning +using [`tidyverse`](https://www.tidyverse.org/) principles. +Some packages included in the `tidymodels` framework are +`parsnip`, `recipes`, `workflows` `tune`, and `rsample`; +see the [setup](#setup) section below for more examples. +Furthermore, click [here](https://www.tidymodels.org/find/parsnip/) +to access the complete list of supported regression models +in the `tidymodels` package. There are currently 80 supported +models, but we can apply a wide range of pre-processing data +steps to increase this number or add regression models not +already implemented in `tidymodels`. In the former setting, +we can either apply the linear regression model directly to +the data or pre-process the data to compute principal components +(principal component regression), which we do in the +[pre-process](#separate_preproc) section. For the latter setting, +we demonstrate how to incorporate the projection pursuit regression +model into the `tidymodels` framework in the +[add new regression methods](#new) section. + +Note that our framework does not currently support model +formulas with special terms. For example, we do not support +`parsnip::gen_additive_mod` (i.e., `mgcv::gam()`) as it uses +a non-standard notion in its formulas (in this case, the +`s(feature, k = 2)` function). See `?parsnip::model_formula()` +for more information. However, this hurdle is overcome by +pre-processing data steps containing spline functions, which +we do in the [pre-process](#separate_preproc) section for the +separate regression method class. + +In the [mixed data](#mixed) section, we demonstrate that the +regression-based methods work on mixed data, too. However, we must +add a pre-processing step for the regression models that do not +natively support categorical data to encode the categorical features. + +Note that we use the same data and predictive models in this +vignette as in the main vignette. + +See the end of the [continous data](#summary) and +[mixed data](#summary_mixed) sections for summary figures of all the +methods used in this vignette to compute the Shapley value explanations. + + +# The separate regression method class {#separate} + +In the `regression_separate` methods, we train a new regression +model $g_S(\boldsymbol{x}s)$ to estimate the conditional expectation +for each coalition of features. + +The idea is to estimate +$v(S) = E[f(\boldsymbol{x})|\boldsymbol{x}_S = \boldsymbol{x}_S^*] = E[f(\boldsymbol{x}_{\bar{S}},\boldsymbol{x}_S)|\boldsymbol{x}_S=\boldsymbol{x}_S^*]$ +separately for each coalition $S$ using regression. Let +$\mathcal{D} = \{ \boldsymbol{x}^{[i]}, y^{[i]} \}_{i=1}^{N_{\text{train}}}$ +denote the training data, where $\boldsymbol{x}^{[i]}$ is the $i$th +$M$-dimensional input and $y^{[i]}$ is the associated response. +For each coalition $S \subseteq \{1,2,\dots,M\}$, the corresponding +training data set is +\begin{align*} + \mathcal{D}_S + = + \{\boldsymbol{x}_S^{[i]}, f(\underbrace{\boldsymbol{x}_\bar{S}^{[i]}, \boldsymbol{x}_S^{[i]}}_{\boldsymbol{x}^{[i]}})\}_{i=1}^{N_{\text{train}}} + = + \{\boldsymbol{x}_S^{[i]}, \underbrace{f(\boldsymbol{x}^{[i]})}_{z^{[i]}}\}_{i=1}^{N_{\text{train}}} + = + \{\boldsymbol{x}_S^{[i]}, z^{[i]}\}_{i=1}^{N_{\text{train}}}. +\end{align*} + +For each data set $\mathcal{D}_S$, we train a regression model +$g_S(\boldsymbol{x}s)$ with respect to the mean squared error +loss function. That is, we fit a regression model where the +prediction $f(\boldsymbol{x})$ is acting as the response and +the feature subset of coalition $S$, $\boldsymbol{x}_S$, is +acting as the available features. The optimal model, with +respect to the loss function, is +$g^*_S(\boldsymbol{x}_S) = E[z|\boldsymbol{x}_S] = E[f(\boldsymbol{x}_\bar{S}, \boldsymbol{x}_S)|\boldsymbol{x}_S]$, +which corresponds to the contribution function $v(S)$. The +regression model $g_S$ aims for the optimal, hence, it +resembles/estimates the contribution function, i.e., +$g_S(\boldsymbol{x}_S) = \hat{v}(S) \approx v(S) = E[f(\boldsymbol{x}_\bar{S}, \boldsymbol{x}_S) | \boldsymbol{x}_S = \boldsymbol{x}_S^*]$. + + +## Code {#separate_code} + +In this supplementary vignette, we use the same data and explain +the same model type as in the main vignette. We train a simple +`xgboost` model on the `airquality` dataset and demonstrate how +to use the `shapr` and the separate regression method class to +explain the individual predictions. + + +### Setup {#setup} + +First, we set up the `airquality` dataset and train an `xgboost` +model, whose predictions we want to explain using the Shapley value +explanation framework. We import all packages in the `tidymodels` +framework in the code chunk below, but we could have specified them +directly, too. In this vignette, we use the following packages in +the `tidymodels` framework: `parsnip`, `recipes`, `workflows`, +`dials`, `hardhat`, `tibble`, `rlang`, and `ggplot2`. We include the +`package::function()` notation throughout this vignette to indicate +which package the functions originate from in the `tidymodels` framework. + +```{r setup, message = FALSE, cache = TRUE} +# Either use `library(tidymodels)` or separately specify the libraries indicated above +library(tidymodels) + +# Other libraries +library(xgboost) +library(data.table) +library(shapr) + +# Ensure that shapr's functions are prioritzed, otherwise we need to use the `shapr::` +# prefix when calling explain(). The `conflicted` package is imported by `tidymodels`. +conflicted::conflicts_prefer(shapr::explain, shapr::prepare_data) + +data("airquality") +data <- data.table::as.data.table(airquality) +data <- data[complete.cases(data), ] + +x_var <- c("Solar.R", "Wind", "Temp", "Month") +y_var <- "Ozone" + +ind_x_explain <- 1:20 +x_train <- data[-ind_x_explain, ..x_var] +y_train <- data[-ind_x_explain, get(y_var)] +x_explain <- data[ind_x_explain, ..x_var] + +# Fitting a basic xgboost model to the training data +set.seed(123) # Set seed for reproducibility +model <- xgboost::xgboost( + data = as.matrix(x_train), + label = y_train, + nround = 20, + verbose = FALSE +) + +# Specifying the phi_0, i.e. the expected prediction without any features +p0 <- mean(y_train) + +# List to store all the explanation objects +explanation_list <- list() +``` + +To make the rest of the vignette easier to follow, we create some helper +functions that plot and summarize the results of the explanation methods. +This code block is optional to understand and can be skipped. + +```{r setup-help, cache = TRUE} +# Plot the MSEv criterion scores as horizontal bars and add dashed line of one method's score +plot_MSEv_scores <- function(explanation_list, method_line = NULL) { + fig <- plot_MSEv_eval_crit(explanation_list) + + ggplot2::theme(legend.position = "none") + + ggplot2::coord_flip() + + ggplot2::theme(plot.title = ggplot2::element_text(size = rel(0.95))) + fig <- fig + ggplot2::scale_x_discrete(limits = rev(levels(fig$data$Method))) + if (!is.null(method_line) && method_line %in% fig$data$Method) { + fig <- fig + ggplot2::geom_hline( + yintercept = fig$data$MSEv[fig$data$Method == method_line], + linetype = "dashed", + color = "black" + ) + } + return(fig) +} + +# Extract the MSEv criterion scores and elapsed times +print_MSEv_scores_and_time <- function(explanation_list) { + res <- as.data.frame(t(sapply( + explanation_list, + function(explanation) { + round(c(explanation$MSEv$MSEv$MSEv, explanation$timing$total_time_secs), 2) + } + ))) + colnames(res) <- c("MSEv", "Time") + return(res) +} + +# Extract the k best methods in decreasing order +get_k_best_methods <- function(explanation_list, k_best) { + res <- print_MSEv_scores_and_time(explanation_list) + return(rownames(res)[order(res$MSEv)[seq(k_best)]]) +} +``` + +To establish a baseline against which to compare the regression methods, +we will compare them with the Monte Carlo-based `empirical` approach +with default hyperparameters. In the last section, we include all Monte +Carlo-based methods implemented in `shapr` to make an extensive comparison. + +```{r empirical, cache=TRUE} +# Compute the Shapley value explanations using the empirical method +explanation_list$MC_empirical <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "empirical", + prediction_zero = p0, + n_batches = 4 +) +``` + + +### Linear regression model +Then we compute the Shapley value explanations using a linear +regression model and the separate regression method class. + +```{r explain-sep-lm, cache=TRUE} +explanation_list$sep_lm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg() +) +``` + +A linear model is often not flexible enough to properly model the +contribution function. Thus, it can produce inaccurate Shapley value +explanations. The figure below shows that the `empirical` approach +outperforms the linear regression model approach quite significantly +concerning the $\operatorname{MSE}_v$ evaluation criterion. + +```{r lm-emp-msev, cache=TRUE} +plot_MSEv_scores(explanation_list) +``` + + +### Pre-processing {#separate_preproc} + +This section describes how to pre-process the data before +fitting the separate regression models. We demonstrate this +for the linear regression model, but we can apply this +pre-processing to other regression methods. + +The `recipe` package in the `tidymodels` framework contains +many functions to pre-process the data before fitting the model, +for example, normalization, interaction, encodings, and +transformations (e.g., log, splines, pls, pca). Click +[here](https://recipes.tidymodels.org/reference/index.html) +to access a complete list of all available functions. The list +also contains functions for helping us select which features +to apply the functions to, e.g., `recipes::all_predictors()`, +`recipes::all_numeric_predictors()`, and `recipes::all_factor_predictors()` +apply the functions to all features, only the numerical features, +and only the factor features, respectively. We can also specify +the names of the features to which the functions are applied. +However, as the included features change in each coalition, +we need to check that the feature we want to apply the function +to is present in the dataset. We give an example of this below. + +First, we demonstrate how to compute the principal components +and use (up to) the first two components for each separate +linear regression model. We write "up to" as we can only compute +a single principal component for the singleton coalitions, i.e., +the feature itself. This regression model is called principal +component regression. + +```{r pcr, cache=TRUE} +explanation_list$sep_pcr <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = function(regression_recipe) { + return(recipes::step_pca(regression_recipe, recipes::all_numeric_predictors(), num_comp = 2)) + } +) +``` + +Second, we apply a pre-processing step that computes the basis +expansions of the features using natural splines with two degrees +of freedom. This is similar to fitting a generalized additive model. + +```{r natural-splines, cache=TRUE} +explanation_list$sep_splines <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = function(regression_recipe) { + return(recipes::step_ns(regression_recipe, recipes::all_numeric_predictors(), deg_free = 2)) + } +) +``` + +Finally, we provide an example where we include interactions +between the features `Solar.R` and `Wind`, log-transform `Solar.R`, +convert `Wind` to be between 0 and 1 and then take the square root, +include polynomials of the third degree for `Temp`, and apply the +Box-Cox transformation to `Month`. These transformations are only +applied when the features are present for the different separate models. + +Furthermore, we stress that the purpose of this example is to highlight +the framework's flexibility, NOT that the transformations below are reasonable. + +```{r recipe-func, cache=TRUE} +# Example function of how to apply step functions from the recipes package to specific features +regression.recipe_func <- function(recipe) { + # Get the names of the present features + feature_names <- recipe$var_info$variable[recipe$var_info$role == "predictor"] + + # If Solar.R and Wind is present, then we add the interaction between them + if (all(c("Solar.R", "Wind") %in% feature_names)) { + recipe <- recipes::step_interact(recipe, terms = ~ Solar.R:Wind) + } + + # If Solar.R is present, then log transform it + if ("Solar.R" %in% feature_names) recipe <- recipes::step_log(recipe, Solar.R) + + # If Wind is present, then scale it to be between 0 and 1 and then sqrt transform it + if ("Wind" %in% feature_names) recipe <- recipes::step_sqrt(recipes::step_range(recipe, Wind)) + + # If Temp is present, then expand it using orthogonal polynomials of degree 3 + if ("Temp" %in% feature_names) recipe <- recipes::step_poly(recipe, Temp, degree = 3) + + # If Month is present, then Box-Cox transform it + if ("Month" %in% feature_names) recipe <- recipes::step_BoxCox(recipe, Month) + + # Finally we normalize all features (not needed as LM does this internally) + recipe <- recipes::step_normalize(recipe, recipes::all_numeric_predictors()) + + return(recipe) +} + +# Compute the Shapley values using the pre-processing steps defined above +explanation_list$sep_reicpe_example <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = regression.recipe_func +) +``` + +We can examine the $\operatorname{MSE}_v$ evaluation scores, and we +see that the method using natural splines significantly outperforms +the other methods. + +```{r preproc-plot, cache=TRUE} +# Compare the MSEv criterion of the different explanation methods +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") + +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +``` + + +### Other regression models + +In the following example, we use a decision tree +model instead of the simple linear regression model. + +The `tidymodels` framework supports several implementations +of the decision tree model. We use `set_engine("rpart")` +to specify that we want to use the implementation in the +`rpart` package, and we use `set_mode("regression")` to +specify that we are doing regression. The `tidymodels` +framework uses the default hyperparameter values set in +`rpart` when we do not specify them. By searching for +"decision tree" in the [list of tidymodels](https://www.tidymodels.org/find/parsnip/), +we see that the default hyperparameter values for the +[`decision_tree_rpart`](https://parsnip.tidymodels.org//reference/details_decision_tree_rpart.html) +model are `tree_depth = 30`, `min_n = 2`, and `cost_complexity = 0.01`. + +```{r decision-tree, cache=TRUE} +# Decision tree with specified parameters (stumps) +explanation_list$sep_tree_stump <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree( + tree_depth = 1, + min_n = 2, + cost_complexity = 0.01, + engine = "rpart", + mode = "regression" + ) +) + +# Decision tree with default parameters +explanation_list$sep_tree_default <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree(engine = "rpart", mode = "regression") +) +``` + +We can also set `regression.model = parsnip::decision_tree(tree_depth = 1, min_n = 2, cost_complexity = 0.01) %>% parsnip::set_engine("rpart") %>% parsnip::set_mode("regression")` +if we want to use the pipe function (`%>%`). + +We can now compare the two new methods. The decision tree with +default parameters outperforms the linear model approach concerning +the $\operatorname{MSE}_v$ criterion and is on the same level as +the empirical approach. We obtained a worse method by using stumps, +i.e., trees with depth one. + +```{r decision-tree-plot, cache=TRUE} +# Compare the MSEv criterion of the different explanation methods +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +``` + + +### Cross-validation {#separate_cv} + +Another option is to use cross-validation to tune the hyperparameters. +To do this, we need to specify three things: + +1. In `regression.model`, we need to specify which parameters to tune +in the model. We do this by setting the parameter equal to `hardhat::tune()`. +For example., if we want to tune the `tree_depth` parameter in the +`parsnip::decision_tree` model while using default parameters for the +other parameters, then we set `parsnip::decision_tree(tree_depth = hardhat::tune())`. +2. In `regression.tune_values`, we must provide either a data.frame (can also +be a data.table or tibble) containing the possible hyperparameter values or a +function that takes in the training data for each combination/coalition and +outputs a data.frame containing the possible hyperparameter values. The latter +allows us to use different hyperparameter values for different coalition sizes, +which is essential if a hyperparameter's domain changes with the coalition size. +For example, see the example below where we want to tune the `mtry` parameter in +`ranger` (random forest). The column names of `regression.tune_values` (or the +output if it is a function) must match the tuneable hyperparameters specified +in `regression.model`. For the example above, `regression.tune_values` must be +a one-column data.frame with the column name `tree_depth`. We can either +manually specify the hyperparameter values or use the `dials` package, e.g., +`dials::grid_regular(dials::tree_depth(), levels = 5)`. Or it can be a function +that outputs a data.frame on the same form. +3. Specifying the `regression.vfold_cv_para` parameter is optional. If used, +then `regression.vfold_cv_para` must be a list specifying the parameters to +send to the cross-validation function `rsample::vfold_cv()`. Use `?rsample::vfold_cv` +to see the default parameters. The names of the objects in the `regression.vfold_cv_para` +list must match the parameter names in `rsample::vfold_cv()`. For example, if +we want 5-fold cross-validation, we set `regression.vfold_cv_para = list(v = 5)`. + +First, let us look at some ways to specify `regression.tune_values`. +Note that `dials` have several other grid functions, e.g., `dials::grid_random()` +and `dials::grid_latin_hypercube()`. + +```{r echo=TRUE, results='hide'} +# Possible ways to define the `regression.tune_values` object. +# function(x) dials::grid_regular(dials::tree_depth(), levels = 4) +dials::grid_regular(dials::tree_depth(), levels = 4) +data.table(tree_depth = c(1, 5, 10, 15)) # Can also use data.frame or tibble + +# For several features +# function(x) dials::grid_regular(dials::tree_depth(), dials::cost_complexity(), levels = 3) +dials::grid_regular(dials::tree_depth(), dials::cost_complexity(), levels = 3) +expand.grid(tree_depth = c(1, 3, 5), cost_complexity = c(0.001, 0.05, 0.01)) +``` + +We will now demonstrate how to use cross-validation to fine-tune +the separate decision tree regression method. In the following +examples, we consider two versions. In the first example, we use +cross-validation to tune the `tree_depth` parameter using the +`dials::grid_regular()` function. In the second example, we tune +both the `tree_depth` and `cost_complexity` parameters, but we +will manually specify the possible hyperparameter values this time. + +```{r dt-cv, cache=TRUE} +# Decision tree with cross validated depth (default values other parameters) +explanation_list$sep_tree_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree( + tree_depth = hardhat::tune(), engine = "rpart", mode = "regression" + ), + regression.tune_values = dials::grid_regular(dials::tree_depth(), levels = 4), + regression.vfold_cv_para = list(v = 5) +) + +# Use trees with cross-validation on the depth and cost complexity. Manually set the values. +explanation_list$sep_tree_cv_2 <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree( + tree_depth = hardhat::tune(), + cost_complexity = hardhat::tune(), + engine = "rpart", + mode = "regression" + ), + regression.tune_values = + expand.grid(tree_depth = c(1, 3, 5), cost_complexity = c(0.001, 0.01, 0.1)), + regression.vfold_cv_para = list(v = 5) +) +``` + +We also include one example with a random forest model where +the tunable hyperparameter `mtry` depends on the coalition size. +Thus, `regression.tune_values` must be a function that returns +a data.frame where the hyperparameter values for `mtry` will change +based on the coalition size. If we do not let `regression.tune_values` +be a function, then `tidymodels` will crash for any `mtry` higher +than 1. Furthermore, by setting `verbose = 2`, we receive messages +about which batch and coalition/combination that `shapr` processes +and the results of the cross-validation procedure. Note that the tested +hyperparameter value combinations change based on the coalition size. + +```{r rf-cv, cache=TRUE} +# Using random forest with default parameters +explanation_list$sep_rf <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::rand_forest(engine = "ranger", mode = "regression") +) + +# Using random forest with parameters tuned by cross-validation +explanation_list$sep_rf_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 1, # One batch to get printouts in chronological order + verbose = 2, # To get printouts + approach = "regression_separate", + regression.model = parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = "ranger", mode = "regression" + ), + regression.tune_values = + function(x) { + dials::grid_regular(dials::mtry(c(1, ncol(x))), dials::trees(c(50, 750)), levels = 3) + }, + regression.vfold_cv_para = list(v = 5) +) +``` + +We can look at the $\operatorname{MSE}_v$ evaluation criterion, +and we see that cross-validation improves both the decision tree +and the random forest methods. The two cross-validated decision +tree methods are comparable, but the second version outperforms +the first version by a small margin. This comparison is somewhat +unfair for the `empirical` approach, which also has hyperparameters +we could potentially tune. However, `shapr` does not currently +provide a function to do this automatically. In the figure below, +we include a vertical line at the $\operatorname{MSE}_v$ score of +the `empirical` method for easier comparison. + +```{r dt-cv-plot, cache=TRUE} +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +``` + +Furthermore, we must consider that cross-validation drastically +increases the elapsed time (seconds) and determine if the +increased precision is worth the extra computational time. +We also see that the complex random forest method performs +significantly worse than the simple decision tree method. +This result indicates that even though we do hyperparameter +tuning, we still overfit the data. + +```{r dt-cv-print, cache=TRUE} +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +``` + + +### Parallelization {#separate_parallelization} + +The `future` package can train the separate regression models +in parallel. More specifically, we parallelize both the +training step (when we fit the models) and the prediction +step (when we compute $v(S)$). In the main vignette, we also +explain how to enable progress bars. + +In the code chunk below, we consider four regression-based +methods. The first method uses `xgboost` models with default +hyperparameter values, while the remaining three use +cross-validation to tune the number of trees. The second and +third methods specify the same potential hyperparameter values, +but we run the former sequentially while the latter is run in +parallel to speed up the computations. The fourth model is run +in parallel but also tunes the depth of the trees and not only +the number of trees. + +A small side note: If we set `verbose = 2`, we can see which +`tree` value `shapr` chooses for each coalition. We would then +see that the values 25, 50, 100, and 500 are never chosen. +Thus, we can remove these values without influencing the result +and instead do a finer grid search among the lower values. +We do this in the fourth method. + +```{r xgboost, cache=TRUE} +# Regular xgboost with default parameters +explanation_list$sep_xgboost <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::boost_tree(engine = "xgboost", mode = "regression") +) + +# Cross validate the number of trees +explanation_list$sep_xgboost_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = + parsnip::boost_tree(trees = hardhat::tune(), engine = "xgboost", mode = "regression"), + regression.tune_values = expand.grid(trees = c(10, 15, 25, 50, 100, 500)), + regression.vfold_cv_para = list(v = 5) +) + +# Cross validate the number of trees in parallel on two threads +future::plan(future::multisession, workers = 2) +explanation_list$sep_xgboost_cv_par <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = + parsnip::boost_tree(trees = hardhat::tune(), engine = "xgboost", mode = "regression"), + regression.tune_values = expand.grid(trees = c(10, 15, 25, 50, 100, 500)), + regression.vfold_cv_para = list(v = 5) +) + +# Use a finer grid of low values for `trees` and also tune `tree_depth` +future::plan(future::multisession, workers = 4) # Change to 4 threads due to more complex CV +explanation_list$sep_xgboost_cv_2_par <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::boost_tree( + trees = hardhat::tune(), + tree_depth = hardhat::tune(), + engine = "xgboost", + mode = "regression" + ), + regression.tune_values = expand.grid(trees = c(8, 10, 12, 15), tree_depth = c(4, 6, 8)), + regression.vfold_cv_para = list(v = 5) +) +future::plan(future::sequential) # To return to non-parallel computation +``` + +Looking at the elapsed time, we see that the parallel version +with two workers is faster than the sequential version. Note +that the elapsed time of the parallel version is not reduced +by a factor of two as the creation of the parallel processes +creates some additional overhead, which is significant in +this small example. However, parallelization will yield considerable +relative time improvements in more complex situations. E.g., +in settings with (more) training observations with more features +(i.e., more coalitions to compute) and situations with more +time-consuming cross-validation (i.e., more folds, +hyperparameters to tune, or hyperparameter values to consider). +Furthermore, we see that conducting the cross-validation has +lowered the $\operatorname{MSE}_v$criterion drastically. Finally, +note that we obtain the same value whether we run the +cross-validation in parallel or sequentially. + +```{r xgboost-print, cache=TRUE} +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) +``` + + +# The surrogate regression method class {#surrogate} + +Since the `regression_separate` methods train a new +regression model $g_S(\boldsymbol{x}_S)$ for each coalition +$S \subseteq \{1,2,\dots,M\}$, a total of $2^M-2$ +models has to be trained, which can be time-consuming +for slowly fitted models. The minus two corresponds to +the empty and grand coalitions. + +The `regression_surrogate` method class builds on the +ideas from the `regression_separate` class, but instead of +fitting a new regression model for each coalition, we +train a single regression model $g(\tilde{\boldsymbol{x}}_S)$ +for all coalitions $S \subseteq \{1,2,\dots,M\}$ (except the +empty and grand coalitions), where $\tilde{\boldsymbol{x}}_S$ +is an augmented version of $\boldsymbol{x}_S$. See Section 3.6.1 +in @olsen2024comparative for more details and examples. + +We can also apply all the examples above for the separate +regression method class to the surrogate regression method class. + + +## Code {#surrogate_code} + +We demonstrate the surrogate method class using several +regression models below. More specifically, we use linear +regression, random forest (with and without (some) +cross-validation), and `xgboost` (with and without (some) +cross-validation). + +```{r surrogate, cache=TRUE} +# Compute the Shapley value explanations using a surrogate linear regression model +explanation_list$sur_lm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::linear_reg() +) + +# Using xgboost with default parameters as the surrogate model +explanation_list$sur_xgboost <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::boost_tree(engine = "xgboost", mode = "regression") +) + +# Using xgboost with parameters tuned by cross-validation as the surrogate model +explanation_list$sur_xgboost_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::boost_tree( + trees = hardhat::tune(), + tree_depth = hardhat::tune(), + engine = "xgboost", + mode = "regression" + ), + regression.tune_values = expand.grid(trees = c(5, 15, 25), tree_depth = c(2, 6, 10)), + regression.vfold_cv_para = list(v = 5) +) + +# Using random forest with default parameters as the surrogate model +explanation_list$sur_rf <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::rand_forest(engine = "ranger", mode = "regression") +) + +# Using random forest with parameters tuned by cross-validation as the surrogate model +explanation_list$sur_rf_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = "ranger", mode = "regression" + ), + regression.tune_values = dials::grid_regular( + dials::mtry(c(1, ncol(x_explain))), + dials::trees(c(50, 750)), + levels = 6 + ), + regression.vfold_cv_para = list(v = 5) +) +``` + + +### Parallelization {#surrogate_parallelization} + +The code chunk below demonstrates how to run the surrogate +regression method class in parallel using the `future` package. +The setup procedure is identical to the one we specified for +[separate regression method class](#separate_parallelization). +The training step of the surrogate regression model can be run +in parallel if we tune some of its hyperparameters. We +parallelize the cross-validation procedure in the training step; +hence, we apply no parallelization in the training step of a +surrogate model with specified hyperparameters. Furthermore, +we parallelize the prediction step (when we compute $v(S)$) +in the same way as for the separate regression method class. +Note that parallelization will introduce some overhead, which +can cause it to be slower than running the code sequentially +for smaller problems. + +```{r surrogate-cv-par, cache=TRUE} +# Cross validate the number of trees in parallel on four threads +future::plan(future::multisession, workers = 4) +explanation_list$sur_rf_cv_par <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = "ranger", mode = "regression" + ), + regression.tune_values = dials::grid_regular( + dials::mtry(c(1, ncol(x_explain))), + dials::trees(c(50, 750)), + levels = 6 + ), + regression.vfold_cv_para = list(v = 5) +) +future::plan(future::sequential) # To return to non-parallel computation + +# Check that we get identical Shapley value explanations +all.equal( + explanation_list$sur_rf_cv$shapley_values, + explanation_list$sur_rf_cv_par$shapley_values +) +``` + +By looking at the $\operatorname{MSE}_v$ evaluation criterion +and the elapsed time, we see that the surrogate methods +(except the linear regression model) outperform `empirical` +but are not on the same level as the best separate regression +methods. Furthermore, parallelization (4 cores) decreased the +elapsed time while obtaining the same $\operatorname{MSE}_v$ +score. The identical scores mean that the separate models are +identical and independent of whether they were run sequentially +or in parallel. + +```{r surrogate-plot, cache=TRUE} +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) + +# Compare the MSEv criterion of the different explanation methods. +# Include vertical line corresponding to the MSEv of the empirical method. +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +``` + + +# Add new regression methods {#new} + +Even though the `tidymodels` framework contains many +[models](https://www.tidymodels.org/find/parsnip/), we might +want to add additional methods. In the following section, we +demonstrate how to add the projection pursuit regression (PPR) +model as a new method that can be used by `shapr` to compute the +Shapley value explanations, both as a separate and surrogate method. + +We use the `ppr()` implementation in the `stats` package to fit +the PPR model. The model has several hyperparameters that can be +tuned, but the main hyperparameter is the number of terms `nterms`. +The following is based on the [`tidymodels` guide](https://www.tidymodels.org/learn/develop/models/) +on adding new regression models. We refer to that guide for more +details and explanations of the code below. + +```{r ppr-setup, cache=TRUE} +# Step 1: register the model, modes, and arguments +parsnip::set_new_model(model = "ppr_reg") +parsnip::set_model_mode(model = "ppr_reg", mode = "regression") +parsnip::set_model_engine(model = "ppr_reg", mode = "regression", eng = "ppr") +parsnip::set_dependency("ppr_reg", eng = "ppr", pkg = "stats") + +# If your function has several parameters, then we add one of these functions for each parameter +parsnip::set_model_arg( + model = "ppr_reg", + eng = "ppr", + original = "nterms", # The original parameter name used in stats::ppr + parsnip = "num_terms", # Change parameter name to match tidymodels' name convention + func = list(pkg = "dials", fun = "num_terms"), # list(pkg = "stats", fun = "ppr"), + has_submodel = FALSE +) + +# Step 2: create the model function +ppr_reg <- function(mode = "regression", engine = "ppr", num_terms = NULL) { + # Check for correct mode + if (mode != "regression") rlang::abort("`mode` should be 'regression'") + + # Check for correct engine + if (engine != "ppr") rlang::abort("`engine` should be 'ppr'") + + # Capture the arguments in quosures + args <- list(num_terms = rlang::enquo(num_terms)) + + # Save some empty slots for future parts of the specification + parsnip::new_model_spec( + "ppr_reg", + args = args, + eng_args = NULL, + mode = mode, + method = NULL, + engine = engine + ) +} + +# Step 3: add a fit module +parsnip::set_fit( + model = "ppr_reg", + eng = "ppr", + mode = "regression", + value = list( + interface = "formula", + protect = c("formula", "data", "weights"), + func = c(pkg = "stats", fun = "ppr"), + defaults = list() + ) +) + +parsnip::set_encoding( + model = "ppr_reg", + eng = "ppr", + mode = "regression", + options = list( + predictor_indicators = "traditional", + compute_intercept = TRUE, + remove_intercept = TRUE, + allow_sparse_x = FALSE + ) +) + +# Step 4: add modules for prediction +parsnip::set_pred( + model = "ppr_reg", + eng = "ppr", + mode = "regression", + type = "numeric", + value = list( + pre = NULL, + post = NULL, + func = c(fun = "predict"), + args = list( + object = quote(object$fit), + newdata = quote(new_data), + type = "numeric" + ) + ) +) + +# Step 5: add tuning function (used by tune::tune_grid()) +tunable.ppr_reg <- function(x, ...) { + tibble::tibble( + name = c("num_terms"), + call_info = list(list(pkg = NULL, fun = "num_terms")), + source = "model_spec", + component = "ppr_reg", + component_id = "main" + ) +} + +# Step 6: add updating function (used by tune::finalize_workflow()) +update.ppr_reg <- function(object, parameters = NULL, num_terms = NULL, ...) { + rlang::check_installed("parsnip") + eng_args <- parsnip::update_engine_parameters(object$eng_args, fresh = TRUE, ...) + args <- list(num_terms = rlang::enquo(num_terms)) + args <- parsnip::update_main_parameters(args, parameters) + parsnip::new_model_spec( + "ppr_reg", + args = args, + eng_args = eng_args, + mode = object$mode, + method = NULL, + engine = object$engine + ) +} +``` + +We can now use the PPR model to compute the Shapley value +explanations. We can use it as a separate and surrogate +regression method, and we can either set the number of +terms `num_terms` to a specific value or use cross-validation +to tune the hyperparameter. We do all four combinations below. + +```{r ppr-train, cache=TRUE} +# PPR separate with specified number of terms +explanation_list$sep_ppr <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = ppr_reg(num_terms = 2) +) + +# PPR separate with cross-validated number of terms +explanation_list$sep_ppr_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = ppr_reg(num_terms = hardhat::tune()), + regression.tune_values = dials::grid_regular(dials::num_terms(c(1, 4)), levels = 3), + regression.vfold_cv_para = list(v = 5) +) + +# PPR surrogate with specified number of terms +explanation_list$sur_ppr <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = ppr_reg(num_terms = 3) +) + +# PPR surrogate with cross-validated number of terms +explanation_list$sur_ppr_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = ppr_reg(num_terms = hardhat::tune()), + regression.tune_values = dials::grid_regular(dials::num_terms(c(1, 8)), levels = 4), + regression.vfold_cv_para = list(v = 5) +) +``` + +We can then compare the $\operatorname{MSE}_v$ and some of the Shapley value explanations. +We see that conducting cross-validation improves the evaluation criterion, +but also increase the running time. + +```{r ppr-plot, cache=TRUE} +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) + +# Compare the MSEv criterion of the different explanation methods +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +``` + + +# Summary figures {#summary} + +In this section, we compute the Shapley value explanations for the +Monte Carlo-based methods in the `shapr` package and compare the results +with all the regression-based methods above. The purpose of this vignette +is to demonstrate the rich possibilities that the regression paradigm and +the `tidymodels` framework adds to the `shapr` package. + +In the code chunk below, we compute the Shapley value explanations using +the different Monte Carlo-based methods. + +```{r MC, cache=TRUE} +explanation_list_MC <- list() + +# Compute the Shapley value explanations using the independence method +explanation_list_MC$MC_independence <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + n_batches = 4, + approach = "independence", + prediction_zero = p0 +) + +# Copy the Shapley value explanations for the empirical method +explanation_list_MC$MC_empirical <- explanation_list$MC_empirical + +# Compute the Shapley value explanations using the gaussian method +explanation_list_MC$MC_gaussian <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + n_batches = 4, + approach = "gaussian", + prediction_zero = p0 +) + +# Compute the Shapley value explanations using the copula method +explanation_list_MC$MC_copula <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + n_batches = 4, + approach = "copula", + prediction_zero = p0 +) + +# Compute the Shapley value explanations using the ctree method +explanation_list_MC$MC_ctree <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + n_batches = 4, + approach = "ctree", + prediction_zero = p0 +) + +# Compute the Shapley value explanations using the vaeac method +explanation_list_MC$MC_vaeac <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + n_batches = 4, + approach = "vaeac", + prediction_zero = p0, + vaeac.epochs = 10 +) + +# Combine the two explanations lists +explanation_list$MC_empirical <- NULL +explanation_list <- c(explanation_list_MC, explanation_list) +``` + +We then compare the compare the regression and Monte Carlo-based methods by +plotting the $\operatorname{MSE}_v$ evaluation criterion. We continue with +include a vertical line corresponding to the $\operatorname{MSE}_v$ of +the `MC_empirical` method to make the comparison easier. + +```{r MSEv-sum, cache=TRUE} +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list) + +# Compare the MSEv criterion of the different explanation methods +# Include vertical line corresponding to the MSEv of the MC_empirical method +plot_MSEv_scores(explanation_list, method_line = "MC_empirical") +``` + +The `vaeac` approach is the best-performing method according to the +$\operatorname{MSE}_v$ evaluation criterion, while the `sep_xgboost_cv_2_par` +is the best-performing regression-based method. However, we should note that +the `vaeac` method is much slower and that the difference between the +$\operatorname{MSE}_v$ values is minuscule and inside the confidence intervals. + +We can also order the methods to more easily look at the order of the methods +according to the $\operatorname{MSE}_v$ criterion. + +```{r MSEv-sum-2, cache=TRUE} +order <- get_k_best_methods(explanation_list, k = length(explanation_list)) +plot_MSEv_scores(explanation_list[order], method_line = "MC_empirical") +``` + +We can also examine the different Shapley value explanations for the first six +explicands (two at a time), and we still sort the methods from best to worst. +Most methods agree in the general directions, especially for the most important +features (the features with the largest absolute Shapley values), but there are +some differences for the less important features. These tendencies/discrepancies +are often more visible for the methods with poor/larger $\operatorname{MSE}_v$ values. + +```{r SV-sum, cache=TRUE} +plot_SV_several_approaches(explanation_list[order], index_explicands = c(1, 2), facet_ncol = 1) +plot_SV_several_approaches(explanation_list[order], index_explicands = c(3, 4), facet_ncol = 1) +plot_SV_several_approaches(explanation_list[order], index_explicands = c(5, 6), facet_ncol = 1) +``` + +Here, we focus on the five best methods (and `MC_empricial`) to make it +easier to analyze the individual Shapley value explanations, and we see +a quite strong agreement between the different methods. + +```{r SV-sum-2, cache = TRUE} +# Extract the 5 best methods (and empirical) +best_methods <- get_k_best_methods(explanation_list, k = 5) +if (!"MC_empirical" %in% best_methods) best_methods <- c(best_methods, "MC_empirical") +plot_SV_several_approaches(explanation_list[best_methods], index_explicands = 1:4) +``` + + +# Mixed data {#mixed} + +In this section, we replicate and extend the mixed data example +from the main vignette by demonstrating the separate and surrogate +regression methods. Of the Monte Carlo-based methods, only the +`independence` (not recommended), `ctree`, and `vaeac` methods support +mixed data. We can divide the regression models into two groups based +on whether the model can handle categorical features by default or if +we need to apply pre-processing of the categorical features. By +pre-processing, we mean that we need to convert the categorical features +into numerical values using, for example, dummy features. We demonstrate +this below using the `regression.recipe_func` function. + +## Mixed data: setup + +First, we copy the setup from the main vignette. + +```{r mixed-setup, cache = TRUE} +# convert the month variable to a factor +data_cat <- copy(data)[, Month_factor := as.factor(Month)] + +data_train_cat <- data_cat[-ind_x_explain, ] +data_explain_cat <- data_cat[ind_x_explain, ] + +x_var_cat <- c("Solar.R", "Wind", "Temp", "Month_factor") + +x_train_cat <- data_train_cat[, ..x_var_cat] +x_explain_cat <- data_explain_cat[, ..x_var_cat] + +p0_cat <- mean(y_train) + +# Fitting an lm model here as xgboost does not handle categorical features directly +formula <- as.formula(paste0(y_var, " ~ ", paste0(x_var_cat, collapse = " + "))) +model_cat <- lm(formula, data_train_cat) + +# We could also consider other models such as random forest which supports mixed data +# model_cat <- ranger(formula, data_train_cat) + +# List to store the explanations for this mixed data setup +explanation_list_mixed <- list() +``` + + +## Mixed data: Monte Carlo-based methods + +Second, we compute the explanations using the Monte Carlo-based methods. + +```{r mixed-MC, cache = TRUE} +explanation_list_mixed$MC_independence <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "independence" +) + +explanation_list_mixed$MC_ctree <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "ctree" +) + +explanation_list_mixed$MC_vaeac <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "vaeac" +) +``` + + +## Mixed data: separate regression methods + +Third, we compute the Shapley value explanations using separate +regression methods. We use many of the same regression models +as we did above for the continuous data examples. + +```{r mixed-separate, cache = TRUE} +# Standard linear regression +explanation_list_mixed$sep_lm <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg() +) + +# Linear regression where we have added splines to the numerical features +explanation_list_mixed$sep_splines <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = function(regression_recipe) { + return(step_ns(regression_recipe, all_numeric_predictors(), deg_free = 2)) + } +) + +# Decision tree with default parameters +explanation_list_mixed$sep_tree <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree(engine = "rpart", mode = "regression") +) + +# Use trees with cross-validation on the depth and cost complexity. Manually set the values. +explanation_list_mixed$sep_tree_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::decision_tree( + tree_depth = hardhat::tune(), + cost_complexity = hardhat::tune(), + engine = "rpart", + mode = "regression" + ), + regression.tune_values = + expand.grid(tree_depth = c(1, 3, 5), cost_complexity = c(0.001, 0.01, 0.1)), + regression.vfold_cv_para = list(v = 5) +) + +# Random forest with default hyperparameters. Do NOT need to use dummy features. +explanation_list_mixed$sep_rf <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::rand_forest(engine = "ranger", mode = "regression") +) + +# Random forest with cross validated hyperparameters. +explanation_list_mixed$sep_rf_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = "ranger", mode = "regression" + ), + regression.tune_values = + function(x) { + dials::grid_regular(dials::mtry(c(1, ncol(x))), dials::trees(c(50, 750)), levels = 4) + }, + regression.vfold_cv_para = list(v = 5) +) + +# Xgboost with default hyperparameters, but we have to dummy encode the factors +explanation_list_mixed$sep_xgboost <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::boost_tree(engine = "xgboost", mode = "regression"), + regression.recipe_func = function(regression_recipe) { + return(step_dummy(regression_recipe, all_factor_predictors())) + } +) + +# Xgboost with cross validated hyperparameters and we dummy encode the factors +explanation_list_mixed$sep_xgboost_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::boost_tree( + trees = hardhat::tune(), + tree_depth = hardhat::tune(), + engine = "xgboost", + mode = "regression" + ), + regression.recipe_func = function(regression_recipe) { + return(step_dummy(regression_recipe, all_factor_predictors())) + }, + regression.tune_values = expand.grid(trees = c(5, 15, 25), tree_depth = c(2, 6, 10)), + regression.vfold_cv_para = list(v = 5) +) +``` + + +## Mixed data: surrogate regression methods + +Fourth, we compute the Shapley value explanations using surrogate +regression methods. We use the same regression models as we did +above for separate regression method class. + +```{r mixed-surrogate, cache = TRUE} +# Standard linear regression +explanation_list_mixed$sur_lm <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::linear_reg() +) + +# Linear regression where we have added splines to the numerical features +# NOTE, that we remove the augmented mask variables to avoid a rank-deficient fit +explanation_list_mixed$sur_splines <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = function(recipe) { + return(step_ns(recipe, all_numeric_predictors(), -starts_with("mask_"), deg_free = 2)) + } +) + +# Decision tree with default parameters +explanation_list_mixed$sur_tree <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::decision_tree(engine = "rpart", mode = "regression") +) + +# Use trees with cross-validation on the depth and cost complexity. Manually set the values. +explanation_list_mixed$sur_tree_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::decision_tree( + tree_depth = hardhat::tune(), + cost_complexity = hardhat::tune(), + engine = "rpart", + mode = "regression" + ), + regression.tune_values = + expand.grid(tree_depth = c(1, 3, 5), cost_complexity = c(0.001, 0.01, 0.1)), + regression.vfold_cv_para = list(v = 5) +) + +# Random forest with default hyperparameters. Do NOT need to use dummy features. +explanation_list_mixed$sur_rf <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::rand_forest(engine = "ranger", mode = "regression") +) + +# Random forest with cross validated hyperparameters. +explanation_list_mixed$sur_rf_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = "ranger", mode = "regression" + ), + regression.tune_values = expand.grid(mtry = c(1, 2, 4), trees = c(50, 250, 500, 750)), + regression.vfold_cv_para = list(v = 5) +) + +# Xgboost with default hyperparameters, but we have to dummy encode the factors +explanation_list_mixed$sur_xgboost <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::boost_tree(engine = "xgboost", mode = "regression"), + regression.recipe_func = function(regression_recipe) { + return(step_dummy(regression_recipe, all_factor_predictors())) + } +) + +# Xgboost with cross validated hyperparameters and we dummy encode the factors +explanation_list_mixed$sur_xgboost_cv <- explain( + model = model_cat, + x_explain = x_explain_cat, + x_train = x_train_cat, + prediction_zero = p0_cat, + n_batches = 4, + approach = "regression_surrogate", + regression.model = parsnip::boost_tree( + trees = hardhat::tune(), + tree_depth = hardhat::tune(), + engine = "xgboost", + mode = "regression" + ), + regression.recipe_func = function(regression_recipe) { + return(step_dummy(regression_recipe, all_factor_predictors())) + }, + regression.tune_values = expand.grid(trees = c(5, 15, 25), tree_depth = c(2, 6, 10)), + regression.vfold_cv_para = list(v = 5) +) +``` + + +## Mixed data: summary {#summary_mixed} + +Fifth, and finally, we compare the results. The surrogate random +forest model performs well and outperforms the cross-validated +version, but note the wide confidence interval. We see that several +of the regression-based methods outperform the Monte Carlo-based +methods. More specifically, three separate regression methods and +three surrogate regression methods. + +```{r mixed-plot, cache = TRUE} +# Print the MSEv scores and the elapsed time (in seconds) for the different methods +print_MSEv_scores_and_time(explanation_list_mixed) + +# Compare the MSEv criterion of the different explanation methods +# Include vertical line corresponding to the MSEv of the empirical method. +plot_MSEv_scores(explanation_list_mixed, method_line = "MC_ctree") +``` + +The best-performing methods are the surrogate random forest +and xgboost with cross-validation methods. The Monte Carlo-based +methods perform worse, with `ctree` being the best, with a +seventh-place overall ranking. + +We can also order the methods to more easily look at the order +of the methods according to the $\operatorname{MSE}_v$ criterion. + +```{r mixed-plot-2, cache = TRUE} +order <- get_k_best_methods(explanation_list_mixed, k = length(explanation_list_mixed)) +plot_MSEv_scores(explanation_list_mixed[order], method_line = "MC_ctree") +``` + +We also look at some of the Shapley value explanations and +see that many methods produce similar explanations. + +```{r mixed-plot-3, cache = TRUE} +plot_SV_several_approaches(explanation_list_mixed[order], index_explicands = c(1, 2), facet_ncol = 1) +``` + +We can also focus on the Shapley value explanations for the best five +methods according to the $\operatorname{MSE}_v$ criterion. We also +include the `ctree` method, the best-performing Monte Carlo-based method. + +```{r mixed-plot-4, cache = TRUE} +best_methods <- get_k_best_methods(explanation_list_mixed, k = 5) +if (!"MC_ctree" %in% best_methods) best_methods <- c(best_methods, "MC_ctree") +plot_SV_several_approaches(explanation_list_mixed[best_methods], index_explicands = 1:4) +``` + + +# Regression arguments as strings + +In this section, we demonstrate that the `regression.model`, +`regression.tune_values`, and `regression.recipe_func` +parameters can be provided as strings. This is a property +which is convenient if the `explain()` function is called +from Python. That is, the user only has to specify strings +containing R code instead of having to deal with creating +the R objects in Python. In the code chunk below, we see +that we obtain identical $\operatorname{MSE}_v$ scores for +the string and non-string versions. + +```{r R-vs-string, cache = TRUE} +explanation_list_str <- list() +explanation_list_str$sep_lm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = "parsnip::linear_reg()" +) + +explanation_list_str$sep_pcr <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = "parsnip::linear_reg()", + regression.recipe_func = "function(regression_recipe) { + return(recipes::step_pca(regression_recipe, recipes::all_numeric_predictors(), num_comp = 2)) + }" +) + +explanation_list_str$sep_splines <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = parsnip::linear_reg(), + regression.recipe_func = "function(regression_recipe) { + return(recipes::step_ns(regression_recipe, recipes::all_numeric_predictors(), deg_free = 2)) + }" +) + +explanation_list_str$sep_tree_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_separate", + regression.model = "parsnip::decision_tree( + tree_depth = hardhat::tune(), engine = 'rpart', mode = 'regression' + )", + regression.tune_values = "dials::grid_regular(dials::tree_depth(), levels = 4)", + regression.vfold_cv_para = list(v = 5) +) + +# Using random forest with parameters tuned by cross-validation +explanation_list_str$sep_rf_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 1, # As we used this for the non-string version + approach = "regression_separate", + regression.model = "parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = 'ranger', mode = 'regression' + )", + regression.tune_values = + "function(x) { + dials::grid_regular(dials::mtry(c(1, ncol(x))), dials::trees(c(50, 750)), levels = 3) + }", + regression.vfold_cv_para = list(v = 5) +) + +# Using random forest with parameters tuned by cross-validation as the surrogate model +explanation_list_str$sur_rf_cv <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + prediction_zero = p0, + n_batches = 4, + approach = "regression_surrogate", + regression.model = "parsnip::rand_forest( + mtry = hardhat::tune(), trees = hardhat::tune(), engine = 'ranger', mode = 'regression' + )", + regression.tune_values = "dials::grid_regular( + dials::mtry(c(1, ncol(x_explain))), + dials::trees(c(50, 750)), + levels = 6 + )", + regression.vfold_cv_para = list(v = 5) +) + +# See that the evaluation scores match the non-string versions. +print_MSEv_scores_and_time(explanation_list_str) +print_MSEv_scores_and_time(explanation_list[names(explanation_list_str)]) +``` + + +# Vignette summary + +This vignette demonstrates the rich possibilities that the regression +paradigm and the `tidymodels` framework add to the `shapr` package. +We have seen that regression-based methods are on par with or outperform +the Monte Carlo-based methods regarding the $\operatorname{MSE}_v$ +evaluation criterion. Furthermore, we have seen that the regression-based +methods are relatively computationally fast and that parallelization can +be used to speed up the computations. + +# References