NorskRegnesentral · martinju · Apr 17, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -25,7 +25,10 @@ inst/compare_lundberg\.xgb\.obj
 ^rebuild-long-running-vignette\.R$
 ^vignettes/understanding_shapr_vaeac\.Rmd\.orig$
 ^vignettes/understanding_shapr\.Rmd\.orig$
+^vignettes/understanding_shapr_regression\.Rmd\.orig$
 ^vignettes/figure_main/*$
 ^vignettes/cache_main/*$
 ^vignettes/figure_vaeac/*$
 ^vignettes/cache_vaeac/*$
+^vignettes/figure_regression/*$
+^vignettes/cache_regression/*$
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -35,6 +35,7 @@ jobs:
       fail-fast: false
       matrix:
         config:
+# Temporary disable all but ubuntu release to reduce compute while debugging
           - {os: macOS-latest,   r: 'release'}
           - {os: windows-latest, r: 'release'}
           - {os: ubuntu-20.04,   r: 'devel', http-user-agent: 'release'}

diff --git a/.gitignore b/.gitignore
@@ -24,3 +24,7 @@ docs
 /Meta/
 .idea
 .DS_Store
+
+vignettes/cache_main/
+vignettes/cache_vaeac/
+vignettes/cache_regression/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -57,7 +57,16 @@ Suggests:
     torch,
     GGally,
     progress,
-    coro
+    coro,
+    parsnip,
+    recipes,
+    workflows,
+    tune,
+    dials,
+    yardstick,
+    hardhat,
+    rsample,
+    rlang
 LinkingTo: 
     RcppArmadillo,
     Rcpp

diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ S3method(get_model_specs,gam)
 S3method(get_model_specs,glm)
 S3method(get_model_specs,lm)
 S3method(get_model_specs,ranger)
+S3method(get_model_specs,workflow)
 S3method(get_model_specs,xgb.Booster)
 S3method(model_checker,Arima)
 S3method(model_checker,ar)
@@ -16,6 +17,7 @@ S3method(model_checker,gam)
 S3method(model_checker,glm)
 S3method(model_checker,lm)
 S3method(model_checker,ranger)
+S3method(model_checker,workflow)
 S3method(model_checker,xgb.Booster)
 S3method(plot,shapr)
 S3method(predict_model,Arima)
@@ -26,13 +28,16 @@ S3method(predict_model,gam)
 S3method(predict_model,glm)
 S3method(predict_model,lm)
 S3method(predict_model,ranger)
+S3method(predict_model,workflow)
 S3method(predict_model,xgb.Booster)
 S3method(prepare_data,categorical)
 S3method(prepare_data,copula)
 S3method(prepare_data,ctree)
 S3method(prepare_data,empirical)
 S3method(prepare_data,gaussian)
 S3method(prepare_data,independence)
+S3method(prepare_data,regression_separate)
+S3method(prepare_data,regression_surrogate)
 S3method(prepare_data,timeseries)
 S3method(prepare_data,vaeac)
 S3method(print,shapr)
@@ -43,6 +48,8 @@ S3method(setup_approach,ctree)
 S3method(setup_approach,empirical)
 S3method(setup_approach,gaussian)
 S3method(setup_approach,independence)
+S3method(setup_approach,regression_separate)
+S3method(setup_approach,regression_surrogate)
 S3method(setup_approach,timeseries)
 S3method(setup_approach,vaeac)
 export(aicc_full_single_cpp)
@@ -68,6 +75,7 @@ export(predict_model)
 export(prepare_data)
 export(prepare_data_copula_cpp)
 export(prepare_data_gaussian_cpp)
+export(regression.train_model)
 export(rss_cpp)
 export(setup)
 export(setup_approach)

diff --git a/R/approach_regression_separate.R b/R/approach_regression_separate.R
diff --git a/R/approach_regression_surrogate.R b/R/approach_regression_surrogate.R
@@ -0,0 +1,245 @@
+# Shapr functions ======================================================================================================
+#' @rdname setup_approach
+#'
+#' @inheritParams default_doc_explain
+#' @inheritParams setup_approach.regression_separate
+#' @param regression.surrogate_n_comb Integer (default is `internal$parameters$used_n_combinations`) specifying the
+#' number of unique combinations/coalitions to apply to each training observation. Maximum allowed value is
+#' "`internal$parameters$used_n_combinations` - 2". By default, we use all coalitions, but this can take a lot of memory
+#' in larger dimensions. Note that by "all", we mean all coalitions chosen by `shapr` to be used. This will be all
+#' \eqn{2^{n_{\text{features}}}} coalitions (minus empty and grand coalition) if `shapr` is in the exact mode. If the
+#' user sets a lower value than `internal$parameters$used_n_combinations`, then we sample this amount of unique
+#' coalitions separately for each training observations. That is, on average, all coalitions should be equally trained.
+#'
+#' @export
+#' @author Lars Henry Berge Olsen
+setup_approach.regression_surrogate <- function(internal,
+                                                regression.model = parsnip::linear_reg(),
+                                                regression.tune_values = NULL,
+                                                regression.vfold_cv_para = NULL,
+                                                regression.recipe_func = NULL,
+                                                regression.surrogate_n_comb =
+                                                  internal$parameters$used_n_combinations - 2,
+                                                ...) {
+  # Check that required libraries are installed
+  regression.check_namespaces()
+
+  # Small printout to the user
+  if (internal$parameters$verbose == 2) message("Starting 'setup_approach.regression_surrogate'.")
+
+  # Add the default parameter values for the non-user specified parameters for the separate regression approach
+  defaults <- mget(c(
+    "regression.model", "regression.tune_values", "regression.vfold_cv_para",
+    "regression.recipe_func", "regression.surrogate_n_comb"
+  ))
+  internal <- insert_defaults(internal, defaults)
+
+  # Check the parameters to the regression approach
+  internal <- regression.check_parameters(internal)
+
+  # Augment the training data
+  x_train_augmented <- regression.surrogate_aug_data(
+    internal = internal, x = internal$data$x_train, y_hat = internal$data$x_train_y_hat, augment_include_grand = TRUE
+  )
+
+  # Fit the surrogate regression model and store it in the internal list
+  if (internal$parameters$verbose == 2) message("Start training the surrogate model.")
+  internal$objects$regression.surrogate_model <- regression.train_model(
+    x = x_train_augmented,
+    seed = internal$parameters$seed,
+    verbose = internal$parameters$verbose,
+    regression.model = internal$parameters$regression.model,
+    regression.tune = internal$parameters$regression.tune,
+    regression.tune_values = internal$parameters$regression.tune_values,
+    regression.vfold_cv_para = internal$parameters$regression.vfold_cv_para,
+    regression.recipe_func = internal$parameters$regression.recipe_func,
+    regression.surrogate_n_comb = regression.surrogate_n_comb + 1 # Add 1 as augment_include_grand = TRUE above
+  )
+
+  # Small printout to the user
+  if (internal$parameters$verbose == 2) message("Done with 'setup_approach.regression_surrogate'.")
+
+  return(internal) # Return the updated internal list
+}
+
+#' @inheritParams default_doc
+#' @rdname prepare_data
+#' @export
+#' @author Lars Henry Berge Olsen
+prepare_data.regression_surrogate <- function(internal, index_features = NULL, ...) {
+  # Load `workflows`, needed when parallelized as we call predict with a workflow object. Checked installed above.
+  requireNamespace("workflows", quietly = TRUE)
+
+  # Small printout to the user about which batch that are currently worked on
+  if (internal$parameters$verbose == 2) regression.prep_message_batch(internal, index_features)
+
+  # Augment the explicand data
+  x_explain_aug <- regression.surrogate_aug_data(internal, x = internal$data$x_explain, index_features = index_features)
+
+  # Compute the predicted response for the explicands, i.e., v(S, x_i) for all explicands x_i and S in index_features
+  pred_explicand <- predict(internal$objects$regression.surrogate_model, new_data = x_explain_aug)$.pred
+
+  # Insert the predicted contribution functions values into a data table of the correct setup
+  dt_res <- data.table(as.integer(index_features), matrix(pred_explicand, nrow = length(index_features)))
+  data.table::setnames(dt_res, c("id_combination", paste0("p_hat1_", seq_len(internal$parameters$n_explain))))
+  data.table::setkey(dt_res, id_combination) # Set id_combination to be the key
+
+  return(dt_res)
+}
+
+# Augment function =====================================================================================================
+#' Augment the training data and the explicands
+#'
+#' @inheritParams default_doc
+#' @inheritParams regression.train_model
+#' @param y_hat Vector of numerics (optional) containing the predicted responses for the observations in `x`.
+#' @param index_features Array of integers (optional) containing which coalitions to consider. Must be provided if
+#' `x` is the explicands.
+#' @param augment_add_id_comb Logical (default is `FALSE`). If `TRUE`, an additional column is adding containing
+#' which coalition was applied.
+#' @param augment_include_grand Logical (default is `FALSE`). If `TRUE`, then the grand coalition is included.
+#' If `index_features` are provided, then `augment_include_grand` has no effect. Note that if we sample the
+#' combinations then the grand coalition is equally likely to be samples as the other coalitions (or weighted if
+#' `augment_comb_prob` is provided).
+#' @param augment_masks_as_factor Logical (default is `FALSE`). If `TRUE`, then the binary masks are converted
+#' to factors. If `FALSE`, then the binary masks are numerics.
+#' @param augment_comb_prob Array of numerics (default is `NULL`). The length of the array must match the number of
+#' combinations being considered, where each entry specifies the probability of sampling the corresponding coalition.
+#' This is useful if we want to generate more training data for some specific coalitions. One possible choice would be
+#' `augment_comb_prob = if (use_Shapley_weights) internal$objects$X$shapley_weight[2:actual_n_combinations] else NULL`.
+#' @param augment_weights String (optional). Specifying which type of weights to add to the observations.
+#' If `NULL` (default), then no weights are added. If `"Shapley"`, then the Shapley weights for the different
+#' combinations are added to corresponding observations where the coalitions was applied. If `uniform`, then
+#' all observations get an equal weight of one.
+#'
+#' @return A data.table containing the augmented data.
+#' @author Lars Henry Berge Olsen
+#' @keywords internal
+regression.surrogate_aug_data <- function(internal,
+                                          x,
+                                          y_hat = NULL,
+                                          index_features = NULL,
+                                          augment_masks_as_factor = FALSE,
+                                          augment_include_grand = FALSE,
+                                          augment_add_id_comb = FALSE,
+                                          augment_comb_prob = NULL,
+                                          augment_weights = NULL) {
+  # Get some of the parameters
+  S <- internal$objects$S
+  actual_n_combinations <- internal$parameters$used_n_combinations - 2 # Remove empty and grand coalitions
+  regression.surrogate_n_comb <- internal$parameters$regression.surrogate_n_comb
+  if (!is.null(index_features)) regression.surrogate_n_comb <- length(index_features) # Applicable from prep_data()
+  if (augment_include_grand) {
+    actual_n_combinations <- actual_n_combinations + 1 # Add 1 to include the grand comb
+    regression.surrogate_n_comb <- regression.surrogate_n_comb + 1
+  }
+  if (regression.surrogate_n_comb > actual_n_combinations) regression.surrogate_n_comb <- actual_n_combinations
+
+  # Small checks
+  if (!is.null(augment_weights)) augment_weights <- match.arg(augment_weights, c("Shapley", "uniform"))
+
+  if (!is.null(augment_comb_prob) && length(augment_comb_prob) != actual_n_combinations) {
+    stop(paste("`augment_comb_prob` must be of length", actual_n_combinations, "."))
+  }
+
+  if (!is.null(augment_weights) && augment_include_grand && augment_weights == "Shapley") {
+    stop(paste(
+      "`augment_include_grand = TRUE` and `augment_weights = 'Shapley'` cannot occure",
+      "because this entails too large weight for the grand coalition."
+    ))
+  }
+
+  # Get the number of observations (either the same as n_train or n_explain)
+  n_obs <- nrow(x)
+
+  # Get the names of the categorical/factor features and the continuous/non-categorical/numeric features.
+  feature_classes <- internal$objects$feature_specs$classes
+  feature_cat <- names(feature_classes)[feature_classes == "factor"]
+  feature_cont <- names(feature_classes)[feature_classes != "factor"]
+
+  # Get the indices of the order of the cat and cont features
+  feature_cat_idx <- which(names(feature_classes) %in% feature_cat)
+  feature_cont_idx <- which(names(feature_classes) %in% feature_cont)
+
+  # Check if we are to augment the training data or the explicands
+  if (is.null(index_features)) {
+    # Training: get matrix (n_obs x regression.surrogate_n_comb) containing the indices of the active coalitions
+    if (regression.surrogate_n_comb >= actual_n_combinations) { # Start from two to exclude the empty set
+      comb_active_idx <- matrix(rep(seq(2, actual_n_combinations + 1), times = n_obs), ncol = n_obs)
+    } else {
+      comb_active_idx <- sapply(seq(n_obs), function(x) { # Add 1 as we want to exclude the empty set
+        sample.int(n = actual_n_combinations, size = regression.surrogate_n_comb, prob = augment_comb_prob) + 1
+      })
+    }
+  } else {
+    # Explicands: get matrix of dimension n_obs x #index_features containing the indices of the active coalitions
+    comb_active_idx <- matrix(rep(index_features, times = n_obs), ncol = n_obs)
+  }
+
+  # Extract the active coalitions for each explicand. The number of rows are n_obs * n_comb_per_explicands,
+  # where the first n_comb_per_explicands rows are connected to the first explicand and so on. Set the column names.
+  id_comb <- as.vector(comb_active_idx)
+  comb_active <- S[id_comb, , drop = FALSE]
+  colnames(comb_active) <- names(feature_classes)
+
+  # Repeat the feature values as many times as there are active coalitions
+  x_augmented <- x[rep(seq_len(n_obs), each = regression.surrogate_n_comb), ]
+
+  # Mask the categorical features. Add a new level called "level_masked" when value is masked.
+  x_augmented[, (feature_cat) := lapply(seq_along(.SD), function(col) {
+    levels(.SD[[col]]) <- c(levels(.SD[[col]]), "level_masked")
+    .SD[[col]][comb_active[, feature_cat_idx[col]] == 0] <- "level_masked"
+    return(.SD[[col]])
+  }), .SDcols = feature_cat]
+
+  # Mask the continuous/non-categorical features
+  x_augmented[, (feature_cont) :=
+    lapply(seq_along(.SD), function(col) .SD[[col]] * comb_active[, feature_cont_idx[col]]),
+  .SDcols = feature_cont
+  ]
+
+  # Add new columns indicating when the continuous features are masked
+  if (length(feature_cont) > 0) {
+    masked_columns <- paste0("mask_", feature_cont)
+    x_augmented <- cbind(x_augmented, setNames(data.table(1 * (comb_active[, feature_cont_idx] == 0)), masked_columns))
+  }
+
+  # Convert the binary masks to factor if user has specified so
+  if (augment_masks_as_factor) x_augmented[, (masked_columns) := lapply(.SD, as.factor), .SDcols = masked_columns]
+
+  # Add either uniform weights or Shapley kernel weights
+  if (!is.null(augment_weights)) {
+    x_augmented[, "weight" := if (augment_weights == "Shapley") internal$objects$X$shapley_weight[id_comb] else 1]
+  }
+
+  # Add the id_comb as a factor
+  if (augment_add_id_comb) x_augmented[, "id_comb" := factor(id_comb)]
+
+  # Add repeated responses if provided
+  if (!is.null(y_hat)) x_augmented[, "y_hat" := rep(y_hat, each = regression.surrogate_n_comb)]
+
+  # Return the augmented data
+  return(x_augmented)
+}
+
+
+# Check function =======================================================================================================
+#' Check the `regression.surrogate_n_comb` parameter
+#'
+#' Check that `regression.surrogate_n_comb` is either NULL or a valid integer.
+#'
+#' @inheritParams setup_approach.regression_surrogate
+#' @param used_n_combinations Integer. The number of used combinations (including the empty and grand coalitions).
+#'
+#' @author Lars Henry Berge Olsen
+#' @keywords internal
+regression.check_sur_n_comb <- function(regression.surrogate_n_comb, used_n_combinations) {
+  if (!is.null(regression.surrogate_n_comb)) {
+    if (regression.surrogate_n_comb < 1 || used_n_combinations - 2 < regression.surrogate_n_comb) {
+      stop(paste0(
+        "`regression.surrogate_n_comb` (", regression.surrogate_n_comb, ") must be a positive integer less than or ",
+        "equal to `used_n_combinations` minus two (", used_n_combinations - 2, ")."
+      ))
+    }
+  }
+}
diff --git a/R/approach_vaeac.R b/R/approach_vaeac.R
@@ -1355,7 +1355,7 @@ vaeac_check_mask_gen <- function(mask_gen_coalitions, mask_gen_coalitions_prob,
   }
 }
 
-#' Function the checks the verbose parameter
+#' Function that checks the verbose parameter
 #'
 #' @inheritParams vaeac_train_model
 #'
-Original file line number
+Diff line change
@@ Expand Up @@
       }
     }
-    #' Function the checks the verbose parameter
+    #' Function that checks the verbose parameter
     #'
     #' @inheritParams vaeac_train_model
     #'
@@ Expand Down @@