NorskRegnesentral · LHBO · Nov 20, 2023 · Aug 24, 2023 · Aug 24, 2023 · Aug 24, 2023
diff --git a/R/setup.R b/R/setup.R
@@ -208,6 +208,7 @@ check_n_batches <- function(internal) {
   n_combinations <- internal$parameters$n_combinations
   is_groupwise <- internal$parameters$is_groupwise
   n_groups <- internal$parameters$n_groups
+  n_unique_approaches <- internal$parameters$n_unique_approaches
 
   if (!is_groupwise) {
     actual_n_combinations <- ifelse(is.null(n_combinations), 2^n_features, n_combinations)
@@ -217,10 +218,18 @@ check_n_batches <- function(internal) {
 
   if (n_batches >= actual_n_combinations) {
     stop(paste0(
-      "`n_batches` (", n_batches, ") must be smaller than the number feature combinations/`n_combinations` (",
+      "`n_batches` (", n_batches, ") must be smaller than the number of feature combinations/`n_combinations` (",
       actual_n_combinations, ")"
     ))
   }
+
+  if (n_batches < n_unique_approaches) {
+    stop(paste0(
+      "`n_batches` (", n_batches, ") must be larger than the number of unique approaches in `approach` (",
+      n_unique_approaches, "). Note that the last approach in `approach` is not included as it is not used ",
+      "to do any computations as described in the vignette."
+    ))
+  }
 }
 
 
@@ -368,6 +377,18 @@ get_extra_parameters <- function(internal) {
     internal$parameters$n_groups <- NULL
   }
 
+  # Get the number of unique approaches
+  if (length(internal$parameters$approach) > 1) {
+    internal$parameters$n_approaches <- length(internal$parameters$approach)
+    # Remove the last approach as `explain` forces the user to specify the last approach
+    # even if it is not used as all variables are conditioned on and no estimation is needed.
+    internal$parameters$n_unique_approaches <-
+      length(unique(internal$parameters$approach[-internal$parameters$n_approaches]))
+  } else {
+    internal$parameters$n_approaches <- 1
+    internal$parameters$n_unique_approaches <- 1
+  }
+
   return(internal)
 }
 
@@ -675,33 +696,33 @@ set_defaults <- function(internal) {
   # Set defaults for certain arguments (based on other input)
 
   approach <- internal$parameters$approach
+  n_unique_approaches <- internal$parameters$n_unique_approaches
   used_n_combinations <- internal$parameters$used_n_combinations
   n_batches <- internal$parameters$n_batches
 
   # n_batches
   if (is.null(n_batches)) {
-    internal$parameters$n_batches <- get_default_n_batches(approach, used_n_combinations)
+    internal$parameters$n_batches <- get_default_n_batches(approach, n_unique_approaches, used_n_combinations)
   }
 
   return(internal)
 }
+
 #' @keywords internal
-get_default_n_batches <- function(approach, n_combinations) {
+get_default_n_batches <- function(approach, n_unique_approaches, n_combinations) {
   used_approach <- names(sort(table(approach), decreasing = TRUE))[1] # Most frequent used approach (when more present)
 
   if (used_approach %in% c("ctree", "gaussian", "copula")) {
     suggestion <- ceiling(n_combinations / 10)
     this_min <- 10
     this_max <- 1000
-    min_checked <- max(c(this_min, suggestion))
-    ret <- min(c(this_max, min_checked))
   } else {
     suggestion <- ceiling(n_combinations / 100)
     this_min <- 2
     this_max <- 100
-    min_checked <- max(c(this_min, suggestion))
-    ret <- min(c(this_max, min_checked))
   }
+  min_checked <- max(c(this_min, suggestion, n_unique_approaches))
+  ret <- min(c(this_max, min_checked, n_combinations - 1))
   message(
     paste0(
       "Setting parameter 'n_batches' to ", ret, " as a fair trade-off between memory consumption and ",

diff --git a/R/setup_computation.R b/R/setup_computation.R
@@ -622,6 +622,7 @@ create_S_batch_new <- function(internal, seed = NULL) {
 
   X <- internal$objects$X
 
+  if (!is.null(seed)) set.seed(seed)
 
   if (length(approach0) > 1) {
     X[!(n_features %in% c(0, n_features0)), approach := approach0[n_features]]
@@ -632,6 +633,57 @@ create_S_batch_new <- function(internal, seed = NULL) {
         pmax(1, round(.N / (n_combinations - 2) * n_batches)),
       n_S_per_approach = .N
     ), by = approach]
+
+    # DELETE THIS COMMENT:
+    # The fix below is simple, but I feel like it is double work as one first does lines 631-635,
+    # and then later changes the output from said lines. A better idea would likely be to look at the logic
+    # in said lines.
+    # We can now use the additional (new) parameter
+    # `n_unique_approaches = internal$parameters$n_unique_approaches`.
+    # So instead of doing
+    # `pmax(1, round(.N / (n_combinations - 2) * n_batches))`
+    # one could maybe do something like
+    # `round(.N / (n_combinations - 2) * (n_batches - n_unique_approaches)) + 1`.
+    # Here we subtract `n_unique_approaches` as we know that at least `n_unique_approaches` of the
+    # `n_batches` have been looked to a specific approach, so we only want to divide the remaining
+    # batches among the approaches. We add 1 as each method needs to have 1 batch, and these
+    # corresponds to the `n_unique_approaches` batches we subtracted before.
+    # But this is will break too.
+    # Consider same example as in `demonstrate_combined_appraoches_bugs.R`.
+    # There `n_combinations = 32`, `n_unique_approaches = 2`, and `.N = c(5, 25)`.
+    # If we let `n_batches = 5`, then my proposal breaks, as
+    # round(.N / (n_combinations - 2) * (n_batches - n_unique_approaches)) + 1
+    # gives c(1,3) which sums to 4 < 5.
+    # This is because before we round we have c(0.5, 2.5) which are both rounded down
+    # So my conclusion is that it might be the easiest to do what is done above,
+    # or use my proposed approach and add batches until the correct amount has been reached.
+    # Discuss with Martin.
+    # Furthermore, we can both end up in situations with too few and too many coalitions,
+    # so have to check for both.
+    # Consider the same situation where one has `n_batches = 15`, and `n_combinations = 32`, then
+    # round(c(5, 25) / (n_combinations - 2) * n_batches)
+    # will yield c(2,12), whose sum is larger less `n_batches`
+
+    # Ensures that the number of batches corresponds to `n_batches`
+    if (sum(batch_count_dt$n_batches_per_approach) != n_batches) {
+      # Ensure that the number of batches is not larger than `n_batches`.
+      # Remove one batch from the approach with the most batches.
+      while (sum(batch_count_dt$n_batches_per_approach) > n_batches) {
+        approach_to_subtract_batch <- which.max(batch_count_dt$n_batches_per_approach)
+        batch_count_dt$n_batches_per_approach[approach_to_subtract_batch] <-
+          batch_count_dt$n_batches_per_approach[approach_to_subtract_batch] - 1
+      }
+
+      # Ensure that the number of batches is not lower than `n_batches`.
+      # Add one batch to the approach with most coalitions per batch
+      while (sum(batch_count_dt$n_batches_per_approach) < n_batches) {
+        approach_to_add_batch <- which.max(batch_count_dt$n_S_per_approach /
+                                            batch_count_dt$n_batches_per_approach)
+        batch_count_dt$n_batches_per_approach[approach_to_add_batch] <-
+          batch_count_dt$n_batches_per_approach[approach_to_add_batch] + 1
+      }
+    }
+
     batch_count_dt[, n_leftover_first_batch := n_S_per_approach %% n_batches_per_approach]
     data.table::setorder(batch_count_dt, -n_leftover_first_batch)
 
@@ -640,7 +692,6 @@ create_S_batch_new <- function(internal, seed = NULL) {
 
     # Randomize order before ordering spreading the batches on the different approaches as evenly as possible
     # with respect to shapley_weight
-    set.seed(seed)
     X[, randomorder := sample(.N)]
     data.table::setorder(X, randomorder) # To avoid smaller id_combinations always proceeding large ones
     data.table::setorder(X, shapley_weight)

diff --git a/inst/scripts/devel/demonstrate_combined_approaches_bugs.R b/inst/scripts/devel/demonstrate_combined_approaches_bugs.R
@@ -0,0 +1,139 @@
+# Use the data objects from the helper-lm.R file.
+# Here we want to illustrate three bugs related to combined approaches (before the bugfix)
+
+
+# First we see that setting `n_batches` lower than the number of unique approaches
+# produce some inconsistencies in shapr.
+# After the bugfix, we force the user to choose a valid value for `n_batches`.
+explanation_1 = explain(
+  model = model_lm_numeric,
+  x_explain = x_explain_numeric,
+  x_train = x_train_numeric,
+  approach = c("independence", "empirical", "gaussian", "copula", "empirical"),
+  prediction_zero = p0,
+  n_batches = 3,
+  timing = FALSE,
+  seed = 1)
+
+# It says shapr is using 3 batches
+explanation_1$internal$parameters$n_batches
+
+# But shapr has actually used 4.
+# This is because shapr can only handle one type of approach for each batch.
+# Hence, the number of batches must be at least as large as the number of unique approaches.
+# (excluding the last approach which is not used, as we then condition on all features)
+length(explanation_1$internal$objects$S_batch)
+
+# Note that after the bugfix, we give an error if `n_batches` < # unique approaches.
+
+
+
+
+
+# Second we look at at another situation where # unique approaches is two and we set `n_batches` = 2,
+# but shapr still use three batches. This is due to how shapr decides how many batches each approach
+# should get. Right now it decided based on the proportion of the number of coalitions each approach
+# is responsible. In this setting, independence is responsible for 5 coalitions and ctree for 25 coalitions,
+# So, initially shapr sets that ctree should get the two batches while independence gets 0, but this
+# is than changed to 1 without considering that it now breaks the consistency with the `n_batches`.
+# This is done in the function `create_S_batch_new()` in setup_computation.R.
+explanation_2 = explain(
+  model = model_lm_numeric,
+  x_explain = x_explain_numeric,
+  x_train = x_train_numeric,
+  approach = c("independence", "ctree", "ctree", "ctree" ,"ctree"),
+  prediction_zero = p0,
+  n_batches = 2,
+  timing = FALSE,
+  seed = 1)
+
+# It says shapr is using 2 batches
+explanation_2$internal$parameters$n_batches
+
+# But shapr has actually used 3
+length(explanation_2$internal$objects$S_batch)
+
+# These are equal after the bugfix
+
+
+# Same type of bug but in the opposite direction
+explanation_3 = explain(
+  model = model_lm_numeric,
+  x_explain = x_explain_numeric,
+  x_train = x_train_numeric,
+  approach = c("independence", "ctree", "ctree", "ctree" ,"ctree"),
+  prediction_zero = p0,
+  n_batches = 15,
+  timing = FALSE,
+  seed = 1)
+
+# It says shapr is using 15 batches
+explanation_3$internal$parameters$n_batches
+
+# It says shapr is using 14 batches
+length(explanation_3$internal$objects$S_batch)
+
+# These are equal after the bugfix
+
+
+
+
+
+
+# Bug number three caused shapr to not to be reproducible as seting the seed did not work for combined approaches.
+# This was due to a `set.seed(NULL)` which ruins all of the earlier set.seed procedures.
+
+
+# Check that setting the seed works for a combination of approaches
+# Here `n_batches` is set to `4`, so one batch for each method,
+# i.e., no randomness.
+# In the first example we get no bug as there is no randomness in assigning the batches.
+explanation_combined_1 = explain(
+  model = model_lm_numeric,
+  x_explain = x_explain_numeric,
+  x_train = x_train_numeric,
+  approach = c("independence", "empirical", "gaussian", "copula", "empirical"),
+  prediction_zero = p0,
+  timing = FALSE,
+  seed = 1)
+
+explanation_combined_2 = explain(
+  model = model_lm_numeric,
+  x_explain = x_explain_numeric,
+  x_train = x_train_numeric,
+  approach = c("independence", "empirical", "gaussian", "copula", "empirical"),
+  prediction_zero = p0,
+  timing = FALSE,
+  seed = 1)
+
+# Check that they are equal
+all.equal(explanation_combined_1, explanation_combined_2)
+
+
+# Here `n_batches` is set to `10`, so NOT one batch for each method,
+# i.e., randomness in assigning the batches.
+explanation_combined_3 = explain(
+  model = model_lm_numeric,
+  x_explain = x_explain_numeric,
+  x_train = x_train_numeric,
+  approach = c("independence", "empirical", "gaussian", "copula", "ctree"),
+  prediction_zero = p0,
+  timing = FALSE,
+  seed = 1)
+
+explanation_combined_4 = explain(
+  model = model_lm_numeric,
+  x_explain = x_explain_numeric,
+  x_train = x_train_numeric,
+  approach = c("independence", "empirical", "gaussian", "copula", "ctree"),
+  prediction_zero = p0,
+  timing = FALSE,
+  seed = 1)
+
+# Check that they are not equal
+all.equal(explanation_combined_3, explanation_combined_4)
+explanation_combined_3$internal$objects$X
+explanation_combined_4$internal$objects$X
+
+# These are equal after the bugfix
+
diff --git a/inst/scripts/devel/testing_for_valid_defualt_n_batches.R b/inst/scripts/devel/testing_for_valid_defualt_n_batches.R
@@ -0,0 +1,54 @@
+# In this code we demonstrate that (before the bugfix) the `explain()` function
+# does not enter the exact mode when n_combinations is larger than or equal to 2^m.
+# The mode is only changed if n_combinations is strictly larger than 2^m.
+# This means that we end up with using all coalitions when n_combinations is 2^m,
+# but use not the exact Shapley kernel weights.
+# Bugfix replaces `>` with `=>`in the places where the code tests if
+# n_combinations is larger than or equal to 2^m. Then the text/messages printed by
+# shapr and the code correspond.
+
+library(xgboost)
+library(data.table)
+
+data("airquality")
+data <- data.table::as.data.table(airquality)
+data <- data[complete.cases(data), ]
+
+x_var <- c("Solar.R", "Wind", "Temp", "Month")
+y_var <- "Ozone"
+
+ind_x_explain <- 1:6
+x_train <- data[-ind_x_explain, ..x_var]
+y_train <- data[-ind_x_explain, get(y_var)]
+x_explain <- data[ind_x_explain, ..x_var]
+
+# Fitting a basic xgboost model to the training data
+model <- xgboost::xgboost(
+  data = as.matrix(x_train),
+  label = y_train,
+  nround = 20,
+  verbose = FALSE
+)
+
+# Specifying the phi_0, i.e. the expected prediction without any features
+p0 <- mean(y_train)
+
+# Shapr sets the default number of batches to be 10 for this dataset for the
+# "ctree", "gaussian", and "copula" approaches. Thus, setting `n_combinations`
+# to any value lower of equal to 10 causes the error.
+any_number_equal_or_below_10 = 8
+
+# Before the bugfix, shapr:::check_n_batches() throws the error:
+# Error in check_n_batches(internal) :
+#   `n_batches` (10) must be smaller than the number feature combinations/`n_combinations` (8)
+# Bug only occures for "ctree", "gaussian", and "copula" as they are treated different in
+# `get_default_n_batches()`, I am not certain why. Ask Martin about the logic behind that.
+explanation <- explain(
+  model = model,
+  x_explain = x_explain,
+  x_train = x_train,
+  n_samples = 2, # Low value for fast computations
+  approach = "gaussian",
+  prediction_zero = p0,
+  n_combinations = any_number_equal_or_below_10
+)
diff --git a/tests/testthat/test-output.R b/tests/testthat/test-output.R
@@ -213,7 +213,7 @@ test_that("output_lm_numeric_comb1", {
       x_train = x_train_numeric,
       approach = c("gaussian", "empirical", "ctree", "independence", "empirical"),
       prediction_zero = p0,
-      n_batches = 1,
+      n_batches = 10,
       timing = FALSE
     ),
     "output_lm_numeric_comb1"
@@ -228,7 +228,7 @@ test_that("output_lm_numeric_comb2", {
       x_train = x_train_numeric,
       approach = c("ctree", "copula", "independence", "copula", "empirical"),
       prediction_zero = p0,
-      n_batches = 1,
+      n_batches = 10,
       timing = FALSE
     ),
     "output_lm_numeric_comb2"
@@ -243,7 +243,7 @@ test_that("output_lm_numeric_comb3", {
       x_train = x_train_numeric,
       approach = c("independence", "empirical", "gaussian", "empirical", "gaussian"),
       prediction_zero = p0,
-      n_batches = 1,
+      n_batches = 10,
       timing = FALSE
     ),
     "output_lm_numeric_comb3"
@@ -292,7 +292,7 @@ test_that("output_lm_mixed_comb", {
       x_train = x_train_mixed,
       approach = c("ctree", "independence", "ctree", "independence", "independence"),
       prediction_zero = p0,
-      n_batches = 1,
+      n_batches = 10,
       timing = FALSE
     ),
     "output_lm_mixed_comb"