-
Notifications
You must be signed in to change notification settings - Fork 35
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Harmonize batch distribution ++ #359
Changes from 38 commits
9578520
8da040d
e9f6ae9
7412780
22c8e17
a05b82f
d0f278d
6fd2d91
4f0bdb9
2a940bf
303df5c
c3c7a87
8e6cc9b
246c2cf
e873f1d
5a2c2eb
42c5ed1
bccf6ff
078c838
703b248
c903e6b
801ff5f
3d216ee
9de817f
d263f37
9fbd49f
14acadc
0f617a4
a0753e5
b7d4402
c28a264
0bf42bd
7971eec
c35e0bc
cb15629
6fb5350
d1a19b3
a3c238b
b0dd3bf
0988ace
f790c22
f9116db
67e5064
1fa2753
70f353f
e0d925d
11cf088
0b146bc
fa6a5b9
c2599fe
aacb474
4dd1a86
30c202d
a224648
b6da078
c9ade53
4851217
cdc624d
8833b0f
b865a65
53c57eb
a04f127
79ddd35
4253ef5
2fd62b5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -208,6 +208,7 @@ check_n_batches <- function(internal) { | |
n_combinations <- internal$parameters$n_combinations | ||
is_groupwise <- internal$parameters$is_groupwise | ||
n_groups <- internal$parameters$n_groups | ||
n_unique_approaches <- internal$parameters$n_unique_approaches | ||
|
||
if (!is_groupwise) { | ||
actual_n_combinations <- ifelse(is.null(n_combinations), 2^n_features, n_combinations) | ||
|
@@ -217,10 +218,18 @@ check_n_batches <- function(internal) { | |
|
||
if (n_batches >= actual_n_combinations) { | ||
stop(paste0( | ||
"`n_batches` (", n_batches, ") must be smaller than the number feature combinations/`n_combinations` (", | ||
"`n_batches` (", n_batches, ") must be smaller than the number of feature combinations/`n_combinations` (", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes the previous tests fail, as now the error messages are different. Let Martin decide if we keep this alteration or not. |
||
actual_n_combinations, ")" | ||
)) | ||
} | ||
|
||
if (n_batches < n_unique_approaches) { | ||
stop(paste0( | ||
"`n_batches` (", n_batches, ") must be larger than the number of unique approaches in `approach` (", | ||
n_unique_approaches, "). Note that the last approach in `approach` is not included as it is not used ", | ||
"to do any computations as described in the vignette." | ||
)) | ||
} | ||
} | ||
|
||
|
||
|
@@ -368,6 +377,18 @@ get_extra_parameters <- function(internal) { | |
internal$parameters$n_groups <- NULL | ||
} | ||
|
||
# Get the number of unique approaches | ||
if (length(internal$parameters$approach) > 1) { | ||
internal$parameters$n_approaches <- length(internal$parameters$approach) | ||
# Remove the last approach as `explain` forces the user to specify the last approach | ||
# even if it is not used as all variables are conditioned on and no estimation is needed. | ||
internal$parameters$n_unique_approaches <- | ||
length(unique(internal$parameters$approach[-internal$parameters$n_approaches])) | ||
} else { | ||
internal$parameters$n_approaches <- 1 | ||
internal$parameters$n_unique_approaches <- 1 | ||
} | ||
|
||
return(internal) | ||
} | ||
|
||
|
@@ -675,33 +696,33 @@ set_defaults <- function(internal) { | |
# Set defaults for certain arguments (based on other input) | ||
|
||
approach <- internal$parameters$approach | ||
n_unique_approaches <- internal$parameters$n_unique_approaches | ||
used_n_combinations <- internal$parameters$used_n_combinations | ||
n_batches <- internal$parameters$n_batches | ||
|
||
# n_batches | ||
if (is.null(n_batches)) { | ||
internal$parameters$n_batches <- get_default_n_batches(approach, used_n_combinations) | ||
internal$parameters$n_batches <- get_default_n_batches(approach, n_unique_approaches, used_n_combinations) | ||
} | ||
|
||
return(internal) | ||
} | ||
|
||
#' @keywords internal | ||
get_default_n_batches <- function(approach, n_combinations) { | ||
get_default_n_batches <- function(approach, n_unique_approaches, n_combinations) { | ||
used_approach <- names(sort(table(approach), decreasing = TRUE))[1] # Most frequent used approach (when more present) | ||
|
||
if (used_approach %in% c("ctree", "gaussian", "copula")) { | ||
suggestion <- ceiling(n_combinations / 10) | ||
this_min <- 10 | ||
this_max <- 1000 | ||
min_checked <- max(c(this_min, suggestion)) | ||
ret <- min(c(this_max, min_checked)) | ||
} else { | ||
suggestion <- ceiling(n_combinations / 100) | ||
this_min <- 2 | ||
this_max <- 100 | ||
min_checked <- max(c(this_min, suggestion)) | ||
ret <- min(c(this_max, min_checked)) | ||
} | ||
min_checked <- max(c(this_min, suggestion, n_unique_approaches)) | ||
ret <- min(c(this_max, min_checked, n_combinations - 1)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We subtract 1 as the |
||
message( | ||
paste0( | ||
"Setting parameter 'n_batches' to ", ret, " as a fair trade-off between memory consumption and ", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -622,6 +622,7 @@ create_S_batch_new <- function(internal, seed = NULL) { | |
|
||
X <- internal$objects$X | ||
|
||
if (!is.null(seed)) set.seed(seed) | ||
|
||
if (length(approach0) > 1) { | ||
X[!(n_features %in% c(0, n_features0)), approach := approach0[n_features]] | ||
|
@@ -632,6 +633,57 @@ create_S_batch_new <- function(internal, seed = NULL) { | |
pmax(1, round(.N / (n_combinations - 2) * n_batches)), | ||
n_S_per_approach = .N | ||
), by = approach] | ||
|
||
# DELETE THIS COMMENT: | ||
# The fix below is simple, but I feel like it is double work as one first does lines 631-635, | ||
# and then later changes the output from said lines. A better idea would likely be to look at the logic | ||
# in said lines. | ||
# We can now use the additional (new) parameter | ||
# `n_unique_approaches = internal$parameters$n_unique_approaches`. | ||
# So instead of doing | ||
# `pmax(1, round(.N / (n_combinations - 2) * n_batches))` | ||
# one could maybe do something like | ||
# `round(.N / (n_combinations - 2) * (n_batches - n_unique_approaches)) + 1`. | ||
# Here we subtract `n_unique_approaches` as we know that at least `n_unique_approaches` of the | ||
# `n_batches` have been looked to a specific approach, so we only want to divide the remaining | ||
# batches among the approaches. We add 1 as each method needs to have 1 batch, and these | ||
# corresponds to the `n_unique_approaches` batches we subtracted before. | ||
# But this is will break too. | ||
# Consider same example as in `demonstrate_combined_appraoches_bugs.R`. | ||
# There `n_combinations = 32`, `n_unique_approaches = 2`, and `.N = c(5, 25)`. | ||
# If we let `n_batches = 5`, then my proposal breaks, as | ||
# round(.N / (n_combinations - 2) * (n_batches - n_unique_approaches)) + 1 | ||
# gives c(1,3) which sums to 4 < 5. | ||
# This is because before we round we have c(0.5, 2.5) which are both rounded down | ||
# So my conclusion is that it might be the easiest to do what is done above, | ||
# or use my proposed approach and add batches until the correct amount has been reached. | ||
# Discuss with Martin. | ||
# Furthermore, we can both end up in situations with too few and too many coalitions, | ||
# so have to check for both. | ||
# Consider the same situation where one has `n_batches = 15`, and `n_combinations = 32`, then | ||
# round(c(5, 25) / (n_combinations - 2) * n_batches) | ||
# will yield c(2,12), whose sum is larger less `n_batches` | ||
|
||
# Ensures that the number of batches corresponds to `n_batches` | ||
if (sum(batch_count_dt$n_batches_per_approach) != n_batches) { | ||
# Ensure that the number of batches is not larger than `n_batches`. | ||
# Remove one batch from the approach with the most batches. | ||
while (sum(batch_count_dt$n_batches_per_approach) > n_batches) { | ||
approach_to_subtract_batch <- which.max(batch_count_dt$n_batches_per_approach) | ||
batch_count_dt$n_batches_per_approach[approach_to_subtract_batch] <- | ||
batch_count_dt$n_batches_per_approach[approach_to_subtract_batch] - 1 | ||
} | ||
|
||
# Ensure that the number of batches is not lower than `n_batches`. | ||
# Add one batch to the approach with most coalitions per batch | ||
while (sum(batch_count_dt$n_batches_per_approach) < n_batches) { | ||
approach_to_add_batch <- which.max(batch_count_dt$n_S_per_approach / | ||
batch_count_dt$n_batches_per_approach) | ||
batch_count_dt$n_batches_per_approach[approach_to_add_batch] <- | ||
batch_count_dt$n_batches_per_approach[approach_to_add_batch] + 1 | ||
} | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @LHBO Not sure whether this extra complexity is really worth it to cover the edge cases where the n_batches does not add up. An alternative is to provide a warning that the number of batches is changed (if specified byt he user), or otherwise just change it. What do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @martinju, I agree that it is not very elegant code, but it does not take long to run and is only run once.
If this is not that important, then I am okay with just giving a warning. |
||
batch_count_dt[, n_leftover_first_batch := n_S_per_approach %% n_batches_per_approach] | ||
data.table::setorder(batch_count_dt, -n_leftover_first_batch) | ||
|
||
|
@@ -640,7 +692,6 @@ create_S_batch_new <- function(internal, seed = NULL) { | |
|
||
# Randomize order before ordering spreading the batches on the different approaches as evenly as possible | ||
# with respect to shapley_weight | ||
set.seed(seed) | ||
X[, randomorder := sample(.N)] | ||
data.table::setorder(X, randomorder) # To avoid smaller id_combinations always proceeding large ones | ||
data.table::setorder(X, shapley_weight) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
# Use the data objects from the helper-lm.R file. | ||
# Here we want to illustrate three bugs related to combined approaches (before the bugfix) | ||
|
||
|
||
# First we see that setting `n_batches` lower than the number of unique approaches | ||
# produce some inconsistencies in shapr. | ||
# After the bugfix, we force the user to choose a valid value for `n_batches`. | ||
explanation_1 = explain( | ||
model = model_lm_numeric, | ||
x_explain = x_explain_numeric, | ||
x_train = x_train_numeric, | ||
approach = c("independence", "empirical", "gaussian", "copula", "empirical"), | ||
prediction_zero = p0, | ||
n_batches = 3, | ||
timing = FALSE, | ||
seed = 1) | ||
|
||
# It says shapr is using 3 batches | ||
explanation_1$internal$parameters$n_batches | ||
|
||
# But shapr has actually used 4. | ||
# This is because shapr can only handle one type of approach for each batch. | ||
# Hence, the number of batches must be at least as large as the number of unique approaches. | ||
# (excluding the last approach which is not used, as we then condition on all features) | ||
length(explanation_1$internal$objects$S_batch) | ||
|
||
# Note that after the bugfix, we give an error if `n_batches` < # unique approaches. | ||
|
||
|
||
|
||
|
||
|
||
# Second we look at at another situation where # unique approaches is two and we set `n_batches` = 2, | ||
# but shapr still use three batches. This is due to how shapr decides how many batches each approach | ||
# should get. Right now it decided based on the proportion of the number of coalitions each approach | ||
# is responsible. In this setting, independence is responsible for 5 coalitions and ctree for 25 coalitions, | ||
# So, initially shapr sets that ctree should get the two batches while independence gets 0, but this | ||
# is than changed to 1 without considering that it now breaks the consistency with the `n_batches`. | ||
# This is done in the function `create_S_batch_new()` in setup_computation.R. | ||
explanation_2 = explain( | ||
model = model_lm_numeric, | ||
x_explain = x_explain_numeric, | ||
x_train = x_train_numeric, | ||
approach = c("independence", "ctree", "ctree", "ctree" ,"ctree"), | ||
prediction_zero = p0, | ||
n_batches = 2, | ||
timing = FALSE, | ||
seed = 1) | ||
|
||
# It says shapr is using 2 batches | ||
explanation_2$internal$parameters$n_batches | ||
|
||
# But shapr has actually used 3 | ||
length(explanation_2$internal$objects$S_batch) | ||
|
||
# These are equal after the bugfix | ||
|
||
|
||
# Same type of bug but in the opposite direction | ||
explanation_3 = explain( | ||
model = model_lm_numeric, | ||
x_explain = x_explain_numeric, | ||
x_train = x_train_numeric, | ||
approach = c("independence", "ctree", "ctree", "ctree" ,"ctree"), | ||
prediction_zero = p0, | ||
n_batches = 15, | ||
timing = FALSE, | ||
seed = 1) | ||
|
||
# It says shapr is using 15 batches | ||
explanation_3$internal$parameters$n_batches | ||
|
||
# It says shapr is using 14 batches | ||
length(explanation_3$internal$objects$S_batch) | ||
|
||
# These are equal after the bugfix | ||
|
||
|
||
|
||
|
||
|
||
|
||
# Bug number three caused shapr to not to be reproducible as seting the seed did not work for combined approaches. | ||
# This was due to a `set.seed(NULL)` which ruins all of the earlier set.seed procedures. | ||
|
||
|
||
# Check that setting the seed works for a combination of approaches | ||
# Here `n_batches` is set to `4`, so one batch for each method, | ||
# i.e., no randomness. | ||
# In the first example we get no bug as there is no randomness in assigning the batches. | ||
explanation_combined_1 = explain( | ||
model = model_lm_numeric, | ||
x_explain = x_explain_numeric, | ||
x_train = x_train_numeric, | ||
approach = c("independence", "empirical", "gaussian", "copula", "empirical"), | ||
prediction_zero = p0, | ||
timing = FALSE, | ||
seed = 1) | ||
|
||
explanation_combined_2 = explain( | ||
model = model_lm_numeric, | ||
x_explain = x_explain_numeric, | ||
x_train = x_train_numeric, | ||
approach = c("independence", "empirical", "gaussian", "copula", "empirical"), | ||
prediction_zero = p0, | ||
timing = FALSE, | ||
seed = 1) | ||
|
||
# Check that they are equal | ||
all.equal(explanation_combined_1, explanation_combined_2) | ||
|
||
|
||
# Here `n_batches` is set to `10`, so NOT one batch for each method, | ||
# i.e., randomness in assigning the batches. | ||
explanation_combined_3 = explain( | ||
model = model_lm_numeric, | ||
x_explain = x_explain_numeric, | ||
x_train = x_train_numeric, | ||
approach = c("independence", "empirical", "gaussian", "copula", "ctree"), | ||
prediction_zero = p0, | ||
timing = FALSE, | ||
seed = 1) | ||
|
||
explanation_combined_4 = explain( | ||
model = model_lm_numeric, | ||
x_explain = x_explain_numeric, | ||
x_train = x_train_numeric, | ||
approach = c("independence", "empirical", "gaussian", "copula", "ctree"), | ||
prediction_zero = p0, | ||
timing = FALSE, | ||
seed = 1) | ||
|
||
# Check that they are not equal | ||
all.equal(explanation_combined_3, explanation_combined_4) | ||
explanation_combined_3$internal$objects$X | ||
explanation_combined_4$internal$objects$X | ||
|
||
# These are equal after the bugfix | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
# In this code we demonstrate that (before the bugfix) the `explain()` function | ||
# does not enter the exact mode when n_combinations is larger than or equal to 2^m. | ||
# The mode is only changed if n_combinations is strictly larger than 2^m. | ||
# This means that we end up with using all coalitions when n_combinations is 2^m, | ||
# but use not the exact Shapley kernel weights. | ||
# Bugfix replaces `>` with `=>`in the places where the code tests if | ||
# n_combinations is larger than or equal to 2^m. Then the text/messages printed by | ||
# shapr and the code correspond. | ||
|
||
library(xgboost) | ||
library(data.table) | ||
|
||
data("airquality") | ||
data <- data.table::as.data.table(airquality) | ||
data <- data[complete.cases(data), ] | ||
|
||
x_var <- c("Solar.R", "Wind", "Temp", "Month") | ||
y_var <- "Ozone" | ||
|
||
ind_x_explain <- 1:6 | ||
x_train <- data[-ind_x_explain, ..x_var] | ||
y_train <- data[-ind_x_explain, get(y_var)] | ||
x_explain <- data[ind_x_explain, ..x_var] | ||
|
||
# Fitting a basic xgboost model to the training data | ||
model <- xgboost::xgboost( | ||
data = as.matrix(x_train), | ||
label = y_train, | ||
nround = 20, | ||
verbose = FALSE | ||
) | ||
|
||
# Specifying the phi_0, i.e. the expected prediction without any features | ||
p0 <- mean(y_train) | ||
|
||
# Shapr sets the default number of batches to be 10 for this dataset for the | ||
# "ctree", "gaussian", and "copula" approaches. Thus, setting `n_combinations` | ||
# to any value lower of equal to 10 causes the error. | ||
any_number_equal_or_below_10 = 8 | ||
|
||
# Before the bugfix, shapr:::check_n_batches() throws the error: | ||
# Error in check_n_batches(internal) : | ||
# `n_batches` (10) must be smaller than the number feature combinations/`n_combinations` (8) | ||
# Bug only occures for "ctree", "gaussian", and "copula" as they are treated different in | ||
# `get_default_n_batches()`, I am not certain why. Ask Martin about the logic behind that. | ||
explanation <- explain( | ||
model = model, | ||
x_explain = x_explain, | ||
x_train = x_train, | ||
n_samples = 2, # Low value for fast computations | ||
approach = "gaussian", | ||
prediction_zero = p0, | ||
n_combinations = any_number_equal_or_below_10 | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do not need the if-else anymore, so a simpler version of the code could be: