From 7f67a02dc447b4a41dc79cfb3e28d7b65cc805bb Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 6 Dec 2023 21:02:45 +0100 Subject: [PATCH 01/26] initial stuff on permutation alternative no antitetic sampling yet --- R/explain.R | 4 + R/finalize_explanation.R | 50 ++++++- R/setup.R | 30 +++- R/setup_computation.R | 178 ++++++++++++++++++++++- inst/scripts/devel/testing_permutation.R | 126 ++++++++++++++++ 5 files changed, 377 insertions(+), 11 deletions(-) create mode 100644 inst/scripts/devel/testing_permutation.R diff --git a/R/explain.R b/R/explain.R index 0ce8a4ea9..b55f5ea37 100644 --- a/R/explain.R +++ b/R/explain.R @@ -247,8 +247,10 @@ explain <- function(model, x_explain, x_train, approach, + shap_approach = "kernel", prediction_zero, n_combinations = NULL, + n_permutations = NULL, group = NULL, n_samples = 1e3, n_batches = NULL, @@ -276,8 +278,10 @@ explain <- function(model, x_train = x_train, x_explain = x_explain, approach = approach, + shap_approach = shap_approach, prediction_zero = prediction_zero, n_combinations = n_combinations, + n_permutations = n_permutations, group = group, n_samples = n_samples, n_batches = n_batches, diff --git a/R/finalize_explanation.R b/R/finalize_explanation.R index 5f3d5ae62..3cf4979aa 100644 --- a/R/finalize_explanation.R +++ b/R/finalize_explanation.R @@ -8,6 +8,7 @@ #' @export finalize_explanation <- function(vS_list, internal) { keep_samp_for_vS <- internal$parameters$keep_samp_for_vS + shap_approach <- internal$parameters$shap_approach processed_vS_list <- postprocess_vS_list( vS_list = vS_list, @@ -20,7 +21,11 @@ finalize_explanation <- function(vS_list, internal) { # internal$timing$postprocessing <- Sys.time() # Compute the Shapley values - dt_shapley <- compute_shapley_new(internal, processed_vS_list$dt_vS) + if(shap_approach == "permutation"){ + dt_shapley <- compute_shapley_permutation(internal, processed_vS_list$dt_vS) + } else { + dt_shapley <- compute_shapley_new(internal, processed_vS_list$dt_vS) + } # internal$timing$shapley_computation <- Sys.time() @@ -101,6 +106,49 @@ get_p <- function(dt_vS, internal) { return(p) } +compute_shapley_permutation <- function(internal,dt_vS){ + feature_names <- internal$parameters$feature_names + X_perm <- internal$objects$X_perm + n_permutations <- internal$parameters$n_permutations + n_features <- internal$parameters$n_features + n_explain <- internal$parameters$n_explain + max_id_combination <- internal$parameters$n_combinations + S <- internal$objects$S + phi0 <- internal$parameters$prediction_zero + + + apply_cols <- names(dt_vS)[-1] + + kshap <- matrix(0,ncol=n_explain,nrow=n_features) + + for(i in seq(n_permutations)){ + # Find id combinations that are permuted + these_id_combs <- c(1,X_perm[permute_id==i,id_combination],max_id_combination) + + # Find the feature to map the contributions to + mapping_mat <- apply(S[these_id_combs,],FUN=diff,MARGIN=2) + contributes_to <- apply(mapping_mat,FUN=function(x) which(x==1),MARGIN=1) + reorder_vec <- order(contributes_to) + + #### LOOK HERE: OK, something is off here, I think it might be the merging that is messing up the order + # I need the order to be such that the X_perm always have the smallest feature numbers first. + + # Find the corresponding rows in dt_vS and get the contribution + these_vS <- dt_vS[id_combination %in% these_id_combs] + these_contribs <- these_vS[,lapply(.SD,diff),.SDcols=apply_cols] + + reordered_contribs <- as.matrix(these_contribs[reorder_vec,]) + kshap <- kshap + reordered_contribs + } + kshap <- kshap/n_permutations + + + + dt_shapley <- data.table::data.table(cbind(none=phi0,t(kshap))) + names(dt_shapley)[-1] <- feature_names + return(dt_shapley) +} + #' Compute shapley values #' @param explainer An `explain` object. #' @param dt_vS The contribution matrix. diff --git a/R/setup.R b/R/setup.R index e1c12e48e..2fd2111b0 100644 --- a/R/setup.R +++ b/R/setup.R @@ -20,9 +20,11 @@ setup <- function(x_train, x_explain, approach, + shap_approach, prediction_zero, output_size = 1, n_combinations, + n_permutations, group, n_samples, n_batches, @@ -46,9 +48,11 @@ setup <- function(x_train, internal$parameters <- get_parameters( approach = approach, + shap_approach = shap_approach, prediction_zero = prediction_zero, output_size = output_size, n_combinations = n_combinations, + n_permutations = n_permutations, group = group, n_samples = n_samples, n_batches = n_batches, @@ -372,13 +376,18 @@ get_extra_parameters <- function(internal) { } #' @keywords internal -get_parameters <- function(approach, prediction_zero, output_size = 1, n_combinations, group, n_samples, +get_parameters <- function(approach, shap_approach, prediction_zero, output_size = 1, n_combinations, n_permutations, group, n_samples, n_batches, seed, keep_samp_for_vS, type, horizon, train_idx, explain_idx, explain_y_lags, explain_xreg_lags, group_lags = NULL, timing, is_python, ...) { # Check input type for approach # approach is checked more comprehensively later + # Check if shap_approach is a character equal to either "kernel" or "permutation" + if (!is.character(shap_approach) || !(shap_approach %in% c("kernel", "permutation"))) { + stop("`shap_approach` must be a character equal to either 'kernel' or 'permutation'.") + } + # n_combinations if (!is.null(n_combinations) && !(is.wholenumber(n_combinations) && @@ -388,6 +397,16 @@ get_parameters <- function(approach, prediction_zero, output_size = 1, n_combina stop("`n_combinations` must be NULL or a single positive integer.") } + # n_combinations + if (!is.null(n_permutations) && + !(is.wholenumber(n_permutations) && + length(n_permutations) == 1 && + !is.na(n_permutations) && + n_permutations > 0)) { + stop("`n_permutations` must be NULL or a single positive integer.") + } + + # group (checked more thoroughly later) if (!is.null(group) && !is.list(group)) { @@ -479,8 +498,10 @@ get_parameters <- function(approach, prediction_zero, output_size = 1, n_combina # Getting basic input parameters parameters <- list( approach = approach, + shap_approach = shap_approach, prediction_zero = prediction_zero, n_combinations = n_combinations, + n_permutations = n_permutations, group = group, n_samples = n_samples, n_batches = n_batches, @@ -499,7 +520,12 @@ get_parameters <- function(approach, prediction_zero, output_size = 1, n_combina # Setting exact based on n_combinations (TRUE if NULL) - parameters$exact <- ifelse(is.null(parameters$n_combinations), TRUE, FALSE) + if(shap_approach=="permutation"){ + parameters$exact <- ifelse(is.null(parameters$n_permutations), TRUE, FALSE) + parameters$n_combinations <- 3*parameters$n_permutations # Temporary setting this parameter to avoid errors in the code + } else { + parameters$exact <- ifelse(is.null(parameters$n_combinations), TRUE, FALSE) + } return(parameters) } diff --git a/R/setup_computation.R b/R/setup_computation.R index 23418b50a..359a7cddc 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -133,10 +133,25 @@ shapley_setup <- function(internal) { exact <- internal$parameters$exact n_features0 <- internal$parameters$n_features n_combinations <- internal$parameters$n_combinations + n_permutations <- internal$parameters$n_permutations is_groupwise <- internal$parameters$is_groupwise + shap_approach <- internal$parameters$shap_approach group_num <- internal$objects$group_num + if(shap_approach == "permutation"){ + +# TODO: Add grouping to the permutation approach later + X_list <- feature_combinations_perm( + m = n_features0, + exact = exact, + n_permutations = n_permutations + ) + X <- X_list$X + internal$objects$X_perm <- X_list$X_perm + W <- NULL + + } else { X <- feature_combinations( m = n_features0, exact = exact, @@ -152,6 +167,13 @@ shapley_setup <- function(internal) { is_groupwise = is_groupwise ) + # Updating parameters$exact as done in feature_combinations + if (!exact && n_combinations >= 2^n_features0) { + internal$parameters$exact <- TRUE + } + } + + ## Get feature matrix --------- S <- feature_matrix_cpp( features = X[["features"]], @@ -159,12 +181,6 @@ shapley_setup <- function(internal) { ) #### Updating parameters #### - - # Updating parameters$exact as done in feature_combinations - if (!exact && n_combinations >= 2^n_features0) { - internal$parameters$exact <- TRUE - } - internal$parameters$n_combinations <- nrow(S) # Updating this parameter in the end based on what is actually used. # This will be obsolete later @@ -180,6 +196,147 @@ shapley_setup <- function(internal) { return(internal) } +feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL) { + + if (!exact) { + # Switch to exact for feature-wise method + if (n_permutations >= factorial(m)) { + n_combinations <- factorial(m) + exact <- TRUE + message( + paste0( + "Success with message:\n", + "n_permutations is larger than or equal to m! = ", factorial(m), ". \n", + "Using exact instead.\n" + ) + ) + } + } + + if (exact) { + perm_dt <- feature_permute_exact(m) + } else { + perm_dt <- feature_permute_samp(m, n_permutations) + } + + ret <- X_from_perm_dt(perm_dt) + + ret$perm_dt = perm_dt + + return(ret) +} + + +feature_permute_samp <- function(m, n_permutations) { + x <- seq(m) + perms <- vector("list", n_permutations) + perms[[1]] <- sample(x) + + for (i in 2:n_permutations) { + perm <- sample(x) + while (any(sapply(perms[1:(i-1)], function(p) all(p == perm)))) { + perm <- sample(x) + } + perms[[i]] <- perm + } + + # Create data.table with id_combination and features columns + dt <- data.table( + permute_id = seq(n_permutations), + perm = perms + ) + + return(dt) +} + +feature_permute_exact <- function(m) { + + # Generate all possible combinations + perms <- all_permutations(seq(m)) + + # Create data.table with id_combination and features columns + dt <- data.table( + permute_id = seq_along(perms), + perm = as.list(perms) + ) + + return(dt) +} + +all_permutations <- function(x) { + if (length(x) <= 1) { + return(list(x)) + } else { + perms <- list() + for (i in seq_along(x)) { + sub_perms <- all_permutations(x[-i]) + for (j in seq_along(sub_perms)) { + perms[[length(perms) + 1]] <- c(x[i], sub_perms[[j]]) + } + } + return(perms) + } +} + + +X_from_perm_dt <- function(perm_dt) { + m <- length(perm_dt[["perm"]][[1]]) + tmp <- list() + + for(i in 1:nrow(perm_dt)){ + perm0 <- perm_dt[i, perm][[1]] + for (j in seq_len(m-1)){ + tmp[[length(tmp) + 1]] <- sort(perm0[1:j]) + } + } + + feature_sample_all <- c(list(integer(0)), tmp, list(c(1:m))) + X_perm <- data.table(n_features = sapply(feature_sample_all, length)) + X_perm[, n_features := as.integer(n_features)] + + # Get number of occurences and duplicated rows------- + is_duplicate <- NULL # due to NSE notes in R CMD check + r <- shapr:::helper_feature(m, feature_sample_all) + X_perm[, is_duplicate := r[["is_duplicate"]]] + X_perm[, features := feature_sample_all] + X_perm[, permute_id := c(NA,rep(seq_len(nrow(perm_dt)),each = m-1),Inf)] + data.table::setkeyv(X_perm, c("n_features","permute_id")) + X_perm[!is.finite(permute_id), permute_id := NA] + + # Temporary create a character string to merge back on below (cannot merge on lists) + X_perm[, features_tmp := sapply(features, paste, collapse = " ")] + X_perm[, rowid:=.I] + + if (any(X_perm[["is_duplicate"]])) { + X <- X_perm[is_duplicate == FALSE] + } else { + X <- copy(X_perm) + } + X[,id_combination:=.I] + X[, is_duplicate := NULL] + + # Merge back to get the id_combination mapping also in X_perm + X_perm <- merge(X_perm,X[,.(features_tmp,id_combination)],by="features_tmp",all.x=TRUE) + data.table::setorderv(X_perm, "rowid") + X_perm[, rowid:=NULL] + X[, rowid:=NULL] + + X_perm[, features_tmp := NULL] + X[, features_tmp := NULL] + + setcolorder(X,c("id_combination","permute_id", "features", "n_features")) + + + ret <- list(X=X[],X_perm=X_perm[]) + return(ret) +} + + + + + + + #' Define feature combinations, and fetch additional information about each unique combination #' #' @param m Positive integer. Total number of features. @@ -619,6 +776,7 @@ create_S_batch_new <- function(internal, seed = NULL) { approach0 <- internal$parameters$approach n_combinations <- internal$parameters$n_combinations n_batches <- internal$parameters$n_batches + shap_approach <- internal$parameters$shap_approach X <- internal$objects$X @@ -643,7 +801,9 @@ create_S_batch_new <- function(internal, seed = NULL) { set.seed(seed) X[, randomorder := sample(.N)] data.table::setorder(X, randomorder) # To avoid smaller id_combinations always proceeding large ones - data.table::setorder(X, shapley_weight) + if(shap_approach!="permutation") { + data.table::setorder(X, shapley_weight) + } batch_counter <- 0 for (i in seq_along(approach_vec)) { @@ -656,7 +816,9 @@ create_S_batch_new <- function(internal, seed = NULL) { # Spreading the batches X[, randomorder := sample(.N)] data.table::setorder(X, randomorder) - data.table::setorder(X, shapley_weight) + if(shap_approach!="permutation") { + data.table::setorder(X, shapley_weight) + } X[!(n_features %in% c(0, n_features0)), batch := ceiling(.I / .N * n_batches)] } diff --git a/inst/scripts/devel/testing_permutation.R b/inst/scripts/devel/testing_permutation.R new file mode 100644 index 000000000..444c9a4b9 --- /dev/null +++ b/inst/scripts/devel/testing_permutation.R @@ -0,0 +1,126 @@ +library(xgboost) +library(shapr) + +data("airquality") +data <- data.table::as.data.table(airquality) +data <- data[complete.cases(data), ] + +set.seed(123) +data[,V1:=rnorm(.N)] +data[,V2:=rnorm(.N)] +data[,V3:=rnorm(.N)] +data[,V4:=rnorm(.N)] +data[,V5:=rnorm(.N)] + +x_var <- c("Solar.R", "Wind", "Temp", "Month","Day","V1","V2","V3","V4","V5") +y_var <- "Ozone" + +ind_x_explain <- 1:6 +x_train <- data[-ind_x_explain, ..x_var] +y_train <- data[-ind_x_explain, get(y_var)] +x_explain <- data[ind_x_explain, ..x_var] + +# Looking at the dependence between the features +cor(x_train) + +# Fitting a basic xgboost model to the training data +model <- xgboost( + data = as.matrix(x_train), + label = y_train, + nround = 20, + verbose = FALSE +) + +# Specifying the phi_0, i.e. the expected prediction without any features +p0 <- mean(y_train) + +# Computing the actual Shapley values with kernelSHAP accounting for feature dependence using +# the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default) +n_permutations_vec <- c(4,8,16,32,64,128) + + +exp_full <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1, + shap_approach = "kernel", + n_combinations = NULL, + prediction_zero = p0 +) + +exp_full$timing + + +exp_list <- exp_kern_list <- list() +mse_vec <- mse_kern_vec <- NULL +for (i in seq_along(n_permutations_vec)) { + exp_list[[i]] <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1, + shap_approach = "permutation", + n_permutations = n_permutations_vec[i], + prediction_zero = p0 + ) + + exp_kern_list[[i]] <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1, + shap_approach = "kernel", + n_combinations = exp_list[[i]]$internal$parameters$n_combinations, + prediction_zero = p0 + ) + + print(exp_list[[i]]$timing) + print(exp_kern_list[[i]]$timing) + + mse_vec[i] <- mean(unlist((exp_list[[i]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + mse_kern_vec[i] <- mean(unlist((exp_kern_list[[i]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + +} + +mse_vec +mse_kern_vec + + +explanation_perm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1, + prediction_zero = p0,shap_approach = "permutation",n_permutations = 24 +) + +explanation_kernel <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1, + n_combinations = explanation_perm$internal$parameters$n_combinations, + prediction_zero = p0 +) + +explanation_perm$internal$objects$X +explanation_kernel$internal$objects$X + +explanation_perm$internal$output$dt_vS +explanation_kernel$internal$output$dt_vS + +explanation_perm$shapley_values +explanation_kernel$shapley_values + +explanation_perm$timing +explanation_kernel$timing + +#### Some TODO notes: + +# Implement antithetic sampling for both permutation and kernel method +# run basic simulations to show improved convergence vs kernel method (based on same n_combinations) +# if the tests shows improved convergence, the setup and computation code needs increased efficiency + + + From f43765f091367b94294f891495692f37029c5c8b Mon Sep 17 00:00:00 2001 From: Martin Date: Sat, 9 Dec 2023 17:45:16 +0100 Subject: [PATCH 02/26] brute force adding of paried sampling to permutation approach --- R/explain.R | 2 + R/setup.R | 11 ++- R/setup_computation.R | 28 +++++-- inst/scripts/devel/testing_permutation.R | 96 ++++++++++++++++-------- 4 files changed, 96 insertions(+), 41 deletions(-) diff --git a/R/explain.R b/R/explain.R index b55f5ea37..9d346c3e6 100644 --- a/R/explain.R +++ b/R/explain.R @@ -248,6 +248,7 @@ explain <- function(model, x_train, approach, shap_approach = "kernel", + paired_shap_sampling = FALSE, prediction_zero, n_combinations = NULL, n_permutations = NULL, @@ -279,6 +280,7 @@ explain <- function(model, x_explain = x_explain, approach = approach, shap_approach = shap_approach, + paired_shap_sampling = paired_shap_sampling, prediction_zero = prediction_zero, n_combinations = n_combinations, n_permutations = n_permutations, diff --git a/R/setup.R b/R/setup.R index 2fd2111b0..f16e6bfa9 100644 --- a/R/setup.R +++ b/R/setup.R @@ -21,6 +21,7 @@ setup <- function(x_train, x_explain, approach, shap_approach, + paired_shap_sampling, prediction_zero, output_size = 1, n_combinations, @@ -49,6 +50,7 @@ setup <- function(x_train, internal$parameters <- get_parameters( approach = approach, shap_approach = shap_approach, + paired_shap_sampling = paired_shap_sampling, prediction_zero = prediction_zero, output_size = output_size, n_combinations = n_combinations, @@ -376,7 +378,7 @@ get_extra_parameters <- function(internal) { } #' @keywords internal -get_parameters <- function(approach, shap_approach, prediction_zero, output_size = 1, n_combinations, n_permutations, group, n_samples, +get_parameters <- function(approach, shap_approach,paired_shap_sampling, prediction_zero, output_size = 1, n_combinations, n_permutations, group, n_samples, n_batches, seed, keep_samp_for_vS, type, horizon, train_idx, explain_idx, explain_y_lags, explain_xreg_lags, group_lags = NULL, timing, is_python, ...) { # Check input type for approach @@ -388,6 +390,10 @@ get_parameters <- function(approach, shap_approach, prediction_zero, output_size stop("`shap_approach` must be a character equal to either 'kernel' or 'permutation'.") } + if (!is.logical(paired_shap_sampling)) { + stop("`paired_shap_sampling` must be a logical.") + } + # n_combinations if (!is.null(n_combinations) && !(is.wholenumber(n_combinations) && @@ -499,6 +505,7 @@ get_parameters <- function(approach, shap_approach, prediction_zero, output_size parameters <- list( approach = approach, shap_approach = shap_approach, + paired_shap_sampling = paired_shap_sampling, prediction_zero = prediction_zero, n_combinations = n_combinations, n_permutations = n_permutations, @@ -522,7 +529,7 @@ get_parameters <- function(approach, shap_approach, prediction_zero, output_size # Setting exact based on n_combinations (TRUE if NULL) if(shap_approach=="permutation"){ parameters$exact <- ifelse(is.null(parameters$n_permutations), TRUE, FALSE) - parameters$n_combinations <- 3*parameters$n_permutations # Temporary setting this parameter to avoid errors in the code + parameters$n_combinations <- 3*parameters$n_permutations # TODO: Do this properly. (Temporary setting this parameter to avoid errors in the code) } else { parameters$exact <- ifelse(is.null(parameters$n_combinations), TRUE, FALSE) } diff --git a/R/setup_computation.R b/R/setup_computation.R index 359a7cddc..5cb3868fb 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -136,6 +136,7 @@ shapley_setup <- function(internal) { n_permutations <- internal$parameters$n_permutations is_groupwise <- internal$parameters$is_groupwise shap_approach <- internal$parameters$shap_approach + paired_shap_sampling = internal$parameters$paired_shap_sampling group_num <- internal$objects$group_num @@ -145,7 +146,8 @@ shapley_setup <- function(internal) { X_list <- feature_combinations_perm( m = n_features0, exact = exact, - n_permutations = n_permutations + n_permutations = n_permutations, + paired_shap_sampling = paired_shap_sampling ) X <- X_list$X internal$objects$X_perm <- X_list$X_perm @@ -196,7 +198,7 @@ shapley_setup <- function(internal) { return(internal) } -feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL) { +feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL,paired_shap_sampling=FALSE) { if (!exact) { # Switch to exact for feature-wise method @@ -216,7 +218,7 @@ feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL) { if (exact) { perm_dt <- feature_permute_exact(m) } else { - perm_dt <- feature_permute_samp(m, n_permutations) + perm_dt <- feature_permute_samp(m, n_permutations,paired_shap_sampling) } ret <- X_from_perm_dt(perm_dt) @@ -227,17 +229,27 @@ feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL) { } -feature_permute_samp <- function(m, n_permutations) { +feature_permute_samp <- function(m, n_permutations,paired_shap_sampling) { x <- seq(m) - perms <- vector("list", n_permutations) + perms <- perms_paired <- vector("list", n_permutations) perms[[1]] <- sample(x) + perms[[2]] <- rev(perms[[1]]) - for (i in 2:n_permutations) { + if(paired_shap_sampling==TRUE){ + for_seq <- seq(3,n_permutations,by=2) + } else { + for_seq <- seq(2,n_permutations,by=1) + } + + for (i in for_seq) { perm <- sample(x) - while (any(sapply(perms[1:(i-1)], function(p) all(p == perm)))) { - perm <- sample(x) + while (any(sapply(perms[1:(i-1)], function(p) all(p == perm)))) { # not perfect as does not check the rev_perm, but OK for now + perm <- sample(x) # repeat until we get a unique sample } perms[[i]] <- perm + if(paired_shap_sampling==TRUE){ + perms[[i+1]] <- rev(perm) + } } # Create data.table with id_combination and features columns diff --git a/inst/scripts/devel/testing_permutation.R b/inst/scripts/devel/testing_permutation.R index 444c9a4b9..adc6c2f44 100644 --- a/inst/scripts/devel/testing_permutation.R +++ b/inst/scripts/devel/testing_permutation.R @@ -1,5 +1,5 @@ library(xgboost) -library(shapr) +#library(shapr) data("airquality") data <- data.table::as.data.table(airquality) @@ -43,47 +43,66 @@ exp_full <- explain( model = model, x_explain = x_explain, x_train = x_train, - approach = "gaussian",seed = 123,n_batches=1, + approach = "gaussian",seed = 123,n_batches=4, shap_approach = "kernel", n_combinations = NULL, - prediction_zero = p0 + prediction_zero = p0,n_samples = 10000 ) exp_full$timing -exp_list <- exp_kern_list <- list() -mse_vec <- mse_kern_vec <- NULL +exp_perm_list <- exp_perm_paired_list <- exp_kern_list <- list() +mse_perm_mat <- mse_perm_paired_mat <- mse_kern_mat <- matrix(NA,ncol=5,nrow=length(n_permutations_vec)) for (i in seq_along(n_permutations_vec)) { - exp_list[[i]] <- explain( - model = model, - x_explain = x_explain, - x_train = x_train, - approach = "gaussian",seed = 123,n_batches=1, - shap_approach = "permutation", - n_permutations = n_permutations_vec[i], - prediction_zero = p0 - ) - - exp_kern_list[[i]] <- explain( - model = model, - x_explain = x_explain, - x_train = x_train, - approach = "gaussian",seed = 123,n_batches=1, - shap_approach = "kernel", - n_combinations = exp_list[[i]]$internal$parameters$n_combinations, - prediction_zero = p0 - ) - - print(exp_list[[i]]$timing) - print(exp_kern_list[[i]]$timing) - - mse_vec[i] <- mean(unlist((exp_list[[i]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) - mse_kern_vec[i] <- mean(unlist((exp_kern_list[[i]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + exp_perm_list[[i]] <- exp_perm_paired_list[[i]] <- exp_kern_list[[i]] <- list() + for (j in 1:5){ + exp_perm_list[[i]][[j]] <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123+j,n_batches=1, + shap_approach = "permutation", + n_permutations = n_permutations_vec[i], + prediction_zero = p0 + ) + + exp_perm_paired_list[[i]][[j]] <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123+j,n_batches=1, + shap_approach = "permutation",paired_shap_sampling = TRUE, + n_permutations = n_permutations_vec[i], + prediction_zero = p0 + ) + + + exp_kern_list[[i]][[j]] <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123+j,n_batches=1, + shap_approach = "kernel", + n_combinations = exp_perm_list[[i]][[j]]$internal$parameters$n_combinations, + prediction_zero = p0 + ) + +# print(exp_list[[i]]$timing) +# print(exp_kern_list[[i]]$timing) + + mse_perm_mat[i,j] <- mean(unlist((exp_perm_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + mse_perm_paired_mat[i,j] <- mean(unlist((exp_perm_paired_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + mse_kern_mat[i,j] <- mean(unlist((exp_kern_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + + print(rowMeans(mse_perm_mat,na.rm = TRUE)) + print(rowMeans(mse_perm_paired_mat,na.rm = TRUE)) + print(rowMeans(mse_kern_mat,na.rm = TRUE)) + } } -mse_vec +mse_perm_mat mse_kern_vec @@ -95,6 +114,15 @@ explanation_perm <- explain( prediction_zero = p0,shap_approach = "permutation",n_permutations = 24 ) +explanation_perm_paired <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1, + prediction_zero = p0,shap_approach = "permutation",paired_shap_sampling = TRUE,n_permutations = 24 +) + + explanation_kernel <- explain( model = model, x_explain = x_explain, @@ -105,15 +133,21 @@ explanation_kernel <- explain( ) explanation_perm$internal$objects$X +explanation_perm_paired$internal$objects$X explanation_kernel$internal$objects$X explanation_perm$internal$output$dt_vS +explanation_perm_paired$internal$output$dt_vS explanation_kernel$internal$output$dt_vS explanation_perm$shapley_values +explanation_perm_paired$shapley_values + explanation_kernel$shapley_values explanation_perm$timing +explanation_perm_paired$timing + explanation_kernel$timing #### Some TODO notes: From 2626e72c71c8ca6dc2ac90660fb4287e7fcbfd4b Mon Sep 17 00:00:00 2001 From: Martin Date: Sun, 10 Dec 2023 11:53:45 +0100 Subject: [PATCH 03/26] paired sampling also for kernel + more scripting --- R/setup_computation.R | 24 ++++++--- inst/scripts/devel/testing_permutation.R | 66 +++++++++++++++++++----- 2 files changed, 72 insertions(+), 18 deletions(-) diff --git a/R/setup_computation.R b/R/setup_computation.R index 5cb3868fb..d02b10d09 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -159,7 +159,8 @@ shapley_setup <- function(internal) { exact = exact, n_combinations = n_combinations, weight_zero_m = 10^6, - group_num = group_num + group_num = group_num, + paired_shap_sampling = paired_shap_sampling ) # Get weighted matrix ---------------- @@ -386,7 +387,7 @@ X_from_perm_dt <- function(perm_dt) { #' #' # Subsample of combinations #' x <- feature_combinations(exact = FALSE, m = 10, n_combinations = 1e2) -feature_combinations <- function(m, exact = TRUE, n_combinations = 200, weight_zero_m = 10^6, group_num = NULL) { +feature_combinations <- function(m, exact = TRUE, n_combinations = 200, weight_zero_m = 10^6, group_num = NULL, paired_shap_sampling = FALSE) { m_group <- length(group_num) # The number of groups # Force user to use a natural number for n_combinations if m > 13 @@ -454,7 +455,7 @@ feature_combinations <- function(m, exact = TRUE, n_combinations = 200, weight_z if (exact) { dt <- feature_exact(m, weight_zero_m) } else { - dt <- feature_not_exact(m, n_combinations, weight_zero_m) + dt <- feature_not_exact(m, n_combinations, weight_zero_m,unique_sampling = TRUE,paired_shap_sampling = paired_shap_sampling) stopifnot( data.table::is.data.table(dt), !is.null(dt[["p"]]) @@ -492,7 +493,7 @@ feature_exact <- function(m, weight_zero_m = 10^6) { } #' @keywords internal -feature_not_exact <- function(m, n_combinations = 200, weight_zero_m = 10^6, unique_sampling = TRUE) { +feature_not_exact <- function(m, n_combinations = 200, weight_zero_m = 10^6, unique_sampling = TRUE,paired_shap_sampling = FALSE) { # Find weights for given number of features ---------- n_features <- seq(m - 1) n <- sapply(n_features, choose, n = m) @@ -505,17 +506,28 @@ feature_not_exact <- function(m, n_combinations = 200, weight_zero_m = 10^6, uni if (unique_sampling) { while (unique_samples < n_combinations - 2) { + if(paired_shap_sampling==TRUE){ + n_samps <- ceiling((n_combinations - unique_samples - 2)/2) + } else { + n_samps <- n_combinations - unique_samples - 2 + } + # Sample number of chosen features ---------- n_features_sample <- sample( x = n_features, - size = n_combinations - unique_samples - 2, # Sample -2 as we add zero and m samples below + size = n_samps, # Sample -2 as we add zero and m samples below replace = TRUE, prob = p ) # Sample specific set of features ------- feature_sample <- sample_features_cpp(m, n_features_sample) - feature_sample_all <- c(feature_sample_all, feature_sample) + if(paired_shap_sampling==TRUE){ + feature_sample_paired <- lapply(feature_sample, function(x) seq(m)[-x]) + feature_sample_all <- c(feature_sample_all, feature_sample,feature_sample_paired) + } else { + feature_sample_all <- c(feature_sample_all, feature_sample) + } unique_samples <- length(unique(feature_sample_all)) } } else { diff --git a/inst/scripts/devel/testing_permutation.R b/inst/scripts/devel/testing_permutation.R index adc6c2f44..bfbf2db32 100644 --- a/inst/scripts/devel/testing_permutation.R +++ b/inst/scripts/devel/testing_permutation.R @@ -36,7 +36,7 @@ p0 <- mean(y_train) # Computing the actual Shapley values with kernelSHAP accounting for feature dependence using # the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default) -n_permutations_vec <- c(4,8,16,32,64,128) +n_permutations_vec <- c(4,8,16,32,64) exp_full <- explain( @@ -51,12 +51,12 @@ exp_full <- explain( exp_full$timing - -exp_perm_list <- exp_perm_paired_list <- exp_kern_list <- list() -mse_perm_mat <- mse_perm_paired_mat <- mse_kern_mat <- matrix(NA,ncol=5,nrow=length(n_permutations_vec)) +reps <- 10 +exp_perm_list <- exp_perm_paired_list <- exp_kern_list <- exp_kern_paired_list <- list() +mse_perm_mat <- mse_perm_paired_mat <- mse_kern_mat <- mse_kern_paired_mat <- matrix(NA,ncol=reps,nrow=length(n_permutations_vec)) for (i in seq_along(n_permutations_vec)) { - exp_perm_list[[i]] <- exp_perm_paired_list[[i]] <- exp_kern_list[[i]] <- list() - for (j in 1:5){ + exp_perm_list[[i]] <- exp_perm_paired_list[[i]] <- exp_kern_list[[i]] <- exp_kern_paired_list[[i]] <- list() + for (j in 1:reps){ exp_perm_list[[i]][[j]] <- explain( model = model, x_explain = x_explain, @@ -88,22 +88,51 @@ for (i in seq_along(n_permutations_vec)) { prediction_zero = p0 ) + exp_kern_paired_list[[i]][[j]] <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123+j,n_batches=1, + shap_approach = "kernel",paired_shap_sampling = TRUE, + n_combinations = exp_perm_list[[i]][[j]]$internal$parameters$n_combinations, + prediction_zero = p0 + ) + + # print(exp_list[[i]]$timing) # print(exp_kern_list[[i]]$timing) mse_perm_mat[i,j] <- mean(unlist((exp_perm_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) mse_perm_paired_mat[i,j] <- mean(unlist((exp_perm_paired_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) mse_kern_mat[i,j] <- mean(unlist((exp_kern_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + mse_kern_paired_mat[i,j] <- mean(unlist((exp_kern_paired_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) print(rowMeans(mse_perm_mat,na.rm = TRUE)) print(rowMeans(mse_perm_paired_mat,na.rm = TRUE)) print(rowMeans(mse_kern_mat,na.rm = TRUE)) + print(rowMeans(mse_kern_paired_mat,na.rm = TRUE)) + } } -mse_perm_mat -mse_kern_vec +print(rowMeans(mse_perm_mat,na.rm = TRUE)) +print(rowMeans(mse_perm_paired_mat,na.rm = TRUE)) +print(rowMeans(mse_kern_mat,na.rm = TRUE)) +print(rowMeans(mse_kern_paired_mat,na.rm = TRUE)) +#> print(rowMeans(mse_perm_mat,na.rm = TRUE)) +#[1] 3.83370 2.16886 1.03013 0.45260 0.21826 +#> print(rowMeans(mse_perm_paired_mat,na.rm = TRUE)) +#[1] 0.696851 0.327208 0.188237 0.094352 0.053114 +#> print(rowMeans(mse_kern_mat,na.rm = TRUE)) +#[1] 4.12236 2.08821 0.82681 0.36273 0.20536 +#> print(rowMeans(mse_kern_paired_mat,na.rm = TRUE)) +#[1] 0.622408 0.222473 0.088097 0.048416 0.024772 + +# OK, so paired sampling improve significantly. +# Permutation and kernel apporach seems to be about the same efficiency +# A question might be whether the accuracy is the same accorss all features for the +# kenrel approach, of whther the permutation apporach is better in this regard. explanation_perm <- explain( @@ -132,28 +161,41 @@ explanation_kernel <- explain( prediction_zero = p0 ) +explanation_kernel_paired <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1,paired_shap_sampling = TRUE, + n_combinations = explanation_perm$internal$parameters$n_combinations, + prediction_zero = p0 +) + + explanation_perm$internal$objects$X explanation_perm_paired$internal$objects$X explanation_kernel$internal$objects$X +explanation_kernel_paired$internal$objects$X + explanation_perm$internal$output$dt_vS explanation_perm_paired$internal$output$dt_vS explanation_kernel$internal$output$dt_vS +explanation_kernel_paired$internal$output$dt_vS explanation_perm$shapley_values explanation_perm_paired$shapley_values - explanation_kernel$shapley_values +explanation_kernel_paired$shapley_values + explanation_perm$timing explanation_perm_paired$timing - explanation_kernel$timing #### Some TODO notes: -# Implement antithetic sampling for both permutation and kernel method -# run basic simulations to show improved convergence vs kernel method (based on same n_combinations) +# double check the results with a better simulation study to show improved convergence +# using permutation with paired sampling vs kernel method w/wo paired sampling (based on same n_combinations) # if the tests shows improved convergence, the setup and computation code needs increased efficiency From 3c53992b664dfe743591ab40f7c1db23fcd62f37 Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 13 Dec 2023 10:01:32 +0100 Subject: [PATCH 04/26] starting to set up the linear_gaussian explainer here --- R/explain_linear.R | 122 ++++++++++++++++++++++ R/model_setup_R.R | 12 +++ R/setup.R | 4 +- R/setup_computation.R | 11 ++ inst/scripts/devel/devel_explain_linear.R | 37 +++++++ 5 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 R/explain_linear.R create mode 100644 inst/scripts/devel/devel_explain_linear.R diff --git a/R/explain_linear.R b/R/explain_linear.R new file mode 100644 index 000000000..44ce8646a --- /dev/null +++ b/R/explain_linear.R @@ -0,0 +1,122 @@ +#' Explain the output of a linear model with Shapley values +#' +#' @inheritParams explain +#' +#' @export +#' +#' @author Martin Jullum +#' +explain_linear <- function(model, + x_explain, + x_train, + paired_shap_sampling = FALSE, + prediction_zero, + n_permutations = NULL, + group = NULL, + n_samples = 1e3, + n_batches = NULL, + seed = 1, + keep_samp_for_vS = FALSE, + predict_model = NULL, + get_model_specs = NULL, + MSEv_uniform_comb_weights = TRUE, + timing = TRUE, + ...) { # ... is further arguments passed to specific approaches + + timing_list <- list( + init_time = Sys.time() + ) + + set.seed(seed) + + # Gets and check feature specs from the model + feature_specs <- get_feature_specs(get_model_specs, model) + + linear_model_coef <- get_linear_coeff(model) + + + # Sets up and organizes input parameters + # Checks the input parameters and their compatability + # Checks data/model compatability + internal <- setup( + type = "linear_gaussian", + x_train = x_train, + x_explain = x_explain, + approach = "gaussian", # always set to "gaussian" although we never really use this argument for linear_gaussian + shap_approach = "permutation", # Always use the permute shap_approach + paired_shap_sampling = paired_shap_sampling, + prediction_zero = prediction_zero, + n_combinations = NULL, # We always set the n_permutations instead + n_permutations = n_permutations, + group = group, + n_samples = n_samples, + n_batches = n_batches, + seed = seed, + keep_samp_for_vS = keep_samp_for_vS, + feature_specs = feature_specs, + MSEv_uniform_comb_weights = MSEv_uniform_comb_weights, + timing = timing, + linear_model_coef = linear_model_coef, # TODO: Make this a proper input argument in setup(). For now this is just included through ... so no checking performed + ... + ) + + timing_list$setup <- Sys.time() + + # Gets predict_model (if not passed to explain) + predict_model <- get_predict_model( + predict_model = predict_model, + model = model + ) + + #TODO Make test for the linear model, checking whether predict() gives the + # same predictions as using the coefficients + + # Checks that predict_model gives correct format + test_predict_model( + x_test = head(internal$data$x_train, 2), + predict_model = predict_model, + model = model, + internal = internal + ) + + timing_list$test_prediction <- Sys.time() + + + # Sets up the Shapley (sampling) framework and prepares the + # conditional expectation computation for the chosen approach + # Note: model and predict_model are ONLY used by the AICc-methods of approach empirical to find optimal parameters + internal <- setup_computation_linear_gaussian(internal) + + timing_list$setup_computation <- Sys.time() + + + # Compute the v(S): + # Get the samples for the conditional distributions with the specified approach + # Predict with these samples + # Perform MC integration on these to estimate the conditional expectation (v(S)) + vS_list <- compute_vS(internal, model, predict_model) + + timing_list$compute_vS <- Sys.time() + + + # Compute Shapley values based on conditional expectations (v(S)) + # Organize function output + output <- finalize_explanation( + vS_list = vS_list, + internal = internal + ) + + timing_list$shapley_computation <- Sys.time() + + if (timing == TRUE) { + output$timing <- compute_time(timing_list) + } + + # Temporary to avoid failing tests + + output$internal$objects$id_combination_mapper_dt <- NULL + output$internal$objects$cols_per_horizon <- NULL + output$internal$objects$W_list <- NULL + + return(output) +} diff --git a/R/model_setup_R.R b/R/model_setup_R.R index 87446700a..6c8a21a37 100644 --- a/R/model_setup_R.R +++ b/R/model_setup_R.R @@ -66,3 +66,15 @@ get_feature_specs <- function(get_model_specs, model) { return(feature_specs) } + + + +get_linear_coeff <- function(model){ + # Checks that model is of class 'lm' + if(class(model)[1] != "lm"){ + stop("explain_linear only supports explaining models of class 'lm'") + } + + # Extracts the coefficients from a linear model + return(model$coefficients) +} diff --git a/R/setup.R b/R/setup.R index f16e6bfa9..0de701c3d 100644 --- a/R/setup.R +++ b/R/setup.R @@ -450,8 +450,8 @@ get_parameters <- function(approach, shap_approach,paired_shap_sampling, predict } # type - if (!(type %in% c("normal", "forecast"))) { - stop("`type` must be either `normal` or `forecast`.\n") + if (!(type %in% c("normal", "forecast","linear_gaussian"))) { + stop("`type` must be either `normal`, `forecast` or `linear_gaussian`.\n") } # parameters only used for type "forecast" diff --git a/R/setup_computation.R b/R/setup_computation.R index d02b10d09..36cfc43d9 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -22,6 +22,17 @@ setup_computation <- function(internal, model, predict_model) { return(internal) } +setup_computation_linear_gaussian <- function(internal){ + + # setup the Shapley framework + internal <- shapley_setup(internal) + + # Setup for approach + internal <- setup_linear_gaussian(internal) # Not created yet + + +} + #' @keywords internal shapley_setup_forecast <- function(internal) { exact <- internal$parameters$exact diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R new file mode 100644 index 000000000..20f37a5cc --- /dev/null +++ b/inst/scripts/devel/devel_explain_linear.R @@ -0,0 +1,37 @@ +set.seed(12345) + +data <- data.table::as.data.table(airquality) + +data_complete <- data[complete.cases(airquality), ] +data_complete <- data_complete[sample(seq_len(.N))] # Sh + +y_var_numeric <- "Ozone" + +x_var_numeric <- c("Solar.R", "Wind", "Temp", "Month", "Day") + +data_train <- head(data_complete, -3) +data_explain <- tail(data_complete, 3) + +x_train_numeric <- data_train[, ..x_var_numeric] + +x_explain_numeric <- data_explain[, ..x_var_numeric] + +lm_formula_numeric <- as.formula(paste0(y_var_numeric, " ~ ", paste0(x_var_numeric, collapse = " + "))) + +model_lm_numeric <- lm(lm_formula_numeric, data = data_complete) + +p0 <- data_train[, mean(get(y_var_numeric))] + +test <-explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "gaussian", + prediction_zero = p0) + +test2 <-explain_linear(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + prediction_zero = p0) + + + From 126941c983ce0ee6c4705508b8095eec0e961ef3 Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 13 Dec 2023 13:31:38 +0100 Subject: [PATCH 05/26] more work on linear_gaussian explainer --- R/explain_linear.R | 5 +++-- R/setup.R | 2 +- inst/scripts/devel/devel_explain_linear.R | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/R/explain_linear.R b/R/explain_linear.R index 44ce8646a..f48121bd4 100644 --- a/R/explain_linear.R +++ b/R/explain_linear.R @@ -11,6 +11,7 @@ explain_linear <- function(model, x_train, paired_shap_sampling = FALSE, prediction_zero, + n_combinations = NULL, n_permutations = NULL, group = NULL, n_samples = 1e3, @@ -34,7 +35,7 @@ explain_linear <- function(model, linear_model_coef <- get_linear_coeff(model) - + null_object <- NULL # Sets up and organizes input parameters # Checks the input parameters and their compatability # Checks data/model compatability @@ -46,7 +47,7 @@ explain_linear <- function(model, shap_approach = "permutation", # Always use the permute shap_approach paired_shap_sampling = paired_shap_sampling, prediction_zero = prediction_zero, - n_combinations = NULL, # We always set the n_permutations instead + n_combinations = n_combinations, # We always set the n_permutations instead n_permutations = n_permutations, group = group, n_samples = n_samples, diff --git a/R/setup.R b/R/setup.R index 437a51971..e8b1796b2 100644 --- a/R/setup.R +++ b/R/setup.R @@ -542,7 +542,7 @@ get_parameters <- function(approach, shap_approach,paired_shap_sampling, predict # Setting exact based on n_combinations (TRUE if NULL) if(shap_approach=="permutation"){ parameters$exact <- ifelse(is.null(parameters$n_permutations), TRUE, FALSE) - parameters$n_combinations <- 3*parameters$n_permutations # TODO: Do this properly. (Temporary setting this parameter to avoid errors in the code) + #parameters$n_combinations <- 3*parameters$n_permutations # TODO: Do this properly. (Temporary setting this parameter to avoid errors in the code) } else { parameters$exact <- ifelse(is.null(parameters$n_combinations), TRUE, FALSE) } diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index 20f37a5cc..1c164646a 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -26,6 +26,7 @@ test <-explain(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, approach = "gaussian", + shap_approach = "permutation", prediction_zero = p0) test2 <-explain_linear(model = model_lm_numeric, From 37bcb31000e1f4acaf5b3e1f28eb0ef81cdaf48a Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 13 Dec 2023 13:44:09 +0100 Subject: [PATCH 06/26] . --- R/linear_gaussian_funcs.R | 123 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 R/linear_gaussian_funcs.R diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R new file mode 100644 index 000000000..150daac91 --- /dev/null +++ b/R/linear_gaussian_funcs.R @@ -0,0 +1,123 @@ +# Here I put both the setup functios and the compute_linear_gaussian functions + + + + +#' @rdname setup_approach +#' +#' @inheritParams default_doc_explain +#' @inheritParams setup_approach.gaussian +#' +#' @export +setup_linear_gaussian <- function(internal, + gaussian.mu = NULL, + gaussian.cov_mat = NULL, ...) { + # For consistency + defaults <- mget(c("gaussian.mu", "gaussian.cov_mat")) + internal <- insert_defaults(internal, defaults) + + x_train <- internal$data$x_train + feature_specs <- internal$objects$feature_specs + + # Checking if factor features are present + if (any(feature_specs$classes == "factor")) { + factor_features <- names(which(feature_specs$classes == "factor")) + factor_approaches <- get_factor_approaches() + stop(paste0( + "The following feature(s) are factor(s): ", factor_features, ".\n", + "approach = 'linear_gaussian' does not support factor features.\n", + "Please change approach to one of ", paste0(factor_approaches, collapse = ", "), "." + )) + } + + # If gaussian.mu is not provided directly in internal list, use mean of training data + if (is.null(internal$parameters$gaussian.mu)) { + internal$parameters$gaussian.mu <- get_mu_vec(x_train) + } + + # If gaussian.cov_mat is not provided directly in internal list, use sample covariance of training data + if (is.null(internal$parameters$gaussian.cov_mat)) { + internal$parameters$gaussian.cov_mat <- get_cov_mat(x_train) + } + + return(internal) +} + + + +#' @rdname prepare_data +#' @export +prepare_data.linear_gaussian <- function(internal, index_features = NULL, ...) { + x_train <- internal$data$x_train + x_explain <- internal$data$x_explain + n_explain <- internal$parameters$n_explain + gaussian.cov_mat <- internal$parameters$gaussian.cov_mat + gaussian.mu <- internal$parameters$gaussian.mu + n_features <- internal$parameters$n_features + linear_model_coef <- internal$parameters$linear_model_coef + S <- internal$objects$S + + + X <- internal$objects$X + + x_explain0 <- as.matrix(x_explain) + dt_l <- list() + + + #TODO: Move this test to somewhere else. + # if (!is.null(index_features)) { + # stop("approach = 'linear_gaussian' can only be applied with n_batches = 1.") + # } + + # To get rid of the smallest and largest subset + S <- S[index_features,] + Sbar <- 1-S + + + # Now using the formulas form True to the model or true to the data paper: https://arxiv.org/pdf/2006.16234.pdf + # Consider whether doing this with sparse matrices is faster + PS_full <- diag(n_features) + PS <- apply(S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) + PSbar <- apply(1-S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) + + # TODO: Should do this in rcpp for speed up later + US_list <- QS_list <- QSbar_list <- list() + for (i in seq_len(nrow(S))){ + US_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])%*%solve(PS[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])) + QS_list[[i]] <- t(PS[[i]])%*%PS[[i]] + QSbar_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]] # Make this every time as well since it is fast to do, even though it could be extracted from QS_list if it is always present. + } + + Tmu_list <- Tx_list <- list() + for (i in seq_len(nrow(S))){ + + } + + # OK, I just stop here now, as I realize I should do this in 2 different ways: + # 1. new explain function which uses permutations, then computes the Tmu and Tx above in a new function (not prepare_data) + # to then compute the Shapley values with formula (9) in the true to the model or data paper + # 2. Create a lingauss-approach which computes the v(S) with the formula in the appendix of our paper, since to compare with + # kernelSHAP, I will be needing that v(S) formula directly + # 3. I can then compare the different approaches (the 2 permutation approaches with different samplign schemes using 1, and with kernelshap using 2.) + + + for (i in seq_len(n_explain)) { + l <- lapply( + X = features, + FUN = sample_gaussian, + n_samples = n_samples, + mu = gaussian.mu, + cov_mat = gaussian.cov_mat, + m = n_features, + x_explain = x_explain0[i, , drop = FALSE] + ) + + dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") + dt_l[[i]][, w := 1 / n_samples] + dt_l[[i]][, id := i] + if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] + } + + dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE) + return(dt) +} From f2ccea2f9d2daeb77f7c4075b5e8a4885a689789 Mon Sep 17 00:00:00 2001 From: Martin Date: Fri, 12 Jan 2024 12:50:28 +0100 Subject: [PATCH 07/26] doc --- NAMESPACE | 3 + man/explain.Rd | 3 + man/explain_linear.Rd | 117 ++++++++++++++++++++++++++++++++++++ man/feature_combinations.Rd | 3 +- man/prepare_data.Rd | 6 +- man/setup.Rd | 3 + man/setup_approach.Rd | 11 +++- 7 files changed, 143 insertions(+), 3 deletions(-) create mode 100644 man/explain_linear.Rd diff --git a/NAMESPACE b/NAMESPACE index 956c374c8..7f4a215a0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,7 @@ S3method(prepare_data,ctree) S3method(prepare_data,empirical) S3method(prepare_data,gaussian) S3method(prepare_data,independence) +S3method(prepare_data,linear_gaussian) S3method(prepare_data,timeseries) S3method(print,shapr) S3method(setup_approach,categorical) @@ -49,6 +50,7 @@ export(compute_vS) export(correction_matrix_cpp) export(explain) export(explain_forecast) +export(explain_linear) export(feature_combinations) export(feature_matrix_cpp) export(finalize_explanation) @@ -67,6 +69,7 @@ export(rss_cpp) export(setup) export(setup_approach) export(setup_computation) +export(setup_linear_gaussian) export(weight_matrix_cpp) importFrom(Rcpp,sourceCpp) importFrom(data.table,":=") diff --git a/man/explain.Rd b/man/explain.Rd index e7c9deb4d..6506bd804 100644 --- a/man/explain.Rd +++ b/man/explain.Rd @@ -9,8 +9,11 @@ explain( x_explain, x_train, approach, + shap_approach = "kernel", + paired_shap_sampling = FALSE, prediction_zero, n_combinations = NULL, + n_permutations = NULL, group = NULL, n_samples = 1000, n_batches = NULL, diff --git a/man/explain_linear.Rd b/man/explain_linear.Rd new file mode 100644 index 000000000..99b4549b3 --- /dev/null +++ b/man/explain_linear.Rd @@ -0,0 +1,117 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/explain_linear.R +\name{explain_linear} +\alias{explain_linear} +\title{Explain the output of a linear model with Shapley values} +\usage{ +explain_linear( + model, + x_explain, + x_train, + paired_shap_sampling = FALSE, + prediction_zero, + n_combinations = NULL, + n_permutations = NULL, + group = NULL, + n_samples = 1000, + n_batches = NULL, + seed = 1, + keep_samp_for_vS = FALSE, + predict_model = NULL, + get_model_specs = NULL, + MSEv_uniform_comb_weights = TRUE, + timing = TRUE, + ... +) +} +\arguments{ +\item{model}{The model whose predictions we want to explain. +Run \code{\link[=get_supported_models]{get_supported_models()}} +for a table of which models \code{explain} supports natively. Unsupported models +can still be explained by passing \code{predict_model} and (optionally) \code{get_model_specs}, +see details for more information.} + +\item{x_explain}{A matrix or data.frame/data.table. +Contains the the features, whose predictions ought to be explained.} + +\item{x_train}{Matrix or data.frame/data.table. +Contains the data used to estimate the (conditional) distributions for the features +needed to properly estimate the conditional expectations in the Shapley formula.} + +\item{prediction_zero}{Numeric. +The prediction value for unseen data, i.e. an estimate of the expected prediction without conditioning on any +features. +Typically we set this value equal to the mean of the response variable in our training data, but other choices +such as the mean of the predictions in the training data are also reasonable.} + +\item{n_combinations}{Integer. +If \code{group = NULL}, \code{n_combinations} represents the number of unique feature combinations to sample. +If \code{group != NULL}, \code{n_combinations} represents the number of unique group combinations to sample. +If \code{n_combinations = NULL}, the exact method is used and all combinations are considered. +The maximum number of combinations equals \code{2^m}, where \code{m} is the number of features.} + +\item{group}{List. +If \code{NULL} regular feature wise Shapley values are computed. +If provided, group wise Shapley values are computed. \code{group} then has length equal to +the number of groups. The list element contains character vectors with the features included +in each of the different groups.} + +\item{n_samples}{Positive integer. +Indicating the maximum number of samples to use in the +Monte Carlo integration for every conditional expectation. See also details.} + +\item{n_batches}{Positive integer (or NULL). +Specifies how many batches the total number of feature combinations should be split into when calculating the +contribution function for each test observation. +The default value is NULL which uses a reasonable trade-off between RAM allocation and computation speed, +which depends on \code{approach} and \code{n_combinations}. +For models with many features, increasing the number of batches reduces the RAM allocation significantly. +This typically comes with a small increase in computation time.} + +\item{seed}{Positive integer. +Specifies the seed before any randomness based code is being run. +If \code{NULL} the seed will be inherited from the calling environment.} + +\item{keep_samp_for_vS}{Logical. +Indicates whether the samples used in the Monte Carlo estimation of v_S should be returned +(in \code{internal$output})} + +\item{predict_model}{Function. +The prediction function used when \code{model} is not natively supported. +(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported +models.) +The function must have two arguments, \code{model} and \code{newdata} which specify, respectively, the model +and a data.frame/data.table to compute predictions for. The function must give the prediction as a numeric vector. +\code{NULL} (the default) uses functions specified internally. +Can also be used to override the default function for natively supported model classes.} + +\item{get_model_specs}{Function. +An optional function for checking model/data consistency when \code{model} is not natively supported. +(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported +models.) +The function takes \code{model} as argument and provides a list with 3 elements: +\describe{ +\item{labels}{Character vector with the names of each feature.} +\item{classes}{Character vector with the classes of each features.} +\item{factor_levels}{Character vector with the levels for any categorical features.} +} +If \code{NULL} (the default) internal functions are used for natively supported model classes, and the checking is +disabled for unsupported model classes. +Can also be used to override the default function for natively supported model classes.} + +\item{MSEv_uniform_comb_weights}{Logical. If \code{TRUE} (default), then the function weights the combinations +uniformly when computing the MSEv criterion. If \code{FALSE}, then the function use the Shapley kernel weights to +weight the combinations when computing the MSEv criterion. Note that the Shapley kernel weights are replaced by the +sampling frequency when not all combinations are considered.} + +\item{timing}{Logical. +Whether the timing of the different parts of the \code{explain()} should saved in the model object.} + +\item{...}{Further arguments passed to specific approaches} +} +\description{ +Explain the output of a linear model with Shapley values +} +\author{ +Martin Jullum +} diff --git a/man/feature_combinations.Rd b/man/feature_combinations.Rd index f6b6c4220..e6635be33 100644 --- a/man/feature_combinations.Rd +++ b/man/feature_combinations.Rd @@ -9,7 +9,8 @@ feature_combinations( exact = TRUE, n_combinations = 200, weight_zero_m = 10^6, - group_num = NULL + group_num = NULL, + paired_shap_sampling = FALSE ) } \arguments{ diff --git a/man/prepare_data.Rd b/man/prepare_data.Rd index 097fef9b8..f677bc9b5 100644 --- a/man/prepare_data.Rd +++ b/man/prepare_data.Rd @@ -1,7 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/approach.R, R/approach_categorical.R, % R/approach_copula.R, R/approach_ctree.R, R/approach_empirical.R, -% R/approach_gaussian.R, R/approach_independence.R, R/approach_timeseries.R +% R/approach_gaussian.R, R/approach_independence.R, R/approach_timeseries.R, +% R/linear_gaussian_funcs.R \name{prepare_data} \alias{prepare_data} \alias{prepare_data.categorical} @@ -11,6 +12,7 @@ \alias{prepare_data.gaussian} \alias{prepare_data.independence} \alias{prepare_data.timeseries} +\alias{prepare_data.linear_gaussian} \title{Generate data used for predictions and Monte Carlo integration} \usage{ prepare_data(internal, index_features = NULL, ...) @@ -28,6 +30,8 @@ prepare_data(internal, index_features = NULL, ...) \method{prepare_data}{independence}(internal, index_features = NULL, ...) \method{prepare_data}{timeseries}(internal, index_features = NULL, ...) + +\method{prepare_data}{linear_gaussian}(internal, index_features = NULL, ...) } \arguments{ \item{internal}{Not used.} diff --git a/man/setup.Rd b/man/setup.Rd index 45a4ef170..efbceec93 100644 --- a/man/setup.Rd +++ b/man/setup.Rd @@ -8,9 +8,12 @@ setup( x_train, x_explain, approach, + shap_approach, + paired_shap_sampling, prediction_zero, output_size = 1, n_combinations, + n_permutations, group, n_samples, n_batches, diff --git a/man/setup_approach.Rd b/man/setup_approach.Rd index 86cb164bb..d9d4b3fc3 100644 --- a/man/setup_approach.Rd +++ b/man/setup_approach.Rd @@ -1,7 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/approach.R, R/approach_categorical.R, % R/approach_copula.R, R/approach_ctree.R, R/approach_empirical.R, -% R/approach_gaussian.R, R/approach_independence.R, R/approach_timeseries.R +% R/approach_gaussian.R, R/approach_independence.R, R/approach_timeseries.R, +% R/linear_gaussian_funcs.R \name{setup_approach} \alias{setup_approach} \alias{setup_approach.categorical} @@ -11,6 +12,7 @@ \alias{setup_approach.gaussian} \alias{setup_approach.independence} \alias{setup_approach.timeseries} +\alias{setup_linear_gaussian} \title{Set up the framework chosen approach} \usage{ setup_approach(internal, ...) @@ -57,6 +59,13 @@ setup_approach(internal, ...) timeseries.bounds = c(NULL, NULL), ... ) + +setup_linear_gaussian( + internal, + gaussian.mu = NULL, + gaussian.cov_mat = NULL, + ... +) } \arguments{ \item{internal}{Not used.} From abc11471a96820b9ab0fde4eed2c0c5b85cd3cbc Mon Sep 17 00:00:00 2001 From: Martin Date: Mon, 15 Jan 2024 08:55:14 +0100 Subject: [PATCH 08/26] bugfix --- R/setup.R | 10 ++++++---- inst/scripts/devel/devel_explain_linear.R | 2 ++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/R/setup.R b/R/setup.R index e8b1796b2..b960deab0 100644 --- a/R/setup.R +++ b/R/setup.R @@ -166,6 +166,7 @@ check_n_combinations <- function(internal) { n_combinations <- internal$parameters$n_combinations n_features <- internal$parameters$n_features n_groups <- internal$parameters$n_groups + shap_approach <- internal$parameters$shap_approach type <- internal$parameters$type @@ -176,12 +177,13 @@ check_n_combinations <- function(internal) { xreg <- internal$data$xreg if (!is_groupwise) { - if (n_combinations <= n_features) { + if (shap_approach=="kernel" && n_combinations <= n_features) { stop(paste0( "`n_combinations` (", n_combinations, ") has to be greater than the number of components to decompose ", " the forecast onto:\n", "`horizon` (", horizon, ") + `explain_y_lags` (", explain_y_lags, ") ", - "+ sum(`explain_xreg_lags`) (", sum(explain_xreg_lags), ").\n" + "+ sum(`explain_xreg_lags`) (", sum(explain_xreg_lags), ")\n", + "for shap_approach = 'kernel'." )) } } else { @@ -195,8 +197,8 @@ check_n_combinations <- function(internal) { } } else { if (!is_groupwise) { - if (n_combinations <= n_features) { - stop("`n_combinations` has to be greater than the number of features.") + if (shap_approach=="kernel" && n_combinations <= n_features) { + stop("`n_combinations` has to be greater than the number of features for shap_approach = 'kernel'.") } } else { if (n_combinations <= n_groups) { diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index 1c164646a..99420778d 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -29,6 +29,8 @@ test <-explain(model = model_lm_numeric, shap_approach = "permutation", prediction_zero = p0) +debugonce(explain_linear) + test2 <-explain_linear(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, From 211121c2ec7153ebeba2c22ed5d7a3c334840b09 Mon Sep 17 00:00:00 2001 From: Martin Date: Mon, 15 Jan 2024 14:35:01 +0100 Subject: [PATCH 09/26] working permute version --- R/finalize_explanation.R | 7 +++-- inst/scripts/devel/devel_permute.R | 44 ++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 inst/scripts/devel/devel_permute.R diff --git a/R/finalize_explanation.R b/R/finalize_explanation.R index de1cb4730..21fac3c74 100644 --- a/R/finalize_explanation.R +++ b/R/finalize_explanation.R @@ -118,19 +118,20 @@ get_p <- function(dt_vS, internal) { compute_shapley_permutation <- function(internal,dt_vS){ feature_names <- internal$parameters$feature_names X_perm <- internal$objects$X_perm - n_permutations <- internal$parameters$n_permutations n_features <- internal$parameters$n_features n_explain <- internal$parameters$n_explain max_id_combination <- internal$parameters$n_combinations S <- internal$objects$S phi0 <- internal$parameters$prediction_zero + n_permutations_used <- X_perm[,max(permute_id,na.rm = TRUE)] + apply_cols <- names(dt_vS)[-1] kshap <- matrix(0,ncol=n_explain,nrow=n_features) - for(i in seq(n_permutations)){ + for(i in seq(n_permutations_used)){ # Find id combinations that are permuted these_id_combs <- c(1,X_perm[permute_id==i,id_combination],max_id_combination) @@ -149,7 +150,7 @@ compute_shapley_permutation <- function(internal,dt_vS){ reordered_contribs <- as.matrix(these_contribs[reorder_vec,]) kshap <- kshap + reordered_contribs } - kshap <- kshap/n_permutations + kshap <- kshap/n_permutations_used diff --git a/inst/scripts/devel/devel_permute.R b/inst/scripts/devel/devel_permute.R new file mode 100644 index 000000000..65e8c5fd7 --- /dev/null +++ b/inst/scripts/devel/devel_permute.R @@ -0,0 +1,44 @@ +set.seed(12345) + +data <- data.table::as.data.table(airquality) + +data_complete <- data[complete.cases(airquality), ] +data_complete <- data_complete[sample(seq_len(.N))] # Sh + +y_var_numeric <- "Ozone" + +x_var_numeric <- c("Solar.R", "Wind", "Temp", "Month", "Day") + +data_train <- head(data_complete, -3) +data_explain <- tail(data_complete, 3) + +x_train_numeric <- data_train[, ..x_var_numeric] + +x_explain_numeric <- data_explain[, ..x_var_numeric] + +lm_formula_numeric <- as.formula(paste0(y_var_numeric, " ~ ", paste0(x_var_numeric, collapse = " + "))) + +model_lm_numeric <- lm(lm_formula_numeric, data = data_complete) + +p0 <- data_train[, mean(get(y_var_numeric))] + +test_kernel <-explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "empirical", + shap_approach = "kernel", + prediction_zero = p0) + +test_permute <-explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "empirical", + shap_approach = "permutation", + prediction_zero = p0) + + +test_kernel$shapley_values-test_permute$shapley_values + + + +debugonce(explain) From e7e8229d10cf11f658bd740e62966daee4d4f8dc Mon Sep 17 00:00:00 2001 From: Martin Date: Mon, 15 Jan 2024 15:23:07 +0100 Subject: [PATCH 10/26] script for testing pure permuting --- inst/scripts/devel/devel_permute.R | 42 +++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/inst/scripts/devel/devel_permute.R b/inst/scripts/devel/devel_permute.R index 65e8c5fd7..2331f5aaa 100644 --- a/inst/scripts/devel/devel_permute.R +++ b/inst/scripts/devel/devel_permute.R @@ -40,5 +40,45 @@ test_permute <-explain(model = model_lm_numeric, test_kernel$shapley_values-test_permute$shapley_values +permute_list <- kernel_list <- list() +for(i in 1:10){ + set.seed(1+i) + + test_kernel <-explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "empirical", + shap_approach = "kernel", + prediction_zero = p0,n_combinations = 2^5-3,seed = i) + + + test_permute <-explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "empirical", + shap_approach = "permutation", + prediction_zero = p0,n_permutations = factorial(5)-3,seed=i) + permute_list[[i]] <- test_permute$shapley_values + kernel_list[[i]] <- test_kernel$shapley_values + + print(i) + +} + +permute_list <- lapply(permute_list,as.matrix) +kernel_list <- lapply(kernel_list,as.matrix) + +# permute is way more precise than kernel when sampling + +Reduce(`+`,permute_list)/length(permute_list) + +Reduce(`+`,kernel_list)/length(kernel_list) + +explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "empirical", + shap_approach = "kernel", + prediction_zero = p0) + -debugonce(explain) From b5953377302e680321d915c1a2fa00effaa7f012 Mon Sep 17 00:00:00 2001 From: Martin Date: Mon, 15 Jan 2024 15:23:47 +0100 Subject: [PATCH 11/26] force paired sampling and test linear_gaussian_model --- R/explain_linear.R | 6 +-- R/get_predict_model.R | 47 +++++++++++++++++++++++ inst/scripts/devel/devel_explain_linear.R | 5 ++- man/explain_linear.Rd | 1 - man/test_predict_linear_model.Rd | 31 +++++++++++++++ 5 files changed, 85 insertions(+), 5 deletions(-) create mode 100644 man/test_predict_linear_model.Rd diff --git a/R/explain_linear.R b/R/explain_linear.R index f48121bd4..dc76052e0 100644 --- a/R/explain_linear.R +++ b/R/explain_linear.R @@ -9,7 +9,6 @@ explain_linear <- function(model, x_explain, x_train, - paired_shap_sampling = FALSE, prediction_zero, n_combinations = NULL, n_permutations = NULL, @@ -45,7 +44,7 @@ explain_linear <- function(model, x_explain = x_explain, approach = "gaussian", # always set to "gaussian" although we never really use this argument for linear_gaussian shap_approach = "permutation", # Always use the permute shap_approach - paired_shap_sampling = paired_shap_sampling, + paired_shap_sampling = TRUE, # Always use paired sampling since simplified computation of the required Q and U objects requires it prediction_zero = prediction_zero, n_combinations = n_combinations, # We always set the n_permutations instead n_permutations = n_permutations, @@ -73,10 +72,11 @@ explain_linear <- function(model, # same predictions as using the coefficients # Checks that predict_model gives correct format - test_predict_model( + test_predict_linear_model( x_test = head(internal$data$x_train, 2), predict_model = predict_model, model = model, + linear_model_coef = linear_model_coef, internal = internal ) diff --git a/R/get_predict_model.R b/R/get_predict_model.R index 93577e8b9..2df4eab87 100644 --- a/R/get_predict_model.R +++ b/R/get_predict_model.R @@ -80,3 +80,50 @@ test_predict_model <- function(x_test, predict_model, model, internal) { ) } } + +#' Model testing function +#' +#' @inheritParams default_doc +#' @keywords internal +test_predict_linear_model <- function(x_test, predict_model, model,linear_model_coef, internal) { + # Tests prediction with some data + + if(class(model)[1]!="lm"){ + stop(paste0("explain_linear_gaussian is only applicable with 'model' of class 'lm'.")) + } + + tmp <- tryCatch(predict_model(model, x_test), error = errorfun) + if (class(tmp)[1] == "error") { + stop(paste0( + "The predict_model function of class `", class(model), "` is invalid.\n", + "A basic function test threw the following error:\n", as.character(tmp[[1]]) + )) + } + + + + + if (!((all(sapply(tmp, is.numeric))) && + (length(tmp) == 2 || (!is.null(dim(tmp)) && nrow(tmp) == 2 && ncol(tmp) == internal$parameters$output_size)))) { + stop( + paste0( + "The predict_model function of class `", class(model), + "` does not return a numeric output of the desired length.\n", + "See the 'Advanced usage' section of the vignette:\n", + "vignette('understanding_shapr', package = 'shapr')\n\n", + "for more information on running shapr with custom models.\n" + ) + ) + } + + manual_pred <- as.vector(cbind(1,as.matrix(x_test))%*%linear_model_coef) + + if(isFALSE(all.equal(manual_pred,tmp,check.attributes = FALSE))){ + stop( + "Prediction with the extracted model coefficients does not match the prediction with the predict_model function.\n", + "This suggests interactions, quadratic effects or other non-linearities in the model.\n", + "explain_linear_gaussian is only applicable with pure linear models.\n", + ) + } +} + diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index 99420778d..bc72c0989 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -29,12 +29,15 @@ test <-explain(model = model_lm_numeric, shap_approach = "permutation", prediction_zero = p0) + + debugonce(explain_linear) test2 <-explain_linear(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, - prediction_zero = p0) + prediction_zero = p0, + n_permutations = 3) diff --git a/man/explain_linear.Rd b/man/explain_linear.Rd index 99b4549b3..82936687c 100644 --- a/man/explain_linear.Rd +++ b/man/explain_linear.Rd @@ -8,7 +8,6 @@ explain_linear( model, x_explain, x_train, - paired_shap_sampling = FALSE, prediction_zero, n_combinations = NULL, n_permutations = NULL, diff --git a/man/test_predict_linear_model.Rd b/man/test_predict_linear_model.Rd new file mode 100644 index 000000000..7067426a0 --- /dev/null +++ b/man/test_predict_linear_model.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_predict_model.R +\name{test_predict_linear_model} +\alias{test_predict_linear_model} +\title{Model testing function} +\usage{ +test_predict_linear_model( + x_test, + predict_model, + model, + linear_model_coef, + internal +) +} +\arguments{ +\item{predict_model}{Function. +The prediction function used when \code{model} is not natively supported. +See the documentation of \code{\link[=explain]{explain()}} for details.} + +\item{model}{Objects. +The model object that ought to be explained. +See the documentation of \code{\link[=explain]{explain()}} for details.} + +\item{internal}{List. +Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}} +The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.} +} +\description{ +Model testing function +} +\keyword{internal} From 0338ce39ea4be0d0c01bb55af697f03b39408c02 Mon Sep 17 00:00:00 2001 From: Martin Date: Mon, 15 Jan 2024 16:14:00 +0100 Subject: [PATCH 12/26] starting to setup the mapping function, just initials --- R/finalize_explanation.R | 3 -- R/linear_gaussian_funcs.R | 84 +++++++++++++++++++++++++++++---------- 2 files changed, 62 insertions(+), 25 deletions(-) diff --git a/R/finalize_explanation.R b/R/finalize_explanation.R index 21fac3c74..e91a01fb9 100644 --- a/R/finalize_explanation.R +++ b/R/finalize_explanation.R @@ -140,9 +140,6 @@ compute_shapley_permutation <- function(internal,dt_vS){ contributes_to <- apply(mapping_mat,FUN=function(x) which(x==1),MARGIN=1) reorder_vec <- order(contributes_to) - #### LOOK HERE: OK, something is off here, I think it might be the merging that is messing up the order - # I need the order to be such that the X_perm always have the smallest feature numbers first. - # Find the corresponding rows in dt_vS and get the contribution these_vS <- dt_vS[id_combination %in% these_id_combs] these_contribs <- these_vS[,lapply(.SD,diff),.SDcols=apply_cols] diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R index 150daac91..172f87ce2 100644 --- a/R/linear_gaussian_funcs.R +++ b/R/linear_gaussian_funcs.R @@ -12,6 +12,19 @@ setup_linear_gaussian <- function(internal, gaussian.mu = NULL, gaussian.cov_mat = NULL, ...) { + + x_train <- internal$data$x_train + n_explain <- internal$parameters$n_explain + gaussian.cov_mat <- internal$parameters$gaussian.cov_mat + gaussian.mu <- internal$parameters$gaussian.mu + n_features <- internal$parameters$n_features + linear_model_coef <- internal$parameters$linear_model_coef + S <- internal$objects$S + X <- internal$objects$X + X_perm <- internal$objects$X_perm + + max_id_combinations <- X[,.N] + # For consistency defaults <- mget(c("gaussian.mu", "gaussian.cov_mat")) internal <- insert_defaults(internal, defaults) @@ -31,13 +44,58 @@ setup_linear_gaussian <- function(internal, } # If gaussian.mu is not provided directly in internal list, use mean of training data - if (is.null(internal$parameters$gaussian.mu)) { - internal$parameters$gaussian.mu <- get_mu_vec(x_train) + if (is.null(gaussian.mu)) { + gaussian.mu <- get_mu_vec(x_train) } # If gaussian.cov_mat is not provided directly in internal list, use sample covariance of training data - if (is.null(internal$parameters$gaussian.cov_mat)) { - internal$parameters$gaussian.cov_mat <- get_cov_mat(x_train) + if (is.null(gaussian.cov_mat)) { + gaussian.cov_mat <- get_cov_mat(x_train) + } + + # Computing the US and QS objects + + + + + # To get rid of the smallest and largest subset + S <- S[-c(1,max_id_combinations),] + Sbar <- 1-S + + + # Now using the formulas form True to the model or true to the data paper: https://arxiv.org/pdf/2006.16234.pdf + # Consider whether doing this with sparse matrices is faster + PS_full <- diag(n_features) + PS <- apply(S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) + PSbar <- apply(1-S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) + + # TODO: Should do this in rcpp for speed up later + US_list <- QS_list <- QSbar_list <- list() + for (i in seq_len(nrow(S))){ + US_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])%*%solve(PS[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])) + QS_list[[i]] <- t(PS[[i]])%*%PS[[i]] + QSbar_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]] # Make this every time as well since it is fast to do, even though it could be extracted from QS_list if it is always present. Change this later + } + + ### Computing the Tmu and Tx objects + # Loop over each feature and create Tmu and Tx lists + # Need to do some mapping from the permutations to the SRbar, SRbar+i, SRbar+i, SRbar+i subsets, probably similar to + # That done in compute_shapley_permutation() + + Tmu_list <- Tx_list <- list() + for(j in seq_len(n_features)){ + for(i in seq(n_permutations_used)){ + + #### JUST SOME COPIED CODE BELOW. NEED TO CONTINUE FROM HERE, figuring out how to map the permutations to the subsets + + # Find id combinations that are permuted + these_id_combs <- c(1,X_perm[permute_id==i,id_combination],max_id_combination) + + # Find the feature to map the contributions to + mapping_mat <- apply(S[these_id_combs,],FUN=diff,MARGIN=2) + contributes_to <- apply(mapping_mat,FUN=function(x) which(x==1),MARGIN=1) + reorder_vec <- order(contributes_to) + } return(internal) @@ -69,24 +127,6 @@ prepare_data.linear_gaussian <- function(internal, index_features = NULL, ...) { # stop("approach = 'linear_gaussian' can only be applied with n_batches = 1.") # } - # To get rid of the smallest and largest subset - S <- S[index_features,] - Sbar <- 1-S - - - # Now using the formulas form True to the model or true to the data paper: https://arxiv.org/pdf/2006.16234.pdf - # Consider whether doing this with sparse matrices is faster - PS_full <- diag(n_features) - PS <- apply(S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) - PSbar <- apply(1-S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) - - # TODO: Should do this in rcpp for speed up later - US_list <- QS_list <- QSbar_list <- list() - for (i in seq_len(nrow(S))){ - US_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])%*%solve(PS[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])) - QS_list[[i]] <- t(PS[[i]])%*%PS[[i]] - QSbar_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]] # Make this every time as well since it is fast to do, even though it could be extracted from QS_list if it is always present. - } Tmu_list <- Tx_list <- list() for (i in seq_len(nrow(S))){ From 0f39bd751432563cb290c478716a93c49f8eea4b Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 16 Jan 2024 10:30:35 +0100 Subject: [PATCH 13/26] more work on linear explainer --- R/explain_linear.R | 15 +---------- R/linear_gaussian_funcs.R | 52 +++++++++++++++++++++++++++------------ R/setup_computation.R | 5 ++-- 3 files changed, 40 insertions(+), 32 deletions(-) diff --git a/R/explain_linear.R b/R/explain_linear.R index dc76052e0..beede9e3d 100644 --- a/R/explain_linear.R +++ b/R/explain_linear.R @@ -90,22 +90,9 @@ explain_linear <- function(model, timing_list$setup_computation <- Sys.time() - - # Compute the v(S): - # Get the samples for the conditional distributions with the specified approach - # Predict with these samples - # Perform MC integration on these to estimate the conditional expectation (v(S)) - vS_list <- compute_vS(internal, model, predict_model) - - timing_list$compute_vS <- Sys.time() - - # Compute Shapley values based on conditional expectations (v(S)) # Organize function output - output <- finalize_explanation( - vS_list = vS_list, - internal = internal - ) + output <- comoute_shapley_linear_gaussian(internal = internal) # Not yet created timing_list$shapley_computation <- Sys.time() diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R index 172f87ce2..03097c941 100644 --- a/R/linear_gaussian_funcs.R +++ b/R/linear_gaussian_funcs.R @@ -23,7 +23,7 @@ setup_linear_gaussian <- function(internal, X <- internal$objects$X X_perm <- internal$objects$X_perm - max_id_combinations <- X[,.N] + max_id_combination <- X[,.N] # For consistency defaults <- mget(c("gaussian.mu", "gaussian.cov_mat")) @@ -59,7 +59,8 @@ setup_linear_gaussian <- function(internal, # To get rid of the smallest and largest subset - S <- S[-c(1,max_id_combinations),] + #Sorg <- S # keeping the original one + #S <- S[-c(1,max_id_combination),] Sbar <- 1-S @@ -71,36 +72,55 @@ setup_linear_gaussian <- function(internal, # TODO: Should do this in rcpp for speed up later US_list <- QS_list <- QSbar_list <- list() - for (i in seq_len(nrow(S))){ - US_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])%*%solve(PS[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])) + US_list[[1]] <- US_list[[nrow(S)]] <- QS_list[[1]] <- matrix(0,nrow = n_features,ncol = n_features) + QS_list[[nrow(S)]] <- diag(n_features) + for (i in seq(2,nrow(S)-1)){ + US_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])%*%solve(PS[[i]]%*%gaussian.cov_mat%*%t(PS[[i]]))%*%PS[[i]] QS_list[[i]] <- t(PS[[i]])%*%PS[[i]] - QSbar_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]] # Make this every time as well since it is fast to do, even though it could be extracted from QS_list if it is always present. Change this later +# QSbar_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]] # Make this every time as well since it is fast to do, even though it could be extracted from QS_list if it is always present. Change this later } ### Computing the Tmu and Tx objects - # Loop over each feature and create Tmu and Tx lists - # Need to do some mapping from the permutations to the SRbar, SRbar+i, SRbar+i, SRbar+i subsets, probably similar to - # That done in compute_shapley_permutation() + # A rewrite of eq (9) in the true to the model or data paper: https://arxiv.org/pdf/2006.16234.pdf shows that + # when we force paired sampling, the Tmu and Tx objects gets a simplified formula. + # Another trick is that since the full S matrix is constructed based on the permutations, + # any row in S that contains features j, will have a corresponding row that does not contain feature j. + # Thus, we can easily find all the rows that contain feature j, and those that don't to then compute Q and U differences + # without having to map the permutations to the subsets. Tmu_list <- Tx_list <- list() for(j in seq_len(n_features)){ - for(i in seq(n_permutations_used)){ - #### JUST SOME COPIED CODE BELOW. NEED TO CONTINUE FROM HERE, figuring out how to map the permutations to the subsets + includes_j <- S[,j]==1 # Find all subsets that include feature j - # Find id combinations that are permuted - these_id_combs <- c(1,X_perm[permute_id==i,id_combination],max_id_combination) + Qdiff <- Reduce("+",QS_list[includes_j])-Reduce("+",QS_list[!includes_j]) + Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) - # Find the feature to map the contributions to - mapping_mat <- apply(S[these_id_combs,],FUN=diff,MARGIN=2) - contributes_to <- apply(mapping_mat,FUN=function(x) which(x==1),MARGIN=1) - reorder_vec <- order(contributes_to) + Tmu_list[[j]] <- (Qdiff-Udiff) / nrow(S) + Tx_list[[j]] <- (Qdiff+Udiff) / nrow(S) } + internal$objects$US_list <- US_list + internal$objects$QS_list <- QS_list + internal$objects$Tmu_list <- Tmu_list + internal$objects$Tx_list <- Tx_list + internal$parameters$gaussian.mu <- gaussian.mu + internal$parameters$gaussian.cov_mat <- gaussian.cov_mat + return(internal) } +SRfun <- function(permute_id0=1,feature=3,max_id_combination=X[,.N]){ + perm0 <- perm_dt[permute_id==permute_id0,perm] + position <- which(perm[[1]]==feature) + + these_id_combinations <- c(1,X_perm[permute_id==permute_id0,id_combination],max_id_combination) + SR_and_SRi <- these_id_combinations[c(position,position+1)] + + return(SR_and_SRi) +} + #' @rdname prepare_data diff --git a/R/setup_computation.R b/R/setup_computation.R index d7e6b7274..f921292ca 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -27,8 +27,8 @@ setup_computation_linear_gaussian <- function(internal){ # setup the Shapley framework internal <- shapley_setup(internal) - # Setup for approach - internal <- setup_linear_gaussian(internal) # Not created yet + # Setup for the linear gaussian method: compute U, Q, Tx and Tmu + internal <- setup_linear_gaussian(internal) } @@ -162,6 +162,7 @@ shapley_setup <- function(internal) { ) X <- X_list$X internal$objects$X_perm <- X_list$X_perm + internal$objects$perm_dt <- X_list$perm_dt W <- NULL } else { From 6b05f700cd043ca4b4b2ae60657e746e57da0da7 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 16 Jan 2024 11:34:02 +0100 Subject: [PATCH 14/26] complete, but not correct results so far --- R/explain_linear.R | 9 ++-- R/finalize_explanation.R | 59 +++++++++++++++++++++++ inst/scripts/devel/devel_explain_linear.R | 15 ++++-- 3 files changed, 72 insertions(+), 11 deletions(-) diff --git a/R/explain_linear.R b/R/explain_linear.R index beede9e3d..40b55bda9 100644 --- a/R/explain_linear.R +++ b/R/explain_linear.R @@ -83,16 +83,13 @@ explain_linear <- function(model, timing_list$test_prediction <- Sys.time() - # Sets up the Shapley (sampling) framework and prepares the - # conditional expectation computation for the chosen approach - # Note: model and predict_model are ONLY used by the AICc-methods of approach empirical to find optimal parameters + # Computes the necessary objects for the linear Gaussian approach internal <- setup_computation_linear_gaussian(internal) timing_list$setup_computation <- Sys.time() - # Compute Shapley values based on conditional expectations (v(S)) - # Organize function output - output <- comoute_shapley_linear_gaussian(internal = internal) # Not yet created + # Compute Shapley values with the linear Gaussian method + output <- compute_shapley_linear_gaussian(internal = internal) timing_list$shapley_computation <- Sys.time() diff --git a/R/finalize_explanation.R b/R/finalize_explanation.R index e91a01fb9..f4cc3ef04 100644 --- a/R/finalize_explanation.R +++ b/R/finalize_explanation.R @@ -320,3 +320,62 @@ compute_MSEv_eval_crit <- function(internal, MSEv_combination = MSEv_combination )) } + + +#' Computes the Shapley values for the linear Gaussian method +#' +#' @inherit explain +#' @inheritParams default_doc +#' @param vS_list List +#' Output from [compute_vS()] +#' +#' @export +compute_shapley_linear_gaussian <- function(internal) { + + # Inputs + mu <- internal$parameters$gaussian.mu + n_features <- internal$parameters$n_features + n_explain <- internal$parameters$n_explain + x_explain <- internal$data$x_explain + Tmu_list <- internal$objects$Tmu_list + Tx_list <- internal$objects$Tx_list + coefs <- internal$parameters$linear_model_coef + feature_names <- internal$parameters$feature_names + + # Convert inputs + beta <- coefs[-1] + x_explain_mat <- as.matrix(x_explain) + + # Get the prediction + + p <- as.numeric(coefs[1] + x_explain_mat%*%beta) + + # Compute phi0 + phi0 <- as.numeric(coefs[1]+t(beta)%*%mu) + shapley_mat <- matrix(0, nrow = n_explain, ncol = n_features) + colnames(shapley_mat) <- feature_names + + for(j in seq_len(n_features)) { + + # Consider moving the computation of the first and all but the multiplication of the second term to the pre-processing function + shapley_mat[,j] <- as.numeric(t(beta)%*%Tmu_list[[j]]%*%mu) + x_explain_mat%*%t(Tx_list[[j]])%*%beta + + } + + dt_shapley <- data.table( + none = phi0, + shapley_mat + ) + + output <- list( + shapley_values = dt_shapley, + internal = internal, + pred_explain = p + ) + attr(output, "class") <- c("shapr", "list") + + return(output) + +} + + diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index bc72c0989..0f8b8498c 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -27,17 +27,22 @@ test <-explain(model = model_lm_numeric, x_train = x_train_numeric, approach = "gaussian", shap_approach = "permutation", - prediction_zero = p0) + prediction_zero = p0,n_samples = 10^4) +test - -debugonce(explain_linear) +#debugonce(explain_linear) test2 <-explain_linear(model = model_lm_numeric, x_explain = x_explain_numeric, x_train = x_train_numeric, - prediction_zero = p0, - n_permutations = 3) + prediction_zero = p0) + +test2 + +### We don't get the same results... Investigating by looking at the v(S) we get: +test$internal$output$dt_vS +test2$internal$objects$ From 22445c7b961d0213bffdd57818b3ac4512039e5a Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 16 Jan 2024 15:02:13 +0100 Subject: [PATCH 15/26] Issue The issue seems to be incorrect weighting of the different S's. I should try to loop through the permutations within the loop instead, extract the relevant S, to then do the computation. Just to see how they are all weighted. --- R/explain_linear.R | 3 -- R/linear_gaussian_funcs.R | 15 ++++------ inst/scripts/devel/devel_explain_linear.R | 35 ++++++++++++++++++++--- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/R/explain_linear.R b/R/explain_linear.R index 40b55bda9..2f4579997 100644 --- a/R/explain_linear.R +++ b/R/explain_linear.R @@ -68,9 +68,6 @@ explain_linear <- function(model, model = model ) - #TODO Make test for the linear model, checking whether predict() gives the - # same predictions as using the coefficients - # Checks that predict_model gives correct format test_predict_linear_model( x_test = head(internal$data$x_train, 2), diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R index 03097c941..71ca6216f 100644 --- a/R/linear_gaussian_funcs.R +++ b/R/linear_gaussian_funcs.R @@ -24,6 +24,7 @@ setup_linear_gaussian <- function(internal, X_perm <- internal$objects$X_perm max_id_combination <- X[,.N] + n_permutations <- X_perm[,max(permute_id,na.rm=TRUE)] # For consistency defaults <- mget(c("gaussian.mu", "gaussian.cov_mat")) @@ -53,16 +54,11 @@ setup_linear_gaussian <- function(internal, gaussian.cov_mat <- get_cov_mat(x_train) } - # Computing the US and QS objects - - - - - # To get rid of the smallest and largest subset - #Sorg <- S # keeping the original one - #S <- S[-c(1,max_id_combination),] - Sbar <- 1-S + # Counting the number of repetitions of each row in S + id_combination_reps <- X_perm[,.N,by=id_combination] + id_combination_reps[c(1,.N),N:=n_permutations] + # Computing the US and QS objects # Now using the formulas form True to the model or true to the data paper: https://arxiv.org/pdf/2006.16234.pdf # Consider whether doing this with sparse matrices is faster @@ -92,6 +88,7 @@ setup_linear_gaussian <- function(internal, for(j in seq_len(n_features)){ includes_j <- S[,j]==1 # Find all subsets that include feature j + id_combination_reps[id_combination%in%which(includes_j),N] Qdiff <- Reduce("+",QS_list[includes_j])-Reduce("+",QS_list[!includes_j]) Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index 0f8b8498c..497b6387c 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -7,7 +7,7 @@ data_complete <- data_complete[sample(seq_len(.N))] # Sh y_var_numeric <- "Ozone" -x_var_numeric <- c("Solar.R", "Wind", "Temp", "Month", "Day") +x_var_numeric <- c("Solar.R", "Wind", "Temp", "Month")#, "Day") data_train <- head(data_complete, -3) data_explain <- tail(data_complete, 3) @@ -27,7 +27,7 @@ test <-explain(model = model_lm_numeric, x_train = x_train_numeric, approach = "gaussian", shap_approach = "permutation", - prediction_zero = p0,n_samples = 10^4) + prediction_zero = p0)#n_samples = 10^5) test @@ -42,7 +42,34 @@ test2 ### We don't get the same results... Investigating by looking at the v(S) we get: -test$internal$output$dt_vS +test$internal$output$dt_vS[id_combination==2] + +# Look at the second id_combination (S={1}) + +test2$internal$objects$S[c(2,31),] + +QS <- test2$internal$objects$QS_list[[2]] +QSbar <- test2$internal$objects$QS_list[[31]] +US <- test2$internal$objects$US_list[[2]] +mu <- test2$internal$parameters$gaussian.mu +Sig <- test2$internal$parameters$gaussian.cov +x <- unlist(test2$internal$data$x_explain[1,]) +coefs <- test2$internal$parameters$linear_model_coef +b <- coefs[1] +beta <- coefs[-1] + +Ex <- (QSbar-US)%*%mu + (QS+US)%*%x +beta%*%Ex + b + +test$internal$output$dt_vS[id_combination==2] + +condmu_manual <- c(x[1],mu[-1] + Sig[-1,1]%*%solve(Sig[1,1])*(x[1]-mu[1])) +# Same as Ex + + +### TODO: Check also other combinations, but seems that the formula for E[x|xS] is correct, so error must be in the Tx/Tmu stuff. + + + -test2$internal$objects$ From 1d6bb940c9f46458a3aaa92daf6e6c91374a57e9 Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 16 Jan 2024 22:54:35 +0100 Subject: [PATCH 16/26] finally it works then need to find the weighting per row in S, to then make it more efficient --- R/linear_gaussian_funcs.R | 77 +++++++++++++++--- R/setup_computation.R | 4 +- inst/scripts/devel/devel_explain_linear.R | 17 +++- inst/scripts/devel/devel_explain_linear_2.R | 90 +++++++++++++++++++++ 4 files changed, 171 insertions(+), 17 deletions(-) create mode 100644 inst/scripts/devel/devel_explain_linear_2.R diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R index 71ca6216f..17dd48340 100644 --- a/R/linear_gaussian_funcs.R +++ b/R/linear_gaussian_funcs.R @@ -24,7 +24,7 @@ setup_linear_gaussian <- function(internal, X_perm <- internal$objects$X_perm max_id_combination <- X[,.N] - n_permutations <- X_perm[,max(permute_id,na.rm=TRUE)] + n_permutations_used <- X_perm[,max(permute_id,na.rm=TRUE)] # For consistency defaults <- mget(c("gaussian.mu", "gaussian.cov_mat")) @@ -56,7 +56,7 @@ setup_linear_gaussian <- function(internal, # Counting the number of repetitions of each row in S id_combination_reps <- X_perm[,.N,by=id_combination] - id_combination_reps[c(1,.N),N:=n_permutations] + id_combination_reps[c(1,.N),N:=n_permutations_used] # Computing the US and QS objects @@ -84,17 +84,72 @@ setup_linear_gaussian <- function(internal, # Thus, we can easily find all the rows that contain feature j, and those that don't to then compute Q and U differences # without having to map the permutations to the subsets. + perm_dt <- internal$objects$perm_dt + Tmu_list <- Tx_list <- list() + these_id_combinations_mat <- list() for(j in seq_len(n_features)){ - - includes_j <- S[,j]==1 # Find all subsets that include feature j - id_combination_reps[id_combination%in%which(includes_j),N] - - Qdiff <- Reduce("+",QS_list[includes_j])-Reduce("+",QS_list[!includes_j]) - Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) - - Tmu_list[[j]] <- (Qdiff-Udiff) / nrow(S) - Tx_list[[j]] <- (Qdiff+Udiff) / nrow(S) + Tmu_list[[j]] <- Tx_list[[j]] <- matrix(0,nrow = n_features,ncol = n_features) + these_id_combinations_mat[[j]] <- numeric(0) + for(i in seq(n_permutations_used)){ + + perm0 <- perm_dt[permute_id==i,perm][[1]] + + position <- which(perm0==j) + if(position==1){ + this_S <- integer(0) + this_S_plus_j <- sort(perm0[seq_len(position)]) + this_Sbar <- sort(perm0) + this_Sbar_min_j <- sort(perm0[-seq_len(position)]) + } else { + this_S <- sort(perm0[seq_len(position-1)]) + this_S_plus_j <- sort(perm0[seq_len(position)]) + this_Sbar <- sort(perm0[-seq_len(position-1)]) + this_Sbar_min_j <- sort(perm0[-seq_len(position)]) + } + + vec <- c(paste0(this_S,collapse = " "), + paste0(this_S_plus_j,collapse = " "), + paste0(this_Sbar,collapse = " "), + paste0(this_Sbar_min_j,collapse = " ")) + + merge_dt <- data.table(features_tmp=vec,id=seq_along(vec)) + + comb_dt <- merge(merge_dt,X[,.(id_combination,features_tmp)],by="features_tmp",all.x=TRUE) + data.table::setorderv(comb_dt,"id") + + these_id_combinations <- comb_dt[,id_combination] + S_id_comb <- these_id_combinations[1] + S_plus_j_id_comb <- these_id_combinations[2] + Sbar_id_comb <- these_id_combinations[3] + Sbar_min_j_id_comb <- these_id_combinations[4] + + Qdiff1 <- QS_list[[Sbar_min_j_id_comb]]-QS_list[[Sbar_id_comb]] + Qdiff2 <- QS_list[[S_plus_j_id_comb]]-QS_list[[S_id_comb]] + Udiff <- US_list[[S_plus_j_id_comb]]-US_list[[S_id_comb]] + + Tmu_list[[j]] <- Tmu_list[[j]] + (Qdiff1-Udiff)/ n_permutations_used + Tx_list[[j]] <- Tx_list[[j]]+ (Qdiff2+Udiff)/ n_permutations_used + + #Tmu_list[[j]] <- Tmu_list[[j]] + (Qdiff1-Udiff)/ n_permutations_used + #Tx_list[[j]] <- Tx_list[[j]]+ (Qdiff2-Udiff)/ n_permutations_used + + # Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) + # + # Tmu_list[[j]] <- (Qdiff-Udiff) / n_permutations_used + # Tx_list[[j]] <- (Qdiff+Udiff) / n_permutations_used + + these_id_combinations_mat[[j]] <- rbind(these_id_combinations_mat[[j]],these_id_combinations) + } + + # includes_j <- S[,j]==1 # Find all subsets that include feature j + # id_combination_reps[id_combination%in%which(includes_j),N] + # + # Qdiff <- Reduce("+",QS_list[includes_j])-Reduce("+",QS_list[!includes_j]) + # Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) + # + # Tmu_list[[j]] <- (Qdiff-Udiff) / n_permutations_used + # Tx_list[[j]] <- (Qdiff+Udiff) / n_permutations_used } diff --git a/R/setup_computation.R b/R/setup_computation.R index f921292ca..1e065753b 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -346,8 +346,8 @@ X_from_perm_dt <- function(perm_dt) { X_perm[, rowid:=NULL] X[, rowid:=NULL] - X_perm[, features_tmp := NULL] - X[, features_tmp := NULL] + #X_perm[, features_tmp := NULL] + #X[, features_tmp := NULL] setcolorder(X,c("id_combination","permute_id", "features", "n_features")) diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index 497b6387c..841d22a35 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -22,24 +22,33 @@ model_lm_numeric <- lm(lm_formula_numeric, data = data_complete) p0 <- data_train[, mean(get(y_var_numeric))] +x_explain_numeric_new <- rbind(x_explain_numeric,t(rep(0,4)),t(rep(1,4)),use.names=FALSE) + + + + + test <-explain(model = model_lm_numeric, - x_explain = x_explain_numeric, + x_explain = x_explain_numeric_new, x_train = x_train_numeric, approach = "gaussian", shap_approach = "permutation", - prediction_zero = p0)#n_samples = 10^5) + prediction_zero = p0,n_samples = 10^5) test #debugonce(explain_linear) + test2 <-explain_linear(model = model_lm_numeric, - x_explain = x_explain_numeric, + x_explain = x_explain_numeric_new, x_train = x_train_numeric, prediction_zero = p0) - test2 +test2$internal$objects$US_list + + ### We don't get the same results... Investigating by looking at the v(S) we get: test$internal$output$dt_vS[id_combination==2] diff --git a/inst/scripts/devel/devel_explain_linear_2.R b/inst/scripts/devel/devel_explain_linear_2.R new file mode 100644 index 000000000..2ae559659 --- /dev/null +++ b/inst/scripts/devel/devel_explain_linear_2.R @@ -0,0 +1,90 @@ +set.seed(12345) + +p <- 4 +n_train <- 10^3 +beta <- rep(1,4) +b <- 1 +mu <- rep(1,4) +Sig <- diag(4) + +data <- as.data.table(mvtnorm::rmvnorm(n_train,mean=mu,sigma=Sig)) + +y <- as.vector(b + as.matrix(data)%*%beta + rnorm(n_train,sd=.01)) + +model <- lm(y~., data = cbind(y,data)) + +p0 <- as.numeric(b+mu%*%beta) + +x_explain <- rbind(head(data,2), + t(rep(0,4)), + t(rep(1,4)), + t(1:4), + use.names=FALSE) + + +test <-explain(model = model, + x_explain = x_explain, + x_train = data, + approach = "gaussian", + shap_approach = "permutation", + prediction_zero = p0, + gaussian.cov_mat=Sig, + gaussian.mu = mu, + n_samples = 10^5) +test + + +#debugonce(explain_linear) + + +test2 <-explain_linear(model = model, + x_explain = x_explain, + x_train = data, + prediction_zero = p0, # this is not used + gaussian.cov_mat=Sig, + gaussian.mu=mu, +) +test2 + +test2$internal$objects$US_list +test2$internal$objects$QS_list +test2$internal$objects$Tmu_list +test2$internal$objects$Tx_list + + +### something odd, here. the tx_list is not looking the same for differnet features, although there is no differnce between them... +# Look more closely into how that is created + +### We don't get the same results... Investigating by looking at the v(S) we get: + +test$internal$output$dt_vS[id_combination==2] + +# Look at the second id_combination (S={1}) + +test2$internal$objects$S[c(2,31),] + +QS <- test2$internal$objects$QS_list[[2]] +QSbar <- test2$internal$objects$QS_list[[31]] +US <- test2$internal$objects$US_list[[2]] +mu <- test2$internal$parameters$gaussian.mu +Sig <- test2$internal$parameters$gaussian.cov +x <- unlist(test2$internal$data$x_explain[1,]) +coefs <- test2$internal$parameters$linear_model_coef +b <- coefs[1] +beta <- coefs[-1] + +Ex <- (QSbar-US)%*%mu + (QS+US)%*%x +beta%*%Ex + b + +test$internal$output$dt_vS[id_combination==2] + +condmu_manual <- c(x[1],mu[-1] + Sig[-1,1]%*%solve(Sig[1,1])*(x[1]-mu[1])) +# Same as Ex + + +### TODO: Check also other combinations, but seems that the formula for E[x|xS] is correct, so error must be in the Tx/Tmu stuff. + + + + + From 67144e39b3c14be3bafdf48ca7f157de7e7bf6a2 Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 17 Jan 2024 08:42:06 +0100 Subject: [PATCH 17/26] more output --- R/linear_gaussian_funcs.R | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R index 17dd48340..17a1e04b6 100644 --- a/R/linear_gaussian_funcs.R +++ b/R/linear_gaussian_funcs.R @@ -88,9 +88,13 @@ setup_linear_gaussian <- function(internal, Tmu_list <- Tx_list <- list() these_id_combinations_mat <- list() + tmp_lists <- list() for(j in seq_len(n_features)){ Tmu_list[[j]] <- Tx_list[[j]] <- matrix(0,nrow = n_features,ncol = n_features) these_id_combinations_mat[[j]] <- numeric(0) + tmp_lists[[j]] <- list(Qdiff1=list(), + Qdiff2=list(), + Udiff=list()) for(i in seq(n_permutations_used)){ perm0 <- perm_dt[permute_id==i,perm][[1]] @@ -140,6 +144,9 @@ setup_linear_gaussian <- function(internal, # Tx_list[[j]] <- (Qdiff+Udiff) / n_permutations_used these_id_combinations_mat[[j]] <- rbind(these_id_combinations_mat[[j]],these_id_combinations) + tmp_lists[[j]]$Qdiff1[[i]] <- Qdiff1 + tmp_lists[[j]]$Qdiff2[[i]] <- Qdiff2 + tmp_lists[[j]]$Udiff[[i]] <- Udiff } # includes_j <- S[,j]==1 # Find all subsets that include feature j @@ -157,6 +164,9 @@ setup_linear_gaussian <- function(internal, internal$objects$QS_list <- QS_list internal$objects$Tmu_list <- Tmu_list internal$objects$Tx_list <- Tx_list + internal$objects$these_id_combinations_mat <- these_id_combinations_mat + internal$objects$tmp_lists <- tmp_lists + internal$parameters$gaussian.mu <- gaussian.mu internal$parameters$gaussian.cov_mat <- gaussian.cov_mat From 4433714fdc39ff55e2c1da3c094abc87a4d6e82a Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 17 Jan 2024 11:30:57 +0100 Subject: [PATCH 18/26] starting to build up separate X_from_perm_dt_linear_gaussian function --- R/setup_computation.R | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/R/setup_computation.R b/R/setup_computation.R index 1e065753b..698d2f2c6 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -303,6 +303,43 @@ all_permutations <- function(x) { } } +X_from_perm_dt_linear_gaussian <- function(perm_dt){ + m <- length(perm_dt[["perm"]][[1]]) + + perm_list<- list() + for (j in seq_len(m)){ + perm_list[[j]] <- list() + } + for(i in 1:nrow(perm_dt)){ + perm0 <- perm_dt[i, perm][[1]] + for (j in seq_len(m)){ + position <- which(perm0==j) + if(position==1){ + perm_list[[j]][[i]] <- numeric(0) + } else { + perm_list[[j]][[i]] <- sort(perm0[seq_len(position-1)]) + } + } + } + + X_perm_list <- list() + for (j in seq_len(m)){ + X_perm_list[[j]] <- data.table(n_features = sapply(perm_list[[j]], length)) + X_perm_list[[j]][, n_features := as.integer(n_features)] + X_perm_list[[j]][, features := perm_list[[j]]] + X_perm_list[[j]][, permute_id := seq_len(nrow(perm_dt))] + data.table::setkeyv(X_perm_list[[j]], c("n_features","permute_id")) + X_perm_list[[j]][!is.finite(permute_id), permute_id := NA] + + # Temporary create a character string to merge back on below (cannot merge on lists) + X_perm_list[[j]][, features_tmp := sapply(features, paste, collapse = " ")] + X_perm_list[[j]][, rowid:=.I] + X_perm_list[[j]][, feature_contrib:=j] + + } + +} + X_from_perm_dt <- function(perm_dt) { m <- length(perm_dt[["perm"]][[1]]) From c2661edaaa5d8fa97d8e7f54e7b6f31bfeabe320 Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 17 Jan 2024 19:49:32 +0100 Subject: [PATCH 19/26] more on separate X mapper for linear gaussian --- R/setup_computation.R | 42 +++++++++++++++++------ inst/scripts/devel/devel_explain_linear.R | 2 +- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/R/setup_computation.R b/R/setup_computation.R index 698d2f2c6..a44c05ce0 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -234,7 +234,8 @@ feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL,pai perm_dt <- feature_permute_samp(m, n_permutations,paired_shap_sampling) } - ret <- X_from_perm_dt(perm_dt) + #ret <- X_from_perm_dt(perm_dt) + ret <- X_from_perm_dt_linear_gaussian(perm_dt) # May use this also for the regular permutation approach, to simplify the computation in the end but need to check that first ret$perm_dt = perm_dt @@ -306,19 +307,15 @@ all_permutations <- function(x) { X_from_perm_dt_linear_gaussian <- function(perm_dt){ m <- length(perm_dt[["perm"]][[1]]) - perm_list<- list() + perm_list<- list() # The perm_list includes, for all features, the S+j sets that are mapped from the permutations. The first element is the empty set to also include that for (j in seq_len(m)){ - perm_list[[j]] <- list() + perm_list[[j]] <- list(numeric(0)) } - for(i in 1:nrow(perm_dt)){ + for(i in seq_len(nrow(perm_dt))){ perm0 <- perm_dt[i, perm][[1]] for (j in seq_len(m)){ position <- which(perm0==j) - if(position==1){ - perm_list[[j]][[i]] <- numeric(0) - } else { - perm_list[[j]][[i]] <- sort(perm0[seq_len(position-1)]) - } + perm_list[[j]][[i+1]] <- sort(perm0[seq_len(position)]) } } @@ -327,17 +324,40 @@ X_from_perm_dt_linear_gaussian <- function(perm_dt){ X_perm_list[[j]] <- data.table(n_features = sapply(perm_list[[j]], length)) X_perm_list[[j]][, n_features := as.integer(n_features)] X_perm_list[[j]][, features := perm_list[[j]]] - X_perm_list[[j]][, permute_id := seq_len(nrow(perm_dt))] + X_perm_list[[j]][, permute_id := c(NA,seq_len(nrow(perm_dt)))] data.table::setkeyv(X_perm_list[[j]], c("n_features","permute_id")) X_perm_list[[j]][!is.finite(permute_id), permute_id := NA] # Temporary create a character string to merge back on below (cannot merge on lists) X_perm_list[[j]][, features_tmp := sapply(features, paste, collapse = " ")] - X_perm_list[[j]][, rowid:=.I] X_perm_list[[j]][, feature_contrib:=j] } + + X_perm <- rbindlist(X_perm_list) + data.table::setkeyv(X_perm, c("n_features","permute_id")) + + X_perm[,is_duplicate:=duplicated(features_tmp)] + X_perm[, rowid:=.I] + X <- copy(X_perm[is_duplicate==FALSE]) + X[,id_combination:=.I] + X[, is_duplicate := NULL] + + # Merge back to get the id_combination mapping also in X_perm + X_perm <- merge(X_perm,X[,.(features_tmp,id_combination)],by="features_tmp",all.x=TRUE) + data.table::setorderv(X_perm, "rowid") + X_perm[, rowid:=NULL] + X[, rowid:=NULL] + + #X_perm[, features_tmp := NULL] + #X[, features_tmp := NULL] + + setcolorder(X,c("id_combination","permute_id", "features", "n_features")) + + + ret <- list(X=X[],X_perm=X_perm[]) + } diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index 841d22a35..cdc359bc6 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -43,7 +43,7 @@ test test2 <-explain_linear(model = model_lm_numeric, x_explain = x_explain_numeric_new, x_train = x_train_numeric, - prediction_zero = p0) + prediction_zero = p0,n_permutations=4) test2 test2$internal$objects$US_list From db68a521fd99728c4d5e9435de176d90c241952e Mon Sep 17 00:00:00 2001 From: Martin Date: Wed, 17 Jan 2024 22:06:21 +0100 Subject: [PATCH 20/26] removing the Q computation to simplify + revert X_for_lin_mod Will simplify it all creating a function which computes Udiffs for a list of perms (perm_dt) instead of pre-computing stuff and extracting them. --- R/linear_gaussian_funcs.R | 12 ++-- R/setup_computation.R | 4 +- inst/scripts/devel/devel_explain_linear.R | 27 +++++++-- inst/scripts/devel/devel_explain_linear_2.R | 65 ++++++++++++++------- 4 files changed, 77 insertions(+), 31 deletions(-) diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R index 17a1e04b6..1c40a715a 100644 --- a/R/linear_gaussian_funcs.R +++ b/R/linear_gaussian_funcs.R @@ -91,10 +91,14 @@ setup_linear_gaussian <- function(internal, tmp_lists <- list() for(j in seq_len(n_features)){ Tmu_list[[j]] <- Tx_list[[j]] <- matrix(0,nrow = n_features,ncol = n_features) + Tmu_list[[j]][j,j] <- -1 + Tx_list[[j]][j,j] <- 1 + these_id_combinations_mat[[j]] <- numeric(0) tmp_lists[[j]] <- list(Qdiff1=list(), Qdiff2=list(), Udiff=list()) + for(i in seq(n_permutations_used)){ perm0 <- perm_dt[permute_id==i,perm][[1]] @@ -128,12 +132,12 @@ setup_linear_gaussian <- function(internal, Sbar_id_comb <- these_id_combinations[3] Sbar_min_j_id_comb <- these_id_combinations[4] - Qdiff1 <- QS_list[[Sbar_min_j_id_comb]]-QS_list[[Sbar_id_comb]] - Qdiff2 <- QS_list[[S_plus_j_id_comb]]-QS_list[[S_id_comb]] + #Qdiff1 <- QS_list[[Sbar_min_j_id_comb]]-QS_list[[Sbar_id_comb]] + #Qdiff2 <- QS_list[[S_plus_j_id_comb]]-QS_list[[S_id_comb]] Udiff <- US_list[[S_plus_j_id_comb]]-US_list[[S_id_comb]] - Tmu_list[[j]] <- Tmu_list[[j]] + (Qdiff1-Udiff)/ n_permutations_used - Tx_list[[j]] <- Tx_list[[j]]+ (Qdiff2+Udiff)/ n_permutations_used + Tmu_list[[j]] <- Tmu_list[[j]] -Udiff/ n_permutations_used + Tx_list[[j]] <- Tx_list[[j]] +Udiff/ n_permutations_used #Tmu_list[[j]] <- Tmu_list[[j]] + (Qdiff1-Udiff)/ n_permutations_used #Tx_list[[j]] <- Tx_list[[j]]+ (Qdiff2-Udiff)/ n_permutations_used diff --git a/R/setup_computation.R b/R/setup_computation.R index a44c05ce0..407851456 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -234,8 +234,8 @@ feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL,pai perm_dt <- feature_permute_samp(m, n_permutations,paired_shap_sampling) } - #ret <- X_from_perm_dt(perm_dt) - ret <- X_from_perm_dt_linear_gaussian(perm_dt) # May use this also for the regular permutation approach, to simplify the computation in the end but need to check that first + ret <- X_from_perm_dt(perm_dt) + #ret <- X_from_perm_dt_linear_gaussian(perm_dt) # May use this also for the regular permutation approach, to simplify the computation in the end but need to check that first ret$perm_dt = perm_dt diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index cdc359bc6..2ee41a214 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -7,7 +7,7 @@ data_complete <- data_complete[sample(seq_len(.N))] # Sh y_var_numeric <- "Ozone" -x_var_numeric <- c("Solar.R", "Wind", "Temp", "Month")#, "Day") +x_var_numeric <- c("Solar.R", "Wind", "Temp", "Month", "Day") data_train <- head(data_complete, -3) data_explain <- tail(data_complete, 3) @@ -22,7 +22,7 @@ model_lm_numeric <- lm(lm_formula_numeric, data = data_complete) p0 <- data_train[, mean(get(y_var_numeric))] -x_explain_numeric_new <- rbind(x_explain_numeric,t(rep(0,4)),t(rep(1,4)),use.names=FALSE) +x_explain_numeric_new <- rbind(x_explain_numeric,t(rep(0,length(x_var_numeric))),t(rep(1,length(x_var_numeric))),use.names=FALSE) @@ -33,8 +33,15 @@ test <-explain(model = model_lm_numeric, x_train = x_train_numeric, approach = "gaussian", shap_approach = "permutation", - prediction_zero = p0,n_samples = 10^5) + prediction_zero = p0,n_samples = 10^4) test +#none Solar.R Wind Temp Month +#1: 42.44 -8.379 7.944 15.29533 0.6522 +#2: 42.44 4.919 -4.755 -11.00089 -1.0133 +#3: 42.44 7.302 -25.612 -0.08335 -0.5415 +#4: 42.44 -12.371 72.269 -154.31630 -6.0802 +#5: 42.44 -11.922 67.037 -153.72291 -6.2782 + #debugonce(explain_linear) @@ -43,10 +50,20 @@ test test2 <-explain_linear(model = model_lm_numeric, x_explain = x_explain_numeric_new, x_train = x_train_numeric, - prediction_zero = p0,n_permutations=4) + prediction_zero = p0, + n_permutations=4, + ) test2 -test2$internal$objects$US_list +test2 <-explain_linear(model = model_lm_numeric, + x_explain = x_explain_numeric_new, + x_train = x_train_numeric, + prediction_zero = p0, +) +test2 + + +unique(test2$internal$objects$US_list) ### We don't get the same results... Investigating by looking at the v(S) we get: diff --git a/inst/scripts/devel/devel_explain_linear_2.R b/inst/scripts/devel/devel_explain_linear_2.R index 2ae559659..13e08625e 100644 --- a/inst/scripts/devel/devel_explain_linear_2.R +++ b/inst/scripts/devel/devel_explain_linear_2.R @@ -2,10 +2,15 @@ set.seed(12345) p <- 4 n_train <- 10^3 -beta <- rep(1,4) -b <- 1 -mu <- rep(1,4) -Sig <- diag(4) +beta <- rnorm(p)#rep(1,p) +b <- rnorm(1) +mu <- rnorm(p)#rep(1,p) + +# Random covariance matrix +A <- matrix(runif(p^2,min = -1,max=1), ncol=p) +Sig <- t(A) %*% A +#Sig <- diag(4) + data <- as.data.table(mvtnorm::rmvnorm(n_train,mean=mu,sigma=Sig)) @@ -16,23 +21,22 @@ model <- lm(y~., data = cbind(y,data)) p0 <- as.numeric(b+mu%*%beta) x_explain <- rbind(head(data,2), - t(rep(0,4)), - t(rep(1,4)), - t(1:4), + t(rep(0,p)), + t(rep(1,p)), + t(1:p), use.names=FALSE) -test <-explain(model = model, - x_explain = x_explain, - x_train = data, - approach = "gaussian", - shap_approach = "permutation", - prediction_zero = p0, - gaussian.cov_mat=Sig, - gaussian.mu = mu, - n_samples = 10^5) -test - +# test <-explain(model = model, +# x_explain = x_explain, +# x_train = data, +# approach = "gaussian", +# shap_approach = "permutation", +# prediction_zero = p0, +# gaussian.cov_mat=Sig, +# gaussian.mu = mu, +# n_samples = 10^4) +# test #debugonce(explain_linear) @@ -42,9 +46,30 @@ test2 <-explain_linear(model = model, x_train = data, prediction_zero = p0, # this is not used gaussian.cov_mat=Sig, - gaussian.mu=mu, + gaussian.mu=mu,n_permutations = 6 ) -test2 +test2$internal$objects$perm_dt[,.N] + +all_lists <- NULL +for(i in 1:p){ + all_lists <- c(all_lists,test2$internal$objects$tmp_lists[[i]]$Udiff) +} +length(unique(all_lists)) + +test2$internal$objects$tmp_lists[[1]]$Udiff[[1]]+test2$internal$objects$tmp_lists[[1]]$Udiff[[2]] +test2$internal$objects$tmp_lists[[1]]$Udiff[[3]]+test2$internal$objects$tmp_lists[[1]]$Udiff[[4]] +test2$internal$objects$tmp_lists[[1]]$Udiff[[5]]+test2$internal$objects$tmp_lists[[1]]$Udiff[[6]] + +test2$internal$objects$tmp_lists[[2]]$Udiff[[1]]+test2$internal$objects$tmp_lists[[2]]$Udiff[[2]] +test2$internal$objects$tmp_lists[[2]]$Udiff[[3]]+test2$internal$objects$tmp_lists[[2]]$Udiff[[4]] +test2$internal$objects$tmp_lists[[2]]$Udiff[[5]]+test2$internal$objects$tmp_lists[[2]]$Udiff[[6]] + + + +length(unique(test2$internal$objects$tmp_lists[[1]]$Udiff)) +length(unique(test2$internal$objects$tmp_lists[[2]]$Udiff)) +length(unique(test2$internal$objects$tmp_lists[[3]]$Udiff)) +length(unique(test2$internal$objects$tmp_lists[[4]]$Udiff)) test2$internal$objects$US_list test2$internal$objects$QS_list From ac7c5b09db1ac1f3c6cc7aac926fc7c1857c5339 Mon Sep 17 00:00:00 2001 From: Martin Date: Thu, 18 Jan 2024 11:01:06 +0100 Subject: [PATCH 21/26] starting to write up direct Ucomputation function --- R/linear_gaussian_funcs.R | 215 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 215 insertions(+) diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R index 1c40a715a..fc2733e7f 100644 --- a/R/linear_gaussian_funcs.R +++ b/R/linear_gaussian_funcs.R @@ -1,6 +1,221 @@ # Here I put both the setup functios and the compute_linear_gaussian functions +#' @rdname setup_approach +#' +#' @inheritParams default_doc_explain +#' @inheritParams setup_approach.gaussian +#' +#' @export +setup_linear_gaussian_new <- function(internal, + gaussian.mu = NULL, + gaussian.cov_mat = NULL, ...) { + + x_train <- internal$data$x_train + n_explain <- internal$parameters$n_explain + gaussian.cov_mat <- internal$parameters$gaussian.cov_mat + gaussian.mu <- internal$parameters$gaussian.mu + n_features <- internal$parameters$n_features + linear_model_coef <- internal$parameters$linear_model_coef + perm_dt <- internal$objects$perm_dt + + n_permutations_used <- perm_dt[,.N] + + # For consistency + defaults <- mget(c("gaussian.mu", "gaussian.cov_mat")) + internal <- insert_defaults(internal, defaults) + + x_train <- internal$data$x_train + feature_specs <- internal$objects$feature_specs + + # Checking if factor features are present + if (any(feature_specs$classes == "factor")) { + factor_features <- names(which(feature_specs$classes == "factor")) + factor_approaches <- get_factor_approaches() + stop(paste0( + "The following feature(s) are factor(s): ", factor_features, ".\n", + "approach = 'linear_gaussian' does not support factor features.\n", + "Please change approach to one of ", paste0(factor_approaches, collapse = ", "), "." + )) + } + + # If gaussian.mu is not provided directly in internal list, use mean of training data + if (is.null(gaussian.mu)) { + gaussian.mu <- get_mu_vec(x_train) + } + + # If gaussian.cov_mat is not provided directly in internal list, use sample covariance of training data + if (is.null(gaussian.cov_mat)) { + gaussian.cov_mat <- get_cov_mat(x_train) + } + + Tmu_list <- Tx_list <- list() + for(j in 1:n_features){ + Tmu_list[[j]] <- Tx_list[[j]] <- matrix(0,nrow = n_features,ncol = n_features) + Tmu_list[[j]][j,j] <- -1 + Tx_list[[j]][j,j] <- 1 + + for(i in seq_len(n_permutations_used)){ + perm0 <- perm_dt[,perm][[i]] + + position <- which(perm0==j) + PS <- PSj <- diag(n_features)*0 + + if(position==1){ + diag(PSj)[j] <- 1 + } else { + diag(PS)[perm0[seq_len(position-1)]] <- 1 + diag(PSj)[perm0[seq_len(position)]] <- 1 + } + + PS_bar <- diag(n_features)-PS + PSj_bar <- diag(n_features)-PSj + + + U_S <- t(PS_bar)%*%PS_bar%*%gaussian.cov_mat%*%t(PS)%*%solve(PS%*%gaussian.cov_mat%*%t(PS))%*%PS + U_Sj <- t(PSj_bar)%*%PSj_bar%*%gaussian.cov_mat%*%t(PSj)%*%solve(PSj%*%gaussian.cov_mat%*%t(PSj))%*%PSj + + # Could compute these here as well, and then DO not use paired sampling, but I do assume the U-computation is the most expensive part anyway, so will not do it here now. + #U_Sbar <- t(PS)%*%PS%*%gaussian.cov_mat%*%t(PS_bar)%*%solve(PS_bar%*%gaussian.cov_mat%*%t(PS_bar))%*%PS_bar + #U_Sj_bar <- t(PSj)%*%PSj%*%gaussian.cov_mat%*%t(PSj_bar)%*%solve(PSj_bar%*%gaussian.cov_mat%*%t(PSj_bar))%*%PSj_bar + + Udiff <- U_Sj-U_S + + Tmu_list[[j]] <- Tmu_list[[j]] -Udiff/ n_permutations_used + Tx_list[[j]] <- Tx_list[[j]] +Udiff/ n_permutations_used + } + + + } + + + + + + + + # Counting the number of repetitions of each row in S + id_combination_reps <- X_perm[,.N,by=id_combination] + id_combination_reps[c(1,.N),N:=n_permutations_used] + + # Computing the US and QS objects + + # Now using the formulas form True to the model or true to the data paper: https://arxiv.org/pdf/2006.16234.pdf + # Consider whether doing this with sparse matrices is faster + PS_full <- diag(n_features) + PS <- apply(S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) + PSbar <- apply(1-S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) + + # TODO: Should do this in rcpp for speed up later + US_list <- QS_list <- QSbar_list <- list() + US_list[[1]] <- US_list[[nrow(S)]] <- QS_list[[1]] <- matrix(0,nrow = n_features,ncol = n_features) + QS_list[[nrow(S)]] <- diag(n_features) + for (i in seq(2,nrow(S)-1)){ + US_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])%*%solve(PS[[i]]%*%gaussian.cov_mat%*%t(PS[[i]]))%*%PS[[i]] + QS_list[[i]] <- t(PS[[i]])%*%PS[[i]] + # QSbar_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]] # Make this every time as well since it is fast to do, even though it could be extracted from QS_list if it is always present. Change this later + } + + ### Computing the Tmu and Tx objects + # A rewrite of eq (9) in the true to the model or data paper: https://arxiv.org/pdf/2006.16234.pdf shows that + # when we force paired sampling, the Tmu and Tx objects gets a simplified formula. + # Another trick is that since the full S matrix is constructed based on the permutations, + # any row in S that contains features j, will have a corresponding row that does not contain feature j. + # Thus, we can easily find all the rows that contain feature j, and those that don't to then compute Q and U differences + # without having to map the permutations to the subsets. + + perm_dt <- internal$objects$perm_dt + + Tmu_list <- Tx_list <- list() + these_id_combinations_mat <- list() + tmp_lists <- list() + for(j in seq_len(n_features)){ + Tmu_list[[j]] <- Tx_list[[j]] <- matrix(0,nrow = n_features,ncol = n_features) + Tmu_list[[j]][j,j] <- -1 + Tx_list[[j]][j,j] <- 1 + + these_id_combinations_mat[[j]] <- numeric(0) + tmp_lists[[j]] <- list(Qdiff1=list(), + Qdiff2=list(), + Udiff=list()) + + for(i in seq(n_permutations_used)){ + + perm0 <- perm_dt[permute_id==i,perm][[1]] + + position <- which(perm0==j) + if(position==1){ + this_S <- integer(0) + this_S_plus_j <- sort(perm0[seq_len(position)]) + this_Sbar <- sort(perm0) + this_Sbar_min_j <- sort(perm0[-seq_len(position)]) + } else { + this_S <- sort(perm0[seq_len(position-1)]) + this_S_plus_j <- sort(perm0[seq_len(position)]) + this_Sbar <- sort(perm0[-seq_len(position-1)]) + this_Sbar_min_j <- sort(perm0[-seq_len(position)]) + } + + vec <- c(paste0(this_S,collapse = " "), + paste0(this_S_plus_j,collapse = " "), + paste0(this_Sbar,collapse = " "), + paste0(this_Sbar_min_j,collapse = " ")) + + merge_dt <- data.table(features_tmp=vec,id=seq_along(vec)) + + comb_dt <- merge(merge_dt,X[,.(id_combination,features_tmp)],by="features_tmp",all.x=TRUE) + data.table::setorderv(comb_dt,"id") + + these_id_combinations <- comb_dt[,id_combination] + S_id_comb <- these_id_combinations[1] + S_plus_j_id_comb <- these_id_combinations[2] + Sbar_id_comb <- these_id_combinations[3] + Sbar_min_j_id_comb <- these_id_combinations[4] + + #Qdiff1 <- QS_list[[Sbar_min_j_id_comb]]-QS_list[[Sbar_id_comb]] + #Qdiff2 <- QS_list[[S_plus_j_id_comb]]-QS_list[[S_id_comb]] + Udiff <- US_list[[S_plus_j_id_comb]]-US_list[[S_id_comb]] + + Tmu_list[[j]] <- Tmu_list[[j]] -Udiff/ n_permutations_used + Tx_list[[j]] <- Tx_list[[j]] +Udiff/ n_permutations_used + + #Tmu_list[[j]] <- Tmu_list[[j]] + (Qdiff1-Udiff)/ n_permutations_used + #Tx_list[[j]] <- Tx_list[[j]]+ (Qdiff2-Udiff)/ n_permutations_used + + # Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) + # + # Tmu_list[[j]] <- (Qdiff-Udiff) / n_permutations_used + # Tx_list[[j]] <- (Qdiff+Udiff) / n_permutations_used + + these_id_combinations_mat[[j]] <- rbind(these_id_combinations_mat[[j]],these_id_combinations) + tmp_lists[[j]]$Qdiff1[[i]] <- Qdiff1 + tmp_lists[[j]]$Qdiff2[[i]] <- Qdiff2 + tmp_lists[[j]]$Udiff[[i]] <- Udiff + } + + # includes_j <- S[,j]==1 # Find all subsets that include feature j + # id_combination_reps[id_combination%in%which(includes_j),N] + # + # Qdiff <- Reduce("+",QS_list[includes_j])-Reduce("+",QS_list[!includes_j]) + # Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) + # + # Tmu_list[[j]] <- (Qdiff-Udiff) / n_permutations_used + # Tx_list[[j]] <- (Qdiff+Udiff) / n_permutations_used + + } + + internal$objects$US_list <- US_list + internal$objects$QS_list <- QS_list + internal$objects$Tmu_list <- Tmu_list + internal$objects$Tx_list <- Tx_list + internal$objects$these_id_combinations_mat <- these_id_combinations_mat + internal$objects$tmp_lists <- tmp_lists + + internal$parameters$gaussian.mu <- gaussian.mu + internal$parameters$gaussian.cov_mat <- gaussian.cov_mat + + return(internal) +} #' @rdname setup_approach From 323fd6ebfe91c32c3b8611a74758d8a2a69b82e4 Mon Sep 17 00:00:00 2001 From: Martin Date: Thu, 18 Jan 2024 14:01:28 +0100 Subject: [PATCH 22/26] implemented working direct approach --- R/linear_gaussian_funcs.R | 163 ++++---------------- R/setup_computation.R | 4 +- inst/scripts/devel/devel_explain_linear.R | 2 +- inst/scripts/devel/devel_explain_linear_2.R | 4 +- 4 files changed, 34 insertions(+), 139 deletions(-) diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R index fc2733e7f..e693ae0bb 100644 --- a/R/linear_gaussian_funcs.R +++ b/R/linear_gaussian_funcs.R @@ -59,157 +59,52 @@ setup_linear_gaussian_new <- function(internal, perm0 <- perm_dt[,perm][[i]] position <- which(perm0==j) - PS <- PSj <- diag(n_features)*0 + PSfull <- diag(n_features) + PS <- PSj <- diag(0,n_features) if(position==1){ - diag(PSj)[j] <- 1 + #U_S <- diag(0,n_features) + +# this_Sj <- perm0[1] +# this_Sj_bar <- perm0[-1] + + PSj <- PSfull[j,,drop=FALSE] + PSj_bar <- PSfull[-j,,drop=FALSE] + +# U_Sj <- t(PSj_bar)%*%PSj_bar%*%gaussian.cov_mat%*%t(PSj)%*%solve(PSj%*%gaussian.cov_mat%*%t(PSj))%*%PSj + + Udiff <- t(PSj_bar)%*%PSj_bar%*%gaussian.cov_mat%*%t(PSj)%*%solve(PSj%*%gaussian.cov_mat%*%t(PSj))%*%PSj + } else { - diag(PS)[perm0[seq_len(position-1)]] <- 1 - diag(PSj)[perm0[seq_len(position)]] <- 1 - } + this_S <- sort(perm0[seq_len(position-1)]) + this_Sj <- sort(perm0[seq_len(position)]) + this_S_bar <- sort(perm0[-seq_len(position-1)]) + this_Sj_bar <- sort(perm0[-seq_len(position)]) - PS_bar <- diag(n_features)-PS - PSj_bar <- diag(n_features)-PSj + PS <- PSfull[this_S,,drop=FALSE] + PSj <- PSfull[this_Sj,,drop=FALSE] + PS_bar <- PSfull[this_S_bar,,drop=FALSE] + PSj_bar <- PSfull[this_Sj_bar,,drop=FALSE] + U_S <- t(PS_bar)%*%PS_bar%*%gaussian.cov_mat%*%t(PS)%*%solve(PS%*%gaussian.cov_mat%*%t(PS))%*%PS + U_Sj <- t(PSj_bar)%*%PSj_bar%*%gaussian.cov_mat%*%t(PSj)%*%solve(PSj%*%gaussian.cov_mat%*%t(PSj))%*%PSj + Udiff <- U_Sj-U_S + + } - U_S <- t(PS_bar)%*%PS_bar%*%gaussian.cov_mat%*%t(PS)%*%solve(PS%*%gaussian.cov_mat%*%t(PS))%*%PS - U_Sj <- t(PSj_bar)%*%PSj_bar%*%gaussian.cov_mat%*%t(PSj)%*%solve(PSj%*%gaussian.cov_mat%*%t(PSj))%*%PSj # Could compute these here as well, and then DO not use paired sampling, but I do assume the U-computation is the most expensive part anyway, so will not do it here now. #U_Sbar <- t(PS)%*%PS%*%gaussian.cov_mat%*%t(PS_bar)%*%solve(PS_bar%*%gaussian.cov_mat%*%t(PS_bar))%*%PS_bar #U_Sj_bar <- t(PSj)%*%PSj%*%gaussian.cov_mat%*%t(PSj_bar)%*%solve(PSj_bar%*%gaussian.cov_mat%*%t(PSj_bar))%*%PSj_bar - Udiff <- U_Sj-U_S Tmu_list[[j]] <- Tmu_list[[j]] -Udiff/ n_permutations_used Tx_list[[j]] <- Tx_list[[j]] +Udiff/ n_permutations_used } - - - } - - - - - - - - # Counting the number of repetitions of each row in S - id_combination_reps <- X_perm[,.N,by=id_combination] - id_combination_reps[c(1,.N),N:=n_permutations_used] - - # Computing the US and QS objects - - # Now using the formulas form True to the model or true to the data paper: https://arxiv.org/pdf/2006.16234.pdf - # Consider whether doing this with sparse matrices is faster - PS_full <- diag(n_features) - PS <- apply(S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) - PSbar <- apply(1-S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) - - # TODO: Should do this in rcpp for speed up later - US_list <- QS_list <- QSbar_list <- list() - US_list[[1]] <- US_list[[nrow(S)]] <- QS_list[[1]] <- matrix(0,nrow = n_features,ncol = n_features) - QS_list[[nrow(S)]] <- diag(n_features) - for (i in seq(2,nrow(S)-1)){ - US_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])%*%solve(PS[[i]]%*%gaussian.cov_mat%*%t(PS[[i]]))%*%PS[[i]] - QS_list[[i]] <- t(PS[[i]])%*%PS[[i]] - # QSbar_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]] # Make this every time as well since it is fast to do, even though it could be extracted from QS_list if it is always present. Change this later } - ### Computing the Tmu and Tx objects - # A rewrite of eq (9) in the true to the model or data paper: https://arxiv.org/pdf/2006.16234.pdf shows that - # when we force paired sampling, the Tmu and Tx objects gets a simplified formula. - # Another trick is that since the full S matrix is constructed based on the permutations, - # any row in S that contains features j, will have a corresponding row that does not contain feature j. - # Thus, we can easily find all the rows that contain feature j, and those that don't to then compute Q and U differences - # without having to map the permutations to the subsets. - - perm_dt <- internal$objects$perm_dt - - Tmu_list <- Tx_list <- list() - these_id_combinations_mat <- list() - tmp_lists <- list() - for(j in seq_len(n_features)){ - Tmu_list[[j]] <- Tx_list[[j]] <- matrix(0,nrow = n_features,ncol = n_features) - Tmu_list[[j]][j,j] <- -1 - Tx_list[[j]][j,j] <- 1 - - these_id_combinations_mat[[j]] <- numeric(0) - tmp_lists[[j]] <- list(Qdiff1=list(), - Qdiff2=list(), - Udiff=list()) - - for(i in seq(n_permutations_used)){ - - perm0 <- perm_dt[permute_id==i,perm][[1]] - - position <- which(perm0==j) - if(position==1){ - this_S <- integer(0) - this_S_plus_j <- sort(perm0[seq_len(position)]) - this_Sbar <- sort(perm0) - this_Sbar_min_j <- sort(perm0[-seq_len(position)]) - } else { - this_S <- sort(perm0[seq_len(position-1)]) - this_S_plus_j <- sort(perm0[seq_len(position)]) - this_Sbar <- sort(perm0[-seq_len(position-1)]) - this_Sbar_min_j <- sort(perm0[-seq_len(position)]) - } - - vec <- c(paste0(this_S,collapse = " "), - paste0(this_S_plus_j,collapse = " "), - paste0(this_Sbar,collapse = " "), - paste0(this_Sbar_min_j,collapse = " ")) - - merge_dt <- data.table(features_tmp=vec,id=seq_along(vec)) - - comb_dt <- merge(merge_dt,X[,.(id_combination,features_tmp)],by="features_tmp",all.x=TRUE) - data.table::setorderv(comb_dt,"id") - - these_id_combinations <- comb_dt[,id_combination] - S_id_comb <- these_id_combinations[1] - S_plus_j_id_comb <- these_id_combinations[2] - Sbar_id_comb <- these_id_combinations[3] - Sbar_min_j_id_comb <- these_id_combinations[4] - - #Qdiff1 <- QS_list[[Sbar_min_j_id_comb]]-QS_list[[Sbar_id_comb]] - #Qdiff2 <- QS_list[[S_plus_j_id_comb]]-QS_list[[S_id_comb]] - Udiff <- US_list[[S_plus_j_id_comb]]-US_list[[S_id_comb]] - - Tmu_list[[j]] <- Tmu_list[[j]] -Udiff/ n_permutations_used - Tx_list[[j]] <- Tx_list[[j]] +Udiff/ n_permutations_used - - #Tmu_list[[j]] <- Tmu_list[[j]] + (Qdiff1-Udiff)/ n_permutations_used - #Tx_list[[j]] <- Tx_list[[j]]+ (Qdiff2-Udiff)/ n_permutations_used - - # Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) - # - # Tmu_list[[j]] <- (Qdiff-Udiff) / n_permutations_used - # Tx_list[[j]] <- (Qdiff+Udiff) / n_permutations_used - - these_id_combinations_mat[[j]] <- rbind(these_id_combinations_mat[[j]],these_id_combinations) - tmp_lists[[j]]$Qdiff1[[i]] <- Qdiff1 - tmp_lists[[j]]$Qdiff2[[i]] <- Qdiff2 - tmp_lists[[j]]$Udiff[[i]] <- Udiff - } - - # includes_j <- S[,j]==1 # Find all subsets that include feature j - # id_combination_reps[id_combination%in%which(includes_j),N] - # - # Qdiff <- Reduce("+",QS_list[includes_j])-Reduce("+",QS_list[!includes_j]) - # Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) - # - # Tmu_list[[j]] <- (Qdiff-Udiff) / n_permutations_used - # Tx_list[[j]] <- (Qdiff+Udiff) / n_permutations_used - - } - - internal$objects$US_list <- US_list - internal$objects$QS_list <- QS_list internal$objects$Tmu_list <- Tmu_list internal$objects$Tx_list <- Tx_list - internal$objects$these_id_combinations_mat <- these_id_combinations_mat - internal$objects$tmp_lists <- tmp_lists internal$parameters$gaussian.mu <- gaussian.mu internal$parameters$gaussian.cov_mat <- gaussian.cov_mat @@ -363,8 +258,8 @@ setup_linear_gaussian <- function(internal, # Tx_list[[j]] <- (Qdiff+Udiff) / n_permutations_used these_id_combinations_mat[[j]] <- rbind(these_id_combinations_mat[[j]],these_id_combinations) - tmp_lists[[j]]$Qdiff1[[i]] <- Qdiff1 - tmp_lists[[j]]$Qdiff2[[i]] <- Qdiff2 +# tmp_lists[[j]]$Qdiff1[[i]] <- Qdiff1 +# tmp_lists[[j]]$Qdiff2[[i]] <- Qdiff2 tmp_lists[[j]]$Udiff[[i]] <- Udiff } diff --git a/R/setup_computation.R b/R/setup_computation.R index 407851456..065acd826 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -28,8 +28,8 @@ setup_computation_linear_gaussian <- function(internal){ internal <- shapley_setup(internal) # Setup for the linear gaussian method: compute U, Q, Tx and Tmu - internal <- setup_linear_gaussian(internal) - + #internal <- setup_linear_gaussian(internal) + internal <- setup_linear_gaussian_new(internal) } diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index 2ee41a214..d7cf7bc6c 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -7,7 +7,7 @@ data_complete <- data_complete[sample(seq_len(.N))] # Sh y_var_numeric <- "Ozone" -x_var_numeric <- c("Solar.R", "Wind", "Temp", "Month", "Day") +x_var_numeric <- c("Solar.R", "Wind", "Temp", "Month")#, "Day") data_train <- head(data_complete, -3) data_explain <- tail(data_complete, 3) diff --git a/inst/scripts/devel/devel_explain_linear_2.R b/inst/scripts/devel/devel_explain_linear_2.R index 13e08625e..37ba54a28 100644 --- a/inst/scripts/devel/devel_explain_linear_2.R +++ b/inst/scripts/devel/devel_explain_linear_2.R @@ -1,6 +1,6 @@ set.seed(12345) -p <- 4 +p <- 8 n_train <- 10^3 beta <- rnorm(p)#rep(1,p) b <- rnorm(1) @@ -46,7 +46,7 @@ test2 <-explain_linear(model = model, x_train = data, prediction_zero = p0, # this is not used gaussian.cov_mat=Sig, - gaussian.mu=mu,n_permutations = 6 + gaussian.mu=mu,n_permutations = 500 ) test2$internal$objects$perm_dt[,.N] From aca5fe2642cfb98b74e68043312234aa26e2c3b7 Mon Sep 17 00:00:00 2001 From: Martin Date: Thu, 18 Jan 2024 16:00:02 +0100 Subject: [PATCH 23/26] move to perm_list for lineargaussian --- R/linear_gaussian_funcs.R | 6 +- R/setup_computation.R | 66 +++++++++++++-------- inst/scripts/devel/TODO permute branch | 14 +++++ inst/scripts/devel/devel_explain_linear.R | 2 +- inst/scripts/devel/devel_explain_linear_2.R | 31 ++++++---- 5 files changed, 76 insertions(+), 43 deletions(-) create mode 100644 inst/scripts/devel/TODO permute branch diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R index e693ae0bb..d6077d55a 100644 --- a/R/linear_gaussian_funcs.R +++ b/R/linear_gaussian_funcs.R @@ -17,9 +17,9 @@ setup_linear_gaussian_new <- function(internal, gaussian.mu <- internal$parameters$gaussian.mu n_features <- internal$parameters$n_features linear_model_coef <- internal$parameters$linear_model_coef - perm_dt <- internal$objects$perm_dt + perm_list <- internal$objects$perm_list - n_permutations_used <- perm_dt[,.N] + n_permutations_used <- length(perm_list) # For consistency defaults <- mget(c("gaussian.mu", "gaussian.cov_mat")) @@ -56,7 +56,7 @@ setup_linear_gaussian_new <- function(internal, Tx_list[[j]][j,j] <- 1 for(i in seq_len(n_permutations_used)){ - perm0 <- perm_dt[,perm][[i]] + perm0 <- perm_list[[i]] position <- which(perm0==j) PSfull <- diag(n_features) diff --git a/R/setup_computation.R b/R/setup_computation.R index 065acd826..08637ae97 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -24,8 +24,8 @@ setup_computation <- function(internal, model, predict_model) { setup_computation_linear_gaussian <- function(internal){ - # setup the Shapley framework - internal <- shapley_setup(internal) + # setup the Shapley framework (only appending the perm_list) + internal <- shapley_setup_linear_gaussian(internal) # Setup for the linear gaussian method: compute U, Q, Tx and Tmu #internal <- setup_linear_gaussian(internal) @@ -33,6 +33,24 @@ setup_computation_linear_gaussian <- function(internal){ } +shapley_setup_linear_gaussian <- function(internal) { + exact <- internal$parameters$exact + n_features0 <- internal$parameters$n_features + n_permutations <- internal$parameters$n_permutations + + perm_list <- feature_combinations_perm( + m = n_features0, + exact = exact, + n_permutations = n_permutations, + paired_shap_sampling = TRUE, + return_perm_list_only = TRUE + ) + internal$objects$perm_list <- perm_list + + return(internal) +} + + #' @keywords internal shapley_setup_forecast <- function(internal) { exact <- internal$parameters$exact @@ -211,7 +229,8 @@ shapley_setup <- function(internal) { return(internal) } -feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL,paired_shap_sampling=FALSE) { + +feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL,paired_shap_sampling=FALSE,return_perm_list_only = FALSE) { if (!exact) { # Switch to exact for feature-wise method @@ -229,23 +248,32 @@ feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL,pai } if (exact) { - perm_dt <- feature_permute_exact(m) + perm_list <- feature_permute_exact(m) } else { - perm_dt <- feature_permute_samp(m, n_permutations,paired_shap_sampling) + perm_list <- feature_permute_samp(m, n_permutations,paired_shap_sampling) } + if(return_perm_list_only == TRUE){ + ret <- perm_list + } else { + + perm_dt <- data.table( + permute_id = seq_along(perm_list), + perm = as.list(perm_list) + ) - ret <- X_from_perm_dt(perm_dt) - #ret <- X_from_perm_dt_linear_gaussian(perm_dt) # May use this also for the regular permutation approach, to simplify the computation in the end but need to check that first + ret <- X_from_perm_dt(perm_dt) + ret$perm_list = perm_list + ret$perm_dt = perm_dt # Here this is a data.table of the permutations - ret$perm_dt = perm_dt + } return(ret) } -feature_permute_samp <- function(m, n_permutations,paired_shap_sampling) { +feature_permute_samp <- function(m, n_permutations,paired_shap_sampling=TRUE) { x <- seq(m) - perms <- perms_paired <- vector("list", n_permutations) + perms <- vector("list", n_permutations) perms[[1]] <- sample(x) perms[[2]] <- rev(perms[[1]]) @@ -266,27 +294,13 @@ feature_permute_samp <- function(m, n_permutations,paired_shap_sampling) { } } - # Create data.table with id_combination and features columns - dt <- data.table( - permute_id = seq(n_permutations), - perm = perms - ) - - return(dt) + return(perms) } feature_permute_exact <- function(m) { # Generate all possible combinations - perms <- all_permutations(seq(m)) - - # Create data.table with id_combination and features columns - dt <- data.table( - permute_id = seq_along(perms), - perm = as.list(perms) - ) - - return(dt) + return(all_permutations(seq(m))) } all_permutations <- function(x) { diff --git a/inst/scripts/devel/TODO permute branch b/inst/scripts/devel/TODO permute branch new file mode 100644 index 000000000..a2c126de2 --- /dev/null +++ b/inst/scripts/devel/TODO permute branch @@ -0,0 +1,14 @@ +TODO: permute-branch + +- General: + - Make sampling permutations faster (Rcpp). This could very well also be a matrix. + - Clean up unused stuff + - Merge master branch in + - Make the creation of the S and X matrices faster + - Think about how to do the looping with + +- LinearGaussian: + - write the looping over Tmu/Tmx in Rcpp for speed + +- Permute: + - make finalization part faster diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index d7cf7bc6c..95224bc23 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -33,7 +33,7 @@ test <-explain(model = model_lm_numeric, x_train = x_train_numeric, approach = "gaussian", shap_approach = "permutation", - prediction_zero = p0,n_samples = 10^4) + prediction_zero = p0,n_samples = 10^5) test #none Solar.R Wind Temp Month #1: 42.44 -8.379 7.944 15.29533 0.6522 diff --git a/inst/scripts/devel/devel_explain_linear_2.R b/inst/scripts/devel/devel_explain_linear_2.R index 37ba54a28..4d07338cc 100644 --- a/inst/scripts/devel/devel_explain_linear_2.R +++ b/inst/scripts/devel/devel_explain_linear_2.R @@ -1,4 +1,5 @@ set.seed(12345) +library(data.table) p <- 8 n_train <- 10^3 @@ -26,28 +27,32 @@ x_explain <- rbind(head(data,2), t(1:p), use.names=FALSE) - -# test <-explain(model = model, -# x_explain = x_explain, -# x_train = data, -# approach = "gaussian", -# shap_approach = "permutation", -# prediction_zero = p0, -# gaussian.cov_mat=Sig, -# gaussian.mu = mu, -# n_samples = 10^4) -# test +profvis({ + test <-explain(model = model, + x_explain = x_explain, + x_train = data, + approach = "gaussian", + shap_approach = "permutation", + prediction_zero = p0, + gaussian.cov_mat=Sig, + gaussian.mu = mu, + n_samples = 10^4, + n_permutations=100) +}) + + test #debugonce(explain_linear) - +profvis({ test2 <-explain_linear(model = model, x_explain = x_explain, x_train = data, prediction_zero = p0, # this is not used gaussian.cov_mat=Sig, - gaussian.mu=mu,n_permutations = 500 + gaussian.mu=mu,n_permutations = 5000 ) +}) test2$internal$objects$perm_dt[,.N] all_lists <- NULL From 293a30bfa76a041dd978a6446307c265f1dbd273 Mon Sep 17 00:00:00 2001 From: Martin Date: Thu, 18 Jan 2024 16:05:30 +0100 Subject: [PATCH 24/26] alter GHA to run only when "ready for review" --- .github/workflows/R-CMD-check.yaml | 2 ++ .github/workflows/lint-changed-files.yaml | 1 + .github/workflows/lint.yaml | 2 ++ .github/workflows/pkgdown.yaml | 2 ++ 4 files changed, 7 insertions(+) diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 39a2d8a60..4b4dfc783 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -22,6 +22,8 @@ on: branches: [main, master, cranversion, devel] pull_request: branches: [main, master, cranversion, devel] + types: [ready_for_review] + name: R-CMD-check diff --git a/.github/workflows/lint-changed-files.yaml b/.github/workflows/lint-changed-files.yaml index 7f71f45f0..1ff463b26 100644 --- a/.github/workflows/lint-changed-files.yaml +++ b/.github/workflows/lint-changed-files.yaml @@ -9,6 +9,7 @@ on: pull_request: branches: [main, master] + types: [ready_for_review] name: lint-changed-files diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 2b2230e99..aede54aac 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -11,6 +11,8 @@ on: branches: [main, master] pull_request: branches: [main, master] + types: [ready_for_review] + name: lint diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 9fc1a221b..57247d4e5 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -11,6 +11,8 @@ on: branches: [main, master] pull_request: branches: [main, master] + types: [ready_for_review] + release: types: [published] workflow_dispatch: From cf59d1179810e4b5a2b7bc95d48a1dda3bf3deaf Mon Sep 17 00:00:00 2001 From: Martin Date: Fri, 19 Jan 2024 08:55:20 +0100 Subject: [PATCH 25/26] remove uncessary arguments --- R/explain_linear.R | 18 ++++-------------- inst/scripts/devel/TODO permute branch | 5 ++++- inst/scripts/devel/devel_explain_linear.R | 1 - 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/R/explain_linear.R b/R/explain_linear.R index 2f4579997..71a10b411 100644 --- a/R/explain_linear.R +++ b/R/explain_linear.R @@ -9,14 +9,10 @@ explain_linear <- function(model, x_explain, x_train, - prediction_zero, - n_combinations = NULL, n_permutations = NULL, group = NULL, - n_samples = 1e3, n_batches = NULL, seed = 1, - keep_samp_for_vS = FALSE, predict_model = NULL, get_model_specs = NULL, MSEv_uniform_comb_weights = TRUE, @@ -45,14 +41,14 @@ explain_linear <- function(model, approach = "gaussian", # always set to "gaussian" although we never really use this argument for linear_gaussian shap_approach = "permutation", # Always use the permute shap_approach paired_shap_sampling = TRUE, # Always use paired sampling since simplified computation of the required Q and U objects requires it - prediction_zero = prediction_zero, - n_combinations = n_combinations, # We always set the n_permutations instead + prediction_zero = 0, # Never used, we extract this from the model object instead. + n_combinations = NULL, # We always set the n_permutations instead n_permutations = n_permutations, group = group, - n_samples = n_samples, + n_samples = 1, # Not applicable for the linear_gaussian method as no sampling is done n_batches = n_batches, seed = seed, - keep_samp_for_vS = keep_samp_for_vS, + keep_samp_for_vS = FALSE, # Not applicable for the linear_gaussian method as no sampling is done feature_specs = feature_specs, MSEv_uniform_comb_weights = MSEv_uniform_comb_weights, timing = timing, @@ -79,7 +75,6 @@ explain_linear <- function(model, timing_list$test_prediction <- Sys.time() - # Computes the necessary objects for the linear Gaussian approach internal <- setup_computation_linear_gaussian(internal) @@ -94,11 +89,6 @@ explain_linear <- function(model, output$timing <- compute_time(timing_list) } - # Temporary to avoid failing tests - - output$internal$objects$id_combination_mapper_dt <- NULL - output$internal$objects$cols_per_horizon <- NULL - output$internal$objects$W_list <- NULL return(output) } diff --git a/inst/scripts/devel/TODO permute branch b/inst/scripts/devel/TODO permute branch index a2c126de2..53ff200b5 100644 --- a/inst/scripts/devel/TODO permute branch +++ b/inst/scripts/devel/TODO permute branch @@ -3,12 +3,15 @@ TODO: permute-branch - General: - Make sampling permutations faster (Rcpp). This could very well also be a matrix. - Clean up unused stuff - - Merge master branch in + - Merge master branch in here - Make the creation of the S and X matrices faster - Think about how to do the looping with + - Implement grouping + - Create a function which takes the explain object, and a new x_explain input and allows computing the shapley values for that super-fast using the Tmx&Tmu lists. - LinearGaussian: - write the looping over Tmu/Tmx in Rcpp for speed + - Add MSE computation - Permute: - make finalization part faster diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index 95224bc23..2a26f994f 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -50,7 +50,6 @@ test test2 <-explain_linear(model = model_lm_numeric, x_explain = x_explain_numeric_new, x_train = x_train_numeric, - prediction_zero = p0, n_permutations=4, ) test2 From bcddf2438eb7135c153777f5f9e973cbdc1f8fad Mon Sep 17 00:00:00 2001 From: Martin Date: Tue, 23 Jan 2024 12:50:31 +0100 Subject: [PATCH 26/26] implement faster permutation sampling with the ranking/unraking approach --- R/explain_linear.R | 7 +- R/linear_gaussian_funcs.R | 17 +- R/setup_computation.R | 110 +++++++----- inst/scripts/devel/TODO permute branch | 4 +- inst/scripts/devel/devel_explain_linear.R | 3 +- inst/scripts/devel/devel_explain_linear_2.R | 7 +- .../devel/devel_permutation_sampling.R | 168 ++++++++++++++++++ 7 files changed, 250 insertions(+), 66 deletions(-) create mode 100644 inst/scripts/devel/devel_permutation_sampling.R diff --git a/R/explain_linear.R b/R/explain_linear.R index 71a10b411..cbaae4a08 100644 --- a/R/explain_linear.R +++ b/R/explain_linear.R @@ -76,10 +76,15 @@ explain_linear <- function(model, timing_list$test_prediction <- Sys.time() # Computes the necessary objects for the linear Gaussian approach - internal <- setup_computation_linear_gaussian(internal) + internal <- shapley_setup_linear_gaussian(internal) timing_list$setup_computation <- Sys.time() + internal <- compute_linear_gaussian_Tmu_Tx(internal,...) + + timing_list$compute_Tmu_Tx <- Sys.time() + + # Compute Shapley values with the linear Gaussian method output <- compute_shapley_linear_gaussian(internal = internal) diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R index d6077d55a..7f84aecd2 100644 --- a/R/linear_gaussian_funcs.R +++ b/R/linear_gaussian_funcs.R @@ -1,15 +1,13 @@ # Here I put both the setup functios and the compute_linear_gaussian functions -#' @rdname setup_approach -#' #' @inheritParams default_doc_explain #' @inheritParams setup_approach.gaussian #' #' @export -setup_linear_gaussian_new <- function(internal, - gaussian.mu = NULL, - gaussian.cov_mat = NULL, ...) { +compute_linear_gaussian_Tmu_Tx <- function(internal, + gaussian.mu = NULL, + gaussian.cov_mat = NULL, ...) { x_train <- internal$data$x_train n_explain <- internal$parameters$n_explain @@ -17,9 +15,9 @@ setup_linear_gaussian_new <- function(internal, gaussian.mu <- internal$parameters$gaussian.mu n_features <- internal$parameters$n_features linear_model_coef <- internal$parameters$linear_model_coef - perm_list <- internal$objects$perm_list + perms_mat <- internal$objects$perms_mat - n_permutations_used <- length(perm_list) + n_permutations_used <- nrow(perms_mat) # For consistency defaults <- mget(c("gaussian.mu", "gaussian.cov_mat")) @@ -56,7 +54,10 @@ setup_linear_gaussian_new <- function(internal, Tx_list[[j]][j,j] <- 1 for(i in seq_len(n_permutations_used)){ - perm0 <- perm_list[[i]] + ### TODO: Call which(perms_mat == j,arr.ind=TRUE) on the outside, then extract + # all which have the first and last position and handle those separately + + perm0 <- perms_mat[i,] position <- which(perm0==j) PSfull <- diag(n_features) diff --git a/R/setup_computation.R b/R/setup_computation.R index 08637ae97..8e278691c 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -22,30 +22,20 @@ setup_computation <- function(internal, model, predict_model) { return(internal) } -setup_computation_linear_gaussian <- function(internal){ - - # setup the Shapley framework (only appending the perm_list) - internal <- shapley_setup_linear_gaussian(internal) - - # Setup for the linear gaussian method: compute U, Q, Tx and Tmu - #internal <- setup_linear_gaussian(internal) - internal <- setup_linear_gaussian_new(internal) - -} shapley_setup_linear_gaussian <- function(internal) { exact <- internal$parameters$exact n_features0 <- internal$parameters$n_features n_permutations <- internal$parameters$n_permutations - perm_list <- feature_combinations_perm( + perms_mat <- feature_combinations_perm( m = n_features0, exact = exact, n_permutations = n_permutations, paired_shap_sampling = TRUE, return_perm_list_only = TRUE ) - internal$objects$perm_list <- perm_list + internal$objects$perms_mat <- perms_mat return(internal) } @@ -235,7 +225,8 @@ feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL,pai if (!exact) { # Switch to exact for feature-wise method if (n_permutations >= factorial(m)) { - n_combinations <- factorial(m) + n_combinations <- 2^m + n_permutations <- factorial(m) exact <- TRUE message( paste0( @@ -248,14 +239,19 @@ feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL,pai } if (exact) { - perm_list <- feature_permute_exact(m) + perms_mat <- feature_permute_exact(m) } else { - perm_list <- feature_permute_samp(m, n_permutations,paired_shap_sampling) + perms_mat <- feature_permute_samp(m, n_permutations,paired_shap_sampling) } if(return_perm_list_only == TRUE){ - ret <- perm_list + return(perms_mat) } else { + # Convert the matrix to a list of row vectors + perm_list <- lapply(seq_len(nrow(perms_mat)), function(i) perms_mat[i, ]) + # TODO: rewrite code using this to work direcrly with the perms_mat + + perm_dt <- data.table( permute_id = seq_along(perm_list), perm = as.list(perm_list) @@ -270,54 +266,70 @@ feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL,pai return(ret) } +unrank_permutations_mat <- function(ranks, n) { + # Precompute factorials + factorials <- sapply(0:(n-1), factorial) -feature_permute_samp <- function(m, n_permutations,paired_shap_sampling=TRUE) { - x <- seq(m) - perms <- vector("list", n_permutations) - perms[[1]] <- sample(x) - perms[[2]] <- rev(perms[[1]]) + # Initialize the matrix of permutations + permutations <- matrix(nrow = length(ranks), ncol = n) - if(paired_shap_sampling==TRUE){ - for_seq <- seq(3,n_permutations,by=2) - } else { - for_seq <- seq(2,n_permutations,by=1) - } + for (r in seq_along(ranks)) { + rank <- ranks[r] + elements <- 1:n + permutation <- integer(n) - for (i in for_seq) { - perm <- sample(x) - while (any(sapply(perms[1:(i-1)], function(p) all(p == perm)))) { # not perfect as does not check the rev_perm, but OK for now - perm <- sample(x) # repeat until we get a unique sample - } - perms[[i]] <- perm - if(paired_shap_sampling==TRUE){ - perms[[i+1]] <- rev(perm) + for(i in seq_len(n)){ + factorial <- factorials[n - i + 1] + index <- rank %/% factorial + rank <- rank %% factorial + permutation[i] <- elements[index + 1] + elements <- elements[-(index + 1)] } + + permutations[r, ] <- permutation } - return(perms) + return(permutations) } -feature_permute_exact <- function(m) { +feature_permute_samp <- function(m, n_permutations,paired_shap_sampling=TRUE) { + # This function generates random permutations of the features using the + # ranking/unranking approach - # Generate all possible combinations - return(all_permutations(seq(m))) -} + tot_no_permutations <- factorial(m) -all_permutations <- function(x) { - if (length(x) <= 1) { - return(list(x)) + # Generate unique random permutation rankings that ought to be transformed to permutations + ranks = sample.int(tot_no_permutations, n_permutations, replace = FALSE)-1 + + # Generates the permutations + perms <- unrank_permutations_mat(ranks,m) + + if(paired_shap_sampling==TRUE){ + # Gets the reverse (paired) permutations + rev_perms <- perms[,seq(m,1)] + + # Since there is no guarantee that the reverse permutations are not sampled in perms, + # we combine the two one by one to ensure that the reverse permutations are always + # present + # Note that we sample 2*n_permutations permutations also in the case of paired_shap_sampling = TRUE since we need to guarantee that we at least have n_permutations unique permutations + comb_perms <- matrix(NA,nrow=2*n_permutations,ncol=m) + comb_perms[seq(1,2*n_permutations-1,by=2),] <- perms + comb_perms[seq(2,2*n_permutations,by=2),] <- rev_perms + + final_perms_mat <- unique(comb_perms)[seq_len(n_permutations),] + return(final_perms_mat) } else { - perms <- list() - for (i in seq_along(x)) { - sub_perms <- all_permutations(x[-i]) - for (j in seq_along(sub_perms)) { - perms[[length(perms) + 1]] <- c(x[i], sub_perms[[j]]) - } - } return(perms) } } +feature_permute_exact <- function(m) { + # Generate all possible combinations of the features using the ranking/unranking approach + + ranks <- seq(factorial(m))-1 + return(unrank_permutations_mat(ranks = ranks, n = m)) +} + X_from_perm_dt_linear_gaussian <- function(perm_dt){ m <- length(perm_dt[["perm"]][[1]]) diff --git a/inst/scripts/devel/TODO permute branch b/inst/scripts/devel/TODO permute branch index 53ff200b5..aaeaa3a1b 100644 --- a/inst/scripts/devel/TODO permute branch +++ b/inst/scripts/devel/TODO permute branch @@ -1,9 +1,9 @@ TODO: permute-branch - General: - - Make sampling permutations faster (Rcpp). This could very well also be a matrix. + - Make sampling permutations faster (Rcpp). This could very well also be a matrix. Moreover, look at what is the slow part of the sampling-code using Rprof(vis) - Clean up unused stuff - - Merge master branch in here + X- Merge master branch in here - Make the creation of the S and X matrices faster - Think about how to do the looping with - Implement grouping diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R index 2a26f994f..a5bfb92e5 100644 --- a/inst/scripts/devel/devel_explain_linear.R +++ b/inst/scripts/devel/devel_explain_linear.R @@ -33,7 +33,7 @@ test <-explain(model = model_lm_numeric, x_train = x_train_numeric, approach = "gaussian", shap_approach = "permutation", - prediction_zero = p0,n_samples = 10^5) + prediction_zero = p0,n_samples = 10^4) test #none Solar.R Wind Temp Month #1: 42.44 -8.379 7.944 15.29533 0.6522 @@ -57,7 +57,6 @@ test2 test2 <-explain_linear(model = model_lm_numeric, x_explain = x_explain_numeric_new, x_train = x_train_numeric, - prediction_zero = p0, ) test2 diff --git a/inst/scripts/devel/devel_explain_linear_2.R b/inst/scripts/devel/devel_explain_linear_2.R index 4d07338cc..05c1fbc31 100644 --- a/inst/scripts/devel/devel_explain_linear_2.R +++ b/inst/scripts/devel/devel_explain_linear_2.R @@ -36,8 +36,8 @@ profvis({ prediction_zero = p0, gaussian.cov_mat=Sig, gaussian.mu = mu, - n_samples = 10^4, - n_permutations=100) + n_samples = 10, + n_permutations=10^4) }) test @@ -48,9 +48,8 @@ profvis({ test2 <-explain_linear(model = model, x_explain = x_explain, x_train = data, - prediction_zero = p0, # this is not used gaussian.cov_mat=Sig, - gaussian.mu=mu,n_permutations = 5000 + gaussian.mu=mu,n_permutations = 10^4 ) }) test2$internal$objects$perm_dt[,.N] diff --git a/inst/scripts/devel/devel_permutation_sampling.R b/inst/scripts/devel/devel_permutation_sampling.R new file mode 100644 index 000000000..1448729c7 --- /dev/null +++ b/inst/scripts/devel/devel_permutation_sampling.R @@ -0,0 +1,168 @@ + + +unrank_permutation <- function(rank, n) { + elements <- 1:n + permutation <- integer(n) + + for(i in 1:n){ + factorial <- factorial(n - i) + index <- rank %/% factorial + rank <- rank %% factorial + permutation[i] <- elements[index+1] + elements <- elements[-(index+1)] + } + + return(permutation) +} + +unrank_permutations <- function(ranks, n) { + # Precompute factorials + factorials <- sapply(0:(n-1), factorial) + + # Initialize the list of permutations + permutations <- vector("list", length(ranks)) + + for (r in seq_along(ranks)) { + rank <- ranks[r] + elements <- 1:n + permutation <- integer(n) + + for(i in seq_len(n)){ + factorial <- factorials[n - i + 1] + index <- rank %/% factorial + rank <- rank %% factorial + permutation[i] <- elements[index + 1] + elements <- elements[-(index + 1)] + } + + permutations[[r]] <- permutation + } + + return(permutations) +} + +unrank_permutations_mat <- function(ranks, n) { + # Precompute factorials + factorials <- sapply(0:(n-1), factorial) + + # Initialize the matrix of permutations + permutations <- matrix(nrow = length(ranks), ncol = n) + + for (r in seq_along(ranks)) { + rank <- ranks[r] + elements <- 1:n + permutation <- integer(n) + + for(i in seq_len(n)){ + factorial <- factorials[n - i + 1] + index <- rank %/% factorial + rank <- rank %% factorial + permutation[i] <- elements[index + 1] + elements <- elements[-(index + 1)] + } + + permutations[r, ] <- permutation + } + + return(permutations) +} + +aa=unrank_permutations_mat(seq_len(factorial(n))-1,n) + +n <- 8 +tot_no_permutations <- factorial(n) + +samples <- factorial(n)#5*10^5 + +ranks = sample.int(tot_no_permutations, samples, replace = FALSE) + +perms <- lapply(ranks, unrank_permutation, n = n) + +perms <- unrank_permutations(1:24,n) # This is the fastest + +#### My coding here #### + +#### THIS SEEMS TO WORK + +n <- 4 +tot_no_permutations <- factorial(n) + +samples <- 10#factorial(n)#5*10^5 + +ranks = sample.int(tot_no_permutations, samples, replace = FALSE)-1 + +perms <- unrank_permutations_mat(ranks,n) # This is the fastest + +rev_perms <- perms[,seq(n,1)] + +comb_perms <- matrix(NA,nrow=samples*2,ncol=n) +comb_perms[seq(1,2*samples-1,by=2),] <- perms +comb_perms[seq(2,2*samples,by=2),] <- rev_perms + +unique(comb_perms)[seq_len(samples),] + +#### higher dim #### + +n <- 15 +tot_no_permutations <- factorial(n) + +samples <- 10^5#factorial(n)#5*10^5 + +ranks = sample.int(tot_no_permutations, samples, replace = FALSE)-1 + +perms <- unrank_permutations_mat(ranks,n) # This is the fastest + +rev_perms <- perms[,seq(n,1)] + +comb_perms <- matrix(NA,nrow=samples*2,ncol=n) +comb_perms[seq(1,2*samples-1,by=2),] <- perms +comb_perms[seq(2,2*samples,by=2),] <- rev_perms + +final_perms <- unique(comb_perms)[seq_len(samples),] + + + +############### + +# Number of unique permutations desired +k <- 20 + +# Length of the vector +n <- 4 + +# Total number of permutations for a list of length n +total_perm <- factorial(n) + +# Generate k/2 random ranks +set.seed(123) # for reproducibility +ranks <- sample(0:(total_perm / 2 - 1), k / 2) + +# Calculate ranks for the reversed permutations +rev_ranks <- total_perm - 1 - ranks + +# Combine the ranks +all_ranks <- c(ranks, rev_ranks) + +# Generate the permutations +permutations <- unrank_permutations(all_ranks, n) + + + + + +############### + +lehmer_code <- function(permutation) { + n <- length(permutation) + code <- rep(0, n) + for (i in 1:n) { + for (j in (i+1):n) { + if (permutation[j] < permutation[i]) { + code[i] <- code[i] + 1 + } + } + } + return(code) +} + +print(lehmer_code(c(4,3,1,2)))