diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 39a2d8a60..4b4dfc783 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -22,6 +22,8 @@ on: branches: [main, master, cranversion, devel] pull_request: branches: [main, master, cranversion, devel] + types: [ready_for_review] + name: R-CMD-check diff --git a/.github/workflows/lint-changed-files.yaml b/.github/workflows/lint-changed-files.yaml index 7f71f45f0..1ff463b26 100644 --- a/.github/workflows/lint-changed-files.yaml +++ b/.github/workflows/lint-changed-files.yaml @@ -9,6 +9,7 @@ on: pull_request: branches: [main, master] + types: [ready_for_review] name: lint-changed-files diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 2b2230e99..aede54aac 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -11,6 +11,8 @@ on: branches: [main, master] pull_request: branches: [main, master] + types: [ready_for_review] + name: lint diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 9fc1a221b..57247d4e5 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -11,6 +11,8 @@ on: branches: [main, master] pull_request: branches: [main, master] + types: [ready_for_review] + release: types: [published] workflow_dispatch: diff --git a/NAMESPACE b/NAMESPACE index 6537b3bcb..366d80844 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,6 +33,7 @@ S3method(prepare_data,ctree) S3method(prepare_data,empirical) S3method(prepare_data,gaussian) S3method(prepare_data,independence) +S3method(prepare_data,linear_gaussian) S3method(prepare_data,timeseries) S3method(print,shapr) S3method(setup_approach,categorical) @@ -49,6 +50,7 @@ export(compute_vS) export(correction_matrix_cpp) export(explain) export(explain_forecast) +export(explain_linear) export(feature_combinations) export(feature_matrix_cpp) export(finalize_explanation) @@ -69,6 +71,7 @@ export(rss_cpp) export(setup) export(setup_approach) export(setup_computation) +export(setup_linear_gaussian) export(weight_matrix_cpp) importFrom(Rcpp,sourceCpp) importFrom(data.table,":=") diff --git a/R/explain.R b/R/explain.R index ca354684f..bc3f60c4f 100644 --- a/R/explain.R +++ b/R/explain.R @@ -255,8 +255,11 @@ explain <- function(model, x_explain, x_train, approach, + shap_approach = "kernel", + paired_shap_sampling = FALSE, prediction_zero, n_combinations = NULL, + n_permutations = NULL, group = NULL, n_samples = 1e3, n_batches = NULL, @@ -285,8 +288,11 @@ explain <- function(model, x_train = x_train, x_explain = x_explain, approach = approach, + shap_approach = shap_approach, + paired_shap_sampling = paired_shap_sampling, prediction_zero = prediction_zero, n_combinations = n_combinations, + n_permutations = n_permutations, group = group, n_samples = n_samples, n_batches = n_batches, diff --git a/R/explain_linear.R b/R/explain_linear.R new file mode 100644 index 000000000..cbaae4a08 --- /dev/null +++ b/R/explain_linear.R @@ -0,0 +1,99 @@ +#' Explain the output of a linear model with Shapley values +#' +#' @inheritParams explain +#' +#' @export +#' +#' @author Martin Jullum +#' +explain_linear <- function(model, + x_explain, + x_train, + n_permutations = NULL, + group = NULL, + n_batches = NULL, + seed = 1, + predict_model = NULL, + get_model_specs = NULL, + MSEv_uniform_comb_weights = TRUE, + timing = TRUE, + ...) { # ... is further arguments passed to specific approaches + + timing_list <- list( + init_time = Sys.time() + ) + + set.seed(seed) + + # Gets and check feature specs from the model + feature_specs <- get_feature_specs(get_model_specs, model) + + linear_model_coef <- get_linear_coeff(model) + + null_object <- NULL + # Sets up and organizes input parameters + # Checks the input parameters and their compatability + # Checks data/model compatability + internal <- setup( + type = "linear_gaussian", + x_train = x_train, + x_explain = x_explain, + approach = "gaussian", # always set to "gaussian" although we never really use this argument for linear_gaussian + shap_approach = "permutation", # Always use the permute shap_approach + paired_shap_sampling = TRUE, # Always use paired sampling since simplified computation of the required Q and U objects requires it + prediction_zero = 0, # Never used, we extract this from the model object instead. + n_combinations = NULL, # We always set the n_permutations instead + n_permutations = n_permutations, + group = group, + n_samples = 1, # Not applicable for the linear_gaussian method as no sampling is done + n_batches = n_batches, + seed = seed, + keep_samp_for_vS = FALSE, # Not applicable for the linear_gaussian method as no sampling is done + feature_specs = feature_specs, + MSEv_uniform_comb_weights = MSEv_uniform_comb_weights, + timing = timing, + linear_model_coef = linear_model_coef, # TODO: Make this a proper input argument in setup(). For now this is just included through ... so no checking performed + ... + ) + + timing_list$setup <- Sys.time() + + # Gets predict_model (if not passed to explain) + predict_model <- get_predict_model( + predict_model = predict_model, + model = model + ) + + # Checks that predict_model gives correct format + test_predict_linear_model( + x_test = head(internal$data$x_train, 2), + predict_model = predict_model, + model = model, + linear_model_coef = linear_model_coef, + internal = internal + ) + + timing_list$test_prediction <- Sys.time() + + # Computes the necessary objects for the linear Gaussian approach + internal <- shapley_setup_linear_gaussian(internal) + + timing_list$setup_computation <- Sys.time() + + internal <- compute_linear_gaussian_Tmu_Tx(internal,...) + + timing_list$compute_Tmu_Tx <- Sys.time() + + + # Compute Shapley values with the linear Gaussian method + output <- compute_shapley_linear_gaussian(internal = internal) + + timing_list$shapley_computation <- Sys.time() + + if (timing == TRUE) { + output$timing <- compute_time(timing_list) + } + + + return(output) +} diff --git a/R/finalize_explanation.R b/R/finalize_explanation.R index 31ae74432..f4cc3ef04 100644 --- a/R/finalize_explanation.R +++ b/R/finalize_explanation.R @@ -9,6 +9,7 @@ finalize_explanation <- function(vS_list, internal) { keep_samp_for_vS <- internal$parameters$keep_samp_for_vS MSEv_uniform_comb_weights <- internal$parameters$MSEv_uniform_comb_weights + shap_approach <- internal$parameters$shap_approach processed_vS_list <- postprocess_vS_list( vS_list = vS_list, @@ -21,7 +22,11 @@ finalize_explanation <- function(vS_list, internal) { # internal$timing$postprocessing <- Sys.time() # Compute the Shapley values - dt_shapley <- compute_shapley_new(internal, processed_vS_list$dt_vS) + if(shap_approach == "permutation"){ + dt_shapley <- compute_shapley_permutation(internal, processed_vS_list$dt_vS) + } else { + dt_shapley <- compute_shapley_new(internal, processed_vS_list$dt_vS) + } # internal$timing$shapley_computation <- Sys.time() @@ -110,6 +115,47 @@ get_p <- function(dt_vS, internal) { return(p) } +compute_shapley_permutation <- function(internal,dt_vS){ + feature_names <- internal$parameters$feature_names + X_perm <- internal$objects$X_perm + n_features <- internal$parameters$n_features + n_explain <- internal$parameters$n_explain + max_id_combination <- internal$parameters$n_combinations + S <- internal$objects$S + phi0 <- internal$parameters$prediction_zero + + n_permutations_used <- X_perm[,max(permute_id,na.rm = TRUE)] + + + apply_cols <- names(dt_vS)[-1] + + kshap <- matrix(0,ncol=n_explain,nrow=n_features) + + for(i in seq(n_permutations_used)){ + # Find id combinations that are permuted + these_id_combs <- c(1,X_perm[permute_id==i,id_combination],max_id_combination) + + # Find the feature to map the contributions to + mapping_mat <- apply(S[these_id_combs,],FUN=diff,MARGIN=2) + contributes_to <- apply(mapping_mat,FUN=function(x) which(x==1),MARGIN=1) + reorder_vec <- order(contributes_to) + + # Find the corresponding rows in dt_vS and get the contribution + these_vS <- dt_vS[id_combination %in% these_id_combs] + these_contribs <- these_vS[,lapply(.SD,diff),.SDcols=apply_cols] + + reordered_contribs <- as.matrix(these_contribs[reorder_vec,]) + kshap <- kshap + reordered_contribs + } + kshap <- kshap/n_permutations_used + + + + dt_shapley <- data.table::data.table(cbind(none=phi0,t(kshap))) + names(dt_shapley)[-1] <- feature_names + return(dt_shapley) +} + #' Compute shapley values #' @param explainer An `explain` object. #' @param dt_vS The contribution matrix. @@ -274,3 +320,62 @@ compute_MSEv_eval_crit <- function(internal, MSEv_combination = MSEv_combination )) } + + +#' Computes the Shapley values for the linear Gaussian method +#' +#' @inherit explain +#' @inheritParams default_doc +#' @param vS_list List +#' Output from [compute_vS()] +#' +#' @export +compute_shapley_linear_gaussian <- function(internal) { + + # Inputs + mu <- internal$parameters$gaussian.mu + n_features <- internal$parameters$n_features + n_explain <- internal$parameters$n_explain + x_explain <- internal$data$x_explain + Tmu_list <- internal$objects$Tmu_list + Tx_list <- internal$objects$Tx_list + coefs <- internal$parameters$linear_model_coef + feature_names <- internal$parameters$feature_names + + # Convert inputs + beta <- coefs[-1] + x_explain_mat <- as.matrix(x_explain) + + # Get the prediction + + p <- as.numeric(coefs[1] + x_explain_mat%*%beta) + + # Compute phi0 + phi0 <- as.numeric(coefs[1]+t(beta)%*%mu) + shapley_mat <- matrix(0, nrow = n_explain, ncol = n_features) + colnames(shapley_mat) <- feature_names + + for(j in seq_len(n_features)) { + + # Consider moving the computation of the first and all but the multiplication of the second term to the pre-processing function + shapley_mat[,j] <- as.numeric(t(beta)%*%Tmu_list[[j]]%*%mu) + x_explain_mat%*%t(Tx_list[[j]])%*%beta + + } + + dt_shapley <- data.table( + none = phi0, + shapley_mat + ) + + output <- list( + shapley_values = dt_shapley, + internal = internal, + pred_explain = p + ) + attr(output, "class") <- c("shapr", "list") + + return(output) + +} + + diff --git a/R/get_predict_model.R b/R/get_predict_model.R index 93577e8b9..2df4eab87 100644 --- a/R/get_predict_model.R +++ b/R/get_predict_model.R @@ -80,3 +80,50 @@ test_predict_model <- function(x_test, predict_model, model, internal) { ) } } + +#' Model testing function +#' +#' @inheritParams default_doc +#' @keywords internal +test_predict_linear_model <- function(x_test, predict_model, model,linear_model_coef, internal) { + # Tests prediction with some data + + if(class(model)[1]!="lm"){ + stop(paste0("explain_linear_gaussian is only applicable with 'model' of class 'lm'.")) + } + + tmp <- tryCatch(predict_model(model, x_test), error = errorfun) + if (class(tmp)[1] == "error") { + stop(paste0( + "The predict_model function of class `", class(model), "` is invalid.\n", + "A basic function test threw the following error:\n", as.character(tmp[[1]]) + )) + } + + + + + if (!((all(sapply(tmp, is.numeric))) && + (length(tmp) == 2 || (!is.null(dim(tmp)) && nrow(tmp) == 2 && ncol(tmp) == internal$parameters$output_size)))) { + stop( + paste0( + "The predict_model function of class `", class(model), + "` does not return a numeric output of the desired length.\n", + "See the 'Advanced usage' section of the vignette:\n", + "vignette('understanding_shapr', package = 'shapr')\n\n", + "for more information on running shapr with custom models.\n" + ) + ) + } + + manual_pred <- as.vector(cbind(1,as.matrix(x_test))%*%linear_model_coef) + + if(isFALSE(all.equal(manual_pred,tmp,check.attributes = FALSE))){ + stop( + "Prediction with the extracted model coefficients does not match the prediction with the predict_model function.\n", + "This suggests interactions, quadratic effects or other non-linearities in the model.\n", + "explain_linear_gaussian is only applicable with pure linear models.\n", + ) + } +} + diff --git a/R/linear_gaussian_funcs.R b/R/linear_gaussian_funcs.R new file mode 100644 index 000000000..7f84aecd2 --- /dev/null +++ b/R/linear_gaussian_funcs.R @@ -0,0 +1,360 @@ +# Here I put both the setup functios and the compute_linear_gaussian functions + + +#' @inheritParams default_doc_explain +#' @inheritParams setup_approach.gaussian +#' +#' @export +compute_linear_gaussian_Tmu_Tx <- function(internal, + gaussian.mu = NULL, + gaussian.cov_mat = NULL, ...) { + + x_train <- internal$data$x_train + n_explain <- internal$parameters$n_explain + gaussian.cov_mat <- internal$parameters$gaussian.cov_mat + gaussian.mu <- internal$parameters$gaussian.mu + n_features <- internal$parameters$n_features + linear_model_coef <- internal$parameters$linear_model_coef + perms_mat <- internal$objects$perms_mat + + n_permutations_used <- nrow(perms_mat) + + # For consistency + defaults <- mget(c("gaussian.mu", "gaussian.cov_mat")) + internal <- insert_defaults(internal, defaults) + + x_train <- internal$data$x_train + feature_specs <- internal$objects$feature_specs + + # Checking if factor features are present + if (any(feature_specs$classes == "factor")) { + factor_features <- names(which(feature_specs$classes == "factor")) + factor_approaches <- get_factor_approaches() + stop(paste0( + "The following feature(s) are factor(s): ", factor_features, ".\n", + "approach = 'linear_gaussian' does not support factor features.\n", + "Please change approach to one of ", paste0(factor_approaches, collapse = ", "), "." + )) + } + + # If gaussian.mu is not provided directly in internal list, use mean of training data + if (is.null(gaussian.mu)) { + gaussian.mu <- get_mu_vec(x_train) + } + + # If gaussian.cov_mat is not provided directly in internal list, use sample covariance of training data + if (is.null(gaussian.cov_mat)) { + gaussian.cov_mat <- get_cov_mat(x_train) + } + + Tmu_list <- Tx_list <- list() + for(j in 1:n_features){ + Tmu_list[[j]] <- Tx_list[[j]] <- matrix(0,nrow = n_features,ncol = n_features) + Tmu_list[[j]][j,j] <- -1 + Tx_list[[j]][j,j] <- 1 + + for(i in seq_len(n_permutations_used)){ + ### TODO: Call which(perms_mat == j,arr.ind=TRUE) on the outside, then extract + # all which have the first and last position and handle those separately + + perm0 <- perms_mat[i,] + + position <- which(perm0==j) + PSfull <- diag(n_features) + PS <- PSj <- diag(0,n_features) + + if(position==1){ + #U_S <- diag(0,n_features) + +# this_Sj <- perm0[1] +# this_Sj_bar <- perm0[-1] + + PSj <- PSfull[j,,drop=FALSE] + PSj_bar <- PSfull[-j,,drop=FALSE] + +# U_Sj <- t(PSj_bar)%*%PSj_bar%*%gaussian.cov_mat%*%t(PSj)%*%solve(PSj%*%gaussian.cov_mat%*%t(PSj))%*%PSj + + Udiff <- t(PSj_bar)%*%PSj_bar%*%gaussian.cov_mat%*%t(PSj)%*%solve(PSj%*%gaussian.cov_mat%*%t(PSj))%*%PSj + + } else { + this_S <- sort(perm0[seq_len(position-1)]) + this_Sj <- sort(perm0[seq_len(position)]) + this_S_bar <- sort(perm0[-seq_len(position-1)]) + this_Sj_bar <- sort(perm0[-seq_len(position)]) + + PS <- PSfull[this_S,,drop=FALSE] + PSj <- PSfull[this_Sj,,drop=FALSE] + PS_bar <- PSfull[this_S_bar,,drop=FALSE] + PSj_bar <- PSfull[this_Sj_bar,,drop=FALSE] + + U_S <- t(PS_bar)%*%PS_bar%*%gaussian.cov_mat%*%t(PS)%*%solve(PS%*%gaussian.cov_mat%*%t(PS))%*%PS + U_Sj <- t(PSj_bar)%*%PSj_bar%*%gaussian.cov_mat%*%t(PSj)%*%solve(PSj%*%gaussian.cov_mat%*%t(PSj))%*%PSj + Udiff <- U_Sj-U_S + + } + + + # Could compute these here as well, and then DO not use paired sampling, but I do assume the U-computation is the most expensive part anyway, so will not do it here now. + #U_Sbar <- t(PS)%*%PS%*%gaussian.cov_mat%*%t(PS_bar)%*%solve(PS_bar%*%gaussian.cov_mat%*%t(PS_bar))%*%PS_bar + #U_Sj_bar <- t(PSj)%*%PSj%*%gaussian.cov_mat%*%t(PSj_bar)%*%solve(PSj_bar%*%gaussian.cov_mat%*%t(PSj_bar))%*%PSj_bar + + + Tmu_list[[j]] <- Tmu_list[[j]] -Udiff/ n_permutations_used + Tx_list[[j]] <- Tx_list[[j]] +Udiff/ n_permutations_used + } + } + + internal$objects$Tmu_list <- Tmu_list + internal$objects$Tx_list <- Tx_list + + internal$parameters$gaussian.mu <- gaussian.mu + internal$parameters$gaussian.cov_mat <- gaussian.cov_mat + + return(internal) +} + + +#' @rdname setup_approach +#' +#' @inheritParams default_doc_explain +#' @inheritParams setup_approach.gaussian +#' +#' @export +setup_linear_gaussian <- function(internal, + gaussian.mu = NULL, + gaussian.cov_mat = NULL, ...) { + + x_train <- internal$data$x_train + n_explain <- internal$parameters$n_explain + gaussian.cov_mat <- internal$parameters$gaussian.cov_mat + gaussian.mu <- internal$parameters$gaussian.mu + n_features <- internal$parameters$n_features + linear_model_coef <- internal$parameters$linear_model_coef + S <- internal$objects$S + X <- internal$objects$X + X_perm <- internal$objects$X_perm + + max_id_combination <- X[,.N] + n_permutations_used <- X_perm[,max(permute_id,na.rm=TRUE)] + + # For consistency + defaults <- mget(c("gaussian.mu", "gaussian.cov_mat")) + internal <- insert_defaults(internal, defaults) + + x_train <- internal$data$x_train + feature_specs <- internal$objects$feature_specs + + # Checking if factor features are present + if (any(feature_specs$classes == "factor")) { + factor_features <- names(which(feature_specs$classes == "factor")) + factor_approaches <- get_factor_approaches() + stop(paste0( + "The following feature(s) are factor(s): ", factor_features, ".\n", + "approach = 'linear_gaussian' does not support factor features.\n", + "Please change approach to one of ", paste0(factor_approaches, collapse = ", "), "." + )) + } + + # If gaussian.mu is not provided directly in internal list, use mean of training data + if (is.null(gaussian.mu)) { + gaussian.mu <- get_mu_vec(x_train) + } + + # If gaussian.cov_mat is not provided directly in internal list, use sample covariance of training data + if (is.null(gaussian.cov_mat)) { + gaussian.cov_mat <- get_cov_mat(x_train) + } + + # Counting the number of repetitions of each row in S + id_combination_reps <- X_perm[,.N,by=id_combination] + id_combination_reps[c(1,.N),N:=n_permutations_used] + + # Computing the US and QS objects + + # Now using the formulas form True to the model or true to the data paper: https://arxiv.org/pdf/2006.16234.pdf + # Consider whether doing this with sparse matrices is faster + PS_full <- diag(n_features) + PS <- apply(S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) + PSbar <- apply(1-S,FUN = function(x)PS_full[which(x==1),,drop = FALSE],MARGIN = 1) + + # TODO: Should do this in rcpp for speed up later + US_list <- QS_list <- QSbar_list <- list() + US_list[[1]] <- US_list[[nrow(S)]] <- QS_list[[1]] <- matrix(0,nrow = n_features,ncol = n_features) + QS_list[[nrow(S)]] <- diag(n_features) + for (i in seq(2,nrow(S)-1)){ + US_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]]%*%gaussian.cov_mat%*%t(PS[[i]])%*%solve(PS[[i]]%*%gaussian.cov_mat%*%t(PS[[i]]))%*%PS[[i]] + QS_list[[i]] <- t(PS[[i]])%*%PS[[i]] +# QSbar_list[[i]] <- t(PSbar[[i]])%*%PSbar[[i]] # Make this every time as well since it is fast to do, even though it could be extracted from QS_list if it is always present. Change this later + } + + ### Computing the Tmu and Tx objects + # A rewrite of eq (9) in the true to the model or data paper: https://arxiv.org/pdf/2006.16234.pdf shows that + # when we force paired sampling, the Tmu and Tx objects gets a simplified formula. + # Another trick is that since the full S matrix is constructed based on the permutations, + # any row in S that contains features j, will have a corresponding row that does not contain feature j. + # Thus, we can easily find all the rows that contain feature j, and those that don't to then compute Q and U differences + # without having to map the permutations to the subsets. + + perm_dt <- internal$objects$perm_dt + + Tmu_list <- Tx_list <- list() + these_id_combinations_mat <- list() + tmp_lists <- list() + for(j in seq_len(n_features)){ + Tmu_list[[j]] <- Tx_list[[j]] <- matrix(0,nrow = n_features,ncol = n_features) + Tmu_list[[j]][j,j] <- -1 + Tx_list[[j]][j,j] <- 1 + + these_id_combinations_mat[[j]] <- numeric(0) + tmp_lists[[j]] <- list(Qdiff1=list(), + Qdiff2=list(), + Udiff=list()) + + for(i in seq(n_permutations_used)){ + + perm0 <- perm_dt[permute_id==i,perm][[1]] + + position <- which(perm0==j) + if(position==1){ + this_S <- integer(0) + this_S_plus_j <- sort(perm0[seq_len(position)]) + this_Sbar <- sort(perm0) + this_Sbar_min_j <- sort(perm0[-seq_len(position)]) + } else { + this_S <- sort(perm0[seq_len(position-1)]) + this_S_plus_j <- sort(perm0[seq_len(position)]) + this_Sbar <- sort(perm0[-seq_len(position-1)]) + this_Sbar_min_j <- sort(perm0[-seq_len(position)]) + } + + vec <- c(paste0(this_S,collapse = " "), + paste0(this_S_plus_j,collapse = " "), + paste0(this_Sbar,collapse = " "), + paste0(this_Sbar_min_j,collapse = " ")) + + merge_dt <- data.table(features_tmp=vec,id=seq_along(vec)) + + comb_dt <- merge(merge_dt,X[,.(id_combination,features_tmp)],by="features_tmp",all.x=TRUE) + data.table::setorderv(comb_dt,"id") + + these_id_combinations <- comb_dt[,id_combination] + S_id_comb <- these_id_combinations[1] + S_plus_j_id_comb <- these_id_combinations[2] + Sbar_id_comb <- these_id_combinations[3] + Sbar_min_j_id_comb <- these_id_combinations[4] + + #Qdiff1 <- QS_list[[Sbar_min_j_id_comb]]-QS_list[[Sbar_id_comb]] + #Qdiff2 <- QS_list[[S_plus_j_id_comb]]-QS_list[[S_id_comb]] + Udiff <- US_list[[S_plus_j_id_comb]]-US_list[[S_id_comb]] + + Tmu_list[[j]] <- Tmu_list[[j]] -Udiff/ n_permutations_used + Tx_list[[j]] <- Tx_list[[j]] +Udiff/ n_permutations_used + + #Tmu_list[[j]] <- Tmu_list[[j]] + (Qdiff1-Udiff)/ n_permutations_used + #Tx_list[[j]] <- Tx_list[[j]]+ (Qdiff2-Udiff)/ n_permutations_used + + # Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) + # + # Tmu_list[[j]] <- (Qdiff-Udiff) / n_permutations_used + # Tx_list[[j]] <- (Qdiff+Udiff) / n_permutations_used + + these_id_combinations_mat[[j]] <- rbind(these_id_combinations_mat[[j]],these_id_combinations) +# tmp_lists[[j]]$Qdiff1[[i]] <- Qdiff1 +# tmp_lists[[j]]$Qdiff2[[i]] <- Qdiff2 + tmp_lists[[j]]$Udiff[[i]] <- Udiff + } + + # includes_j <- S[,j]==1 # Find all subsets that include feature j + # id_combination_reps[id_combination%in%which(includes_j),N] + # + # Qdiff <- Reduce("+",QS_list[includes_j])-Reduce("+",QS_list[!includes_j]) + # Udiff <- Reduce("+",US_list[includes_j])-Reduce("+",US_list[!includes_j]) + # + # Tmu_list[[j]] <- (Qdiff-Udiff) / n_permutations_used + # Tx_list[[j]] <- (Qdiff+Udiff) / n_permutations_used + + } + + internal$objects$US_list <- US_list + internal$objects$QS_list <- QS_list + internal$objects$Tmu_list <- Tmu_list + internal$objects$Tx_list <- Tx_list + internal$objects$these_id_combinations_mat <- these_id_combinations_mat + internal$objects$tmp_lists <- tmp_lists + + internal$parameters$gaussian.mu <- gaussian.mu + internal$parameters$gaussian.cov_mat <- gaussian.cov_mat + + return(internal) +} + +SRfun <- function(permute_id0=1,feature=3,max_id_combination=X[,.N]){ + perm0 <- perm_dt[permute_id==permute_id0,perm] + position <- which(perm[[1]]==feature) + + these_id_combinations <- c(1,X_perm[permute_id==permute_id0,id_combination],max_id_combination) + SR_and_SRi <- these_id_combinations[c(position,position+1)] + + return(SR_and_SRi) +} + + + +#' @rdname prepare_data +#' @export +prepare_data.linear_gaussian <- function(internal, index_features = NULL, ...) { + x_train <- internal$data$x_train + x_explain <- internal$data$x_explain + n_explain <- internal$parameters$n_explain + gaussian.cov_mat <- internal$parameters$gaussian.cov_mat + gaussian.mu <- internal$parameters$gaussian.mu + n_features <- internal$parameters$n_features + linear_model_coef <- internal$parameters$linear_model_coef + S <- internal$objects$S + + + X <- internal$objects$X + + x_explain0 <- as.matrix(x_explain) + dt_l <- list() + + + #TODO: Move this test to somewhere else. + # if (!is.null(index_features)) { + # stop("approach = 'linear_gaussian' can only be applied with n_batches = 1.") + # } + + + Tmu_list <- Tx_list <- list() + for (i in seq_len(nrow(S))){ + + } + + # OK, I just stop here now, as I realize I should do this in 2 different ways: + # 1. new explain function which uses permutations, then computes the Tmu and Tx above in a new function (not prepare_data) + # to then compute the Shapley values with formula (9) in the true to the model or data paper + # 2. Create a lingauss-approach which computes the v(S) with the formula in the appendix of our paper, since to compare with + # kernelSHAP, I will be needing that v(S) formula directly + # 3. I can then compare the different approaches (the 2 permutation approaches with different samplign schemes using 1, and with kernelshap using 2.) + + + for (i in seq_len(n_explain)) { + l <- lapply( + X = features, + FUN = sample_gaussian, + n_samples = n_samples, + mu = gaussian.mu, + cov_mat = gaussian.cov_mat, + m = n_features, + x_explain = x_explain0[i, , drop = FALSE] + ) + + dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") + dt_l[[i]][, w := 1 / n_samples] + dt_l[[i]][, id := i] + if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] + } + + dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE) + return(dt) +} diff --git a/R/model_setup_R.R b/R/model_setup_R.R index 87446700a..6c8a21a37 100644 --- a/R/model_setup_R.R +++ b/R/model_setup_R.R @@ -66,3 +66,15 @@ get_feature_specs <- function(get_model_specs, model) { return(feature_specs) } + + + +get_linear_coeff <- function(model){ + # Checks that model is of class 'lm' + if(class(model)[1] != "lm"){ + stop("explain_linear only supports explaining models of class 'lm'") + } + + # Extracts the coefficients from a linear model + return(model$coefficients) +} diff --git a/R/setup.R b/R/setup.R index 018e03b30..b960deab0 100644 --- a/R/setup.R +++ b/R/setup.R @@ -20,9 +20,12 @@ setup <- function(x_train, x_explain, approach, + shap_approach, + paired_shap_sampling, prediction_zero, output_size = 1, n_combinations, + n_permutations, group, n_samples, n_batches, @@ -46,9 +49,12 @@ setup <- function(x_train, internal$parameters <- get_parameters( approach = approach, + shap_approach = shap_approach, + paired_shap_sampling = paired_shap_sampling, prediction_zero = prediction_zero, output_size = output_size, n_combinations = n_combinations, + n_permutations = n_permutations, group = group, n_samples = n_samples, n_batches = n_batches, @@ -160,6 +166,7 @@ check_n_combinations <- function(internal) { n_combinations <- internal$parameters$n_combinations n_features <- internal$parameters$n_features n_groups <- internal$parameters$n_groups + shap_approach <- internal$parameters$shap_approach type <- internal$parameters$type @@ -170,12 +177,13 @@ check_n_combinations <- function(internal) { xreg <- internal$data$xreg if (!is_groupwise) { - if (n_combinations <= n_features) { + if (shap_approach=="kernel" && n_combinations <= n_features) { stop(paste0( "`n_combinations` (", n_combinations, ") has to be greater than the number of components to decompose ", " the forecast onto:\n", "`horizon` (", horizon, ") + `explain_y_lags` (", explain_y_lags, ") ", - "+ sum(`explain_xreg_lags`) (", sum(explain_xreg_lags), ").\n" + "+ sum(`explain_xreg_lags`) (", sum(explain_xreg_lags), ")\n", + "for shap_approach = 'kernel'." )) } } else { @@ -189,8 +197,8 @@ check_n_combinations <- function(internal) { } } else { if (!is_groupwise) { - if (n_combinations <= n_features) { - stop("`n_combinations` has to be greater than the number of features.") + if (shap_approach=="kernel" && n_combinations <= n_features) { + stop("`n_combinations` has to be greater than the number of features for shap_approach = 'kernel'.") } } else { if (n_combinations <= n_groups) { @@ -385,13 +393,22 @@ get_extra_parameters <- function(internal) { } #' @keywords internal -get_parameters <- function(approach, prediction_zero, output_size = 1, n_combinations, group, n_samples, +get_parameters <- function(approach, shap_approach,paired_shap_sampling, prediction_zero, output_size = 1, n_combinations, n_permutations, group, n_samples, n_batches, seed, keep_samp_for_vS, type, horizon, train_idx, explain_idx, explain_y_lags, explain_xreg_lags, group_lags = NULL, MSEv_uniform_comb_weights, timing, is_python, ...) { # Check input type for approach # approach is checked more comprehensively later + # Check if shap_approach is a character equal to either "kernel" or "permutation" + if (!is.character(shap_approach) || !(shap_approach %in% c("kernel", "permutation"))) { + stop("`shap_approach` must be a character equal to either 'kernel' or 'permutation'.") + } + + if (!is.logical(paired_shap_sampling)) { + stop("`paired_shap_sampling` must be a logical.") + } + # n_combinations if (!is.null(n_combinations) && !(is.wholenumber(n_combinations) && @@ -401,6 +418,16 @@ get_parameters <- function(approach, prediction_zero, output_size = 1, n_combina stop("`n_combinations` must be NULL or a single positive integer.") } + # n_combinations + if (!is.null(n_permutations) && + !(is.wholenumber(n_permutations) && + length(n_permutations) == 1 && + !is.na(n_permutations) && + n_permutations > 0)) { + stop("`n_permutations` must be NULL or a single positive integer.") + } + + # group (checked more thoroughly later) if (!is.null(group) && !is.list(group)) { @@ -437,8 +464,8 @@ get_parameters <- function(approach, prediction_zero, output_size = 1, n_combina } # type - if (!(type %in% c("normal", "forecast"))) { - stop("`type` must be either `normal` or `forecast`.\n") + if (!(type %in% c("normal", "forecast","linear_gaussian"))) { + stop("`type` must be either `normal`, `forecast` or `linear_gaussian`.\n") } # parameters only used for type "forecast" @@ -492,8 +519,11 @@ get_parameters <- function(approach, prediction_zero, output_size = 1, n_combina # Getting basic input parameters parameters <- list( approach = approach, + shap_approach = shap_approach, + paired_shap_sampling = paired_shap_sampling, prediction_zero = prediction_zero, n_combinations = n_combinations, + n_permutations = n_permutations, group = group, n_samples = n_samples, n_batches = n_batches, @@ -512,7 +542,12 @@ get_parameters <- function(approach, prediction_zero, output_size = 1, n_combina parameters <- append(parameters, list(...)) # Setting exact based on n_combinations (TRUE if NULL) - parameters$exact <- ifelse(is.null(parameters$n_combinations), TRUE, FALSE) + if(shap_approach=="permutation"){ + parameters$exact <- ifelse(is.null(parameters$n_permutations), TRUE, FALSE) + #parameters$n_combinations <- 3*parameters$n_permutations # TODO: Do this properly. (Temporary setting this parameter to avoid errors in the code) + } else { + parameters$exact <- ifelse(is.null(parameters$n_combinations), TRUE, FALSE) + } return(parameters) } diff --git a/R/setup_computation.R b/R/setup_computation.R index 195e1931e..8e278691c 100644 --- a/R/setup_computation.R +++ b/R/setup_computation.R @@ -22,6 +22,25 @@ setup_computation <- function(internal, model, predict_model) { return(internal) } + +shapley_setup_linear_gaussian <- function(internal) { + exact <- internal$parameters$exact + n_features0 <- internal$parameters$n_features + n_permutations <- internal$parameters$n_permutations + + perms_mat <- feature_combinations_perm( + m = n_features0, + exact = exact, + n_permutations = n_permutations, + paired_shap_sampling = TRUE, + return_perm_list_only = TRUE + ) + internal$objects$perms_mat <- perms_mat + + return(internal) +} + + #' @keywords internal shapley_setup_forecast <- function(internal) { exact <- internal$parameters$exact @@ -133,16 +152,35 @@ shapley_setup <- function(internal) { exact <- internal$parameters$exact n_features0 <- internal$parameters$n_features n_combinations <- internal$parameters$n_combinations + n_permutations <- internal$parameters$n_permutations is_groupwise <- internal$parameters$is_groupwise + shap_approach <- internal$parameters$shap_approach + paired_shap_sampling = internal$parameters$paired_shap_sampling group_num <- internal$objects$group_num + if(shap_approach == "permutation"){ + +# TODO: Add grouping to the permutation approach later + X_list <- feature_combinations_perm( + m = n_features0, + exact = exact, + n_permutations = n_permutations, + paired_shap_sampling = paired_shap_sampling + ) + X <- X_list$X + internal$objects$X_perm <- X_list$X_perm + internal$objects$perm_dt <- X_list$perm_dt + W <- NULL + + } else { X <- feature_combinations( m = n_features0, exact = exact, n_combinations = n_combinations, weight_zero_m = 10^6, - group_num = group_num + group_num = group_num, + paired_shap_sampling = paired_shap_sampling ) # Get weighted matrix ---------------- @@ -152,6 +190,13 @@ shapley_setup <- function(internal) { is_groupwise = is_groupwise ) + # Updating parameters$exact as done in feature_combinations + if (!exact && n_combinations >= 2^n_features0) { + internal$parameters$exact <- TRUE + } + } + + ## Get feature matrix --------- S <- feature_matrix_cpp( features = X[["features"]], @@ -159,12 +204,6 @@ shapley_setup <- function(internal) { ) #### Updating parameters #### - - # Updating parameters$exact as done in feature_combinations - if (!exact && n_combinations >= 2^n_features0) { - internal$parameters$exact <- TRUE - } - internal$parameters$n_combinations <- nrow(S) # Updating this parameter in the end based on what is actually used. # This will be obsolete later @@ -180,6 +219,232 @@ shapley_setup <- function(internal) { return(internal) } + +feature_combinations_perm <- function(m, exact = TRUE, n_permutations = NULL,paired_shap_sampling=FALSE,return_perm_list_only = FALSE) { + + if (!exact) { + # Switch to exact for feature-wise method + if (n_permutations >= factorial(m)) { + n_combinations <- 2^m + n_permutations <- factorial(m) + exact <- TRUE + message( + paste0( + "Success with message:\n", + "n_permutations is larger than or equal to m! = ", factorial(m), ". \n", + "Using exact instead.\n" + ) + ) + } + } + + if (exact) { + perms_mat <- feature_permute_exact(m) + } else { + perms_mat <- feature_permute_samp(m, n_permutations,paired_shap_sampling) + } + if(return_perm_list_only == TRUE){ + return(perms_mat) + } else { + + # Convert the matrix to a list of row vectors + perm_list <- lapply(seq_len(nrow(perms_mat)), function(i) perms_mat[i, ]) + # TODO: rewrite code using this to work direcrly with the perms_mat + + + perm_dt <- data.table( + permute_id = seq_along(perm_list), + perm = as.list(perm_list) + ) + + ret <- X_from_perm_dt(perm_dt) + ret$perm_list = perm_list + ret$perm_dt = perm_dt # Here this is a data.table of the permutations + + } + + return(ret) +} + +unrank_permutations_mat <- function(ranks, n) { + # Precompute factorials + factorials <- sapply(0:(n-1), factorial) + + # Initialize the matrix of permutations + permutations <- matrix(nrow = length(ranks), ncol = n) + + for (r in seq_along(ranks)) { + rank <- ranks[r] + elements <- 1:n + permutation <- integer(n) + + for(i in seq_len(n)){ + factorial <- factorials[n - i + 1] + index <- rank %/% factorial + rank <- rank %% factorial + permutation[i] <- elements[index + 1] + elements <- elements[-(index + 1)] + } + + permutations[r, ] <- permutation + } + + return(permutations) +} + +feature_permute_samp <- function(m, n_permutations,paired_shap_sampling=TRUE) { + # This function generates random permutations of the features using the + # ranking/unranking approach + + tot_no_permutations <- factorial(m) + + # Generate unique random permutation rankings that ought to be transformed to permutations + ranks = sample.int(tot_no_permutations, n_permutations, replace = FALSE)-1 + + # Generates the permutations + perms <- unrank_permutations_mat(ranks,m) + + if(paired_shap_sampling==TRUE){ + # Gets the reverse (paired) permutations + rev_perms <- perms[,seq(m,1)] + + # Since there is no guarantee that the reverse permutations are not sampled in perms, + # we combine the two one by one to ensure that the reverse permutations are always + # present + # Note that we sample 2*n_permutations permutations also in the case of paired_shap_sampling = TRUE since we need to guarantee that we at least have n_permutations unique permutations + comb_perms <- matrix(NA,nrow=2*n_permutations,ncol=m) + comb_perms[seq(1,2*n_permutations-1,by=2),] <- perms + comb_perms[seq(2,2*n_permutations,by=2),] <- rev_perms + + final_perms_mat <- unique(comb_perms)[seq_len(n_permutations),] + return(final_perms_mat) + } else { + return(perms) + } +} + +feature_permute_exact <- function(m) { + # Generate all possible combinations of the features using the ranking/unranking approach + + ranks <- seq(factorial(m))-1 + return(unrank_permutations_mat(ranks = ranks, n = m)) +} + +X_from_perm_dt_linear_gaussian <- function(perm_dt){ + m <- length(perm_dt[["perm"]][[1]]) + + perm_list<- list() # The perm_list includes, for all features, the S+j sets that are mapped from the permutations. The first element is the empty set to also include that + for (j in seq_len(m)){ + perm_list[[j]] <- list(numeric(0)) + } + for(i in seq_len(nrow(perm_dt))){ + perm0 <- perm_dt[i, perm][[1]] + for (j in seq_len(m)){ + position <- which(perm0==j) + perm_list[[j]][[i+1]] <- sort(perm0[seq_len(position)]) + } + } + + X_perm_list <- list() + for (j in seq_len(m)){ + X_perm_list[[j]] <- data.table(n_features = sapply(perm_list[[j]], length)) + X_perm_list[[j]][, n_features := as.integer(n_features)] + X_perm_list[[j]][, features := perm_list[[j]]] + X_perm_list[[j]][, permute_id := c(NA,seq_len(nrow(perm_dt)))] + data.table::setkeyv(X_perm_list[[j]], c("n_features","permute_id")) + X_perm_list[[j]][!is.finite(permute_id), permute_id := NA] + + # Temporary create a character string to merge back on below (cannot merge on lists) + X_perm_list[[j]][, features_tmp := sapply(features, paste, collapse = " ")] + X_perm_list[[j]][, feature_contrib:=j] + + } + + + X_perm <- rbindlist(X_perm_list) + data.table::setkeyv(X_perm, c("n_features","permute_id")) + + X_perm[,is_duplicate:=duplicated(features_tmp)] + X_perm[, rowid:=.I] + X <- copy(X_perm[is_duplicate==FALSE]) + X[,id_combination:=.I] + X[, is_duplicate := NULL] + + # Merge back to get the id_combination mapping also in X_perm + X_perm <- merge(X_perm,X[,.(features_tmp,id_combination)],by="features_tmp",all.x=TRUE) + data.table::setorderv(X_perm, "rowid") + X_perm[, rowid:=NULL] + X[, rowid:=NULL] + + #X_perm[, features_tmp := NULL] + #X[, features_tmp := NULL] + + setcolorder(X,c("id_combination","permute_id", "features", "n_features")) + + + ret <- list(X=X[],X_perm=X_perm[]) + +} + + +X_from_perm_dt <- function(perm_dt) { + m <- length(perm_dt[["perm"]][[1]]) + tmp <- list() + + for(i in 1:nrow(perm_dt)){ + perm0 <- perm_dt[i, perm][[1]] + for (j in seq_len(m-1)){ + tmp[[length(tmp) + 1]] <- sort(perm0[1:j]) + } + } + + feature_sample_all <- c(list(integer(0)), tmp, list(c(1:m))) + X_perm <- data.table(n_features = sapply(feature_sample_all, length)) + X_perm[, n_features := as.integer(n_features)] + + # Get number of occurences and duplicated rows------- + is_duplicate <- NULL # due to NSE notes in R CMD check + r <- shapr:::helper_feature(m, feature_sample_all) + X_perm[, is_duplicate := r[["is_duplicate"]]] + X_perm[, features := feature_sample_all] + X_perm[, permute_id := c(NA,rep(seq_len(nrow(perm_dt)),each = m-1),Inf)] + data.table::setkeyv(X_perm, c("n_features","permute_id")) + X_perm[!is.finite(permute_id), permute_id := NA] + + # Temporary create a character string to merge back on below (cannot merge on lists) + X_perm[, features_tmp := sapply(features, paste, collapse = " ")] + X_perm[, rowid:=.I] + + if (any(X_perm[["is_duplicate"]])) { + X <- X_perm[is_duplicate == FALSE] + } else { + X <- copy(X_perm) + } + X[,id_combination:=.I] + X[, is_duplicate := NULL] + + # Merge back to get the id_combination mapping also in X_perm + X_perm <- merge(X_perm,X[,.(features_tmp,id_combination)],by="features_tmp",all.x=TRUE) + data.table::setorderv(X_perm, "rowid") + X_perm[, rowid:=NULL] + X[, rowid:=NULL] + + #X_perm[, features_tmp := NULL] + #X[, features_tmp := NULL] + + setcolorder(X,c("id_combination","permute_id", "features", "n_features")) + + + ret <- list(X=X[],X_perm=X_perm[]) + return(ret) +} + + + + + + + #' Define feature combinations, and fetch additional information about each unique combination #' #' @param m Positive integer. Total number of features. @@ -217,7 +482,7 @@ shapley_setup <- function(internal) { #' #' # Subsample of combinations #' x <- feature_combinations(exact = FALSE, m = 10, n_combinations = 1e2) -feature_combinations <- function(m, exact = TRUE, n_combinations = 200, weight_zero_m = 10^6, group_num = NULL) { +feature_combinations <- function(m, exact = TRUE, n_combinations = 200, weight_zero_m = 10^6, group_num = NULL, paired_shap_sampling = FALSE) { m_group <- length(group_num) # The number of groups # Force user to use a natural number for n_combinations if m > 13 @@ -285,7 +550,7 @@ feature_combinations <- function(m, exact = TRUE, n_combinations = 200, weight_z if (exact) { dt <- feature_exact(m, weight_zero_m) } else { - dt <- feature_not_exact(m, n_combinations, weight_zero_m) + dt <- feature_not_exact(m, n_combinations, weight_zero_m,unique_sampling = TRUE,paired_shap_sampling = paired_shap_sampling) stopifnot( data.table::is.data.table(dt), !is.null(dt[["p"]]) @@ -323,7 +588,7 @@ feature_exact <- function(m, weight_zero_m = 10^6) { } #' @keywords internal -feature_not_exact <- function(m, n_combinations = 200, weight_zero_m = 10^6, unique_sampling = TRUE) { +feature_not_exact <- function(m, n_combinations = 200, weight_zero_m = 10^6, unique_sampling = TRUE,paired_shap_sampling = FALSE) { # Find weights for given number of features ---------- n_features <- seq(m - 1) n <- sapply(n_features, choose, n = m) @@ -336,17 +601,28 @@ feature_not_exact <- function(m, n_combinations = 200, weight_zero_m = 10^6, uni if (unique_sampling) { while (unique_samples < n_combinations - 2) { + if(paired_shap_sampling==TRUE){ + n_samps <- ceiling((n_combinations - unique_samples - 2)/2) + } else { + n_samps <- n_combinations - unique_samples - 2 + } + # Sample number of chosen features ---------- n_features_sample <- sample( x = n_features, - size = n_combinations - unique_samples - 2, # Sample -2 as we add zero and m samples below + size = n_samps, # Sample -2 as we add zero and m samples below replace = TRUE, prob = p ) # Sample specific set of features ------- feature_sample <- sample_features_cpp(m, n_features_sample) - feature_sample_all <- c(feature_sample_all, feature_sample) + if(paired_shap_sampling==TRUE){ + feature_sample_paired <- lapply(feature_sample, function(x) seq(m)[-x]) + feature_sample_all <- c(feature_sample_all, feature_sample,feature_sample_paired) + } else { + feature_sample_all <- c(feature_sample_all, feature_sample) + } unique_samples <- length(unique(feature_sample_all)) } } else { @@ -619,6 +895,7 @@ create_S_batch_new <- function(internal, seed = NULL) { approach0 <- internal$parameters$approach n_combinations <- internal$parameters$n_combinations n_batches <- internal$parameters$n_batches + shap_approach <- internal$parameters$shap_approach X <- internal$objects$X @@ -665,7 +942,9 @@ create_S_batch_new <- function(internal, seed = NULL) { # with respect to shapley_weight X[, randomorder := sample(.N)] data.table::setorder(X, randomorder) # To avoid smaller id_combinations always proceeding large ones - data.table::setorder(X, shapley_weight) + if(shap_approach!="permutation") { + data.table::setorder(X, shapley_weight) + } batch_counter <- 0 for (i in seq_along(approach_vec)) { @@ -678,7 +957,9 @@ create_S_batch_new <- function(internal, seed = NULL) { # Spreading the batches X[, randomorder := sample(.N)] data.table::setorder(X, randomorder) - data.table::setorder(X, shapley_weight) + if(shap_approach!="permutation") { + data.table::setorder(X, shapley_weight) + } X[!(n_features %in% c(0, n_features0)), batch := ceiling(.I / .N * n_batches)] } diff --git a/inst/scripts/devel/TODO permute branch b/inst/scripts/devel/TODO permute branch new file mode 100644 index 000000000..aaeaa3a1b --- /dev/null +++ b/inst/scripts/devel/TODO permute branch @@ -0,0 +1,17 @@ +TODO: permute-branch + +- General: + - Make sampling permutations faster (Rcpp). This could very well also be a matrix. Moreover, look at what is the slow part of the sampling-code using Rprof(vis) + - Clean up unused stuff + X- Merge master branch in here + - Make the creation of the S and X matrices faster + - Think about how to do the looping with + - Implement grouping + - Create a function which takes the explain object, and a new x_explain input and allows computing the shapley values for that super-fast using the Tmx&Tmu lists. + +- LinearGaussian: + - write the looping over Tmu/Tmx in Rcpp for speed + - Add MSE computation + +- Permute: + - make finalization part faster diff --git a/inst/scripts/devel/devel_explain_linear.R b/inst/scripts/devel/devel_explain_linear.R new file mode 100644 index 000000000..a5bfb92e5 --- /dev/null +++ b/inst/scripts/devel/devel_explain_linear.R @@ -0,0 +1,99 @@ +set.seed(12345) + +data <- data.table::as.data.table(airquality) + +data_complete <- data[complete.cases(airquality), ] +data_complete <- data_complete[sample(seq_len(.N))] # Sh + +y_var_numeric <- "Ozone" + +x_var_numeric <- c("Solar.R", "Wind", "Temp", "Month")#, "Day") + +data_train <- head(data_complete, -3) +data_explain <- tail(data_complete, 3) + +x_train_numeric <- data_train[, ..x_var_numeric] + +x_explain_numeric <- data_explain[, ..x_var_numeric] + +lm_formula_numeric <- as.formula(paste0(y_var_numeric, " ~ ", paste0(x_var_numeric, collapse = " + "))) + +model_lm_numeric <- lm(lm_formula_numeric, data = data_complete) + +p0 <- data_train[, mean(get(y_var_numeric))] + +x_explain_numeric_new <- rbind(x_explain_numeric,t(rep(0,length(x_var_numeric))),t(rep(1,length(x_var_numeric))),use.names=FALSE) + + + + + +test <-explain(model = model_lm_numeric, + x_explain = x_explain_numeric_new, + x_train = x_train_numeric, + approach = "gaussian", + shap_approach = "permutation", + prediction_zero = p0,n_samples = 10^4) +test +#none Solar.R Wind Temp Month +#1: 42.44 -8.379 7.944 15.29533 0.6522 +#2: 42.44 4.919 -4.755 -11.00089 -1.0133 +#3: 42.44 7.302 -25.612 -0.08335 -0.5415 +#4: 42.44 -12.371 72.269 -154.31630 -6.0802 +#5: 42.44 -11.922 67.037 -153.72291 -6.2782 + + + +#debugonce(explain_linear) + + +test2 <-explain_linear(model = model_lm_numeric, + x_explain = x_explain_numeric_new, + x_train = x_train_numeric, + n_permutations=4, + ) +test2 + +test2 <-explain_linear(model = model_lm_numeric, + x_explain = x_explain_numeric_new, + x_train = x_train_numeric, +) +test2 + + +unique(test2$internal$objects$US_list) + + +### We don't get the same results... Investigating by looking at the v(S) we get: + +test$internal$output$dt_vS[id_combination==2] + +# Look at the second id_combination (S={1}) + +test2$internal$objects$S[c(2,31),] + +QS <- test2$internal$objects$QS_list[[2]] +QSbar <- test2$internal$objects$QS_list[[31]] +US <- test2$internal$objects$US_list[[2]] +mu <- test2$internal$parameters$gaussian.mu +Sig <- test2$internal$parameters$gaussian.cov +x <- unlist(test2$internal$data$x_explain[1,]) +coefs <- test2$internal$parameters$linear_model_coef +b <- coefs[1] +beta <- coefs[-1] + +Ex <- (QSbar-US)%*%mu + (QS+US)%*%x +beta%*%Ex + b + +test$internal$output$dt_vS[id_combination==2] + +condmu_manual <- c(x[1],mu[-1] + Sig[-1,1]%*%solve(Sig[1,1])*(x[1]-mu[1])) +# Same as Ex + + +### TODO: Check also other combinations, but seems that the formula for E[x|xS] is correct, so error must be in the Tx/Tmu stuff. + + + + + diff --git a/inst/scripts/devel/devel_explain_linear_2.R b/inst/scripts/devel/devel_explain_linear_2.R new file mode 100644 index 000000000..05c1fbc31 --- /dev/null +++ b/inst/scripts/devel/devel_explain_linear_2.R @@ -0,0 +1,119 @@ +set.seed(12345) +library(data.table) + +p <- 8 +n_train <- 10^3 +beta <- rnorm(p)#rep(1,p) +b <- rnorm(1) +mu <- rnorm(p)#rep(1,p) + +# Random covariance matrix +A <- matrix(runif(p^2,min = -1,max=1), ncol=p) +Sig <- t(A) %*% A +#Sig <- diag(4) + + +data <- as.data.table(mvtnorm::rmvnorm(n_train,mean=mu,sigma=Sig)) + +y <- as.vector(b + as.matrix(data)%*%beta + rnorm(n_train,sd=.01)) + +model <- lm(y~., data = cbind(y,data)) + +p0 <- as.numeric(b+mu%*%beta) + +x_explain <- rbind(head(data,2), + t(rep(0,p)), + t(rep(1,p)), + t(1:p), + use.names=FALSE) + +profvis({ + test <-explain(model = model, + x_explain = x_explain, + x_train = data, + approach = "gaussian", + shap_approach = "permutation", + prediction_zero = p0, + gaussian.cov_mat=Sig, + gaussian.mu = mu, + n_samples = 10, + n_permutations=10^4) +}) + + test + +#debugonce(explain_linear) + +profvis({ +test2 <-explain_linear(model = model, + x_explain = x_explain, + x_train = data, + gaussian.cov_mat=Sig, + gaussian.mu=mu,n_permutations = 10^4 +) +}) +test2$internal$objects$perm_dt[,.N] + +all_lists <- NULL +for(i in 1:p){ + all_lists <- c(all_lists,test2$internal$objects$tmp_lists[[i]]$Udiff) +} +length(unique(all_lists)) + +test2$internal$objects$tmp_lists[[1]]$Udiff[[1]]+test2$internal$objects$tmp_lists[[1]]$Udiff[[2]] +test2$internal$objects$tmp_lists[[1]]$Udiff[[3]]+test2$internal$objects$tmp_lists[[1]]$Udiff[[4]] +test2$internal$objects$tmp_lists[[1]]$Udiff[[5]]+test2$internal$objects$tmp_lists[[1]]$Udiff[[6]] + +test2$internal$objects$tmp_lists[[2]]$Udiff[[1]]+test2$internal$objects$tmp_lists[[2]]$Udiff[[2]] +test2$internal$objects$tmp_lists[[2]]$Udiff[[3]]+test2$internal$objects$tmp_lists[[2]]$Udiff[[4]] +test2$internal$objects$tmp_lists[[2]]$Udiff[[5]]+test2$internal$objects$tmp_lists[[2]]$Udiff[[6]] + + + +length(unique(test2$internal$objects$tmp_lists[[1]]$Udiff)) +length(unique(test2$internal$objects$tmp_lists[[2]]$Udiff)) +length(unique(test2$internal$objects$tmp_lists[[3]]$Udiff)) +length(unique(test2$internal$objects$tmp_lists[[4]]$Udiff)) + +test2$internal$objects$US_list +test2$internal$objects$QS_list +test2$internal$objects$Tmu_list +test2$internal$objects$Tx_list + + +### something odd, here. the tx_list is not looking the same for differnet features, although there is no differnce between them... +# Look more closely into how that is created + +### We don't get the same results... Investigating by looking at the v(S) we get: + +test$internal$output$dt_vS[id_combination==2] + +# Look at the second id_combination (S={1}) + +test2$internal$objects$S[c(2,31),] + +QS <- test2$internal$objects$QS_list[[2]] +QSbar <- test2$internal$objects$QS_list[[31]] +US <- test2$internal$objects$US_list[[2]] +mu <- test2$internal$parameters$gaussian.mu +Sig <- test2$internal$parameters$gaussian.cov +x <- unlist(test2$internal$data$x_explain[1,]) +coefs <- test2$internal$parameters$linear_model_coef +b <- coefs[1] +beta <- coefs[-1] + +Ex <- (QSbar-US)%*%mu + (QS+US)%*%x +beta%*%Ex + b + +test$internal$output$dt_vS[id_combination==2] + +condmu_manual <- c(x[1],mu[-1] + Sig[-1,1]%*%solve(Sig[1,1])*(x[1]-mu[1])) +# Same as Ex + + +### TODO: Check also other combinations, but seems that the formula for E[x|xS] is correct, so error must be in the Tx/Tmu stuff. + + + + + diff --git a/inst/scripts/devel/devel_permutation_sampling.R b/inst/scripts/devel/devel_permutation_sampling.R new file mode 100644 index 000000000..1448729c7 --- /dev/null +++ b/inst/scripts/devel/devel_permutation_sampling.R @@ -0,0 +1,168 @@ + + +unrank_permutation <- function(rank, n) { + elements <- 1:n + permutation <- integer(n) + + for(i in 1:n){ + factorial <- factorial(n - i) + index <- rank %/% factorial + rank <- rank %% factorial + permutation[i] <- elements[index+1] + elements <- elements[-(index+1)] + } + + return(permutation) +} + +unrank_permutations <- function(ranks, n) { + # Precompute factorials + factorials <- sapply(0:(n-1), factorial) + + # Initialize the list of permutations + permutations <- vector("list", length(ranks)) + + for (r in seq_along(ranks)) { + rank <- ranks[r] + elements <- 1:n + permutation <- integer(n) + + for(i in seq_len(n)){ + factorial <- factorials[n - i + 1] + index <- rank %/% factorial + rank <- rank %% factorial + permutation[i] <- elements[index + 1] + elements <- elements[-(index + 1)] + } + + permutations[[r]] <- permutation + } + + return(permutations) +} + +unrank_permutations_mat <- function(ranks, n) { + # Precompute factorials + factorials <- sapply(0:(n-1), factorial) + + # Initialize the matrix of permutations + permutations <- matrix(nrow = length(ranks), ncol = n) + + for (r in seq_along(ranks)) { + rank <- ranks[r] + elements <- 1:n + permutation <- integer(n) + + for(i in seq_len(n)){ + factorial <- factorials[n - i + 1] + index <- rank %/% factorial + rank <- rank %% factorial + permutation[i] <- elements[index + 1] + elements <- elements[-(index + 1)] + } + + permutations[r, ] <- permutation + } + + return(permutations) +} + +aa=unrank_permutations_mat(seq_len(factorial(n))-1,n) + +n <- 8 +tot_no_permutations <- factorial(n) + +samples <- factorial(n)#5*10^5 + +ranks = sample.int(tot_no_permutations, samples, replace = FALSE) + +perms <- lapply(ranks, unrank_permutation, n = n) + +perms <- unrank_permutations(1:24,n) # This is the fastest + +#### My coding here #### + +#### THIS SEEMS TO WORK + +n <- 4 +tot_no_permutations <- factorial(n) + +samples <- 10#factorial(n)#5*10^5 + +ranks = sample.int(tot_no_permutations, samples, replace = FALSE)-1 + +perms <- unrank_permutations_mat(ranks,n) # This is the fastest + +rev_perms <- perms[,seq(n,1)] + +comb_perms <- matrix(NA,nrow=samples*2,ncol=n) +comb_perms[seq(1,2*samples-1,by=2),] <- perms +comb_perms[seq(2,2*samples,by=2),] <- rev_perms + +unique(comb_perms)[seq_len(samples),] + +#### higher dim #### + +n <- 15 +tot_no_permutations <- factorial(n) + +samples <- 10^5#factorial(n)#5*10^5 + +ranks = sample.int(tot_no_permutations, samples, replace = FALSE)-1 + +perms <- unrank_permutations_mat(ranks,n) # This is the fastest + +rev_perms <- perms[,seq(n,1)] + +comb_perms <- matrix(NA,nrow=samples*2,ncol=n) +comb_perms[seq(1,2*samples-1,by=2),] <- perms +comb_perms[seq(2,2*samples,by=2),] <- rev_perms + +final_perms <- unique(comb_perms)[seq_len(samples),] + + + +############### + +# Number of unique permutations desired +k <- 20 + +# Length of the vector +n <- 4 + +# Total number of permutations for a list of length n +total_perm <- factorial(n) + +# Generate k/2 random ranks +set.seed(123) # for reproducibility +ranks <- sample(0:(total_perm / 2 - 1), k / 2) + +# Calculate ranks for the reversed permutations +rev_ranks <- total_perm - 1 - ranks + +# Combine the ranks +all_ranks <- c(ranks, rev_ranks) + +# Generate the permutations +permutations <- unrank_permutations(all_ranks, n) + + + + + +############### + +lehmer_code <- function(permutation) { + n <- length(permutation) + code <- rep(0, n) + for (i in 1:n) { + for (j in (i+1):n) { + if (permutation[j] < permutation[i]) { + code[i] <- code[i] + 1 + } + } + } + return(code) +} + +print(lehmer_code(c(4,3,1,2))) diff --git a/inst/scripts/devel/devel_permute.R b/inst/scripts/devel/devel_permute.R new file mode 100644 index 000000000..2331f5aaa --- /dev/null +++ b/inst/scripts/devel/devel_permute.R @@ -0,0 +1,84 @@ +set.seed(12345) + +data <- data.table::as.data.table(airquality) + +data_complete <- data[complete.cases(airquality), ] +data_complete <- data_complete[sample(seq_len(.N))] # Sh + +y_var_numeric <- "Ozone" + +x_var_numeric <- c("Solar.R", "Wind", "Temp", "Month", "Day") + +data_train <- head(data_complete, -3) +data_explain <- tail(data_complete, 3) + +x_train_numeric <- data_train[, ..x_var_numeric] + +x_explain_numeric <- data_explain[, ..x_var_numeric] + +lm_formula_numeric <- as.formula(paste0(y_var_numeric, " ~ ", paste0(x_var_numeric, collapse = " + "))) + +model_lm_numeric <- lm(lm_formula_numeric, data = data_complete) + +p0 <- data_train[, mean(get(y_var_numeric))] + +test_kernel <-explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "empirical", + shap_approach = "kernel", + prediction_zero = p0) + +test_permute <-explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "empirical", + shap_approach = "permutation", + prediction_zero = p0) + + +test_kernel$shapley_values-test_permute$shapley_values + + +permute_list <- kernel_list <- list() +for(i in 1:10){ + set.seed(1+i) + + test_kernel <-explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "empirical", + shap_approach = "kernel", + prediction_zero = p0,n_combinations = 2^5-3,seed = i) + + + test_permute <-explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "empirical", + shap_approach = "permutation", + prediction_zero = p0,n_permutations = factorial(5)-3,seed=i) + permute_list[[i]] <- test_permute$shapley_values + kernel_list[[i]] <- test_kernel$shapley_values + + print(i) + +} + +permute_list <- lapply(permute_list,as.matrix) +kernel_list <- lapply(kernel_list,as.matrix) + +# permute is way more precise than kernel when sampling + +Reduce(`+`,permute_list)/length(permute_list) + +Reduce(`+`,kernel_list)/length(kernel_list) + +explain(model = model_lm_numeric, + x_explain = x_explain_numeric, + x_train = x_train_numeric, + approach = "empirical", + shap_approach = "kernel", + prediction_zero = p0) + + diff --git a/inst/scripts/devel/testing_permutation.R b/inst/scripts/devel/testing_permutation.R new file mode 100644 index 000000000..bfbf2db32 --- /dev/null +++ b/inst/scripts/devel/testing_permutation.R @@ -0,0 +1,202 @@ +library(xgboost) +#library(shapr) + +data("airquality") +data <- data.table::as.data.table(airquality) +data <- data[complete.cases(data), ] + +set.seed(123) +data[,V1:=rnorm(.N)] +data[,V2:=rnorm(.N)] +data[,V3:=rnorm(.N)] +data[,V4:=rnorm(.N)] +data[,V5:=rnorm(.N)] + +x_var <- c("Solar.R", "Wind", "Temp", "Month","Day","V1","V2","V3","V4","V5") +y_var <- "Ozone" + +ind_x_explain <- 1:6 +x_train <- data[-ind_x_explain, ..x_var] +y_train <- data[-ind_x_explain, get(y_var)] +x_explain <- data[ind_x_explain, ..x_var] + +# Looking at the dependence between the features +cor(x_train) + +# Fitting a basic xgboost model to the training data +model <- xgboost( + data = as.matrix(x_train), + label = y_train, + nround = 20, + verbose = FALSE +) + +# Specifying the phi_0, i.e. the expected prediction without any features +p0 <- mean(y_train) + +# Computing the actual Shapley values with kernelSHAP accounting for feature dependence using +# the empirical (conditional) distribution approach with bandwidth parameter sigma = 0.1 (default) +n_permutations_vec <- c(4,8,16,32,64) + + +exp_full <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=4, + shap_approach = "kernel", + n_combinations = NULL, + prediction_zero = p0,n_samples = 10000 +) + +exp_full$timing + +reps <- 10 +exp_perm_list <- exp_perm_paired_list <- exp_kern_list <- exp_kern_paired_list <- list() +mse_perm_mat <- mse_perm_paired_mat <- mse_kern_mat <- mse_kern_paired_mat <- matrix(NA,ncol=reps,nrow=length(n_permutations_vec)) +for (i in seq_along(n_permutations_vec)) { + exp_perm_list[[i]] <- exp_perm_paired_list[[i]] <- exp_kern_list[[i]] <- exp_kern_paired_list[[i]] <- list() + for (j in 1:reps){ + exp_perm_list[[i]][[j]] <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123+j,n_batches=1, + shap_approach = "permutation", + n_permutations = n_permutations_vec[i], + prediction_zero = p0 + ) + + exp_perm_paired_list[[i]][[j]] <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123+j,n_batches=1, + shap_approach = "permutation",paired_shap_sampling = TRUE, + n_permutations = n_permutations_vec[i], + prediction_zero = p0 + ) + + + exp_kern_list[[i]][[j]] <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123+j,n_batches=1, + shap_approach = "kernel", + n_combinations = exp_perm_list[[i]][[j]]$internal$parameters$n_combinations, + prediction_zero = p0 + ) + + exp_kern_paired_list[[i]][[j]] <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123+j,n_batches=1, + shap_approach = "kernel",paired_shap_sampling = TRUE, + n_combinations = exp_perm_list[[i]][[j]]$internal$parameters$n_combinations, + prediction_zero = p0 + ) + + +# print(exp_list[[i]]$timing) +# print(exp_kern_list[[i]]$timing) + + mse_perm_mat[i,j] <- mean(unlist((exp_perm_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + mse_perm_paired_mat[i,j] <- mean(unlist((exp_perm_paired_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + mse_kern_mat[i,j] <- mean(unlist((exp_kern_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + mse_kern_paired_mat[i,j] <- mean(unlist((exp_kern_paired_list[[i]][[j]]$shapley_values[,..x_var]-exp_full$shapley_values[,..x_var])^2)) + + print(rowMeans(mse_perm_mat,na.rm = TRUE)) + print(rowMeans(mse_perm_paired_mat,na.rm = TRUE)) + print(rowMeans(mse_kern_mat,na.rm = TRUE)) + print(rowMeans(mse_kern_paired_mat,na.rm = TRUE)) + + } + +} + +print(rowMeans(mse_perm_mat,na.rm = TRUE)) +print(rowMeans(mse_perm_paired_mat,na.rm = TRUE)) +print(rowMeans(mse_kern_mat,na.rm = TRUE)) +print(rowMeans(mse_kern_paired_mat,na.rm = TRUE)) +#> print(rowMeans(mse_perm_mat,na.rm = TRUE)) +#[1] 3.83370 2.16886 1.03013 0.45260 0.21826 +#> print(rowMeans(mse_perm_paired_mat,na.rm = TRUE)) +#[1] 0.696851 0.327208 0.188237 0.094352 0.053114 +#> print(rowMeans(mse_kern_mat,na.rm = TRUE)) +#[1] 4.12236 2.08821 0.82681 0.36273 0.20536 +#> print(rowMeans(mse_kern_paired_mat,na.rm = TRUE)) +#[1] 0.622408 0.222473 0.088097 0.048416 0.024772 + +# OK, so paired sampling improve significantly. +# Permutation and kernel apporach seems to be about the same efficiency +# A question might be whether the accuracy is the same accorss all features for the +# kenrel approach, of whther the permutation apporach is better in this regard. + + +explanation_perm <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1, + prediction_zero = p0,shap_approach = "permutation",n_permutations = 24 +) + +explanation_perm_paired <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1, + prediction_zero = p0,shap_approach = "permutation",paired_shap_sampling = TRUE,n_permutations = 24 +) + + +explanation_kernel <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1, + n_combinations = explanation_perm$internal$parameters$n_combinations, + prediction_zero = p0 +) + +explanation_kernel_paired <- explain( + model = model, + x_explain = x_explain, + x_train = x_train, + approach = "gaussian",seed = 123,n_batches=1,paired_shap_sampling = TRUE, + n_combinations = explanation_perm$internal$parameters$n_combinations, + prediction_zero = p0 +) + + +explanation_perm$internal$objects$X +explanation_perm_paired$internal$objects$X +explanation_kernel$internal$objects$X +explanation_kernel_paired$internal$objects$X + + +explanation_perm$internal$output$dt_vS +explanation_perm_paired$internal$output$dt_vS +explanation_kernel$internal$output$dt_vS +explanation_kernel_paired$internal$output$dt_vS + +explanation_perm$shapley_values +explanation_perm_paired$shapley_values +explanation_kernel$shapley_values +explanation_kernel_paired$shapley_values + + +explanation_perm$timing +explanation_perm_paired$timing +explanation_kernel$timing + +#### Some TODO notes: + +# double check the results with a better simulation study to show improved convergence +# using permutation with paired sampling vs kernel method w/wo paired sampling (based on same n_combinations) +# if the tests shows improved convergence, the setup and computation code needs increased efficiency + + + diff --git a/man/explain.Rd b/man/explain.Rd index e7c9deb4d..6506bd804 100644 --- a/man/explain.Rd +++ b/man/explain.Rd @@ -9,8 +9,11 @@ explain( x_explain, x_train, approach, + shap_approach = "kernel", + paired_shap_sampling = FALSE, prediction_zero, n_combinations = NULL, + n_permutations = NULL, group = NULL, n_samples = 1000, n_batches = NULL, diff --git a/man/explain_linear.Rd b/man/explain_linear.Rd new file mode 100644 index 000000000..82936687c --- /dev/null +++ b/man/explain_linear.Rd @@ -0,0 +1,116 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/explain_linear.R +\name{explain_linear} +\alias{explain_linear} +\title{Explain the output of a linear model with Shapley values} +\usage{ +explain_linear( + model, + x_explain, + x_train, + prediction_zero, + n_combinations = NULL, + n_permutations = NULL, + group = NULL, + n_samples = 1000, + n_batches = NULL, + seed = 1, + keep_samp_for_vS = FALSE, + predict_model = NULL, + get_model_specs = NULL, + MSEv_uniform_comb_weights = TRUE, + timing = TRUE, + ... +) +} +\arguments{ +\item{model}{The model whose predictions we want to explain. +Run \code{\link[=get_supported_models]{get_supported_models()}} +for a table of which models \code{explain} supports natively. Unsupported models +can still be explained by passing \code{predict_model} and (optionally) \code{get_model_specs}, +see details for more information.} + +\item{x_explain}{A matrix or data.frame/data.table. +Contains the the features, whose predictions ought to be explained.} + +\item{x_train}{Matrix or data.frame/data.table. +Contains the data used to estimate the (conditional) distributions for the features +needed to properly estimate the conditional expectations in the Shapley formula.} + +\item{prediction_zero}{Numeric. +The prediction value for unseen data, i.e. an estimate of the expected prediction without conditioning on any +features. +Typically we set this value equal to the mean of the response variable in our training data, but other choices +such as the mean of the predictions in the training data are also reasonable.} + +\item{n_combinations}{Integer. +If \code{group = NULL}, \code{n_combinations} represents the number of unique feature combinations to sample. +If \code{group != NULL}, \code{n_combinations} represents the number of unique group combinations to sample. +If \code{n_combinations = NULL}, the exact method is used and all combinations are considered. +The maximum number of combinations equals \code{2^m}, where \code{m} is the number of features.} + +\item{group}{List. +If \code{NULL} regular feature wise Shapley values are computed. +If provided, group wise Shapley values are computed. \code{group} then has length equal to +the number of groups. The list element contains character vectors with the features included +in each of the different groups.} + +\item{n_samples}{Positive integer. +Indicating the maximum number of samples to use in the +Monte Carlo integration for every conditional expectation. See also details.} + +\item{n_batches}{Positive integer (or NULL). +Specifies how many batches the total number of feature combinations should be split into when calculating the +contribution function for each test observation. +The default value is NULL which uses a reasonable trade-off between RAM allocation and computation speed, +which depends on \code{approach} and \code{n_combinations}. +For models with many features, increasing the number of batches reduces the RAM allocation significantly. +This typically comes with a small increase in computation time.} + +\item{seed}{Positive integer. +Specifies the seed before any randomness based code is being run. +If \code{NULL} the seed will be inherited from the calling environment.} + +\item{keep_samp_for_vS}{Logical. +Indicates whether the samples used in the Monte Carlo estimation of v_S should be returned +(in \code{internal$output})} + +\item{predict_model}{Function. +The prediction function used when \code{model} is not natively supported. +(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported +models.) +The function must have two arguments, \code{model} and \code{newdata} which specify, respectively, the model +and a data.frame/data.table to compute predictions for. The function must give the prediction as a numeric vector. +\code{NULL} (the default) uses functions specified internally. +Can also be used to override the default function for natively supported model classes.} + +\item{get_model_specs}{Function. +An optional function for checking model/data consistency when \code{model} is not natively supported. +(Run \code{\link[=get_supported_models]{get_supported_models()}} for a list of natively supported +models.) +The function takes \code{model} as argument and provides a list with 3 elements: +\describe{ +\item{labels}{Character vector with the names of each feature.} +\item{classes}{Character vector with the classes of each features.} +\item{factor_levels}{Character vector with the levels for any categorical features.} +} +If \code{NULL} (the default) internal functions are used for natively supported model classes, and the checking is +disabled for unsupported model classes. +Can also be used to override the default function for natively supported model classes.} + +\item{MSEv_uniform_comb_weights}{Logical. If \code{TRUE} (default), then the function weights the combinations +uniformly when computing the MSEv criterion. If \code{FALSE}, then the function use the Shapley kernel weights to +weight the combinations when computing the MSEv criterion. Note that the Shapley kernel weights are replaced by the +sampling frequency when not all combinations are considered.} + +\item{timing}{Logical. +Whether the timing of the different parts of the \code{explain()} should saved in the model object.} + +\item{...}{Further arguments passed to specific approaches} +} +\description{ +Explain the output of a linear model with Shapley values +} +\author{ +Martin Jullum +} diff --git a/man/feature_combinations.Rd b/man/feature_combinations.Rd index f6b6c4220..e6635be33 100644 --- a/man/feature_combinations.Rd +++ b/man/feature_combinations.Rd @@ -9,7 +9,8 @@ feature_combinations( exact = TRUE, n_combinations = 200, weight_zero_m = 10^6, - group_num = NULL + group_num = NULL, + paired_shap_sampling = FALSE ) } \arguments{ diff --git a/man/prepare_data.Rd b/man/prepare_data.Rd index 23e57b18d..86875d018 100644 --- a/man/prepare_data.Rd +++ b/man/prepare_data.Rd @@ -1,7 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/approach.R, R/approach_categorical.R, % R/approach_copula.R, R/approach_ctree.R, R/approach_empirical.R, -% R/approach_gaussian.R, R/approach_independence.R, R/approach_timeseries.R +% R/approach_gaussian.R, R/approach_independence.R, R/approach_timeseries.R, +% R/linear_gaussian_funcs.R \name{prepare_data} \alias{prepare_data} \alias{prepare_data.categorical} @@ -11,6 +12,7 @@ \alias{prepare_data.gaussian} \alias{prepare_data.independence} \alias{prepare_data.timeseries} +\alias{prepare_data.linear_gaussian} \title{Generate data used for predictions and Monte Carlo integration} \usage{ prepare_data(internal, index_features = NULL, ...) @@ -28,6 +30,8 @@ prepare_data(internal, index_features = NULL, ...) \method{prepare_data}{independence}(internal, index_features = NULL, ...) \method{prepare_data}{timeseries}(internal, index_features = NULL, ...) + +\method{prepare_data}{linear_gaussian}(internal, index_features = NULL, ...) } \arguments{ \item{internal}{Not used.} diff --git a/man/setup.Rd b/man/setup.Rd index 45a4ef170..efbceec93 100644 --- a/man/setup.Rd +++ b/man/setup.Rd @@ -8,9 +8,12 @@ setup( x_train, x_explain, approach, + shap_approach, + paired_shap_sampling, prediction_zero, output_size = 1, n_combinations, + n_permutations, group, n_samples, n_batches, diff --git a/man/setup_approach.Rd b/man/setup_approach.Rd index a87fbd106..269d79a8f 100644 --- a/man/setup_approach.Rd +++ b/man/setup_approach.Rd @@ -1,7 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/approach.R, R/approach_categorical.R, % R/approach_copula.R, R/approach_ctree.R, R/approach_empirical.R, -% R/approach_gaussian.R, R/approach_independence.R, R/approach_timeseries.R +% R/approach_gaussian.R, R/approach_independence.R, R/approach_timeseries.R, +% R/linear_gaussian_funcs.R \name{setup_approach} \alias{setup_approach} \alias{setup_approach.categorical} @@ -11,6 +12,7 @@ \alias{setup_approach.gaussian} \alias{setup_approach.independence} \alias{setup_approach.timeseries} +\alias{setup_linear_gaussian} \title{Set up the framework chosen approach} \usage{ setup_approach(internal, ...) @@ -57,6 +59,13 @@ setup_approach(internal, ...) timeseries.bounds = c(NULL, NULL), ... ) + +setup_linear_gaussian( + internal, + gaussian.mu = NULL, + gaussian.cov_mat = NULL, + ... +) } \arguments{ \item{internal}{Not used.} diff --git a/man/test_predict_linear_model.Rd b/man/test_predict_linear_model.Rd new file mode 100644 index 000000000..7067426a0 --- /dev/null +++ b/man/test_predict_linear_model.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_predict_model.R +\name{test_predict_linear_model} +\alias{test_predict_linear_model} +\title{Model testing function} +\usage{ +test_predict_linear_model( + x_test, + predict_model, + model, + linear_model_coef, + internal +) +} +\arguments{ +\item{predict_model}{Function. +The prediction function used when \code{model} is not natively supported. +See the documentation of \code{\link[=explain]{explain()}} for details.} + +\item{model}{Objects. +The model object that ought to be explained. +See the documentation of \code{\link[=explain]{explain()}} for details.} + +\item{internal}{List. +Holds all parameters, data, functions and computed objects used within \code{\link[=explain]{explain()}} +The list contains one or more of the elements \code{parameters}, \code{data}, \code{objects}, \code{output}.} +} +\description{ +Model testing function +} +\keyword{internal}