diff --git a/R/approach_copula.R b/R/approach_copula.R index 403d88809..32087a010 100644 --- a/R/approach_copula.R +++ b/R/approach_copula.R @@ -49,20 +49,93 @@ setup_approach.copula <- function(internal, ...) { return(internal) } + + #' @inheritParams default_doc #' @rdname prepare_data #' @export -prepare_data.copula <- function(internal, index_features = NULL, ...) { +#' @author Lars Henry Berge Olsen +prepare_data.copula2 <- function(internal, index_features, ...) { + + # Extract used variables + X <- internal$objects$X x_train <- internal$data$x_train + x_train_mat <- as.matrix(internal$data$x_train) x_explain <- internal$data$x_explain + x_explain_mat <- as.matrix(internal$data$x_explain) n_explain <- internal$parameters$n_explain - copula.cov_mat <- internal$parameters$copula.cov_mat n_samples <- internal$parameters$n_samples - copula.mu <- internal$parameters$copula.mu + n_samples = 1000000 n_features <- internal$parameters$n_features + n_combinations <- internal$parameters$n_combinations + n_combinations_now <- length(index_features) + copula.mu <- internal$parameters$copula.mu + copula.cov_mat <- internal$parameters$copula.cov_mat + copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian) # CAN SKIP as.matrix as it is a matrix allready + feature_names <- internal$parameters$feature_names - copula.x_explain_gaussian <- internal$data$copula.x_explain_gaussian + + + + + + + + + S <- internal$objects$S[index_features, , drop = FALSE] + + + # Generate the MC samples from N(0, 1) + MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features) + + # Use Cpp to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}) for all coalitions and explicands. + # The object `dt` is here a 3D array of dimension (n_samples, n_explain*n_coalitions, n_features). + # INCLUDE THE TRANSFORMATION + + + dt <- prepare_data_copula_cpp(MC_samples_mat = MC_samples_mat, + x_explain_mat = x_explain_mat, + x_explain_gaussian_mat = copula.x_explain_gaussian_mat, + x_train_mat = x_train_mat, + S = S, + mu = copula.mu, + cov_mat = copula.cov_mat) + + # Reshape `dt` to a 2D array of dimension (n_samples*n_explain*n_coalitions, n_features). + dim(dt) = c(n_combinations_now*n_explain*n_samples, n_features) + + # Convert to a data.table + dt = as.data.table(dt) + setnames(dt, feature_names) + dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)] + dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))] + dt[, "w" := 1 / n_samples] + data.table::setcolorder(dt, c("id_combination", "id", feature_names)) + dt[, id_combination := index_features[id_combination]] + dt + + dt[id_combination == 9 & id == 1,] + + dt_agr = dt[, lapply(.SD, mean), by = list(id, id_combination)] + data.table::setorderv(dt_agr, c("id", "id_combination")) + dt_agr + + return(dt) +} + +#' @inheritParams default_doc +#' @rdname prepare_data +#' @export +prepare_data.copula <- function(internal, index_features = NULL, ...) { X <- internal$objects$X + x_train <- internal$data$x_train + x_explain <- internal$data$x_explain + n_explain <- internal$parameters$n_explain + n_samples <- internal$parameters$n_samples + n_features <- internal$parameters$n_features + copula.mu <- internal$parameters$copula.mu + copula.cov_mat <- internal$parameters$copula.cov_mat + copula.x_explain_gaussian <- internal$data$copula.x_explain_gaussian x_explain0 <- as.matrix(x_explain) @@ -74,7 +147,9 @@ prepare_data.copula <- function(internal, index_features = NULL, ...) { } + n_samples = 1000000 for (i in seq_len(n_explain)) { + print(i) l <- lapply( X = features, FUN = sample_copula, @@ -86,6 +161,9 @@ prepare_data.copula <- function(internal, index_features = NULL, ...) { x_train = as.matrix(x_train), x_explain_gaussian = copula.x_explain_gaussian[i, , drop = FALSE] ) + # x_explain_gaussian2 = x_explain_gaussian + # x_train2 = x_train + # x_explain2 = x_explain dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination") dt_l[[i]][, w := 1 / n_samples] @@ -93,7 +171,15 @@ prepare_data.copula <- function(internal, index_features = NULL, ...) { if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]] } dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE) + dt + dt9 = dt + dt9_agr = dt9[, lapply(.SD, mean), by = list(id, id_combination)] + dt9_agr - dt_agr return(dt) + + + + } #' Sample conditional variables using the Gaussian copula approach @@ -119,6 +205,7 @@ sample_copula <- function(index_given, n_samples, mu, cov_mat, m, x_explain_gaus } else { dependent_ind <- (seq_len(length(mu)))[-index_given] + # Dette har jeg kode til tmp <- condMVNorm::condMVN( mean = mu, sigma = cov_mat, @@ -127,15 +214,24 @@ sample_copula <- function(index_given, n_samples, mu, cov_mat, m, x_explain_gaus X.given = x_explain_gaussian[index_given] ) + # Dette har jeg kode til. Bruker cholensky + simulert data fra før ret0_z <- mvnfast::rmvn(n = n_samples, mu = tmp$condMean, sigma = tmp$condVar) + # Dette må jeg skrive selv ret0_x <- apply( X = rbind(ret0_z, x_train[, dependent_ind, drop = FALSE]), MARGIN = 2, FUN = inv_gaussian_transform, - n_z = n_samples + n_z = n_samples, + type = 5 ) + ret0_x_cpp = inv_gaussian_transform_cpp(z = ret0_z, x = x_train[, dependent_ind, drop = FALSE]) + + ret0_x_cpp = inv_gaussian_transform_cpp_armamat(rbind(ret0_z, x_train[, dependent_ind, drop = FALSE]), n_samples = n_samples) + colnames(ret0_x_cpp) = feature_names[dependent_ind] + all.equal(ret0_x, ret0_x_cpp) + # Dette har jeg kode til ret <- matrix(NA, ncol = m, nrow = n_samples) ret[, index_given] <- rep(x_explain[index_given], each = n_samples) ret[, dependent_ind] <- ret0_x @@ -156,13 +252,13 @@ sample_copula <- function(index_given, n_samples, mu, cov_mat, m, x_explain_gaus #' @keywords internal #' #' @author Martin Jullum -inv_gaussian_transform <- function(zx, n_z) { +inv_gaussian_transform <- function(zx, n_z, type) { if (n_z >= length(zx)) stop("n_z should be less than length(zx)") ind <- 1:n_z z <- zx[ind] x <- zx[-ind] u <- stats::pnorm(z) - x_new <- stats::quantile(x, probs = u) + x_new <- stats::quantile(x, probs = u, type = type) return(as.double(x_new)) } diff --git a/src/Copula.cpp b/src/Copula.cpp new file mode 100644 index 000000000..5e7997922 --- /dev/null +++ b/src/Copula.cpp @@ -0,0 +1,198 @@ +#include +#include + +// [[Rcpp::depends(RcppArmadillo)]] + +//' Transforms new data to a standardized normal distribution + //' + //' @param zx Numeric vector. The first `n_samples` items are the Gaussian data, and the last part is + //' the data with the original transformation. + //' @param n_samples Positive integer. Number of elements of `zx` that belongs to new data. + //' + //' @return Numeric matrix of length `n_samples` + //' + //' @keywords internal + //' + //' @author Lars Henry Berge Olsen + // [[Rcpp::export]] + Rcpp::NumericVector inv_gaussian_transform_cpp_Rcpp(const Rcpp::NumericVector& zx, const int n_samples) { + + // Extract z and x + Rcpp::NumericVector z = zx[Rcpp::Range(0, n_samples - 1)]; + Rcpp::NumericVector x = zx[Rcpp::Range(n_samples, zx.size() - 1)]; + + // Calculate u + Rcpp::NumericVector u = Rcpp::pnorm(z); + + // Calculate x_new using Armadillo's quantile function + arma::vec x_arma = Rcpp::as(x); + arma::vec u_arma = Rcpp::as(u); + + arma::vec x_new_arma = arma::quantile(x_arma, u_arma); + + // Convert back to Rcpp::NumericMatrix + Rcpp::NumericVector x_new = Rcpp::wrap(x_new_arma); + + return x_new; + } + + +// [[Rcpp::export]] +arma::mat inv_gaussian_transform_cpp_mat(Rcpp::NumericMatrix zx, const int n_samples) { + + int n_features = zx.ncol(); + + // Extract z and x + Rcpp::NumericMatrix z = zx(Rcpp::Range(0, n_samples - 1), Rcpp::_ ); + Rcpp::NumericMatrix x = zx(Rcpp::Range(n_samples, zx.nrow() - 1), Rcpp::_ ); + + // Rcpp::NumericMatrix u = Rcpp::pnorm(z); + + // Convert Rcpp::NumericMatrix to arma::mat + arma::mat z_arma = Rcpp::as(z); + arma::mat x_arma = Rcpp::as(x); + + // Calculate u + arma::mat u_arma = arma::normcdf(z_arma); + + // Calculate x_new using Armadillo's quantile function + arma::mat x_new_arma(n_samples, n_features); + for (int feature_idx = 0; feature_idx < n_features; feature_idx++) { + x_new_arma.col(feature_idx) = arma::quantile(x_arma.col(feature_idx), u_arma.col(feature_idx)); + } + + // // Convert back to Rcpp::NumericVector + // Rcpp::NumericMatrix x_new = Rcpp::wrap(x_new_arma); + + return x_new_arma; +} + +// [[Rcpp::export]] +arma::mat inv_gaussian_transform_cpp_armamat(arma::mat zx, const int n_samples) { + + int n_features = zx.n_cols; + +// WHAT IS THE POINT TO FIRST ADD THEM TOGETHER AND THEN SPLIT THEM? + // Extract z and x + arma::mat z = zx.rows(0, n_samples - 1); + arma::mat x = zx.rows(n_samples, zx.n_rows - 1); + + // Calculate u + arma::mat u = arma::normcdf(z); + + // Calculate x_new using Armadillo's quantile function + arma::mat x_new(n_samples, n_features); + for (int feature_idx = 0; feature_idx < n_features; feature_idx++) { + x_new.col(feature_idx) = arma::quantile(x.col(feature_idx), u.col(feature_idx)); + } + + // // Convert back to Rcpp::NumericVector + // Rcpp::NumericMatrix x_new = Rcpp::wrap(x_new_arma); + + return x_new; +} + + + +//' Transforms new data to a standardized normal distribution +//' +//' @details The function uses `arma::quantile(...)` which corresponds to R's `stats::quantile(..., type = 5)`. +//' +//' @param z arma::mat. The data are the Gaussian Monte Carlos samples to transform. +//' @param x arma::mat. The data with the original transformation. Used to conduct the transformation of `z`. +//' +//' @return arma::mat of same dimension as `z` +//' +//' @keywords internal +//' @author Lars Henry Berge Olsen +// [[Rcpp::export]] +arma::mat inv_gaussian_transform_cpp(arma::mat z, arma::mat x) { + int n_features = z.n_cols; + int n_samples = z.n_rows; + arma::mat u = arma::normcdf(z); + arma::mat z_new(n_samples, n_features); + for (int feature_idx = 0; feature_idx < n_features; feature_idx++) { + z_new.col(feature_idx) = arma::quantile(x.col(feature_idx), u.col(feature_idx)); + } + return z_new; +} + + + +// [[Rcpp::export]] +arma::cube prepare_data_copula_cpp(arma::mat MC_samples_mat, + arma::mat x_explain_mat, + arma::mat x_explain_gaussian_mat, + arma::mat x_train_mat, + arma::mat S, + arma::vec mu, + arma::mat cov_mat) { + + int n_explain = x_explain_mat.n_rows; + int n_samples = MC_samples_mat.n_rows; + int n_features = MC_samples_mat.n_cols; + int n_coalitions = S.n_rows; + + // Initialize auxiliary matrix and result cube + arma::mat aux_mat(n_samples, n_features); + arma::cube result_cube(n_samples, n_explain*n_coalitions, n_features); + + // Iterate over the coalitions + for (int S_ind = 0; S_ind < n_coalitions; S_ind++) { + + // Get current coalition S and the indices of the features in coalition S and mask Sbar + arma::mat S_now = S.row(S_ind); + arma::uvec S_now_idx = arma::find(S_now > 0.5); + arma::uvec Sbar_now_idx = arma::find(S_now < 0.5); + + // Extract the features we condition on + arma::mat x_S_star = x_explain_mat.cols(S_now_idx); + arma::mat x_S_star_gaussian = x_explain_gaussian_mat.cols(S_now_idx); + + // // Does that we do not conditioning on + // arma::mat x_Sbar_star = x_train_mat.cols(Sbar_now_idx); + + // Extract the mean values for the features in the two sets + arma::vec mu_S = mu.elem(S_now_idx); + arma::vec mu_Sbar = mu.elem(Sbar_now_idx); + + std::cout << mu_S << std::endl; + + // Extract the relevant parts of the covariance matrix + arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx); + arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx); + arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx); + arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx); + + // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix + arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS); + arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar; + + // Ensure that the conditional covariance matrix is symmetric + if (!cond_cov_mat_Sbar_given_S.is_symmetric()) { + cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S); + } + + // Compute the conditional mean of Xsbar given Xs = Xs_star + arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star_gaussian.each_row() - mu_S.t()).t(); + x_Sbar_mean.each_col() += mu_Sbar; + + // Transform the samples to be from N(O, Sigma_Sbar|S) + arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S); + + // Loop over the different test observations and combine the generated values with the values we conditioned on + for (int idx_now = 0; idx_now < n_explain; idx_now++) { + + arma::mat MC_samples_mat_now_now = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1); + arma::mat MC_samples_mat_now_now_trans = + inv_gaussian_transform_cpp(MC_samples_mat_now_now, x_train_mat.cols(Sbar_now_idx)); + + aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1); + aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now_now_trans; + + result_cube.col(S_ind*n_explain + idx_now) = aux_mat; + } + } + + return result_cube; +}