Need to clean code. Push in case something happens with my computer.

NorskRegnesentral · Jan 5, 2024 · 11819e8 · 11819e8
1 parent df9777d
commit 11819e8
Show file tree

Hide file tree

Showing 2 changed files with 301 additions and 7 deletions.
diff --git a/R/approach_copula.R b/R/approach_copula.R
@@ -49,20 +49,93 @@ setup_approach.copula <- function(internal, ...) {
   return(internal)
 }
 
+
+
 #' @inheritParams default_doc
 #' @rdname prepare_data
 #' @export
-prepare_data.copula <- function(internal, index_features = NULL, ...) {
+#' @author Lars Henry Berge Olsen
+prepare_data.copula2 <- function(internal, index_features, ...) {
+
+  # Extract used variables
+  X <- internal$objects$X
   x_train <- internal$data$x_train
+  x_train_mat <- as.matrix(internal$data$x_train)
   x_explain <- internal$data$x_explain
+  x_explain_mat <- as.matrix(internal$data$x_explain)
   n_explain <- internal$parameters$n_explain
-  copula.cov_mat <- internal$parameters$copula.cov_mat
   n_samples <- internal$parameters$n_samples
-  copula.mu <- internal$parameters$copula.mu
+  n_samples = 1000000
   n_features <- internal$parameters$n_features
+  n_combinations <- internal$parameters$n_combinations
+  n_combinations_now <- length(index_features)
+  copula.mu <- internal$parameters$copula.mu
+  copula.cov_mat <- internal$parameters$copula.cov_mat
+  copula.x_explain_gaussian_mat <- as.matrix(internal$data$copula.x_explain_gaussian) # CAN SKIP as.matrix as it is a matrix allready
+  feature_names <- internal$parameters$feature_names
 
-  copula.x_explain_gaussian <- internal$data$copula.x_explain_gaussian
+
+
+
+
+
+
+
+
+  S <- internal$objects$S[index_features, , drop = FALSE]
+
+
+  # Generate the MC samples from N(0, 1)
+  MC_samples_mat <- matrix(rnorm(n_samples * n_features), nrow = n_samples, ncol = n_features)
+
+  # Use Cpp to convert the MC samples to N(mu_{Sbar|S}, Sigma_{Sbar|S}) for all coalitions and explicands.
+  # The object `dt` is here a 3D array of dimension (n_samples, n_explain*n_coalitions, n_features).
+  # INCLUDE THE TRANSFORMATION
+
+
+  dt <- prepare_data_copula_cpp(MC_samples_mat = MC_samples_mat,
+                                  x_explain_mat = x_explain_mat,
+                                  x_explain_gaussian_mat = copula.x_explain_gaussian_mat,
+                                  x_train_mat = x_train_mat,
+                                  S = S,
+                                  mu = copula.mu,
+                                  cov_mat = copula.cov_mat)
+
+  # Reshape `dt` to a 2D array of dimension (n_samples*n_explain*n_coalitions, n_features).
+  dim(dt) = c(n_combinations_now*n_explain*n_samples, n_features)
+
+  # Convert to a data.table
+  dt = as.data.table(dt)
+  setnames(dt, feature_names)
+  dt[, "id_combination" := rep(seq(nrow(S)), each = n_samples * n_explain)]
+  dt[, "id" := rep(seq(n_explain), each = n_samples, times = nrow(S))]
+  dt[, "w" := 1 / n_samples]
+  data.table::setcolorder(dt, c("id_combination", "id", feature_names))
+  dt[, id_combination := index_features[id_combination]]
+  dt
+
+  dt[id_combination == 9 & id == 1,]
+
+  dt_agr = dt[, lapply(.SD, mean), by = list(id, id_combination)]
+  data.table::setorderv(dt_agr, c("id", "id_combination"))
+  dt_agr
+
+  return(dt)
+}
+
+#' @inheritParams default_doc
+#' @rdname prepare_data
+#' @export
+prepare_data.copula <- function(internal, index_features = NULL, ...) {
   X <- internal$objects$X
+  x_train <- internal$data$x_train
+  x_explain <- internal$data$x_explain
+  n_explain <- internal$parameters$n_explain
+  n_samples <- internal$parameters$n_samples
+  n_features <- internal$parameters$n_features
+  copula.mu <- internal$parameters$copula.mu
+  copula.cov_mat <- internal$parameters$copula.cov_mat
+  copula.x_explain_gaussian <- internal$data$copula.x_explain_gaussian
 
 
   x_explain0 <- as.matrix(x_explain)
@@ -74,7 +147,9 @@ prepare_data.copula <- function(internal, index_features = NULL, ...) {
   }
 
 
+  n_samples = 1000000
   for (i in seq_len(n_explain)) {
+    print(i)
     l <- lapply(
       X = features,
       FUN = sample_copula,
@@ -86,14 +161,25 @@ prepare_data.copula <- function(internal, index_features = NULL, ...) {
       x_train = as.matrix(x_train),
       x_explain_gaussian = copula.x_explain_gaussian[i, , drop = FALSE]
     )
+    # x_explain_gaussian2 = x_explain_gaussian
+    # x_train2 = x_train
+    # x_explain2 = x_explain
 
     dt_l[[i]] <- data.table::rbindlist(l, idcol = "id_combination")
     dt_l[[i]][, w := 1 / n_samples]
     dt_l[[i]][, id := i]
     if (!is.null(index_features)) dt_l[[i]][, id_combination := index_features[id_combination]]
   }
   dt <- data.table::rbindlist(dt_l, use.names = TRUE, fill = TRUE)
+  dt
+  dt9 = dt
+  dt9_agr = dt9[, lapply(.SD, mean), by = list(id, id_combination)]
+  dt9_agr - dt_agr
   return(dt)
+
+
+
+
 }
 
 #' Sample conditional variables using the Gaussian copula approach
@@ -119,6 +205,7 @@ sample_copula <- function(index_given, n_samples, mu, cov_mat, m, x_explain_gaus
   } else {
     dependent_ind <- (seq_len(length(mu)))[-index_given]
 
+    # Dette har jeg kode til
     tmp <- condMVNorm::condMVN(
       mean = mu,
       sigma = cov_mat,
@@ -127,15 +214,24 @@ sample_copula <- function(index_given, n_samples, mu, cov_mat, m, x_explain_gaus
       X.given = x_explain_gaussian[index_given]
     )
 
+    # Dette har jeg kode til. Bruker cholensky + simulert data fra før
     ret0_z <- mvnfast::rmvn(n = n_samples, mu = tmp$condMean, sigma = tmp$condVar)
 
+    # Dette må jeg skrive selv
     ret0_x <- apply(
       X = rbind(ret0_z, x_train[, dependent_ind, drop = FALSE]),
       MARGIN = 2,
       FUN = inv_gaussian_transform,
-      n_z = n_samples
+      n_z = n_samples,
+      type = 5
     )
+    ret0_x_cpp = inv_gaussian_transform_cpp(z = ret0_z, x = x_train[, dependent_ind, drop = FALSE])
+
+    ret0_x_cpp = inv_gaussian_transform_cpp_armamat(rbind(ret0_z, x_train[, dependent_ind, drop = FALSE]), n_samples = n_samples)
+    colnames(ret0_x_cpp) = feature_names[dependent_ind]
+    all.equal(ret0_x, ret0_x_cpp)
 
+    # Dette har jeg kode til
     ret <- matrix(NA, ncol = m, nrow = n_samples)
     ret[, index_given] <- rep(x_explain[index_given], each = n_samples)
     ret[, dependent_ind] <- ret0_x
@@ -156,13 +252,13 @@ sample_copula <- function(index_given, n_samples, mu, cov_mat, m, x_explain_gaus
 #' @keywords internal
 #'
 #' @author Martin Jullum
-inv_gaussian_transform <- function(zx, n_z) {
+inv_gaussian_transform <- function(zx, n_z, type) {
   if (n_z >= length(zx)) stop("n_z should be less than length(zx)")
   ind <- 1:n_z
   z <- zx[ind]
   x <- zx[-ind]
   u <- stats::pnorm(z)
-  x_new <- stats::quantile(x, probs = u)
+  x_new <- stats::quantile(x, probs = u, type = type)
   return(as.double(x_new))
 }
 

diff --git a/src/Copula.cpp b/src/Copula.cpp
@@ -0,0 +1,198 @@
+#include <RcppArmadillo.h>
+#include <iostream>
+
+// [[Rcpp::depends(RcppArmadillo)]]
+
+//' Transforms new data to a standardized normal distribution
+ //'
+ //' @param zx Numeric vector. The first `n_samples` items are the Gaussian data, and the last part is
+ //' the data with the original transformation.
+ //' @param n_samples Positive integer. Number of elements of `zx` that belongs to new data.
+ //'
+ //' @return Numeric matrix of length `n_samples`
+ //'
+ //' @keywords internal
+ //'
+ //' @author Lars Henry Berge Olsen
+ // [[Rcpp::export]]
+ Rcpp::NumericVector inv_gaussian_transform_cpp_Rcpp(const Rcpp::NumericVector& zx, const int n_samples) {
+
+   // Extract z and x
+   Rcpp::NumericVector z = zx[Rcpp::Range(0, n_samples - 1)];
+   Rcpp::NumericVector x = zx[Rcpp::Range(n_samples, zx.size() - 1)];
+
+   // Calculate u
+   Rcpp::NumericVector u = Rcpp::pnorm(z);
+
+   // Calculate x_new using Armadillo's quantile function
+   arma::vec x_arma = Rcpp::as<arma::vec>(x);
+   arma::vec u_arma = Rcpp::as<arma::vec>(u);
+
+   arma::vec x_new_arma = arma::quantile(x_arma, u_arma);
+
+   // Convert back to Rcpp::NumericMatrix
+   Rcpp::NumericVector x_new = Rcpp::wrap(x_new_arma);
+
+   return x_new;
+ }
+
+
+// [[Rcpp::export]]
+arma::mat inv_gaussian_transform_cpp_mat(Rcpp::NumericMatrix zx, const int n_samples) {
+
+  int n_features = zx.ncol();
+
+  // Extract z and x
+  Rcpp::NumericMatrix z = zx(Rcpp::Range(0, n_samples - 1), Rcpp::_ );
+  Rcpp::NumericMatrix x = zx(Rcpp::Range(n_samples, zx.nrow() - 1), Rcpp::_ );
+
+  // Rcpp::NumericMatrix u = Rcpp::pnorm(z);
+
+  // Convert Rcpp::NumericMatrix to arma::mat
+  arma::mat z_arma = Rcpp::as<arma::mat>(z);
+  arma::mat x_arma = Rcpp::as<arma::mat>(x);
+
+  // Calculate u
+  arma::mat u_arma = arma::normcdf(z_arma);
+
+  // Calculate x_new using Armadillo's quantile function
+  arma::mat x_new_arma(n_samples, n_features);
+  for (int feature_idx = 0; feature_idx < n_features; feature_idx++) {
+    x_new_arma.col(feature_idx) = arma::quantile(x_arma.col(feature_idx), u_arma.col(feature_idx));
+  }
+
+  // // Convert back to Rcpp::NumericVector
+  // Rcpp::NumericMatrix x_new = Rcpp::wrap(x_new_arma);
+
+  return x_new_arma;
+}
+
+// [[Rcpp::export]]
+arma::mat inv_gaussian_transform_cpp_armamat(arma::mat zx, const int n_samples) {
+
+  int n_features = zx.n_cols;
+
+// WHAT IS THE POINT TO FIRST ADD THEM TOGETHER AND THEN SPLIT THEM?
+  // Extract z and x
+  arma::mat z = zx.rows(0, n_samples - 1);
+  arma::mat x = zx.rows(n_samples, zx.n_rows - 1);
+
+  // Calculate u
+  arma::mat u = arma::normcdf(z);
+
+  // Calculate x_new using Armadillo's quantile function
+  arma::mat x_new(n_samples, n_features);
+  for (int feature_idx = 0; feature_idx < n_features; feature_idx++) {
+    x_new.col(feature_idx) = arma::quantile(x.col(feature_idx), u.col(feature_idx));
+  }
+
+  // // Convert back to Rcpp::NumericVector
+  // Rcpp::NumericMatrix x_new = Rcpp::wrap(x_new_arma);
+
+  return x_new;
+}
+
+
+
+//' Transforms new data to a standardized normal distribution
+//'
+//' @details The function uses `arma::quantile(...)` which corresponds to R's `stats::quantile(..., type = 5)`.
+//'
+//' @param z arma::mat. The data are the Gaussian Monte Carlos samples to transform.
+//' @param x arma::mat. The data with the original transformation. Used to conduct the transformation of `z`.
+//'
+//' @return arma::mat of same dimension as `z`
+//'
+//' @keywords internal
+//' @author Lars Henry Berge Olsen
+// [[Rcpp::export]]
+arma::mat inv_gaussian_transform_cpp(arma::mat z, arma::mat x) {
+  int n_features = z.n_cols;
+  int n_samples = z.n_rows;
+  arma::mat u = arma::normcdf(z);
+  arma::mat z_new(n_samples, n_features);
+  for (int feature_idx = 0; feature_idx < n_features; feature_idx++) {
+    z_new.col(feature_idx) = arma::quantile(x.col(feature_idx), u.col(feature_idx));
+  }
+  return z_new;
+}
+
+
+
+// [[Rcpp::export]]
+arma::cube prepare_data_copula_cpp(arma::mat MC_samples_mat,
+                                   arma::mat x_explain_mat,
+                                   arma::mat x_explain_gaussian_mat,
+                                   arma::mat x_train_mat,
+                                   arma::mat S,
+                                   arma::vec mu,
+                                   arma::mat cov_mat) {
+
+  int n_explain = x_explain_mat.n_rows;
+  int n_samples = MC_samples_mat.n_rows;
+  int n_features = MC_samples_mat.n_cols;
+  int n_coalitions = S.n_rows;
+
+  // Initialize auxiliary matrix and result cube
+  arma::mat aux_mat(n_samples, n_features);
+  arma::cube result_cube(n_samples, n_explain*n_coalitions, n_features);
+
+  // Iterate over the coalitions
+  for (int S_ind = 0; S_ind < n_coalitions; S_ind++) {
+
+    // Get current coalition S and the indices of the features in coalition S and mask Sbar
+    arma::mat S_now = S.row(S_ind);
+    arma::uvec S_now_idx = arma::find(S_now > 0.5);
+    arma::uvec Sbar_now_idx = arma::find(S_now < 0.5);
+
+    // Extract the features we condition on
+    arma::mat x_S_star = x_explain_mat.cols(S_now_idx);
+    arma::mat x_S_star_gaussian = x_explain_gaussian_mat.cols(S_now_idx);
+
+    // // Does that we do not conditioning on
+    // arma::mat x_Sbar_star = x_train_mat.cols(Sbar_now_idx);
+
+    // Extract the mean values for the features in the two sets
+    arma::vec mu_S = mu.elem(S_now_idx);
+    arma::vec mu_Sbar = mu.elem(Sbar_now_idx);
+
+    std::cout << mu_S << std::endl;
+
+    // Extract the relevant parts of the covariance matrix
+    arma::mat cov_mat_SS = cov_mat.submat(S_now_idx, S_now_idx);
+    arma::mat cov_mat_SSbar = cov_mat.submat(S_now_idx, Sbar_now_idx);
+    arma::mat cov_mat_SbarS = cov_mat.submat(Sbar_now_idx, S_now_idx);
+    arma::mat cov_mat_SbarSbar = cov_mat.submat(Sbar_now_idx, Sbar_now_idx);
+
+    // Compute the covariance matrix multiplication factors/terms and the conditional covariance matrix
+    arma::mat cov_mat_SbarS_cov_mat_SS_inv = cov_mat_SbarS * inv(cov_mat_SS);
+    arma::mat cond_cov_mat_Sbar_given_S = cov_mat_SbarSbar - cov_mat_SbarS_cov_mat_SS_inv * cov_mat_SSbar;
+
+    // Ensure that the conditional covariance matrix is symmetric
+    if (!cond_cov_mat_Sbar_given_S.is_symmetric()) {
+      cond_cov_mat_Sbar_given_S = arma::symmatl(cond_cov_mat_Sbar_given_S);
+    }
+
+    // Compute the conditional mean of Xsbar given Xs = Xs_star
+    arma::mat x_Sbar_mean = cov_mat_SbarS_cov_mat_SS_inv * (x_S_star_gaussian.each_row() - mu_S.t()).t();
+    x_Sbar_mean.each_col() += mu_Sbar;
+
+    // Transform the samples to be from N(O, Sigma_Sbar|S)
+    arma::mat MC_samples_mat_now = MC_samples_mat.cols(Sbar_now_idx) * arma::chol(cond_cov_mat_Sbar_given_S);
+
+    // Loop over the different test observations and combine the generated values with the values we conditioned on
+    for (int idx_now = 0; idx_now < n_explain; idx_now++) {
+
+      arma::mat MC_samples_mat_now_now = MC_samples_mat_now + repmat(trans(x_Sbar_mean.col(idx_now)), n_samples, 1);
+      arma::mat MC_samples_mat_now_now_trans =
+        inv_gaussian_transform_cpp(MC_samples_mat_now_now, x_train_mat.cols(Sbar_now_idx));
+
+      aux_mat.cols(S_now_idx) = repmat(x_S_star.row(idx_now), n_samples, 1);
+      aux_mat.cols(Sbar_now_idx) = MC_samples_mat_now_now_trans;
+
+      result_cube.col(S_ind*n_explain + idx_now) = aux_mat;
+    }
+  }
+
+  return result_cube;
+}