diff --git a/src/gemma.cpp b/src/gemma.cpp
index 2525075..da13dd8 100644
--- a/src/gemma.cpp
+++ b/src/gemma.cpp
@@ -337,6 +337,17 @@ void GEMMA::PrintHelp(size_t option) {
     cout << "                  variable for individual 2" << endl;
     cout << "                  ..." << endl;
     cout << "          missing value: NA" << endl;
+    cout << " -resid   [filename]     "
+         << " residual variance file contains a column of positive values to be used "
+         << "directly as for the residual variance---each value corresponds to an "
+         << "individual, in which each value is the empirical residual variance (sigmasq) based"
+         << "on a trait value calculated from multiple replicates for this individual" 
+         << "(similar in format to phenotype file)"
+         << endl;
+    cout << "          format: variable for individual 1" << endl;
+    cout << "                  variable for individual 2" << endl;
+    cout << "                  ..." << endl;
+    cout << "          missing value: NA" << endl;
     cout << " -k        [filename]     "
          << " specify input kinship/relatedness matrix file name" << endl;
     cout << " -mk       [filename]     "
@@ -824,6 +835,14 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) {
       str.clear();
       str.assign(argv[i]);
       cPar.file_weight = str;
+    } else if (strcmp(argv[i], "-resid") == 0) {
+      if (argv[i + 1] == NULL || argv[i + 1][0] == '-') {
+        continue;
+      }
+      ++i;
+      str.clear();
+      str.assign(argv[i]);
+      cPar.file_resid = str;
     } else if (strcmp(argv[i], "-wsnp") == 0) {
       if (argv[i + 1] == NULL || argv[i + 1][0] == '-') {
         continue;
@@ -2611,6 +2630,8 @@ void GEMMA::BatchRun(PARAM &cPar) {
           }
         }
       }
+      // Need some way of reading the residual variance file and then
+      // assigning it to what would have been the ..+ 1 part of the computation
 
       // eigen-decomposition and calculate trace_G - main track
       cout << "Start Eigen-Decomposition..." << endl;
diff --git a/src/gemma_io.cpp b/src/gemma_io.cpp
index bd687af..6bb2c68 100644
--- a/src/gemma_io.cpp
+++ b/src/gemma_io.cpp
@@ -150,7 +150,7 @@ std::istream &safeGetline(std::istream &is, std::string &t) {
 
 // Read SNP file. A single column of SNP names.
 bool ReadFile_snps(const string file_snps, set<string> &setSnps) {
-  debug_msg("enter ReadFile_snps");
+  debug_msg("entered");
   setSnps.clear();
 
   igzstream infile(file_snps.c_str(), igzstream::in);
@@ -329,9 +329,6 @@ bool ReadFile_anno(const string &file_anno, map<string, string> &mapRS2chr,
     mapRS2bp[rs] = b_pos;
     mapRS2cM[rs] = cM;
   }
-  // for (auto& [key, value] : mapRS2bp) {
-  //     cerr << key << endl;
-  //}
 
   infile.close();
   infile.clear();
@@ -509,6 +506,73 @@ bool ReadFile_cvt(const string &file_cvt, vector<int> &indicator_cvt,
   return true;
 }
 
+bool ReadFile_resid(const string &file_resid, vector<int> &indicator_resid,
+                  vector<vector<double>> &sigmasq, size_t &n_resid) {
+  debug_msg("entered");
+  indicator_resid.clear();
+
+  ifstream infile(file_resid.c_str(), ifstream::in);
+  if (!infile) {
+    cout << "error! fail to open residual variance file: " << file_resid << endl;
+    return false;
+  }
+
+  string line;
+  char *ch_ptr;
+  double d;
+
+  int flag_na = 0;
+
+  while (!safeGetline(infile, line).eof()) {
+    vector<double> v_d;
+    flag_na = 0;
+    ch_ptr = strtok((char *)line.c_str(), " ,\t");
+    while (ch_ptr != NULL) {
+      if (strcmp(ch_ptr, "NA") == 0) {
+        flag_na = 1;
+        d = -9;
+      } else {
+        d = atof(ch_ptr);
+      }
+
+      v_d.push_back(d);
+      ch_ptr = strtok(NULL, " ,\t");
+    }
+    if (flag_na == 0) {
+      indicator_resid.push_back(1);
+    } else {
+      indicator_resid.push_back(0);
+    }
+    sigmasq.push_back(v_d);
+  }
+
+  if (indicator_resid.empty()) {
+    n_resid = 0;
+  } else {
+    flag_na = 0;
+    for (vector<int>::size_type i = 0; i < indicator_resid.size(); ++i) {
+      if (indicator_resid[i] == 0) {
+        continue;
+      }
+
+      if (flag_na == 0) {
+        flag_na = 1;
+        n_resid = sigmasq[i].size();
+      }
+      if (flag_na != 0 && n_resid != sigmasq[i].size()) {
+        cout << "error! number of residuals in row " << i
+             << " do not match other rows." << endl;
+        return false;
+      }
+    }
+  }
+
+  infile.close();
+  infile.clear();
+
+  return true;
+}
+
 // Read .bim file.
 bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) {
   debug_msg("entered");
@@ -684,9 +748,10 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
   double maf, geno, geno_old;
   size_t n_miss;
   size_t n_0, n_1, n_2;
+
   double min_g = std::numeric_limits<float>::max();
   double max_g = std::numeric_limits<float>::min();
-
+  
   int flag_poly;
 
   int ni_total = indicator_idv.size();
@@ -699,9 +764,6 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
   file_pos = 0;
   auto count_warnings = 0;
   auto infilen = file_geno.c_str();
-  // for (auto& [key, value] : mapRS2bp) {
-  //      cerr << key << endl;
-  // }
   while (!safe_get_line(infile, line).eof()) {
     ch_ptr = strtok_safe2((char *)line.c_str(), " ,\t",infilen);
     rs = ch_ptr;
@@ -1412,6 +1474,55 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) {
   return;
 }
 
+void ReadFile_resid(const string &file_resid, bool &error, gsl_matrix *sigmasq) {
+  debug_msg("entered");
+  igzstream infile(file_resid.c_str(), igzstream::in);
+  if (!infile) {
+    cout << "error! fail to open the residual variance file: " << file_resid << endl;
+    error = true;
+    return;
+  }
+
+  size_t n_row = sigmasq->size1, n_col = sigmasq->size2, i_row = 0, i_col = 0;
+
+  gsl_matrix_set_zero(sigmasq); //change this so that sigmasq = V_e somehow
+
+  string line;
+  char *ch_ptr;
+  double d;
+
+  while (getline(infile, line)) {
+    if (i_row == n_row) {
+      cout << "error! number of rows in the residual variance file is larger "
+           << "than expected." << endl;
+      error = true;
+    }
+
+    i_col = 0;
+    ch_ptr = strtok((char *)line.c_str(), " ,\t");
+    while (ch_ptr != NULL) {
+      if (i_col == n_col) {
+        cout << "error! number of columns in the residual variance file "
+             << "is larger than expected, for row = " << i_row << endl;
+        error = true;
+      }
+
+      d = atof(ch_ptr);
+      gsl_matrix_set(sigmasq, i_row, i_col, d);
+      i_col++;
+
+      ch_ptr = strtok(NULL, " ,\t");
+    }
+
+    i_row++;
+  }
+
+  infile.close();
+  infile.clear();
+
+  return;
+}
+
 // Read bimbam mean genotype file and calculate kinship matrix.
 bool BimbamKin(const string file_geno, const set<string> ksnps,
                vector<int> &indicator_snp, const int k_mode,
diff --git a/src/gemma_io.h b/src/gemma_io.h
index dd1d5c0..9586591 100644
--- a/src/gemma_io.h
+++ b/src/gemma_io.h
@@ -86,6 +86,8 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv,
                  gsl_matrix *G);
 void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U);
 void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval);
+bool ReadFile_resid(const string &file_resid, vector<int> &indicator_resid,
+                  gsl_matrix *sigmasq, size_t &n_resid);
 
 bool BimbamKin(const string file_geno, const set<string> ksnps,
                vector<int> &indicator_snp, const int k_mode,
diff --git a/src/lmm.cpp b/src/lmm.cpp
index 091b3b4..ca9fbc2 100644
--- a/src/lmm.cpp
+++ b/src/lmm.cpp
@@ -49,8 +49,6 @@
 #include "lmm.h"
 #include "mathfunc.h"
 
-#define P_YY_MIN 0.00000001
-
 using namespace std;
 
 void LMM::CopyFromParam(PARAM &cPar) {
@@ -524,8 +522,7 @@ double LogL_f(double l, void *params) {
   index_yy = GetabIndex(n_cvt + 2, n_cvt + 2, n_cvt);
   double P_yy = gsl_matrix_safe_get(Pab, nc_total, index_yy);
 
-  if (P_yy >= 0.0 && (P_yy < P_YY_MIN)) P_yy = P_YY_MIN; // control potential round-off
-
+  if (P_yy == 0.0) P_yy = 0.00000001; // control potential round-off
   if (is_check_mode() || is_debug_mode()) {
     // cerr << "P_yy is" << P_yy << endl;
     assert(!is_nan(P_yy));
@@ -851,7 +848,7 @@ double LogRL_f(double l, void *params) {
   index_ww = GetabIndex(n_cvt + 2, n_cvt + 2, n_cvt);
   double P_yy = gsl_matrix_safe_get(Pab, nc_total, index_ww);
   // P_yy is positive and may get zeroed printf("P_yy=%f",P_yy);
-  if (P_yy >= 0.0 && (P_yy < P_YY_MIN)) P_yy = P_YY_MIN; // control potential round-off
+  if (P_yy == 0.0) P_yy = 0.00000001; // control potential round-off
 
   double c = 0.5 * df * (safe_log(df) - safe_log(2 * M_PI) - 1.0);
   f = c - 0.5 * logdet_h - 0.5 * logdet_hiw - 0.5 * df * safe_log(P_yy);
@@ -2027,14 +2024,13 @@ void CalcLambda(const char func_name, FUNC_PARAM &params, const double l_min,
     for (vector<double>::size_type i = 0; i < lambda_lh.size(); ++i) {
       lambda_l = lambda_lh[i].first;
       lambda_h = lambda_lh[i].second;
-      // printf("%f,%f\n",lambda_l,lambda_h);
-      auto handler = gsl_set_error_handler_off();
       gsl_root_fsolver_set((gsl_root_fsolver*)s_f, &F, lambda_l, lambda_h);
 
       int status = GSL_FAILURE;
       uint iter = 0;
       const auto max_iter = 100;
 
+      auto handler = gsl_set_error_handler_off();
       do {
         iter++;
         status = gsl_root_fsolver_iterate((gsl_root_fsolver*)s_f);
diff --git a/src/mvlmm.cpp b/src/mvlmm.cpp
index 1d05bf1..ceba2bb 100644
--- a/src/mvlmm.cpp
+++ b/src/mvlmm.cpp
@@ -54,6 +54,7 @@ void MVLMM::CopyFromParam(PARAM &cPar) {
 
   file_bfile = cPar.file_bfile;
   file_geno = cPar.file_geno;
+  file_resid = cPar.file_resid;
   file_out = cPar.file_out;
   path_out = cPar.path_out;
 
@@ -80,6 +81,7 @@ void MVLMM::CopyFromParam(PARAM &cPar) {
   ni_test = cPar.ni_test;
   ns_test = cPar.ns_test;
   n_cvt = cPar.n_cvt;
+  n_resid = cPar.n_resid;
 
   n_ph = cPar.n_ph;
 
@@ -210,7 +212,7 @@ void MVLMM::WriteFiles() {
 }
 
 // Below are functions for EM algorithm.
-double EigenProc(const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l,
+tuple<gsl_matrix *, double> EigenProc(const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l,
                  gsl_matrix *UltVeh, gsl_matrix *UltVehi) {
   size_t d_size = V_g->size1;
   double d, logdet_Ve = 0.0;
@@ -272,30 +274,42 @@ double EigenProc(const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_vector *D_l,
 
   // free memory
   gsl_matrix_free(Lambda);
-  gsl_matrix_free(V_e_temp);
+  //gsl_matrix_free(V_e_temp); we want to continue using V_e_temp to scale epsilon
   gsl_matrix_free(V_e_h);
   gsl_matrix_free(V_e_hi);
   gsl_matrix_free(VgVehi);
   gsl_matrix_free(U_l);
 
-  return logdet_Ve;
+  return make_tuple(V_e_temp, logdet_Ve);
 }
 
-// Qi=(\sum_{k=1}^n x_kx_k^T\otimes(delta_k*Dl+I)^{-1} )^{-1}.
-double CalcQi(const gsl_vector *eval, const gsl_vector *D_l,
-              const gsl_matrix *X, gsl_matrix *Qi) {
+// Qi=(\sum_{k=1}^n x_kx_k^T\otimes(delta_k*Dl+epsilon_k)^{-1} )^{-1}.
+double CalcQi(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_vector *D_l,
+              const gsl_matrix *X, const gsl_matrix *V_e_temp, gsl_matrix *Qi) {
   size_t n_size = eval->size, d_size = D_l->size, dc_size = Qi->size1;
   size_t c_size = dc_size / d_size;
 
-  double delta, dl, d1, d2, d, logdet_Q;
+  double delta, ve, epsilon, dl, d1, d2, d, logdet_Q;
 
   gsl_matrix *Q = gsl_matrix_alloc(dc_size, dc_size);
   gsl_matrix_set_zero(Q);
 
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+ // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
   for (size_t i = 0; i < c_size; i++) {
     for (size_t j = 0; j < c_size; j++) {
       for (size_t l = 0; l < d_size; l++) {
         dl = gsl_vector_get(D_l, l);
+        ve = gsl_matrix_get(V_e_temp, l, l);
+        gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+
+       double sigmasq_value = gsl_matrix_get(sigmasq, 0, 0);  // Adjust if sigmasq has different structure
+       double scalar = sigmasq_value / ve;
+       gsl_matrix_scale(Sigma, scalar);
 
         if (j < i) {
           d = gsl_matrix_get(Q, j * d_size + l, i * d_size + l);
@@ -305,7 +319,9 @@ double CalcQi(const gsl_vector *eval, const gsl_vector *D_l,
             d1 = gsl_matrix_get(X, i, k);
             d2 = gsl_matrix_get(X, j, k);
             delta = gsl_vector_get(eval, k);
-            d += d1 * d2 / (dl * delta + 1.0); // @@
+            epsilon = gsl_matrix_get(Sigma, k, k);
+
+            d += d1 * d2 / (dl * delta + epsilon); // @@
           }
         }
 
@@ -328,27 +344,48 @@ double CalcQi(const gsl_vector *eval, const gsl_vector *D_l,
   return logdet_Q;
 }
 
-// xHiy=\sum_{k=1}^n x_k\otimes ((delta_k*Dl+I)^{-1}Ul^TVe^{-1/2}y.
+// xHiy=\sum_{k=1}^n x_k\otimes ((delta_k*Dl+epsilon_k)^{-1}Ul^TVe^{-1/2}y.
 //
 // FIXME: mvlmm spends a massive amount of time here
-void CalcXHiY(const gsl_vector *eval, const gsl_vector *D_l,
-              const gsl_matrix *X, const gsl_matrix *UltVehiY,
+void CalcXHiY(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_vector *D_l,
+              const gsl_matrix *X, const gsl_matrix *UltVehiY, const gsl_matrix *V_e_temp,
               gsl_vector *xHiy) {
   // debug_msg("enter");
-  size_t n_size = eval->size, c_size = X->size1, d_size = D_l->size;
+  size_t n_size = eval->size, 
+  c_size = X->size1, 
+  d_size = D_l->size;
 
   // gsl_vector_set_zero(xHiy);
+  std::vector<double> xHiy(d_size * c_size, 0.0);  // Initialize output vector xHiy
 
-  double x, delta, dl, y, d;
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+
+  double x, delta, ve, epsilon, dl, y, d;
+ 
   for (size_t i = 0; i < d_size; i++) {
     dl = gsl_vector_get(D_l, i);
+    ve = gsl_matrix_get(V_e_temp, i, i);
+   
+   // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+    gsl_matrix_set_zero(Sigma);
+    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+    double scalar = sigmasq / ve;
+    gsl_matrix_scale(Sigma, scalar);
+   
     for (size_t j = 0; j < c_size; j++) {
       d = 0.0;
       for (size_t k = 0; k < n_size; k++) {
         x = gsl_matrix_get(X, j, k);
         y = gsl_matrix_get(UltVehiY, i, k);
         delta = gsl_vector_get(eval, k);
-        d += x * y / (delta * dl + 1.0);
+        epsilon = gsl_vector_get(Sigma, k, k);
+
+        epsilon_sc = epsilon / ve;
+        d += x * y / (delta * dl + epsilon);
       }
       gsl_vector_set(xHiy, j * d_size + i, d);
     }
@@ -358,19 +395,38 @@ void CalcXHiY(const gsl_vector *eval, const gsl_vector *D_l,
   return;
 }
 
-// OmegaU=D_l/(delta Dl+I)^{-1}
-// OmegaE=delta D_l/(delta Dl+I)^{-1}
-void CalcOmega(const gsl_vector *eval, const gsl_vector *D_l,
+// OmegaU=D_l/(delta Dl+epsilon)^{-1}
+// OmegaE=delta D_l/(delta Dl+epsilon)^{-1}
+void CalcOmega(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_vector *D_l, const gsl_matrix *V_e_temp,
                gsl_matrix *OmegaU, gsl_matrix *OmegaE) {
-  size_t n_size = eval->size, d_size = D_l->size;
-  double delta, dl, d_u, d_e;
+  size_t n_size = eval->size, 
+  d_size = D_l->size;
+  double delta, ve, epsilon, dl, d_u, d_e;
 
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+ 
   for (size_t k = 0; k < n_size; k++) {
-    delta = gsl_vector_get(eval, k);
+    delta = gsl_vector_get(eval, k);;
+
     for (size_t i = 0; i < d_size; i++) {
       dl = gsl_vector_get(D_l, i);
+      ve = gsl_matrix_get(V_e_temp, i, i);
+
+      // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+      gsl_matrix_set_zero(Sigma);
+      gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+      double scalar = sigmasq / ve;
+      gsl_matrix_scale(Sigma, scalar);
+
+      // Get epsilon from Sigma
+      epsilon = gsl_matrix_get(Sigma, k, k);
 
-      d_u = dl / (delta * dl + 1.0);  // @@
+     
+      d_u = dl / (delta * dl + epsilon);  // @@
       d_e = delta * d_u;
 
       gsl_matrix_set(OmegaU, i, k, d_u);
@@ -440,28 +496,47 @@ void UpdateRL_B(const gsl_vector *xHiy, const gsl_matrix *Qi,
   return;
 }
 
-void UpdateV(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *E,
-             const gsl_matrix *Sigma_uu, const gsl_matrix *Sigma_ee,
+void UpdateV(const gsl_vector *eval, const gsl_matrix *sigmasq, const gsl_matrix *U, const gsl_matrix *E,
+             const gsl_matrix *Sigma_uu, const gsl_matrix *Sigma_ee, const gsl_matrix *V_e_temp,
              gsl_matrix *V_g, gsl_matrix *V_e) {
   size_t n_size = eval->size, d_size = U->size1;
 
   gsl_matrix_set_zero(V_g);
   gsl_matrix_set_zero(V_e);
 
-  double delta;
+  double delta, ve;
 
-  // Calculate the first part: UD^{-1}U^T and EE^T.
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+  
+ // Calculate the first part: UD^{-1}U^T and EE^T.
+  for (size_t i = 0; i < d_size; i++) {
+    ve = gsl_matrix_get(V_e_temp, i, i);
+  
   for (size_t k = 0; k < n_size; k++) {
     delta = gsl_vector_get(eval, k);
+    
     if (delta == 0) {
       continue;
     }
+   // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+    gsl_matrix_set_zero(Sigma);
+    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+    double scalar = sigmasq / ve;
+    gsl_matrix_scale(Sigma, scalar);
 
+    double epsilon = gsl_matrix_get(Sigma, k, k);
+  
     gsl_vector_const_view U_col = gsl_matrix_const_column(U, k);
+    gsl_vector_const_view E_col = gsl_matrix_const_column(E, k);
     gsl_blas_dsyr(CblasUpper, 1.0 / delta, &U_col.vector, V_g);
+    gsl_blas_dsyr(CblasUpper, 1.0 / epsilon, &E_col.vector, V_e);
+  }
   }
 
-  gsl_blas_dsyrk(CblasUpper, CblasNoTrans, 1.0, E, 0.0, V_e);
 
   // Copy the upper part to lower part.
   for (size_t i = 0; i < d_size; i++) {
@@ -482,10 +557,10 @@ void UpdateV(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *E,
   return;
 }
 
-void CalcSigma(const char func_name, const gsl_vector *eval,
-               const gsl_vector *D_l, const gsl_matrix *X,
+void CalcSigma(const char func_name, const gsl_vector *eval, const gsl_matrix *U,
+               const gsl_matrix *sigmasq, const gsl_vector *D_l, const gsl_matrix *X,
                const gsl_matrix *OmegaU, const gsl_matrix *OmegaE,
-               const gsl_matrix *UltVeh, const gsl_matrix *Qi,
+               const gsl_matrix *UltVeh, const gsl_matrix *Qi, const gsl_matrix *V_e_temp,
                gsl_matrix *Sigma_uu, gsl_matrix *Sigma_ee) {
   if (func_name != 'R' && func_name != 'L' && func_name != 'r' &&
       func_name != 'l') {
@@ -494,13 +569,15 @@ void CalcSigma(const char func_name, const gsl_vector *eval,
     return;
   }
 
-  size_t n_size = eval->size, c_size = X->size1;
-  size_t d_size = D_l->size, dc_size = Qi->size1;
+  size_t n_size = eval->size, 
+  c_size = X->size1;
+  size_t d_size = D_l->size, 
+  dc_size = Qi->size1;
 
   gsl_matrix_set_zero(Sigma_uu);
   gsl_matrix_set_zero(Sigma_ee);
 
-  double delta, dl, x, d;
+  double delta, ve, epsilon, dl, x, d;
 
   // Calculate the first diagonal term.
   gsl_vector_view Suu_diag = gsl_matrix_diagonal(Sigma_uu);
@@ -523,14 +600,28 @@ void CalcSigma(const char func_name, const gsl_vector *eval,
     gsl_matrix_set_zero(M_u);
     gsl_matrix_set_zero(M_e);
 
+    // Transpose U
+    gsl_matrix_transpose_memcpy(U_T, U);
+
     for (size_t k = 0; k < n_size; k++) {
       delta = gsl_vector_get(eval, k);
 
       for (size_t i = 0; i < d_size; i++) {
         dl = gsl_vector_get(D_l, i);
+        ve = gsl_matrix_get(V_e_temp, i, i);
+
+        // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+        gsl_matrix_set_zero(Sigma);
+        gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+        gsl_matrix_scale(Sigma, sigmasq / ve);
+
+        double epsilon = gsl_matrix_get(Sigma, k, k);
+       
         for (size_t j = 0; j < c_size; j++) {
           x = gsl_matrix_get(X, j, k);
-          d = x / (delta * dl + 1.0);
+
+          epsilon_sc = epsilon / ve;
+          d = x / (delta * dl + epsilon);
           gsl_matrix_set(M_e, j * d_size + i, i, d);
           gsl_matrix_set(M_u, j * d_size + i, i, d * dl);
         }
@@ -539,7 +630,7 @@ void CalcSigma(const char func_name, const gsl_vector *eval,
       gsl_blas_dgemm(CblasTrans, CblasNoTrans, delta, M_u, QiM, 1.0, Sigma_uu);
 
       gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, Qi, M_e, 0.0, QiM);
-      gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, M_e, QiM, 1.0, Sigma_ee);
+      gsl_blas_dgemm(CblasTrans, CblasNoTrans, epsilon, M_e, QiM, 1.0, Sigma_ee);
     }
 
     gsl_matrix_free(M_u);
@@ -562,19 +653,36 @@ void CalcSigma(const char func_name, const gsl_vector *eval,
 // 'R' for restricted likelihood and 'L' for likelihood.
 // 'R' update B and 'L' don't.
 // only calculate -0.5*\sum_{k=1}^n|H_k|-0.5yPxy.
-double MphCalcLogL(const gsl_vector *eval, const gsl_vector *xHiy,
-                   const gsl_vector *D_l, const gsl_matrix *UltVehiY,
+double MphCalcLogL(const gsl_vector *eval, const gsl_matrix *U, const gsl_vector *xHiy, const gsl_matrix *sigmasq,
+                   const gsl_vector *D_l, const gsl_matrix *UltVehiY, const gsl_matrix *V_e_temp,
                    const gsl_matrix *Qi) {
   size_t n_size = eval->size, d_size = D_l->size, dc_size = Qi->size1;
-  double logl = 0.0, delta, dl, y, d;
+  double logl = 0.0, delta, ve, epsilon, epsilon_sc, dl, y, d;
+  // Create temporary matrix for Sigma
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+    
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
 
   // Calculate yHiy+log|H_k|.
   for (size_t k = 0; k < n_size; k++) {
     delta = gsl_vector_get(eval, k);
+
     for (size_t i = 0; i < d_size; i++) {
       y = gsl_matrix_get(UltVehiY, i, k);
       dl = gsl_vector_get(D_l, i);
-      d = delta * dl + 1.0;
+      ve = gsl_matrix_get(V_e_temp, i, i);
+
+      // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+      gsl_matrix_set_zero(Sigma);
+      gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+      gsl_matrix_scale(Sigma, sigmasq / ve);
+
+      double epsilon = gsl_matrix_get(Sigma, k, k);
+
+      // Calculate d = delta * dl + epsilon
+      double d = delta * dl + epsilon;
 
       logl += y * y / d + safe_log(d);
     }
@@ -597,7 +705,7 @@ double MphCalcLogL(const gsl_vector *eval, const gsl_vector *xHiy,
 // dxd matrix, V_e is a dxd matrix, eval is a size n vector
 //'R' for restricted likelihood and 'L' for likelihood.
 double MphEM(const char func_name, const size_t max_iter, const double max_prec,
-             const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y,
+             const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *X, const gsl_matrix *Y,
              gsl_matrix *U_hat, gsl_matrix *E_hat, gsl_matrix *OmegaU,
              gsl_matrix *OmegaE, gsl_matrix *UltVehiY, gsl_matrix *UltVehiBX,
              gsl_matrix *UltVehiU, gsl_matrix *UltVehiE, gsl_matrix *V_g,
@@ -617,6 +725,7 @@ double MphEM(const char func_name, const size_t max_iter, const double max_prec,
   gsl_vector *D_l = gsl_vector_alloc(d_size);
   gsl_matrix *UltVeh = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *UltVehi = gsl_matrix_alloc(d_size, d_size);
+  gsl_matrix *V_e_temp = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *UltVehiB = gsl_matrix_alloc(d_size, c_size);
   gsl_matrix *Qi = gsl_matrix_alloc(dc_size, dc_size);
   gsl_matrix *Sigma_uu = gsl_matrix_alloc(d_size, d_size);
@@ -628,6 +737,48 @@ double MphEM(const char func_name, const size_t max_iter, const double max_prec,
   double logdet_Q, logdet_Ve;
   int sig;
 
+//print inputs
+  cout<<"X, as inputted to MphEM: "<<endl;
+  for (size_t i=0; i<c_size; i++) {
+    for (size_t j=0; j<n_size; j++) {
+      cout<<gsl_matrix_get(X, i, j)<<"\t";
+    }
+    cout<<endl;
+  }	
+  cout<<"size2 of X: "<<endl;
+  cout<<X->size2<<endl;
+  
+  cout<<"size1 of X: "<<endl;
+  cout<<X->size1<<endl;
+  cout<<"Y, as inputted to MphEM: "<<endl;
+  for (size_t i=0; i<d_size; i++) {
+    for (size_t j=0; j<n_size; j++) {
+      cout<<gsl_matrix_get(Y, i, j)<<"\t";
+    }
+    cout<<endl;
+  }	
+  cout<<"size2 of Y: "<<endl;
+  cout<<Y->size2<<endl;
+  
+  cout<<"size1 of Y: "<<endl;
+  cout<<Y->size1<<endl;
+  
+  cout<<"Vg, as inputted to MphEM: "<<endl;
+  for (size_t i=0; i<d_size; i++) {
+    for (size_t j=0; j<d_size; j++) {
+      cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+    }
+    cout<<endl;
+  }	
+  
+  cout<<"Ve, as inputted to MphEM: "<<endl;
+  for (size_t i=0; i<d_size; i++) {
+    for (size_t j=0; j<d_size; j++) {
+      cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+    }
+    cout<<endl;
+  }
+
   // Calculate |XXt| and (XXt)^{-1}.
   gsl_blas_dsyrk(CblasUpper, CblasNoTrans, 1.0, X, 0.0, XXt);
   for (size_t i = 0; i < c_size; ++i) {
@@ -650,16 +801,16 @@ double MphEM(const char func_name, const size_t max_iter, const double max_prec,
 
   // Start EM.
   for (size_t t = 0; t < max_iter; t++) {
-    logdet_Ve = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi);
+    tie(V_e_temp, logdet_Ve) = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi);
 
-    logdet_Q = CalcQi(eval, D_l, X, Qi);
+    logdet_Q = CalcQi(eval, U, sigmasq, D_l, X, V_e_temp, Qi);
 
     gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
-    CalcXHiY(eval, D_l, X, UltVehiY, xHiy);
+    CalcXHiY(eval, U, sigmasq, D_l, X, UltVehiY, V_e_temp, xHiy);
 
     // Calculate log likelihood/restricted likelihood value, and
     // terminate if change is small.
-    logl_new = logl_const + MphCalcLogL(eval, xHiy, D_l, UltVehiY, Qi) -
+    logl_new = logl_const + MphCalcLogL(eval, U, xHiy, sigmasq, D_l, UltVehiY, V_e_temp, Qi) -
                0.5 * (double)n_size * logdet_Ve;
     if (func_name == 'R' || func_name == 'r') {
       logl_new += -0.5 * (logdet_Q - (double)c_size * logdet_Ve);
@@ -669,7 +820,7 @@ double MphEM(const char func_name, const size_t max_iter, const double max_prec,
     }
     logl_old = logl_new;
 
-    CalcOmega(eval, D_l, OmegaU, OmegaE);
+    CalcOmega(eval, U, sigmasq, D_l, V_e_temp, OmegaU, OmegaE);
 
     // Update UltVehiB, UltVehiU.
     if (func_name == 'R' || func_name == 'r') {
@@ -701,12 +852,53 @@ double MphEM(const char func_name, const size_t max_iter, const double max_prec,
     gsl_blas_dgemm(CblasTrans, CblasNoTrans, 1.0, UltVeh, UltVehiB, 0.0, B);
 
     // Calculate Sigma_uu and Sigma_ee.
-    CalcSigma(func_name, eval, D_l, X, OmegaU, OmegaE, UltVeh, Qi, Sigma_uu,
+    CalcSigma(func_name, eval, U, sigmasq, D_l, X, OmegaU, OmegaE, UltVeh, Qi, V_e_temp, Sigma_uu,
               Sigma_ee);
 
     // Update V_g and V_e.
-    UpdateV(eval, U_hat, E_hat, Sigma_uu, Sigma_ee, V_g, V_e);
-  }
+    UpdateV(eval, U, sigmasq, U_hat, E_hat, Sigma_uu, Sigma_ee, V_e_temp, V_g, V_e);
+    // print statements
+    //print iteration number:
+    cout<< t<<endl;
+    //print logdet_Ve
+    cout<<"logdet_Ve"<<endl;
+    cout<< logdet_Ve <<endl;
+    //print D_l
+    cout<<"D_l"<<endl;
+    for (size_t i=0; i<d_size; i++) {
+      cout<<gsl_vector_get(D_l, i)<<"\t";
+    }
+    cout<<endl;
+    //print UltVeh	
+    cout<<"UltVeh"<<endl;
+    for (size_t i=0; i<d_size; i++) {
+      for (size_t j=0; j<d_size; j++) {
+        cout<<gsl_matrix_get(UltVeh, i, j)<<"\t";
+      }
+      cout<<endl;
+    }	
+    cout<<"UltVehi"<<endl;
+    for (size_t i=0; i<d_size; i++) {
+      for (size_t j=0; j<d_size; j++) {
+        cout<<gsl_matrix_get(UltVehi, i, j)<<"\t";
+      }
+      cout<<endl;
+    }	
+    cout<<"Vg"<<endl;
+    for (size_t i=0; i<d_size; i++) {
+      for (size_t j=0; j<d_size; j++) {
+        cout<<gsl_matrix_get(V_g, i, j)<<"\t";
+      }
+      cout<<endl;
+    }	
+    cout<<"Ve"<<endl;
+    for (size_t i=0; i<d_size; i++) {
+      for (size_t j=0; j<d_size; j++) {
+        cout<<gsl_matrix_get(V_e, i, j)<<"\t";
+      }
+      cout<<endl;
+    }
+  } //end EM iterations
 
   gsl_matrix_free(XXt);
   gsl_matrix_free(XXti);
@@ -724,17 +916,18 @@ double MphEM(const char func_name, const size_t max_iter, const double max_prec,
 }
 
 // Calculate p-value, beta (d by 1 vector) and V(beta).
-double MphCalcP(const gsl_vector *eval, const gsl_vector *x_vec,
+double MphCalcP(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_vector *x_vec,
                 const gsl_matrix *W, const gsl_matrix *Y, const gsl_matrix *V_g,
                 const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_vector *beta,
                 gsl_matrix *Vbeta) {
   size_t n_size = eval->size, c_size = W->size1, d_size = V_g->size1;
   size_t dc_size = d_size * c_size;
-  double delta, dl, d, d1, d2, dy, dx, dw; //  logdet_Ve, logdet_Q, p_value;
+  double delta, ve, epsilon, epsilon_sc, dl, d, d1, d2, dy, dx, dw; //  logdet_Ve, logdet_Q, p_value;
 
   gsl_vector *D_l = gsl_vector_alloc(d_size);
   gsl_matrix *UltVeh = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *UltVehi = gsl_matrix_alloc(d_size, d_size);
+  gsl_matrix *V_e_temp = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *Qi = gsl_matrix_alloc(dc_size, dc_size);
   gsl_matrix *WHix = gsl_matrix_alloc(dc_size, d_size);
   gsl_matrix *QiWHix = gsl_matrix_alloc(dc_size, d_size);
@@ -754,24 +947,38 @@ double MphCalcP(const gsl_vector *eval, const gsl_vector *x_vec,
 
   // Calculate Qi and log|Q|.
   // double logdet_Q = CalcQi(eval, D_l, W, Qi);
-  CalcQi(eval, D_l, W, Qi);
+  CalcQi(eval, U, sigmasq, D_l, W, V_e_temp, Qi);
 
   // Calculate UltVehiY.
   gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
 
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+
   // Calculate WHix, WHiy, xHiy, xHix.
   for (size_t i = 0; i < d_size; i++) {
     dl = gsl_vector_get(D_l, i);
-
+    ve = gsl_matrix_get(V_e_temp, i, i);
+
+    // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+    gsl_matrix_set_zero(Sigma);
+    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+    double scalar = sigmasq / ve;
+    gsl_matrix_scale(Sigma, scalar);
+   
     d1 = 0.0;
     d2 = 0.0;
     for (size_t k = 0; k < n_size; k++) {
       delta = gsl_vector_get(eval, k);
+      epsilon = = gsl_matrix_get(Sigma, k, k);
       dx = gsl_vector_get(x_vec, k);
       dy = gsl_matrix_get(UltVehiY, i, k);
 
-      d1 += dx * dy / (delta * dl + 1.0);
-      d2 += dx * dx / (delta * dl + 1.0);
+      d1 += dx * dy / (delta * dl + epsilon);
+      d2 += dx * dx / (delta * dl + epsilon);
     }
     gsl_vector_set(xPy, i, d1);
     gsl_matrix_set(xPx, i, i, d2);
@@ -781,12 +988,13 @@ double MphCalcP(const gsl_vector *eval, const gsl_vector *x_vec,
       d2 = 0.0;
       for (size_t k = 0; k < n_size; k++) {
         delta = gsl_vector_get(eval, k);
+        epsilon = = gsl_matrix_get(Sigma, k, k);
         dx = gsl_vector_get(x_vec, k);
         dw = gsl_matrix_get(W, j, k);
         dy = gsl_matrix_get(UltVehiY, i, k);
 
-        d1 += dx * dw / (delta * dl + 1.0);
-        d2 += dy * dw / (delta * dl + 1.0);
+        d1 += dx * dw / (delta * dl + epsilon);
+        d2 += dy * dw / (delta * dl + epsilon);
       }
       gsl_matrix_set(WHix, j * d_size + i, i, d1);
       gsl_vector_set(WHiy, j * d_size + i, d2);
@@ -832,33 +1040,39 @@ double MphCalcP(const gsl_vector *eval, const gsl_vector *x_vec,
 
 // Calculate B and its standard error (which is a matrix of the same
 // dimension as B).
-void MphCalcBeta(const gsl_vector *eval, const gsl_matrix *W,
+void MphCalcBeta(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *W,
                  const gsl_matrix *Y, const gsl_matrix *V_g,
                  const gsl_matrix *V_e, gsl_matrix *UltVehiY, gsl_matrix *B,
                  gsl_matrix *se_B) {
   size_t n_size = eval->size, c_size = W->size1, d_size = V_g->size1;
   size_t dc_size = d_size * c_size;
-  double delta, dl, d, dy, dw; // , logdet_Ve, logdet_Q;
+  double delta, ve, epsilon, epsilon_sc, dl, d, dy, dw; // , logdet_Ve, logdet_Q;
 
   gsl_vector *D_l = gsl_vector_alloc(d_size);
   gsl_matrix *UltVeh = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *UltVehi = gsl_matrix_alloc(d_size, d_size);
+  gsl_matrix *V_e_temp = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *Qi = gsl_matrix_alloc(dc_size, dc_size);
   gsl_matrix *Qi_temp = gsl_matrix_alloc(dc_size, dc_size);
   gsl_vector *WHiy = gsl_vector_alloc(dc_size);
   gsl_vector *QiWHiy = gsl_vector_alloc(dc_size);
   gsl_vector *beta = gsl_vector_alloc(dc_size);
   gsl_matrix *Vbeta = gsl_matrix_alloc(dc_size, dc_size);
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
 
   gsl_vector_set_zero(WHiy);
 
   // Eigen decomposition and calculate log|Ve|.
-  // double logdet_Ve = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi);
+  // tuple(V_e_temp,logdet_Ve) = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi);
   EigenProc(V_g, V_e, D_l, UltVeh, UltVehi);
 
   // Calculate Qi and log|Q|.
-  // double logdet_Q = CalcQi(eval, D_l, W, Qi);
-  CalcQi(eval, D_l, W, Qi);
+  // double logdet_Q = CalcQi(eval, D_l, W, V_e_temp, Qi);
+  CalcQi(eval, U, sigmasq, D_l, W, V_e_temp, Qi);
 
   // Calculate UltVehiY.
   gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
@@ -866,15 +1080,23 @@ void MphCalcBeta(const gsl_vector *eval, const gsl_matrix *W,
   // Calculate WHiy.
   for (size_t i = 0; i < d_size; i++) {
     dl = gsl_vector_get(D_l, i);
-
+    ve = gsl_matrix_get(V_e_temp, i, i);
+
+    // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+    gsl_matrix_set_zero(Sigma);
+    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+    double scalar = sigmasq / ve;
+    gsl_matrix_scale(Sigma, scalar);
+   
     for (size_t j = 0; j < c_size; j++) {
       d = 0.0;
       for (size_t k = 0; k < n_size; k++) {
         delta = gsl_vector_get(eval, k);
+        double epsilon = gsl_matrix_get(Sigma, k, k);
         dw = gsl_matrix_get(W, j, k);
         dy = gsl_matrix_get(UltVehiY, i, k);
 
-        d += dy * dw / (delta * dl + 1.0);
+        d += dy * dw / (delta * dl + epsilon);
       }
       gsl_vector_set(WHiy, j * d_size + i, d);
     }
@@ -925,6 +1147,7 @@ void MphCalcBeta(const gsl_vector *eval, const gsl_matrix *W,
   gsl_matrix_free(UltVeh);
   gsl_matrix_free(UltVehi);
   gsl_matrix_free(Qi);
+  //gsl_matrix_free(V_e_temp);
   gsl_matrix_free(Qi_temp);
   gsl_vector_free(WHiy);
   gsl_vector_free(QiWHiy);
@@ -939,7 +1162,7 @@ void MphCalcBeta(const gsl_vector *eval, const gsl_matrix *W,
 // Calculate all Hi and return logdet_H=\sum_{k=1}^{n}log|H_k|
 // and calculate Qi and return logdet_Q
 // and calculate yPy.
-void CalcHiQi(const gsl_vector *eval, const gsl_matrix *X,
+void CalcHiQi(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *X,
               const gsl_matrix *V_g, const gsl_matrix *V_e, gsl_matrix *Hi_all,
               gsl_matrix *Qi, double &logdet_H, double &logdet_Q) {
   gsl_matrix_set_zero(Hi_all);
@@ -948,15 +1171,21 @@ void CalcHiQi(const gsl_vector *eval, const gsl_matrix *X,
   logdet_Q = 0.0;
 
   size_t n_size = eval->size, c_size = X->size1, d_size = V_g->size1;
-  double logdet_Ve = 0.0, delta, dl, d;
+  double logdet_Ve = 0.0, delta, ve, epsilon, epsilon_sc, dl, d;
 
   gsl_matrix *mat_dd = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *UltVeh = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *UltVehi = gsl_matrix_alloc(d_size, d_size);
+  gsl_matrix *V_e_temp = gsl_matrix_alloc(d_size, d_size);
   gsl_vector *D_l = gsl_vector_alloc(d_size);
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
 
   // Calculate D_l, UltVeh and UltVehi.
-  logdet_Ve = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi);
+  tie(V_e_temp, logdet_Ve) = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi);
 
   // Calculate each Hi and log|H_k|.
   logdet_H = (double)n_size * logdet_Ve;
@@ -964,9 +1193,19 @@ void CalcHiQi(const gsl_vector *eval, const gsl_matrix *X,
     delta = gsl_vector_get(eval, k);
 
     gsl_matrix_memcpy(mat_dd, UltVehi);
-    for (size_t i = 0; i < d_size; i++) {
+    
+   for (size_t i = 0; i < d_size; i++) {
       dl = gsl_vector_get(D_l, i);
-      d = delta * dl + 1.0;
+      ve = gsl_matrix_get(V_e_temp, i, i);
+      // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+      gsl_matrix_set_zero(Sigma);
+      gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+      double scalar = sigmasq / ve;
+      gsl_matrix_scale(Sigma, scalar);
+
+     double epsilon = gsl_matrix_get(Sigma, k, k);
+     
+      d = delta * dl + epsilon;
 
       gsl_vector_view mat_row = gsl_matrix_row(mat_dd, i);
       gsl_vector_scale(&mat_row.vector, 1.0 / d); // @@
@@ -983,7 +1222,7 @@ void CalcHiQi(const gsl_vector *eval, const gsl_matrix *X,
   // Calculate Qi, and multiply I\o times UtVeh on both side and
   // calculate logdet_Q, don't forget to substract
   // c_size*logdet_Ve.
-  logdet_Q = CalcQi(eval, D_l, X, Qi) - (double)c_size * logdet_Ve;
+  logdet_Q = CalcQi(eval, U, sigmasq, D_l, X, V_e_temp, Qi) - (double)c_size * logdet_Ve;
 
   for (size_t i = 0; i < c_size; i++) {
     for (size_t j = 0; j < c_size; j++) {
@@ -1108,66 +1347,99 @@ size_t GetIndex(const size_t i, const size_t j, const size_t d_size) {
   return (2 * d_size - s + 1) * s / 2 + l - s;
 }
 
-void Calc_yHiDHiy(const gsl_vector *eval, const gsl_matrix *Hiy, const size_t i,
+void Calc_yHiDHiy(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Hiy,  const gsl_matrix *V_e_temp, const size_t i,
                   const size_t j, double &yHiDHiy_g, double &yHiDHiy_e) {
   yHiDHiy_g = 0.0;
   yHiDHiy_e = 0.0;
 
   size_t n_size = eval->size;
+  size_t d_size = V_e_temp->size1;
+
+  double delta, ve, epsilon, d1, d2;
+
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
 
-  double delta, d1, d2;
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
 
+for (size_t i = 0; i < d_size; i++) {
+  ve = gsl_matrix_get(V_e_temp, i, i);
+  // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+  gsl_matrix_set_zero(Sigma);
+  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+  double scalar = sigmasq / ve;
+  gsl_matrix_scale(Sigma, scalar);
+ 
   for (size_t k = 0; k < n_size; k++) {
     delta = gsl_vector_get(eval, k);
+    epsilon = gsl_matrix_get(Sigma,k, k);
     d1 = gsl_matrix_get(Hiy, i, k);
     d2 = gsl_matrix_get(Hiy, j, k);
 
     if (i == j) {
       yHiDHiy_g += delta * d1 * d2;
-      yHiDHiy_e += d1 * d2;
+      yHiDHiy_e += epsilon * d1 * d2;
     } else {
       yHiDHiy_g += delta * d1 * d2 * 2.0;
-      yHiDHiy_e += d1 * d2 * 2.0;
+      yHiDHiy_e += epsilon * d1 * d2 * 2.0;
     }
   }
+}
+          
 
   return;
 }
 
-void Calc_xHiDHiy(const gsl_vector *eval, const gsl_matrix *xHi,
-                  const gsl_matrix *Hiy, const size_t i, const size_t j,
+void Calc_xHiDHiy(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigma, const gsl_matrix *xHi,
+                  const gsl_matrix *Hiy, const gsl_matrix *V_e_temp, const size_t i, const size_t j,
                   gsl_vector *xHiDHiy_g, gsl_vector *xHiDHiy_e) {
   gsl_vector_set_zero(xHiDHiy_g);
   gsl_vector_set_zero(xHiDHiy_e);
 
   size_t n_size = eval->size, d_size = Hiy->size1;
 
-  double delta, d;
-
-  for (size_t k = 0; k < n_size; k++) {
-    delta = gsl_vector_get(eval, k);
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+ 
+  double delta, ve, epsilon, d;
+  for (size_t i = 0; i < d_size; d++) {
+    ve = gsl_matrix_get(V_e_temp, i, i);
+
+    // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+    gsl_matrix_set_zero(Sigma);
+    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+    double scalar = sigmasq / ve;
+    gsl_matrix_scale(Sigma, scalar);
+   
+    for (size_t k = 0; k < n_size; k++) {
+      delta = gsl_vector_get(eval, k);
+      epsilon = gsl_matrix_get(Sigma, k, k);
 
-    gsl_vector_const_view xHi_col_i =
-        gsl_matrix_const_column(xHi, k * d_size + i);
-    d = gsl_matrix_get(Hiy, j, k);
+      gsl_vector_const_view xHi_col_i = //Is there somethign missing here?
+      gsl_matrix_const_column(xHi, k * d_size + i);
+      d = gsl_matrix_get(Hiy, j, k);
 
-    gsl_blas_daxpy(d * delta, &xHi_col_i.vector, xHiDHiy_g);
-    gsl_blas_daxpy(d, &xHi_col_i.vector, xHiDHiy_e);
+      gsl_blas_daxpy(d * delta, &xHi_col_i.vector, xHiDHiy_g);
+      gsl_blas_daxpy(d * epsilon, &xHi_col_i.vector, xHiDHiy_e);
 
-    if (i != j) {
-      gsl_vector_const_view xHi_col_j =
+      if (i != j) {
+        gsl_vector_const_view xHi_col_j =
           gsl_matrix_const_column(xHi, k * d_size + j);
-      d = gsl_matrix_get(Hiy, i, k);
+        d = gsl_matrix_get(Hiy, i, k);
 
-      gsl_blas_daxpy(d * delta, &xHi_col_j.vector, xHiDHiy_g);
-      gsl_blas_daxpy(d, &xHi_col_j.vector, xHiDHiy_e);
+        gsl_blas_daxpy(d * delta, &xHi_col_j.vector, xHiDHiy_g);
+        gsl_blas_daxpy(d * epsilon, &xHi_col_j.vector, xHiDHiy_e);
     }
   }
-
+}
   return;
 }
 
-void Calc_xHiDHix(const gsl_vector *eval, const gsl_matrix *xHi, const size_t i,
+void Calc_xHiDHix(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigma, const gsl_matrix *V_e_temp, const gsl_matrix *xHi, const size_t i,
                   const size_t j, gsl_matrix *xHiDHix_g,
                   gsl_matrix *xHiDHix_e) {
   gsl_matrix_set_zero(xHiDHix_g);
@@ -1176,36 +1448,54 @@ void Calc_xHiDHix(const gsl_vector *eval, const gsl_matrix *xHi, const size_t i,
   size_t n_size = eval->size, dc_size = xHi->size1;
   size_t d_size = xHi->size2 / n_size;
 
-  double delta;
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+ 
+  double delta, ve, epsilon;
 
   gsl_matrix *mat_dcdc = gsl_matrix_alloc(dc_size, dc_size);
   gsl_matrix *mat_dcdc_t = gsl_matrix_alloc(dc_size, dc_size);
+  for (size_t i = 0; i < d_size; i++) {
+    ve = gsl_matrix_get(V_e_temp, i, i);
+
+    // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+    gsl_matrix_set_zero(Sigma);
+    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+    double scalar = sigmasq / ve;
+    gsl_matrix_scale(Sigma, scalar);
+   
+    for (size_t k = 0; k < n_size; k++) {
+      delta = gsl_vector_get(eval, k);
+      epsilon = gsl_matrix_get(Sigma, k, k);
 
-  for (size_t k = 0; k < n_size; k++) {
-    delta = gsl_vector_get(eval, k);
-
-    gsl_vector_const_view xHi_col_i =
+      gsl_vector_const_view xHi_col_i =
         gsl_matrix_const_column(xHi, k * d_size + i);
-    gsl_vector_const_view xHi_col_j =
+      gsl_vector_const_view xHi_col_j =
         gsl_matrix_const_column(xHi, k * d_size + j);
 
-    gsl_matrix_set_zero(mat_dcdc);
-    gsl_blas_dger(1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc);
+      gsl_matrix_set_zero(mat_dcdc);
+      gsl_blas_dger(1.0, &xHi_col_i.vector, &xHi_col_j.vector, mat_dcdc);
 
-    gsl_matrix_transpose_memcpy(mat_dcdc_t, mat_dcdc);
+      gsl_matrix_transpose_memcpy(mat_dcdc_t, mat_dcdc);
 
-    gsl_matrix_add(xHiDHix_e, mat_dcdc);
+      gsl_matrix_scale(mat_dcdc, epsilon);
+      gsl_matrix_add(xHiDHix_e, mat_dcdc);
 
-    gsl_matrix_scale(mat_dcdc, delta);
-    gsl_matrix_add(xHiDHix_g, mat_dcdc);
+      gsl_matrix_scale(mat_dcdc, delta);
+      gsl_matrix_add(xHiDHix_g, mat_dcdc);
 
-    if (i != j) {
+      if (i != j) {
+      gsl_matrix_scale(mat_dcdc_t, epsilon);
       gsl_matrix_add(xHiDHix_e, mat_dcdc_t);
 
       gsl_matrix_scale(mat_dcdc_t, delta);
       gsl_matrix_add(xHiDHix_g, mat_dcdc_t);
     }
   }
+}
 
   gsl_matrix_free(mat_dcdc);
   gsl_matrix_free(mat_dcdc_t);
@@ -1213,8 +1503,8 @@ void Calc_xHiDHix(const gsl_vector *eval, const gsl_matrix *xHi, const size_t i,
   return;
 }
 
-void Calc_yHiDHiDHiy(const gsl_vector *eval, const gsl_matrix *Hi,
-                     const gsl_matrix *Hiy, const size_t i1, const size_t j1,
+void Calc_yHiDHiDHiy(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Hi,
+                     const gsl_matrix *Hiy, const gsl_matrix *V_e_temp, const size_t i1, const size_t j1,
                      const size_t i2, const size_t j2, double &yHiDHiDHiy_gg,
                      double &yHiDHiDHiy_ee, double &yHiDHiDHiy_ge) {
   yHiDHiDHiy_gg = 0.0;
@@ -1223,56 +1513,72 @@ void Calc_yHiDHiDHiy(const gsl_vector *eval, const gsl_matrix *Hi,
 
   size_t n_size = eval->size, d_size = Hiy->size1;
 
-  double delta, d_Hiy_i1, d_Hiy_j1, d_Hiy_i2, d_Hiy_j2;
-  double d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
 
-  for (size_t k = 0; k < n_size; k++) {
-    delta = gsl_vector_get(eval, k);
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
 
-    d_Hiy_i1 = gsl_matrix_get(Hiy, i1, k);
-    d_Hiy_j1 = gsl_matrix_get(Hiy, j1, k);
-    d_Hiy_i2 = gsl_matrix_get(Hiy, i2, k);
-    d_Hiy_j2 = gsl_matrix_get(Hiy, j2, k);
+  double delta, ve, epsilon, d_Hiy_i1, d_Hiy_j1, d_Hiy_i2, d_Hiy_j2;
+  double d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+  for (size_t i = 0; i < d_size; i++) {
+    ve = gsl_matrix_get(V_e_temp, i, i);
+
+    // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+    gsl_matrix_set_zero(Sigma);
+    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+    double scalar = sigmasq / ve;
+    gsl_matrix_scale(Sigma, scalar);
+   
+    for (size_t k = 0; k < n_size; k++) {
+      delta = gsl_vector_get(eval, k);
+      epsilon = gsl_matrix_get(Sigma, k, k);
 
-    d_Hi_i1i2 = gsl_matrix_get(Hi, i1, k * d_size + i2);
-    d_Hi_i1j2 = gsl_matrix_get(Hi, i1, k * d_size + j2);
-    d_Hi_j1i2 = gsl_matrix_get(Hi, j1, k * d_size + i2);
-    d_Hi_j1j2 = gsl_matrix_get(Hi, j1, k * d_size + j2);
+      d_Hiy_i1 = gsl_matrix_get(Hiy, i1, k);
+      d_Hiy_j1 = gsl_matrix_get(Hiy, j1, k);
+      d_Hiy_i2 = gsl_matrix_get(Hiy, i2, k);
+      d_Hiy_j2 = gsl_matrix_get(Hiy, j2, k);
 
-    if (i1 == j1) {
-      yHiDHiDHiy_gg += delta * delta * (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2);
-      yHiDHiDHiy_ee += (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2);
-      yHiDHiDHiy_ge += delta * (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2);
+      d_Hi_i1i2 = gsl_matrix_get(Hi, i1, k * d_size + i2);
+      d_Hi_i1j2 = gsl_matrix_get(Hi, i1, k * d_size + j2);
+      d_Hi_j1i2 = gsl_matrix_get(Hi, j1, k * d_size + i2);
+      d_Hi_j1j2 = gsl_matrix_get(Hi, j1, k * d_size + j2);
 
-      if (i2 != j2) {
+      if (i1 == j1) {
+       yHiDHiDHiy_gg += delta * delta * (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2);
+       yHiDHiDHiy_ee += epsilon * epsilon * (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2);
+       yHiDHiDHiy_ge += delta * epsilon * (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2);
+
+       if (i2 != j2) {
         yHiDHiDHiy_gg += delta * delta * (d_Hiy_i1 * d_Hi_j1j2 * d_Hiy_i2);
-        yHiDHiDHiy_ee += (d_Hiy_i1 * d_Hi_j1j2 * d_Hiy_i2);
-        yHiDHiDHiy_ge += delta * (d_Hiy_i1 * d_Hi_j1j2 * d_Hiy_i2);
-      }
-    } else {
-      yHiDHiDHiy_gg += delta * delta * (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2 +
+        yHiDHiDHiy_ee += epsilon * epsilon * (d_Hiy_i1 * d_Hi_j1j2 * d_Hiy_i2);
+        yHiDHiDHiy_ge += delta * epsilon * (d_Hiy_i1 * d_Hi_j1j2 * d_Hiy_i2);
+       }
+      } else {
+       yHiDHiDHiy_gg += delta * delta * (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2 +
                                         d_Hiy_j1 * d_Hi_i1i2 * d_Hiy_j2);
-      yHiDHiDHiy_ee +=
-          (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2 + d_Hiy_j1 * d_Hi_i1i2 * d_Hiy_j2);
-      yHiDHiDHiy_ge += delta * (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2 +
+       yHiDHiDHiy_ee += epsilon * epsilon * (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2 + 
+                                d_Hiy_j1 * d_Hi_i1i2 * d_Hiy_j2);
+       yHiDHiDHiy_ge += delta * epsilon * (d_Hiy_i1 * d_Hi_j1i2 * d_Hiy_j2 +
                                 d_Hiy_j1 * d_Hi_i1i2 * d_Hiy_j2);
 
       if (i2 != j2) {
         yHiDHiDHiy_gg += delta * delta * (d_Hiy_i1 * d_Hi_j1j2 * d_Hiy_i2 +
                                           d_Hiy_j1 * d_Hi_i1j2 * d_Hiy_i2);
-        yHiDHiDHiy_ee +=
-            (d_Hiy_i1 * d_Hi_j1j2 * d_Hiy_i2 + d_Hiy_j1 * d_Hi_i1j2 * d_Hiy_i2);
-        yHiDHiDHiy_ge += delta * (d_Hiy_i1 * d_Hi_j1j2 * d_Hiy_i2 +
+        yHiDHiDHiy_ee += epsilon * epsilon * (d_Hiy_i1 * d_Hi_j1j2 * d_Hiy_i2 + 
+                                  d_Hiy_j1 * d_Hi_i1j2 * d_Hiy_i2);
+        yHiDHiDHiy_ge += delta * epsilon * (d_Hiy_i1 * d_Hi_j1j2 * d_Hiy_i2 +
                                   d_Hiy_j1 * d_Hi_i1j2 * d_Hiy_i2);
       }
     }
   }
+}
 
   return;
 }
 
-void Calc_xHiDHiDHiy(const gsl_vector *eval, const gsl_matrix *Hi,
-                     const gsl_matrix *xHi, const gsl_matrix *Hiy,
+void Calc_xHiDHiDHiy(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Hi,
+                     const gsl_matrix *xHi, const gsl_matrix *Hiy, const gsl_matrix *V_e_temp,
                      const size_t i1, const size_t j1, const size_t i2,
                      const size_t j2, gsl_vector *xHiDHiDHiy_gg,
                      gsl_vector *xHiDHiDHiy_ee, gsl_vector *xHiDHiDHiy_ge) {
@@ -1282,11 +1588,27 @@ void Calc_xHiDHiDHiy(const gsl_vector *eval, const gsl_matrix *Hi,
 
   size_t n_size = eval->size, d_size = Hiy->size1;
 
-  double delta, d_Hiy_i, d_Hiy_j, d_Hi_i1i2, d_Hi_i1j2;
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+
+  double delta, ve, epsilon, d_Hiy_i, d_Hiy_j, d_Hi_i1i2, d_Hi_i1j2;
   double d_Hi_j1i2, d_Hi_j1j2;
 
+for (size_t i = 0; i < d_size; i++) {
+  ve = gsl_matrix_get(V_e_temp, i, i);
+
+  // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+  gsl_matrix_set_zero(Sigma);
+  gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+  double scalar = sigmasq / ve;
+  gsl_matrix_scale(Sigma, scalar);
+ 
   for (size_t k = 0; k < n_size; k++) {
     delta = gsl_vector_get(eval, k);
+    epsilon = gsl_matrix_get(Sigma, k, k);
 
     gsl_vector_const_view xHi_col_i =
         gsl_matrix_const_column(xHi, k * d_size + i1);
@@ -1304,50 +1626,55 @@ void Calc_xHiDHiDHiy(const gsl_vector *eval, const gsl_matrix *Hi,
     if (i1 == j1) {
       gsl_blas_daxpy(delta * delta * d_Hi_j1i2 * d_Hiy_j, &xHi_col_i.vector,
                      xHiDHiDHiy_gg);
-      gsl_blas_daxpy(d_Hi_j1i2 * d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee);
-      gsl_blas_daxpy(delta * d_Hi_j1i2 * d_Hiy_j, &xHi_col_i.vector,
+      gsl_blas_daxpy(epsilon * epsilon * d_Hi_j1i2 * d_Hiy_j, &xHi_col_i.vector, 
+                      xHiDHiDHiy_ee);
+      gsl_blas_daxpy(delta * epsilon * d_Hi_j1i2 * d_Hiy_j, &xHi_col_i.vector,
                      xHiDHiDHiy_ge);
 
       if (i2 != j2) {
         gsl_blas_daxpy(delta * delta * d_Hi_j1j2 * d_Hiy_i, &xHi_col_i.vector,
                        xHiDHiDHiy_gg);
-        gsl_blas_daxpy(d_Hi_j1j2 * d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ee);
-        gsl_blas_daxpy(delta * d_Hi_j1j2 * d_Hiy_i, &xHi_col_i.vector,
+        gsl_blas_daxpy(epsilon * epsilon * d_Hi_j1j2 * d_Hiy_i, &xHi_col_i.vector, 
+                        xHiDHiDHiy_ee);
+        gsl_blas_daxpy(delta * epsilon * d_Hi_j1j2 * d_Hiy_i, &xHi_col_i.vector,
                        xHiDHiDHiy_ge);
       }
     } else {
       gsl_blas_daxpy(delta * delta * d_Hi_j1i2 * d_Hiy_j, &xHi_col_i.vector,
                      xHiDHiDHiy_gg);
-      gsl_blas_daxpy(d_Hi_j1i2 * d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee);
-      gsl_blas_daxpy(delta * d_Hi_j1i2 * d_Hiy_j, &xHi_col_i.vector,
+      gsl_blas_daxpy(epsilon * epsilon * d_Hi_j1i2 * d_Hiy_j, &xHi_col_i.vector, xHiDHiDHiy_ee);
+      gsl_blas_daxpy(delta * epsilon * d_Hi_j1i2 * d_Hiy_j, &xHi_col_i.vector,
                      xHiDHiDHiy_ge);
 
       gsl_blas_daxpy(delta * delta * d_Hi_i1i2 * d_Hiy_j, &xHi_col_j.vector,
                      xHiDHiDHiy_gg);
-      gsl_blas_daxpy(d_Hi_i1i2 * d_Hiy_j, &xHi_col_j.vector, xHiDHiDHiy_ee);
-      gsl_blas_daxpy(delta * d_Hi_i1i2 * d_Hiy_j, &xHi_col_j.vector,
+      gsl_blas_daxpy(epsilon * epsilon * d_Hi_i1i2 * d_Hiy_j, &xHi_col_j.vector, 
+                      xHiDHiDHiy_ee);
+      gsl_blas_daxpy(delta * epsilon * d_Hi_i1i2 * d_Hiy_j, &xHi_col_j.vector,
                      xHiDHiDHiy_ge);
 
       if (i2 != j2) {
         gsl_blas_daxpy(delta * delta * d_Hi_j1j2 * d_Hiy_i, &xHi_col_i.vector,
                        xHiDHiDHiy_gg);
-        gsl_blas_daxpy(d_Hi_j1j2 * d_Hiy_i, &xHi_col_i.vector, xHiDHiDHiy_ee);
-        gsl_blas_daxpy(delta * d_Hi_j1j2 * d_Hiy_i, &xHi_col_i.vector,
+        gsl_blas_daxpy(epsilon * epsilon * d_Hi_j1j2 * d_Hiy_i, &xHi_col_i.vector, 
+                        xHiDHiDHiy_ee);
+        gsl_blas_daxpy(delta * epsilon * d_Hi_j1j2 * d_Hiy_i, &xHi_col_i.vector,
                        xHiDHiDHiy_ge);
 
         gsl_blas_daxpy(delta * delta * d_Hi_i1j2 * d_Hiy_i, &xHi_col_j.vector,
                        xHiDHiDHiy_gg);
-        gsl_blas_daxpy(d_Hi_i1j2 * d_Hiy_i, &xHi_col_j.vector, xHiDHiDHiy_ee);
-        gsl_blas_daxpy(delta * d_Hi_i1j2 * d_Hiy_i, &xHi_col_j.vector,
+        gsl_blas_daxpy(epsilon * epsilon * d_Hi_i1j2 * d_Hiy_i, &xHi_col_j.vector, 
+                        xHiDHiDHiy_ee);
+        gsl_blas_daxpy(delta * epsilon * d_Hi_i1j2 * d_Hiy_i, &xHi_col_j.vector,
                        xHiDHiDHiy_ge);
       }
     }
   }
-
+}
   return;
 }
 
-void Calc_xHiDHiDHix(const gsl_vector *eval, const gsl_matrix *Hi,
+void Calc_xHiDHiDHix(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Hi, const gsl_matrix *V_e_temp,
                      const gsl_matrix *xHi, const size_t i1, const size_t j1,
                      const size_t i2, const size_t j2,
                      gsl_matrix *xHiDHiDHix_gg, gsl_matrix *xHiDHiDHix_ee,
@@ -1358,45 +1685,62 @@ void Calc_xHiDHiDHix(const gsl_vector *eval, const gsl_matrix *Hi,
 
   size_t n_size = eval->size, d_size = Hi->size1, dc_size = xHi->size1;
 
-  double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+  double delta, ve, epsilon, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
 
   gsl_matrix *mat_dcdc = gsl_matrix_alloc(dc_size, dc_size);
 
-  for (size_t k = 0; k < n_size; k++) {
-    delta = gsl_vector_get(eval, k);
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+
+
+  for (size_t i = 0; i < d_size; i++) {
+    ve = gsl_matrix_get(V_e_temp, i, i);
+
+    // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+    gsl_matrix_set_zero(Sigma);
+    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+    double scalar = sigmasq / ve;
+    gsl_matrix_scale(Sigma, scalar);
+   
+    for (size_t k = 0; k < n_size; k++) {
+      delta = gsl_vector_get(eval, k);
+      epsilon = gsl_matrix_get(Sigma, k, k);
 
-    gsl_vector_const_view xHi_col_i1 =
+      gsl_vector_const_view xHi_col_i1 =
         gsl_matrix_const_column(xHi, k * d_size + i1);
-    gsl_vector_const_view xHi_col_j1 =
+      gsl_vector_const_view xHi_col_j1 =
         gsl_matrix_const_column(xHi, k * d_size + j1);
-    gsl_vector_const_view xHi_col_i2 =
+      gsl_vector_const_view xHi_col_i2 =
         gsl_matrix_const_column(xHi, k * d_size + i2);
-    gsl_vector_const_view xHi_col_j2 =
+      gsl_vector_const_view xHi_col_j2 =
         gsl_matrix_const_column(xHi, k * d_size + j2);
 
-    d_Hi_i1i2 = gsl_matrix_get(Hi, i1, k * d_size + i2);
-    d_Hi_i1j2 = gsl_matrix_get(Hi, i1, k * d_size + j2);
-    d_Hi_j1i2 = gsl_matrix_get(Hi, j1, k * d_size + i2);
-    d_Hi_j1j2 = gsl_matrix_get(Hi, j1, k * d_size + j2);
+      d_Hi_i1i2 = gsl_matrix_get(Hi, i1, k * d_size + i2);
+      d_Hi_i1j2 = gsl_matrix_get(Hi, i1, k * d_size + j2);
+      d_Hi_j1i2 = gsl_matrix_get(Hi, j1, k * d_size + i2);
+      d_Hi_j1j2 = gsl_matrix_get(Hi, j1, k * d_size + j2);
 
-    if (i1 == j1) {
-      gsl_matrix_set_zero(mat_dcdc);
-      gsl_blas_dger(d_Hi_j1i2, &xHi_col_i1.vector, &xHi_col_j2.vector,
+      if (i1 == j1) {
+       gsl_matrix_set_zero(mat_dcdc);
+       gsl_blas_dger(d_Hi_j1i2, &xHi_col_i1.vector, &xHi_col_j2.vector,
                     mat_dcdc);
 
-      gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
-      gsl_matrix_scale(mat_dcdc, delta);
-      gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
-      gsl_matrix_scale(mat_dcdc, delta);
-      gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
+       gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
+       gsl_matrix_scale(mat_dcdc, epsilon);
+       gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
+       gsl_matrix_scale(mat_dcdc, delta);
+       gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
 
-      if (i2 != j2) {
+       if (i2 != j2) {
         gsl_matrix_set_zero(mat_dcdc);
         gsl_blas_dger(d_Hi_j1j2, &xHi_col_i1.vector, &xHi_col_i2.vector,
                       mat_dcdc);
 
         gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
-        gsl_matrix_scale(mat_dcdc, delta);
+        gsl_matrix_scale(mat_dcdc, epsilon);
         gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
         gsl_matrix_scale(mat_dcdc, delta);
         gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
@@ -1407,7 +1751,7 @@ void Calc_xHiDHiDHix(const gsl_vector *eval, const gsl_matrix *Hi,
                     mat_dcdc);
 
       gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
-      gsl_matrix_scale(mat_dcdc, delta);
+      gsl_matrix_scale(mat_dcdc, epsilon);
       gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
       gsl_matrix_scale(mat_dcdc, delta);
       gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
@@ -1417,7 +1761,7 @@ void Calc_xHiDHiDHix(const gsl_vector *eval, const gsl_matrix *Hi,
                     mat_dcdc);
 
       gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
-      gsl_matrix_scale(mat_dcdc, delta);
+      gsl_matrix_scale(mat_dcdc, epsilon);
       gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
       gsl_matrix_scale(mat_dcdc, delta);
       gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
@@ -1428,7 +1772,7 @@ void Calc_xHiDHiDHix(const gsl_vector *eval, const gsl_matrix *Hi,
                       mat_dcdc);
 
         gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
-        gsl_matrix_scale(mat_dcdc, delta);
+        gsl_matrix_scale(mat_dcdc, epsilon);
         gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
         gsl_matrix_scale(mat_dcdc, delta);
         gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
@@ -1438,44 +1782,62 @@ void Calc_xHiDHiDHix(const gsl_vector *eval, const gsl_matrix *Hi,
                       mat_dcdc);
 
         gsl_matrix_add(xHiDHiDHix_ee, mat_dcdc);
-        gsl_matrix_scale(mat_dcdc, delta);
+        gsl_matrix_scale(mat_dcdc, epsilon);
         gsl_matrix_add(xHiDHiDHix_ge, mat_dcdc);
         gsl_matrix_scale(mat_dcdc, delta);
         gsl_matrix_add(xHiDHiDHix_gg, mat_dcdc);
       }
     }
   }
+}
 
   gsl_matrix_free(mat_dcdc);
 
   return;
 }
 
-void Calc_traceHiD(const gsl_vector *eval, const gsl_matrix *Hi, const size_t i,
+void Calc_traceHiD(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Hi, const gsl_matrix *V_e_temp, const size_t i,
                    const size_t j, double &tHiD_g, double &tHiD_e) {
   tHiD_g = 0.0;
   tHiD_e = 0.0;
 
   size_t n_size = eval->size, d_size = Hi->size1;
-  double delta, d;
+  double delta, ve, epsilon, d;
 
-  for (size_t k = 0; k < n_size; k++) {
-    delta = gsl_vector_get(eval, k);
-    d = gsl_matrix_get(Hi, j, k * d_size + i);
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
 
-    if (i == j) {
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+
+  for (size_t i = 0; i < d_size; i++) {
+    ve = gsl_matrix_get(V_e_temp, i, i);
+
+    // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+    gsl_matrix_set_zero(Sigma);
+    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+    double scalar = sigmasq / ve;
+    gsl_matrix_scale(Sigma, scalar);
+   
+    for (size_t k = 0; k < n_size; k++) {
+     delta = gsl_vector_get(eval, k);
+     epsilon = gsl_matrix_get(Sigma, k, k);
+     d = gsl_matrix_get(Hi, j, k * d_size + i);
+
+     if (i == j) {
       tHiD_g += delta * d;
-      tHiD_e += d;
-    } else {
+      tHiD_e += epsilon * d;
+     } else {
       tHiD_g += delta * d * 2.0;
-      tHiD_e += d * 2.0;
+      tHiD_e += epsilon * d * 2.0;
     }
   }
+}
 
   return;
 }
 
-void Calc_traceHiDHiD(const gsl_vector *eval, const gsl_matrix *Hi,
+void Calc_traceHiDHiD(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Hi, const gsl_matrix *V_e_temp,
                       const size_t i1, const size_t j1, const size_t i2,
                       const size_t j2, double &tHiDHiD_gg, double &tHiDHiD_ee,
                       double &tHiDHiD_ge) {
@@ -1484,10 +1846,24 @@ void Calc_traceHiDHiD(const gsl_vector *eval, const gsl_matrix *Hi,
   tHiDHiD_ge = 0.0;
 
   size_t n_size = eval->size, d_size = Hi->size1;
-  double delta, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
+  double delta, ve, epsilon, d_Hi_i1i2, d_Hi_i1j2, d_Hi_j1i2, d_Hi_j1j2;
 
-  for (size_t k = 0; k < n_size; k++) {
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+
+  for (size_t i = 0; i < d_size; i++) {
+   ve = gsl_matrix_get(V_e_temp, i, i);
+   // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+   gsl_matrix_set_zero(Sigma);
+   gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+   double scalar = sigmasq / ve;
+   gsl_matrix_scale(Sigma, scalar);
+   for (size_t k = 0; k < n_size; k++) {
     delta = gsl_vector_get(eval, k);
+    epsilon = gsl_matrix_get(Sigma, k, k);
 
     d_Hi_i1i2 = gsl_matrix_get(Hi, i1, k * d_size + i2);
     d_Hi_i1j2 = gsl_matrix_get(Hi, i1, k * d_size + j2);
@@ -1496,34 +1872,35 @@ void Calc_traceHiDHiD(const gsl_vector *eval, const gsl_matrix *Hi,
 
     if (i1 == j1) {
       tHiDHiD_gg += delta * delta * d_Hi_i1j2 * d_Hi_j1i2;
-      tHiDHiD_ee += d_Hi_i1j2 * d_Hi_j1i2;
-      tHiDHiD_ge += delta * d_Hi_i1j2 * d_Hi_j1i2;
+      tHiDHiD_ee += epsilon * epsilon * d_Hi_i1j2 * d_Hi_j1i2;
+      tHiDHiD_ge += delta * epsilon * d_Hi_i1j2 * d_Hi_j1i2;
 
       if (i2 != j2) {
         tHiDHiD_gg += delta * delta * d_Hi_i1i2 * d_Hi_j1j2;
-        tHiDHiD_ee += d_Hi_i1i2 * d_Hi_j1j2;
-        tHiDHiD_ge += delta * d_Hi_i1i2 * d_Hi_j1j2;
+        tHiDHiD_ee += epsilon * epsilon * d_Hi_i1i2 * d_Hi_j1j2;
+        tHiDHiD_ge += delta * epsilon * d_Hi_i1i2 * d_Hi_j1j2;
       }
     } else {
       tHiDHiD_gg +=
           delta * delta * (d_Hi_i1j2 * d_Hi_j1i2 + d_Hi_j1j2 * d_Hi_i1i2);
-      tHiDHiD_ee += (d_Hi_i1j2 * d_Hi_j1i2 + d_Hi_j1j2 * d_Hi_i1i2);
-      tHiDHiD_ge += delta * (d_Hi_i1j2 * d_Hi_j1i2 + d_Hi_j1j2 * d_Hi_i1i2);
+      tHiDHiD_ee += epsilon * epsilon * (d_Hi_i1j2 * d_Hi_j1i2 + d_Hi_j1j2 * d_Hi_i1i2);
+      tHiDHiD_ge += delta * epsilon * (d_Hi_i1j2 * d_Hi_j1i2 + d_Hi_j1j2 * d_Hi_i1i2);
 
       if (i2 != j2) {
         tHiDHiD_gg +=
             delta * delta * (d_Hi_i1i2 * d_Hi_j1j2 + d_Hi_j1i2 * d_Hi_i1j2);
-        tHiDHiD_ee += (d_Hi_i1i2 * d_Hi_j1j2 + d_Hi_j1i2 * d_Hi_i1j2);
-        tHiDHiD_ge += delta * (d_Hi_i1i2 * d_Hi_j1j2 + d_Hi_j1i2 * d_Hi_i1j2);
+        tHiDHiD_ee += epsilon * epsilon * (d_Hi_i1i2 * d_Hi_j1j2 + d_Hi_j1i2 * d_Hi_i1j2);
+        tHiDHiD_ge += delta * epsilon * (d_Hi_i1i2 * d_Hi_j1j2 + d_Hi_j1i2 * d_Hi_i1j2);
       }
     }
   }
+}
 
   return;
 }
 
 // trace(PD) = trace((Hi-HixQixHi)D)=trace(HiD) - trace(HixQixHiD)
-void Calc_tracePD(const gsl_vector *eval, const gsl_matrix *Qi,
+void Calc_tracePD(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Qi, const gsl_matrix *V_e_temp, 
                   const gsl_matrix *Hi, const gsl_matrix *xHiDHix_all_g,
                   const gsl_matrix *xHiDHix_all_e, const size_t i,
                   const size_t j, double &tPD_g, double &tPD_e) {
@@ -1533,7 +1910,7 @@ void Calc_tracePD(const gsl_vector *eval, const gsl_matrix *Qi,
   double d;
 
   // Calculate the first part: trace(HiD).
-  Calc_traceHiD(eval, Hi, i, j, tPD_g, tPD_e);
+  Calc_traceHiD(eval, U, sigmasq, Hi, V_e_temp, i, j, tPD_g, tPD_e);
 
   // Calculate the second part: -trace(HixQixHiD).
   for (size_t k = 0; k < dc_size; k++) {
@@ -1555,7 +1932,7 @@ void Calc_tracePD(const gsl_vector *eval, const gsl_matrix *Qi,
 // trace(PDPD) = trace((Hi-HixQixHi)D(Hi-HixQixHi)D)
 //             = trace(HiDHiD) - trace(HixQixHiDHiD)
 //               - trace(HiDHixQixHiD) + trace(HixQixHiDHixQixHiD)
-void Calc_tracePDPD(const gsl_vector *eval, const gsl_matrix *Qi,
+void Calc_tracePDPD(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Qi, const gsl_matrix *V_e_temp,
                     const gsl_matrix *Hi, const gsl_matrix *xHi,
                     const gsl_matrix *QixHiDHix_all_g,
                     const gsl_matrix *QixHiDHix_all_e,
@@ -1571,7 +1948,7 @@ void Calc_tracePDPD(const gsl_vector *eval, const gsl_matrix *Qi,
   double d;
 
   // Calculate the first part: trace(HiDHiD).
-  Calc_traceHiDHiD(eval, Hi, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge);
+  Calc_traceHiDHiD(eval, U, sigmasq, Hi, V_e_temp, i1, j1, i2, j2, tPDPD_gg, tPDPD_ee, tPDPD_ge);
 
   // Calculate the second and third parts:
   // -trace(HixQixHiDHiD) - trace(HiDHixQixHiD)
@@ -1621,7 +1998,7 @@ void Calc_tracePDPD(const gsl_vector *eval, const gsl_matrix *Qi,
 }
 
 // Calculate (xHiDHiy) for every pair (i,j).
-void Calc_xHiDHiy_all(const gsl_vector *eval, const gsl_matrix *xHi,
+void Calc_xHiDHiy_all(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *xHi, const gsl_matrix *V_e_temp,
                       const gsl_matrix *Hiy, gsl_matrix *xHiDHiy_all_g,
                       gsl_matrix *xHiDHiy_all_e) {
   gsl_matrix_set_zero(xHiDHiy_all_g);
@@ -1640,14 +2017,14 @@ void Calc_xHiDHiy_all(const gsl_vector *eval, const gsl_matrix *xHi,
       gsl_vector_view xHiDHiy_g = gsl_matrix_column(xHiDHiy_all_g, v);
       gsl_vector_view xHiDHiy_e = gsl_matrix_column(xHiDHiy_all_e, v);
 
-      Calc_xHiDHiy(eval, xHi, Hiy, i, j, &xHiDHiy_g.vector, &xHiDHiy_e.vector);
+      Calc_xHiDHiy(eval, U, sigmasq, xHi, Hiy, V_e_temp, i, j, &xHiDHiy_g.vector, &xHiDHiy_e.vector);
     }
   }
   return;
 }
 
 // Calculate (xHiDHix) for every pair (i,j).
-void Calc_xHiDHix_all(const gsl_vector *eval, const gsl_matrix *xHi,
+void Calc_xHiDHix_all(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *xHi, const gsl_matrix *V_e_temp, 
                       gsl_matrix *xHiDHix_all_g, gsl_matrix *xHiDHix_all_e) {
   gsl_matrix_set_zero(xHiDHix_all_g);
   gsl_matrix_set_zero(xHiDHix_all_e);
@@ -1667,15 +2044,15 @@ void Calc_xHiDHix_all(const gsl_vector *eval, const gsl_matrix *xHi,
       gsl_matrix_view xHiDHix_e =
           gsl_matrix_submatrix(xHiDHix_all_e, 0, v * dc_size, dc_size, dc_size);
 
-      Calc_xHiDHix(eval, xHi, i, j, &xHiDHix_g.matrix, &xHiDHix_e.matrix);
+      Calc_xHiDHix(eval, U, sigmasq, V_e_temp, xHi, i, j, &xHiDHix_g.matrix, &xHiDHix_e.matrix);
     }
   }
   return;
 }
 
 // Calculate (xHiDHiy) for every pair (i,j).
-void Calc_xHiDHiDHiy_all(const size_t v_size, const gsl_vector *eval,
-                         const gsl_matrix *Hi, const gsl_matrix *xHi,
+void Calc_xHiDHiDHiy_all(const size_t v_size, const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq,
+                         const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *V_e_temp,
                          const gsl_matrix *Hiy, gsl_matrix *xHiDHiDHiy_all_gg,
                          gsl_matrix *xHiDHiDHiy_all_ee,
                          gsl_matrix *xHiDHiDHiy_all_ge) {
@@ -1707,7 +2084,7 @@ void Calc_xHiDHiDHiy_all(const size_t v_size, const gsl_vector *eval,
           gsl_vector_view xHiDHiDHiy_ge =
               gsl_matrix_column(xHiDHiDHiy_all_ge, v1 * v_size + v2);
 
-          Calc_xHiDHiDHiy(eval, Hi, xHi, Hiy, i1, j1, i2, j2,
+          Calc_xHiDHiDHiy(eval, U, sigmasq, Hi, xHi, Hiy, V_e_temp, i1, j1, i2, j2,
                           &xHiDHiDHiy_gg.vector, &xHiDHiDHiy_ee.vector,
                           &xHiDHiDHiy_ge.vector);
         }
@@ -1718,8 +2095,8 @@ void Calc_xHiDHiDHiy_all(const size_t v_size, const gsl_vector *eval,
 }
 
 // Calculate (xHiDHix) for every pair (i,j).
-void Calc_xHiDHiDHix_all(const size_t v_size, const gsl_vector *eval,
-                         const gsl_matrix *Hi, const gsl_matrix *xHi,
+void Calc_xHiDHiDHix_all(const size_t v_size, const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq,
+                         const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *V_e_temp, 
                          gsl_matrix *xHiDHiDHix_all_gg,
                          gsl_matrix *xHiDHiDHix_all_ee,
                          gsl_matrix *xHiDHiDHix_all_ge) {
@@ -1758,7 +2135,7 @@ void Calc_xHiDHiDHix_all(const size_t v_size, const gsl_vector *eval,
               xHiDHiDHix_all_ge, 0, (v1 * v_size + v2) * dc_size, dc_size,
               dc_size);
 
-          Calc_xHiDHiDHix(eval, Hi, xHi, i1, j1, i2, j2, &xHiDHiDHix_gg1.matrix,
+          Calc_xHiDHiDHix(eval, U, sigmasq, Hi, V_e_temp, xHi, i1, j1, i2, j2, &xHiDHiDHix_gg1.matrix,
                           &xHiDHiDHix_ee1.matrix, &xHiDHiDHix_ge1.matrix);
 
           if (v2 != v1) {
@@ -1860,7 +2237,7 @@ void Calc_QiMat_all(const gsl_matrix *Qi, const gsl_matrix *mat_all_g,
 // yPDPy = y(Hi-HixQixHi)D(Hi-HixQixHi)y
 //       = ytHiDHiy - (yHix)Qi(xHiDHiy) - (yHiDHix)Qi(xHiy)
 //         + (yHix)Qi(xHiDHix)Qi(xtHiy)
-void Calc_yPDPy(const gsl_vector *eval, const gsl_matrix *Hiy,
+void Calc_yPDPy(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Hiy, const gsl_matrix *V_e_temp,
                 const gsl_vector *QixHiy, const gsl_matrix *xHiDHiy_all_g,
                 const gsl_matrix *xHiDHiy_all_e,
                 const gsl_matrix *xHiDHixQixHiy_all_g,
@@ -1872,7 +2249,7 @@ void Calc_yPDPy(const gsl_vector *eval, const gsl_matrix *Hiy,
   double d;
 
   // First part: ytHiDHiy.
-  Calc_yHiDHiy(eval, Hiy, i, j, yPDPy_g, yPDPy_e);
+  Calc_yHiDHiy(eval, U, sigmasq, Hiy, V_e_temp, i, j, yPDPy_g, yPDPy_e);
 
   // Second and third parts: -(yHix)Qi(xHiDHiy)-(yHiDHix)Qi(xHiy)
   gsl_vector_const_view xHiDHiy_g = gsl_matrix_const_column(xHiDHiy_all_g, v);
@@ -1906,7 +2283,7 @@ void Calc_yPDPy(const gsl_vector *eval, const gsl_matrix *Hiy,
 //                     + (yHix)Qi(xHiDHiDHix)Qi(xHiy)
 //                     - (yHix)Qi(xHiDHix)Qi(xHiDHix)Qi(xHiy)
 void Calc_yPDPDPy(
-    const gsl_vector *eval, const gsl_matrix *Hi, const gsl_matrix *xHi,
+    const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *V_e_temp,
     const gsl_matrix *Hiy, const gsl_vector *QixHiy,
     const gsl_matrix *xHiDHiy_all_g, const gsl_matrix *xHiDHiy_all_e,
     const gsl_matrix *QixHiDHiy_all_g, const gsl_matrix *QixHiDHiy_all_e,
@@ -1928,7 +2305,7 @@ void Calc_yPDPDPy(
   gsl_vector *xHiDHiDHixQixHiy = gsl_vector_alloc(dc_size);
 
   // First part: yHiDHiDHiy.
-  Calc_yHiDHiDHiy(eval, Hi, Hiy, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee,
+  Calc_yHiDHiDHiy(eval, U, sigmasq, Hi, Hiy, V_e_temp, i1, j1, i2, j2, yPDPDPy_gg, yPDPDPy_ee,
                   yPDPDPy_ge);
 
   // Second and third parts:
@@ -2357,7 +2734,7 @@ void CalcCRT(const gsl_matrix *Hessian_inv, const gsl_matrix *Qi,
 }
 
 // Calculate first-order and second-order derivatives.
-void CalcDev(const char func_name, const gsl_vector *eval, const gsl_matrix *Qi,
+void CalcDev(const char func_name, const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *Qi,
              const gsl_matrix *Hi, const gsl_matrix *xHi, const gsl_matrix *Hiy,
              const gsl_vector *QixHiy, gsl_vector *gradient,
              gsl_matrix *Hessian_inv, double &crt_a, double &crt_b,
@@ -2376,7 +2753,7 @@ void CalcDev(const char func_name, const gsl_vector *eval, const gsl_matrix *Qi,
   double dev1_g, dev1_e, dev2_gg, dev2_ee, dev2_ge;
 
   gsl_matrix *Hessian = gsl_matrix_alloc(v_size * 2, v_size * 2);
-
+  gsl_matrix *V_e_temp = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *xHiDHiy_all_g = gsl_matrix_alloc(dc_size, v_size);
   gsl_matrix *xHiDHiy_all_e = gsl_matrix_alloc(dc_size, v_size);
   gsl_matrix *xHiDHix_all_g = gsl_matrix_alloc(dc_size, v_size * dc_size);
@@ -2402,14 +2779,14 @@ void CalcDev(const char func_name, const gsl_vector *eval, const gsl_matrix *Qi,
       gsl_matrix_alloc(dc_size, v_size * v_size * dc_size);
 
   // Calculate xHiDHiy_all, xHiDHix_all and xHiDHixQixHiy_all.
-  Calc_xHiDHiy_all(eval, xHi, Hiy, xHiDHiy_all_g, xHiDHiy_all_e);
-  Calc_xHiDHix_all(eval, xHi, xHiDHix_all_g, xHiDHix_all_e);
+  Calc_xHiDHiy_all(eval, U, sigmasq, xHi, V_e_temp, Hiy, xHiDHiy_all_g, xHiDHiy_all_e);
+  Calc_xHiDHix_all(eval, U, sigmasq, xHi, V_e_temp, xHiDHix_all_g, xHiDHix_all_e);
   Calc_xHiDHixQixHiy_all(xHiDHix_all_g, xHiDHix_all_e, QixHiy,
                          xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e);
 
-  Calc_xHiDHiDHiy_all(v_size, eval, Hi, xHi, Hiy, xHiDHiDHiy_all_gg,
+  Calc_xHiDHiDHiy_all(v_size, eval, U, sigmasq, Hi, xHi, V_e_temp, Hiy, xHiDHiDHiy_all_gg,
                       xHiDHiDHiy_all_ee, xHiDHiDHiy_all_ge);
-  Calc_xHiDHiDHix_all(v_size, eval, Hi, xHi, xHiDHiDHix_all_gg,
+  Calc_xHiDHiDHix_all(v_size, eval, U, sigmasq, Hi, xHi, V_e_temp, xHiDHiDHix_all_gg,
                       xHiDHiDHix_all_ee, xHiDHiDHix_all_ge);
 
   // Calculate QixHiDHiy_all, QixHiDHix_all and QixHiDHixQixHiy_all.
@@ -2432,18 +2809,18 @@ void CalcDev(const char func_name, const gsl_vector *eval, const gsl_matrix *Qi,
       }
       v1 = GetIndex(i1, j1, d_size);
 
-      Calc_yPDPy(eval, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e,
+      Calc_yPDPy(eval, U, sigmasq, Hiy, V_e_temp, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e,
                  xHiDHixQixHiy_all_g, xHiDHixQixHiy_all_e, i1, j1, yPDPy_g,
                  yPDPy_e);
 
       if (func_name == 'R' || func_name == 'r') {
-        Calc_tracePD(eval, Qi, Hi, xHiDHix_all_g, xHiDHix_all_e, i1, j1, tPD_g,
+        Calc_tracePD(eval, U, sigmasq, Qi, V_e_temp, Hi, xHiDHix_all_g, xHiDHix_all_e, i1, j1, tPD_g,
                      tPD_e);
 
         dev1_g = -0.5 * tPD_g + 0.5 * yPDPy_g;
         dev1_e = -0.5 * tPD_e + 0.5 * yPDPy_e;
       } else {
-        Calc_traceHiD(eval, Hi, i1, j1, tHiD_g, tHiD_e);
+        Calc_traceHiD(eval, U, sigmasq, Hi, V_e_temp, i1, j1, tHiD_g, tHiD_e);
 
         dev1_g = -0.5 * tHiD_g + 0.5 * yPDPy_g;
         dev1_e = -0.5 * tHiD_e + 0.5 * yPDPy_e;
@@ -2463,7 +2840,7 @@ void CalcDev(const char func_name, const gsl_vector *eval, const gsl_matrix *Qi,
             continue;
           }
 
-          Calc_yPDPDPy(eval, Hi, xHi, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e,
+          Calc_yPDPDPy(eval, U, sigmasq, Hi, xHi, V_e_temp, Hiy, QixHiy, xHiDHiy_all_g, xHiDHiy_all_e,
                        QixHiDHiy_all_g, QixHiDHiy_all_e, xHiDHixQixHiy_all_g,
                        xHiDHixQixHiy_all_e, QixHiDHixQixHiy_all_g,
                        QixHiDHixQixHiy_all_e, xHiDHiDHiy_all_gg,
@@ -2473,7 +2850,7 @@ void CalcDev(const char func_name, const gsl_vector *eval, const gsl_matrix *Qi,
 
           // AI for REML.
           if (func_name == 'R' || func_name == 'r') {
-            Calc_tracePDPD(eval, Qi, Hi, xHi, QixHiDHix_all_g, QixHiDHix_all_e,
+            Calc_tracePDPD(eval, U, sigmasq, Qi, V_e_temp, Hi, xHi, QixHiDHix_all_g, QixHiDHix_all_e,
                            xHiDHiDHix_all_gg, xHiDHiDHix_all_ee,
                            xHiDHiDHix_all_ge, i1, j1, i2, j2, tPDPD_gg,
                            tPDPD_ee, tPDPD_ge);
@@ -2482,7 +2859,7 @@ void CalcDev(const char func_name, const gsl_vector *eval, const gsl_matrix *Qi,
             dev2_ee = 0.5 * tPDPD_ee - yPDPDPy_ee;
             dev2_ge = 0.5 * tPDPD_ge - yPDPDPy_ge;
           } else {
-            Calc_traceHiDHiD(eval, Hi, i1, j1, i2, j2, tHiDHiD_gg, tHiDHiD_ee,
+            Calc_traceHiDHiD(eval, U, sigmasq, Hi, V_e_temp, i1, j1, i2, j2, tHiDHiD_gg, tHiDHiD_ee,
                              tHiDHiD_ge);
 
             dev2_gg = 0.5 * tHiDHiD_gg - yPDPDPy_gg;
@@ -2606,7 +2983,7 @@ void UpdateVgVe(const gsl_matrix *Hessian_inv, const gsl_vector *gradient,
 }
 
 double MphNR(const char func_name, const size_t max_iter, const double max_prec,
-             const gsl_vector *eval, const gsl_matrix *X, const gsl_matrix *Y,
+             const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *X, const gsl_matrix *Y,
              gsl_matrix *Hi_all, gsl_matrix *xHi_all, gsl_matrix *Hiy_all,
              gsl_matrix *V_g, gsl_matrix *V_e, gsl_matrix *Hessian_inv,
              double &crt_a, double &crt_b, double &crt_c) {
@@ -2627,7 +3004,8 @@ double MphNR(const char func_name, const size_t max_iter, const double max_prec,
 
   gsl_matrix *Vg_save = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *Ve_save = gsl_matrix_alloc(d_size, d_size);
-  gsl_matrix *V_temp = gsl_matrix_alloc(d_size, d_size);
+  gsl_matrix *V_e_temp = gsl_matrix_alloc(d_size, d_size);
+  gsl_matrix *V_g_temp = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *U_temp = gsl_matrix_alloc(d_size, d_size);
   gsl_vector *D_temp = gsl_vector_alloc(d_size);
   gsl_vector *xHiy = gsl_vector_alloc(dc_size);
@@ -2676,15 +3054,15 @@ double MphNR(const char func_name, const size_t max_iter, const double max_prec,
 
       // Check if both Vg and Ve are positive definite.
       flag_pd = 1;
-      gsl_matrix_memcpy(V_temp, V_e);
-      EigenDecomp(V_temp, U_temp, D_temp, 0);
+      gsl_matrix_memcpy(V_e_temp, V_e);
+      EigenDecomp(V_e_temp, U_temp, D_temp, 0);
       for (size_t i = 0; i < d_size; i++) {
         if (gsl_vector_get(D_temp, i) <= 0) {
           flag_pd = 0;
         }
       }
-      gsl_matrix_memcpy(V_temp, V_g);
-      EigenDecomp(V_temp, U_temp, D_temp, 0);
+      gsl_matrix_memcpy(V_g_temp, V_g);
+      EigenDecomp(V_g_temp, U_temp, D_temp, 0);
       for (size_t i = 0; i < d_size; i++) {
         if (gsl_vector_get(D_temp, i) <= 0) {
           flag_pd = 0;
@@ -2694,7 +3072,7 @@ double MphNR(const char func_name, const size_t max_iter, const double max_prec,
       // If flag_pd==1, continue to calculate quantities
       // and logl.
       if (flag_pd == 1) {
-        CalcHiQi(eval, X, V_g, V_e, Hi_all, Qi, logdet_H, logdet_Q);
+        CalcHiQi(eval, U, sigmasq, X, V_g, V_e, Hi_all, Qi, logdet_H, logdet_Q);
         Calc_Hiy_all(Y, Hi_all, Hiy_all);
         Calc_xHi_all(X, Hi_all, xHi_all);
 
@@ -2735,7 +3113,7 @@ double MphNR(const char func_name, const size_t max_iter, const double max_prec,
 
     logl_old = logl_new;
 
-    CalcDev(func_name, eval, Qi, Hi_all, xHi_all, Hiy_all, QixHiy, gradient,
+    CalcDev(func_name, eval, U, sigmasq, Qi, Hi_all, xHi_all, Hiy_all, QixHiy, gradient,
             Hessian_inv, crt_a, crt_b, crt_c);
   }
 
@@ -2745,7 +3123,8 @@ double MphNR(const char func_name, const size_t max_iter, const double max_prec,
 
   gsl_matrix_free(Vg_save);
   gsl_matrix_free(Ve_save);
-  gsl_matrix_free(V_temp);
+  gsl_matrix_free(V_e_temp);
+  gsl_matrix_free(V_g_temp);
   gsl_matrix_free(U_temp);
   gsl_vector_free(D_temp);
   gsl_vector_free(xHiy);
@@ -2762,7 +3141,7 @@ double MphNR(const char func_name, const size_t max_iter, const double max_prec,
 // Initialize Vg, Ve and B.
 void MphInitial(const size_t em_iter, const double em_prec,
                 const size_t nr_iter, const double nr_prec,
-                const gsl_vector *eval, const gsl_matrix *X,
+                const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *X,
                 const gsl_matrix *Y, const double l_min, const double l_max,
                 const size_t n_region, gsl_matrix *V_g, gsl_matrix *V_e,
                 gsl_matrix *B) {
@@ -2786,9 +3165,9 @@ void MphInitial(const size_t em_iter, const double em_prec,
 
   for (size_t i = 0; i < d_size; i++) {
     gsl_vector_const_view Y_row = gsl_matrix_const_row(Y, i);
-    CalcLambda('R', eval, Xt, &Y_row.vector, l_min, l_max, n_region, lambda,
+    CalcLambda('R', eval, U, sigmasq, Xt, &Y_row.vector, l_min, l_max, n_region, lambda,
                logl);
-    CalcLmmVgVeBeta(eval, Xt, &Y_row.vector, lambda, vg, ve, beta_temp,
+    CalcLmmVgVeBeta(eval, U, sigmasq, Xt, &Y_row.vector, lambda, vg, ve, beta_temp,
                     se_beta_temp);
 
     gsl_matrix_set(V_g, i, i, vg);
@@ -2847,10 +3226,10 @@ void MphInitial(const size_t em_iter, const double em_prec,
         gsl_matrix_set(Vg_sub, 1, 1, gsl_matrix_get(V_g, j, j));
         gsl_matrix_set(Ve_sub, 1, 1, gsl_matrix_get(V_e, j, j));
 
-        logl = MphEM('R', em_iter, em_prec, eval, X, Y_sub, U_hat, E_hat,
+        logl = MphEM('R', em_iter, em_prec, eval, U, sigmasq, X, Y_sub, U_hat, E_hat,
                      OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE,
                      Vg_sub, Ve_sub, B_sub);
-        logl = MphNR('R', nr_iter, nr_prec, eval, X, Y_sub, Hi_all, xHi_all,
+        logl = MphNR('R', nr_iter, nr_prec, eval, U, sigmasq, X, Y_sub, Hi_all, xHi_all,
                      Hiy_all, Vg_sub, Ve_sub, Hessian, a, b, c);
 
         gsl_matrix_set(V_g, i, j, gsl_matrix_get(Vg_sub, 0, 1));
@@ -2884,7 +3263,7 @@ void MphInitial(const size_t em_iter, const double em_prec,
 
   // Calculate B hat using GSL estimate.
   gsl_matrix *UltVehiY = gsl_matrix_alloc(d_size, n_size);
-
+  gsl_matrix *V_e_temp = gsl_matrix_alloc(d_size, d_size);
   gsl_vector *D_l = gsl_vector_alloc(d_size);
   gsl_matrix *UltVeh = gsl_matrix_alloc(d_size, d_size);
   gsl_matrix *UltVehi = gsl_matrix_alloc(d_size, d_size);
@@ -2894,15 +3273,21 @@ void MphInitial(const size_t em_iter, const double em_prec,
 
   gsl_vector_set_zero(XHiy);
 
-  double dl, d, delta, dx, dy;
+  gsl_matrix *U_T = gsl_matrix_alloc(n_size, n_size);
+  gsl_matrix *Sigma = gsl_matrix_alloc(n_size, n_size);
+
+  // Transpose U
+  gsl_matrix_transpose_memcpy(U_T, U);
+
+  double logdet_Ve, dl, d, delta, epsilon, dx, dy;
 
   // Eigen decomposition and calculate log|Ve|.
   // double logdet_Ve = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi);
-  EigenProc(V_g, V_e, D_l, UltVeh, UltVehi);
+  tie(V_e_temp, logdet_Ve) = EigenProc(V_g, V_e, D_l, UltVeh, UltVehi);
 
   // Calculate Qi and log|Q|.
-  // double logdet_Q = CalcQi(eval, D_l, X, Qi);
-  CalcQi(eval, D_l, X, Qi);
+  // double logdet_Q = CalcQi(eval, D_l, X, V_e_temp, Qi);
+  CalcQi(eval, U, sigmasq, D_l, X, V_e_temp, Qi);
 
   // Calculate UltVehiY.
   gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, UltVehi, Y, 0.0, UltVehiY);
@@ -2910,14 +3295,23 @@ void MphInitial(const size_t em_iter, const double em_prec,
   // calculate XHiy
   for (size_t i = 0; i < d_size; i++) {
     dl = gsl_vector_get(D_l, i);
-
+    ve = gsl_matrix_get(V_e_temp, i, i);
+
+    // Calculate Sigma = t(U) %*% (sigmasq / V_e_temp[i]) %*% U
+    gsl_matrix_set_zero(Sigma);
+    gsl_blas_dgemm(CblasNoTrans, CblasNoTrans, 1.0, U_T, U, 0.0, Sigma);
+    double scalar = sigmasq / ve;
+    gsl_matrix_scale(Sigma, scalar);
+   
     for (size_t j = 0; j < c_size; j++) {
       d = 0.0;
       for (size_t k = 0; k < n_size; k++) {
         delta = gsl_vector_get(eval, k);
+        epsilon = gsl_matrix_get(Sigma, k, k);
         dx = gsl_matrix_get(X, j, k);
         dy = gsl_matrix_get(UltVehiY, i, k);
-        d += dy * dx / (delta * dl + 1.0);
+
+        d += dy * dx / (delta * dl + epsilon);
       }
       gsl_vector_set(XHiy, j * d_size + i, d);
     }
@@ -2968,7 +3362,7 @@ double PCRT(const size_t mode, const size_t d_size, const double p_value,
   return p_crt;
 }
 
-void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
+void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq,
                           const gsl_matrix *UtW, const gsl_matrix *UtY) {
   debug_msg("entering");
   igzstream infile(file_geno.c_str(), igzstream::in);
@@ -3051,15 +3445,15 @@ void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
   gsl_vector_view B_col = gsl_matrix_column(B, c_size);
   gsl_vector_set_zero(&B_col.vector);
 
-  MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min,
+  MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, U, sigmasq, &X_sub.matrix, Y, l_min,
              l_max, n_region, V_g, V_e, &B_sub.matrix);
-  logl_H0 = MphEM('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat,
+  logl_H0 = MphEM('R', em_iter, em_prec, eval, U, sigmasq, &X_sub.matrix, Y, U_hat, E_hat,
                   OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g,
                   V_e, &B_sub.matrix);
-  logl_H0 = MphNR('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all,
+  logl_H0 = MphNR('R', nr_iter, nr_prec, eval, U, sigmasq, &X_sub.matrix, Y, Hi_all,
                   &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b,
                   crt_c);
-  MphCalcBeta(eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix,
+  MphCalcBeta(eval, U, sigmasq, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix,
               se_B_null);
 
   c = 0;
@@ -3119,13 +3513,13 @@ void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
   }
   cout << "REMLE likelihood = " << logl_H0 << endl;
 
-  logl_H0 = MphEM('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat,
+  logl_H0 = MphEM('L', em_iter, em_prec, eval, U, sigmasq, &X_sub.matrix, Y, U_hat, E_hat,
                   OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g,
                   V_e, &B_sub.matrix);
-  logl_H0 = MphNR('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all,
+  logl_H0 = MphNR('L', nr_iter, nr_prec, eval, U, sigmasq, &X_sub.matrix, Y, Hi_all,
                   &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b,
                   crt_c);
-  MphCalcBeta(eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix,
+  MphCalcBeta(eval, U, sigmasq, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix,
               se_B_null);
 
   c = 0;
@@ -3288,32 +3682,32 @@ void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
 
         // 3 is before 1.
         if (a_mode == 3 || a_mode == 4) {
-          p_score = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g_null,
+          p_score = MphCalcP(eval, U, sigmasq, &X_row.vector, &X_sub.matrix, Y, V_g_null,
                              V_e_null, UltVehiY, beta, Vbeta);
           if (p_score < p_nr && crt == 1) {
-            logl_H1 = MphNR('R', 1, nr_prec * 10, eval, X, Y, Hi_all, xHi_all,
+            logl_H1 = MphNR('R', 1, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all, xHi_all,
                             Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
             p_score = PCRT(3, d_size, p_score, crt_a, crt_b, crt_c);
           }
         }
 
         if (a_mode == 2 || a_mode == 4) {
-          logl_H1 = MphEM('L', em_iter / 10, em_prec * 10, eval, X, Y, U_hat,
+          logl_H1 = MphEM('L', em_iter / 10, em_prec * 10, eval, U, sigmasq, X, Y, U_hat,
                           E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU,
                           UltVehiE, V_g, V_e, B);
 
           // Calculate beta and Vbeta.
-          p_lrt = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
+          p_lrt = MphCalcP(eval, U, sigmasq, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
                            UltVehiY, beta, Vbeta);
           p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size);
 
           if (p_lrt < p_nr) {
             logl_H1 =
-                MphNR('L', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all,
+                MphNR('L', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all,
                       xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
 
             // Calculate beta and Vbeta.
-            p_lrt = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
+            p_lrt = MphCalcP(eval, U, sigmasq, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
                              UltVehiY, beta, Vbeta);
             p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size);
 
@@ -3324,17 +3718,17 @@ void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
         }
 
         if (a_mode == 1 || a_mode == 4) {
-          logl_H1 = MphEM('R', em_iter / 10, em_prec * 10, eval, X, Y, U_hat,
+          logl_H1 = MphEM('R', em_iter / 10, em_prec * 10, eval, U, sigmasq, X, Y, U_hat,
                           E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU,
                           UltVehiE, V_g, V_e, B);
-          p_wald = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
+          p_wald = MphCalcP(eval, U, sigmasq, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
                             UltVehiY, beta, Vbeta);
 
           if (p_wald < p_nr) {
             logl_H1 =
-                MphNR('R', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all,
+                MphNR('R', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all,
                       xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-            p_wald = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
+            p_wald = MphCalcP(eval, U, sigmasq, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
                               UltVehiY, beta, Vbeta);
 
             if (crt == 1) {
@@ -3406,7 +3800,7 @@ void MVLMM::AnalyzeBimbam(const gsl_matrix *U, const gsl_vector *eval,
   return;
 }
 
-void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
+void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq,
                          const gsl_matrix *UtW, const gsl_matrix *UtY) {
   debug_msg("entering");
   string file_bed = file_bfile + ".bed";
@@ -3488,18 +3882,18 @@ void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
   gsl_vector_view B_col = gsl_matrix_column(B, c_size);
   gsl_vector_set_zero(&B_col.vector);
 
-  MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub.matrix, Y, l_min,
+  MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, U, sigmasq, &X_sub.matrix, Y, l_min,
              l_max, n_region, V_g, V_e, &B_sub.matrix);
 
   write(eval,"eval4");
 
-  logl_H0 = MphEM('R', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat,
+  logl_H0 = MphEM('R', em_iter, em_prec, eval, U, sigmasq, &X_sub.matrix, Y, U_hat, E_hat,
                   OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g,
                   V_e, &B_sub.matrix);
-  logl_H0 = MphNR('R', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all,
+  logl_H0 = MphNR('R', nr_iter, nr_prec, eval, U, sigmasq, &X_sub.matrix, Y, Hi_all,
                   &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b,
                   crt_c);
-  MphCalcBeta(eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix,
+  MphCalcBeta(eval, U, sigmasq, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix,
               se_B_null);
 
   c = 0;
@@ -3562,13 +3956,13 @@ void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
   }
   cout << "REMLE likelihood = " << logl_H0 << endl;
 
-  logl_H0 = MphEM('L', em_iter, em_prec, eval, &X_sub.matrix, Y, U_hat, E_hat,
+  logl_H0 = MphEM('L', em_iter, em_prec, eval, U, sigmasq, &X_sub.matrix, Y, U_hat, E_hat,
                   OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g,
                   V_e, &B_sub.matrix);
-  logl_H0 = MphNR('L', nr_iter, nr_prec, eval, &X_sub.matrix, Y, Hi_all,
+  logl_H0 = MphNR('L', nr_iter, nr_prec, eval, U, sigmasq, &X_sub.matrix, Y, Hi_all,
                   &xHi_all_sub.matrix, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b,
                   crt_c);
-  MphCalcBeta(eval, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix,
+  MphCalcBeta(eval, U, sigmasq, &X_sub.matrix, Y, V_g, V_e, UltVehiY, &B_sub.matrix,
               se_B_null);
 
   c = 0;
@@ -3770,33 +4164,33 @@ void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
 
         // 3 is before 1.
         if (a_mode == 3 || a_mode == 4) {
-          p_score = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g_null,
+          p_score = MphCalcP(eval, U, sigmasq, &X_row.vector, &X_sub.matrix, Y, V_g_null,
                              V_e_null, UltVehiY, beta, Vbeta);
 
           if (p_score < p_nr && crt == 1) {
-            logl_H1 = MphNR('R', 1, nr_prec * 10, eval, X, Y, Hi_all, xHi_all,
+            logl_H1 = MphNR('R', 1, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all, xHi_all,
                             Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
             p_score = PCRT(3, d_size, p_score, crt_a, crt_b, crt_c);
           }
         }
 
         if (a_mode == 2 || a_mode == 4) {
-          logl_H1 = MphEM('L', em_iter / 10, em_prec * 10, eval, X, Y, U_hat,
+          logl_H1 = MphEM('L', em_iter / 10, em_prec * 10, eval, U, sigmasq, X, Y, U_hat,
                           E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU,
                           UltVehiE, V_g, V_e, B);
 
           // Calculate beta and Vbeta.
-          p_lrt = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
+          p_lrt = MphCalcP(eval, U, sigmasq, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
                            UltVehiY, beta, Vbeta);
           p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size);
 
           if (p_lrt < p_nr) {
             logl_H1 =
-                MphNR('L', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all,
+                MphNR('L', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all,
                       xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
 
             // Calculate beta and Vbeta.
-            p_lrt = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
+            p_lrt = MphCalcP(eval, U, sigmasq, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
                              UltVehiY, beta, Vbeta);
             p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size);
             if (crt == 1) {
@@ -3806,17 +4200,17 @@ void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
         }
 
         if (a_mode == 1 || a_mode == 4) {
-          logl_H1 = MphEM('R', em_iter / 10, em_prec * 10, eval, X, Y, U_hat,
+          logl_H1 = MphEM('R', em_iter / 10, em_prec * 10, eval, U, sigmasq, X, Y, U_hat,
                           E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU,
                           UltVehiE, V_g, V_e, B);
-          p_wald = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
+          p_wald = MphCalcP(eval, U, sigmasq, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
                             UltVehiY, beta, Vbeta);
 
           if (p_wald < p_nr) {
             logl_H1 =
-                MphNR('R', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all,
+                MphNR('R', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all,
                       xHi_all, Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-            p_wald = MphCalcP(eval, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
+            p_wald = MphCalcP(eval, U, sigmasq, &X_row.vector, &X_sub.matrix, Y, V_g, V_e,
                               UltVehiY, beta, Vbeta);
 
             if (crt == 1) {
@@ -3889,7 +4283,7 @@ void MVLMM::AnalyzePlink(const gsl_matrix *U, const gsl_vector *eval,
 
 // Calculate Vg, Ve, B, se(B) in the null mvLMM model.
 // Both B and se_B are d by c matrices.
-void CalcMvLmmVgVeBeta(const gsl_vector *eval, const gsl_matrix *UtW,
+void CalcMvLmmVgVeBeta(const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq, const gsl_matrix *UtW,
                        const gsl_matrix *UtY, const size_t em_iter,
                        const size_t nr_iter, const double em_prec,
                        const double nr_prec, const double l_min,
@@ -3929,13 +4323,13 @@ void CalcMvLmmVgVeBeta(const gsl_vector *eval, const gsl_matrix *UtW,
   gsl_matrix_transpose_memcpy(W, UtW);
 
   // Initial, EM, NR, and calculate B.
-  MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, W, Y, l_min, l_max,
+  MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, U, sigmasq, W, Y, l_min, l_max,
              n_region, V_g, V_e, B);
-  MphEM('R', em_iter, em_prec, eval, W, Y, U_hat, E_hat, OmegaU, OmegaE,
+  MphEM('R', em_iter, em_prec, eval, U, sigmasq, W, Y, U_hat, E_hat, OmegaU, OmegaE,
         UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g, V_e, B);
-  MphNR('R', nr_iter, nr_prec, eval, W, Y, Hi_all, xHi_all, Hiy_all, V_g,
+  MphNR('R', nr_iter, nr_prec, eval, U, sigmasq, W, Y, Hi_all, xHi_all, Hiy_all, V_g,
         V_e, Hessian, crt_a, crt_b, crt_c);
-  MphCalcBeta(eval, W, Y, V_g, V_e, UltVehiY, B, se_B);
+  MphCalcBeta(eval, U, sigmasq, W, Y, V_g, V_e, UltVehiY, B, se_B);
 
   // Free matrices.
   gsl_matrix_free(U_hat);
@@ -3958,7 +4352,7 @@ void CalcMvLmmVgVeBeta(const gsl_vector *eval, const gsl_matrix *UtW,
   return;
 }
 
-void MVLMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval,
+void MVLMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq,
                              const gsl_matrix *UtW, const gsl_matrix *UtY,
                              const gsl_vector *env) {
   debug_msg("entering");
@@ -4050,15 +4444,15 @@ void MVLMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval,
   gsl_vector_view B_col2 = gsl_matrix_column(B, c_size);
   gsl_vector_set_zero(&B_col2.vector);
 
-  MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub1.matrix, Y, l_min,
+  MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, U, sigmasq, &X_sub1.matrix, Y, l_min,
              l_max, n_region, V_g, V_e, &B_sub1.matrix);
-  logl_H0 = MphEM('R', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat,
+  logl_H0 = MphEM('R', em_iter, em_prec, eval, U, sigmasq, &X_sub1.matrix, Y, U_hat, E_hat,
                   OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g,
                   V_e, &B_sub1.matrix);
-  logl_H0 = MphNR('R', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all,
+  logl_H0 = MphNR('R', nr_iter, nr_prec, eval, U, sigmasq, &X_sub1.matrix, Y, Hi_all,
                   &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a,
                   crt_b, crt_c);
-  MphCalcBeta(eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix,
+  MphCalcBeta(eval, U, sigmasq, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix,
               se_B_null1);
 
   c = 0;
@@ -4118,13 +4512,13 @@ void MVLMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval,
   }
   cout << "REMLE likelihood = " << logl_H0 << endl;
 
-  logl_H0 = MphEM('L', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat,
+  logl_H0 = MphEM('L', em_iter, em_prec, eval, U, sigmasq, &X_sub1.matrix, Y, U_hat, E_hat,
                   OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g,
                   V_e, &B_sub1.matrix);
-  logl_H0 = MphNR('L', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all,
+  logl_H0 = MphNR('L', nr_iter, nr_prec, eval, U, sigmasq, &X_sub1.matrix, Y, Hi_all,
                   &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a,
                   crt_b, crt_c);
-  MphCalcBeta(eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix,
+  MphCalcBeta(eval, U, sigmasq, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix,
               se_B_null1);
 
   c = 0;
@@ -4260,24 +4654,24 @@ void MVLMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval,
 
     if (a_mode == 2 || a_mode == 3 || a_mode == 4) {
       if (a_mode == 3 || a_mode == 4) {
-        logl_H0 = MphEM('R', em_iter / 10, em_prec * 10, eval, &X_sub2.matrix,
+        logl_H0 = MphEM('R', em_iter / 10, em_prec * 10, eval, U, sigmasq, &X_sub2.matrix,
                         Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX,
                         UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix);
-        logl_H0 = MphNR('R', nr_iter / 10, nr_prec * 10, eval, &X_sub2.matrix,
+        logl_H0 = MphNR('R', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, &X_sub2.matrix,
                         Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e,
                         Hessian, crt_a, crt_b, crt_c);
-        MphCalcBeta(eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix,
+        MphCalcBeta(eval, U, sigmasq, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix,
                     se_B_null2);
       }
 
       if (a_mode == 2 || a_mode == 4) {
-        logl_H0 = MphEM('L', em_iter / 10, em_prec * 10, eval, &X_sub2.matrix,
+        logl_H0 = MphEM('L', em_iter / 10, em_prec * 10, eval, U, sigmasq, &X_sub2.matrix,
                         Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX,
                         UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix);
-        logl_H0 = MphNR('L', nr_iter / 10, nr_prec * 10, eval, &X_sub2.matrix,
+        logl_H0 = MphNR('L', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, &X_sub2.matrix,
                         Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e,
                         Hessian, crt_a, crt_b, crt_c);
-        MphCalcBeta(eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix,
+        MphCalcBeta(eval, U, sigmasq, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix,
                     se_B_null2);
       }
     }
@@ -4286,32 +4680,32 @@ void MVLMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval,
 
     // 3 is before 1.
     if (a_mode == 3 || a_mode == 4) {
-      p_score = MphCalcP(eval, &X_row2.vector, &X_sub2.matrix, Y, V_g_null,
+      p_score = MphCalcP(eval, U, sigmasq, &X_row2.vector, &X_sub2.matrix, Y, V_g_null,
                          V_e_null, UltVehiY, beta, Vbeta);
       if (p_score < p_nr && crt == 1) {
-        logl_H1 = MphNR('R', 1, nr_prec * 10, eval, X, Y, Hi_all, xHi_all,
+        logl_H1 = MphNR('R', 1, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all, xHi_all,
                         Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
         p_score = PCRT(3, d_size, p_score, crt_a, crt_b, crt_c);
       }
     }
 
     if (a_mode == 2 || a_mode == 4) {
-      logl_H1 = MphEM('L', em_iter / 10, em_prec * 10, eval, X, Y, U_hat, E_hat,
+      logl_H1 = MphEM('L', em_iter / 10, em_prec * 10, eval, U, sigmasq, X, Y, U_hat, E_hat,
                       OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE,
                       V_g, V_e, B);
 
       // Calculate beta and Vbeta.
-      p_lrt = MphCalcP(eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
+      p_lrt = MphCalcP(eval, U, sigmasq, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
                        UltVehiY, beta, Vbeta);
       p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size);
 
       if (p_lrt < p_nr) {
         logl_H1 =
-            MphNR('L', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all, xHi_all,
+            MphNR('L', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all, xHi_all,
                   Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
 
         // Calculate beta and Vbeta.
-        p_lrt = MphCalcP(eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
+        p_lrt = MphCalcP(eval, U, sigmasq, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
                          UltVehiY, beta, Vbeta);
         p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size);
 
@@ -4322,17 +4716,17 @@ void MVLMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval,
     }
 
     if (a_mode == 1 || a_mode == 4) {
-      logl_H1 = MphEM('R', em_iter / 10, em_prec * 10, eval, X, Y, U_hat, E_hat,
+      logl_H1 = MphEM('R', em_iter / 10, em_prec * 10, eval, U, sigmasq, X, Y, U_hat, E_hat,
                       OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE,
                       V_g, V_e, B);
-      p_wald = MphCalcP(eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
+      p_wald = MphCalcP(eval, U, sigmasq, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
                         UltVehiY, beta, Vbeta);
 
       if (p_wald < p_nr) {
         logl_H1 =
-            MphNR('R', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all, xHi_all,
+            MphNR('R', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all, xHi_all,
                   Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-        p_wald = MphCalcP(eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
+        p_wald = MphCalcP(eval, U, sigmasq, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
                           UltVehiY, beta, Vbeta);
 
         if (crt == 1) {
@@ -4404,7 +4798,7 @@ void MVLMM::AnalyzeBimbamGXE(const gsl_matrix *U, const gsl_vector *eval,
   return;
 }
 
-void MVLMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval,
+void MVLMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *U, const gsl_matrix *sigmasq,
                             const gsl_matrix *UtW, const gsl_matrix *UtY,
                             const gsl_vector *env) {
   debug_msg("entering");
@@ -4495,16 +4889,16 @@ void MVLMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval,
   gsl_vector_view B_col2 = gsl_matrix_column(B, c_size);
   gsl_vector_set_zero(&B_col2.vector);
 
-  MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, &X_sub1.matrix, Y, l_min,
+  MphInitial(em_iter, em_prec, nr_iter, nr_prec, eval, U, sigmasq, &X_sub1.matrix, Y, l_min,
              l_max, n_region, V_g, V_e, &B_sub1.matrix);
 
-  logl_H0 = MphEM('R', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat,
+  logl_H0 = MphEM('R', em_iter, em_prec, eval, U, sigmasq, &X_sub1.matrix, Y, U_hat, E_hat,
                   OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g,
                   V_e, &B_sub1.matrix);
-  logl_H0 = MphNR('R', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all,
+  logl_H0 = MphNR('R', nr_iter, nr_prec, eval, U, sigmasq, &X_sub1.matrix, Y, Hi_all,
                   &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a,
                   crt_b, crt_c);
-  MphCalcBeta(eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix,
+  MphCalcBeta(eval, U, sigmasq, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix,
               se_B_null1);
 
   c = 0;
@@ -4563,13 +4957,13 @@ void MVLMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval,
   }
   cout << "REMLE likelihood = " << logl_H0 << endl;
 
-  logl_H0 = MphEM('L', em_iter, em_prec, eval, &X_sub1.matrix, Y, U_hat, E_hat,
+  logl_H0 = MphEM('L', em_iter, em_prec, eval, U, sigmasq, &X_sub1.matrix, Y, U_hat, E_hat,
                   OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE, V_g,
                   V_e, &B_sub1.matrix);
-  logl_H0 = MphNR('L', nr_iter, nr_prec, eval, &X_sub1.matrix, Y, Hi_all,
+  logl_H0 = MphNR('L', nr_iter, nr_prec, eval, U, sigmasq, &X_sub1.matrix, Y, Hi_all,
                   &xHi_all_sub1.matrix, Hiy_all, V_g, V_e, Hessian, crt_a,
                   crt_b, crt_c);
-  MphCalcBeta(eval, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix,
+  MphCalcBeta(eval, U, sigmasq, &X_sub1.matrix, Y, V_g, V_e, UltVehiY, &B_sub1.matrix,
               se_B_null1);
 
   c = 0;
@@ -4736,24 +5130,24 @@ void MVLMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval,
 
     if (a_mode == 2 || a_mode == 3 || a_mode == 4) {
       if (a_mode == 3 || a_mode == 4) {
-        logl_H0 = MphEM('R', em_iter / 10, em_prec * 10, eval, &X_sub2.matrix,
+        logl_H0 = MphEM('R', em_iter / 10, em_prec * 10, eval, U, sigmasq, &X_sub2.matrix,
                         Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX,
                         UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix);
-        logl_H0 = MphNR('R', nr_iter / 10, nr_prec * 10, eval, &X_sub2.matrix,
+        logl_H0 = MphNR('R', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, &X_sub2.matrix,
                         Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e,
                         Hessian, crt_a, crt_b, crt_c);
-        MphCalcBeta(eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix,
+        MphCalcBeta(eval, U, sigmasq, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix,
                     se_B_null2);
       }
 
       if (a_mode == 2 || a_mode == 4) {
-        logl_H0 = MphEM('L', em_iter / 10, em_prec * 10, eval, &X_sub2.matrix,
+        logl_H0 = MphEM('L', em_iter / 10, em_prec * 10, eval, U, sigmasq, &X_sub2.matrix,
                         Y, U_hat, E_hat, OmegaU, OmegaE, UltVehiY, UltVehiBX,
                         UltVehiU, UltVehiE, V_g, V_e, &B_sub2.matrix);
-        logl_H0 = MphNR('L', nr_iter / 10, nr_prec * 10, eval, &X_sub2.matrix,
+        logl_H0 = MphNR('L', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, &X_sub2.matrix,
                         Y, Hi_all, &xHi_all_sub2.matrix, Hiy_all, V_g, V_e,
                         Hessian, crt_a, crt_b, crt_c);
-        MphCalcBeta(eval, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix,
+        MphCalcBeta(eval, U, sigmasq, &X_sub2.matrix, Y, V_g, V_e, UltVehiY, &B_sub2.matrix,
                     se_B_null2);
       }
     }
@@ -4762,33 +5156,33 @@ void MVLMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval,
 
     // 3 is before 1.
     if (a_mode == 3 || a_mode == 4) {
-      p_score = MphCalcP(eval, &X_row2.vector, &X_sub2.matrix, Y, V_g_null,
+      p_score = MphCalcP(eval, U, sigmasq, &X_row2.vector, &X_sub2.matrix, Y, V_g_null,
                          V_e_null, UltVehiY, beta, Vbeta);
 
       if (p_score < p_nr && crt == 1) {
-        logl_H1 = MphNR('R', 1, nr_prec * 10, eval, X, Y, Hi_all, xHi_all,
+        logl_H1 = MphNR('R', 1, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all, xHi_all,
                         Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
         p_score = PCRT(3, d_size, p_score, crt_a, crt_b, crt_c);
       }
     }
 
     if (a_mode == 2 || a_mode == 4) {
-      logl_H1 = MphEM('L', em_iter / 10, em_prec * 10, eval, X, Y, U_hat, E_hat,
+      logl_H1 = MphEM('L', em_iter / 10, em_prec * 10, eval, U, sigmasq, X, Y, U_hat, E_hat,
                       OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE,
                       V_g, V_e, B);
 
       // Calculate beta and Vbeta.
-      p_lrt = MphCalcP(eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
+      p_lrt = MphCalcP(eval, U, sigmasq, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
                        UltVehiY, beta, Vbeta);
       p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size);
 
       if (p_lrt < p_nr) {
         logl_H1 =
-            MphNR('L', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all, xHi_all,
+            MphNR('L', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all, xHi_all,
                   Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
 
         // Calculate beta and Vbeta.
-        p_lrt = MphCalcP(eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
+        p_lrt = MphCalcP(eval, U, sigmasq, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
                          UltVehiY, beta, Vbeta);
         p_lrt = gsl_cdf_chisq_Q(2.0 * (logl_H1 - logl_H0), (double)d_size);
         if (crt == 1) {
@@ -4798,17 +5192,17 @@ void MVLMM::AnalyzePlinkGXE(const gsl_matrix *U, const gsl_vector *eval,
     }
 
     if (a_mode == 1 || a_mode == 4) {
-      logl_H1 = MphEM('R', em_iter / 10, em_prec * 10, eval, X, Y, U_hat, E_hat,
+      logl_H1 = MphEM('R', em_iter / 10, em_prec * 10, eval, U, sigmasq, X, Y, U_hat, E_hat,
                       OmegaU, OmegaE, UltVehiY, UltVehiBX, UltVehiU, UltVehiE,
                       V_g, V_e, B);
-      p_wald = MphCalcP(eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
+      p_wald = MphCalcP(eval, U, sigmasq, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
                         UltVehiY, beta, Vbeta);
 
       if (p_wald < p_nr) {
         logl_H1 =
-            MphNR('R', nr_iter / 10, nr_prec * 10, eval, X, Y, Hi_all, xHi_all,
+            MphNR('R', nr_iter / 10, nr_prec * 10, eval, U, sigmasq, X, Y, Hi_all, xHi_all,
                   Hiy_all, V_g, V_e, Hessian, crt_a, crt_b, crt_c);
-        p_wald = MphCalcP(eval, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
+        p_wald = MphCalcP(eval, U, sigmasq, &X_row2.vector, &X_sub2.matrix, Y, V_g, V_e,
                           UltVehiY, beta, Vbeta);
 
         if (crt == 1) {
diff --git a/src/mvlmm.h b/src/mvlmm.h
index b92bd5e..7fa6d94 100644
--- a/src/mvlmm.h
+++ b/src/mvlmm.h
@@ -35,6 +35,7 @@ class MVLMM {
 
   string file_bfile;
   string file_geno;
+  string file_resid;
   string file_oxford;
   string file_out;
   string path_out;
@@ -58,6 +59,7 @@ class MVLMM {
   size_t ni_total, ni_test; // Number of individuals.
   size_t ns_total, ns_test; // Number of SNPs.
   size_t n_cvt;
+  size_t n_resid;
   size_t n_ph;
   double time_UtX; // Time spent on optimization iterations.
   double time_opt; // Time spent on optimization iterations.
@@ -93,7 +95,7 @@ class MVLMM {
   void WriteFiles();
 };
 
-void CalcMvLmmVgVeBeta(const gsl_vector *eval, const gsl_matrix *UtW,
+void CalcMvLmmVgVeBeta(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW,
                        const gsl_matrix *UtY, const size_t em_iter,
                        const size_t nr_iter, const double em_prec,
                        const double nr_prec, const double l_min,
diff --git a/src/param.cpp b/src/param.cpp
index f96e9c3..683fc42 100644
--- a/src/param.cpp
+++ b/src/param.cpp
@@ -65,6 +65,22 @@ void LOCO_set_Snps(set<string> &ksnps, set<string> &gwasnps,
   }
 }
 
+// This code defines a function named LOCO_set_Snps that takes four arguments: 
+// two set<string> references (ksnps and gwasnps), a constant map<string, string> reference (mapchr), and a constant string (loco).
+
+// The function first checks that both ksnps and gwasnps are empty sets to ensure they are not initialized twice.
+
+// Next, the function loops through each key-value pair (kv) in the mapchr map using a range-based for loop. 
+//For each key-value pair, the function extracts the key (snp) and value (chr) using auto.
+
+// Then, the function checks if chr is equal to loco. If it is, then the snp is added to 
+// gwasnps using the insert() method of the set container. If chr is not equal to loco, then snp is added to ksnps.
+
+// Essentially, the function divides the set of SNPs (single nucleotide polymorphisms) into two groups: 
+// the SNPs that are associated with the trait of interest (gwasnps) and the SNPs that are not associated with the trait of interest (ksnps). 
+// The loco variable specifies the chromosome on which the trait of interest is located.
+
+
 // Trim #individuals to size which is used to write tests that run faster
 //
 // Note it actually trims the number of functional individuals
@@ -233,6 +249,13 @@ void PARAM::ReadFiles(void) {
   }
   trim_individuals(indicator_cvt, ni_max);
 
+    // Read residual variance files before the genotype files.
+  if (!file_resid.empty()) {
+    if (ReadFile_column(file_resid, indicator_resid, resid, 1) == false) {
+      error = true;
+    }
+  }
+
   if (!file_gxe.empty()) {
     if (ReadFile_column(file_gxe, indicator_gxe, gxe, 1) == false) {
       error = true;
@@ -244,6 +267,7 @@ void PARAM::ReadFiles(void) {
     }
   }
 
+
   trim_individuals(indicator_idv, ni_max);
 
   // Read genotype and phenotype file for PLINK format.
@@ -939,6 +963,7 @@ void PARAM::CheckParam(void) {
   enforce_fexists(file_weight, "open file");
   enforce_fexists(file_epm, "open file");
   enforce_fexists(file_ebv, "open file");
+  enforce_fexists(file_resid, "open file");
   enforce_fexists(file_read, "open file");
 
   // Check if files are compatible with analysis mode.
@@ -1020,6 +1045,14 @@ void PARAM::CheckData(void) {
     return;
   }
 
+  if ((indicator_resid).size() != 0 &&
+      (indicator_resid).size() != (indicator_idv).size()) {
+    error = true;
+    cout << "error! number of rows in the residual variance file do not match "
+         << "the number of individuals. " << endl;
+    return;
+  }
+
   if ((indicator_read).size() != 0 &&
       (indicator_read).size() != (indicator_idv).size()) {
     error = true;
@@ -1069,6 +1102,12 @@ void PARAM::CheckData(void) {
       }
     }
 
+    if (indicator_resid.size() != 0) {
+      if (indicator_resid[i] == 0) {
+        continue;
+      }
+    }
+
     for (size_t j = 0; j < indicator_pheno[i].size(); j++) {
       if (indicator_pheno[i][j] == 0) {
         np_miss++;
@@ -1114,6 +1153,7 @@ void PARAM::CheckData(void) {
       cout << "## number of analyzed individuals = " << ni_test << endl;
     }
     cout << "## number of covariates = " << n_cvt << endl;
+    cout << "## number of residual variances" << n_resid << endl;
     cout << "## number of phenotypes = " << n_ph << endl;
     if (a_mode == 43) {
       cout << "## number of observed data = " << np_obs << endl;
@@ -1934,6 +1974,52 @@ void PARAM::WriteVector(const gsl_vector *vector_D, const string suffix) {
   return;
 }
 
+void PARAM::Checkresid() {
+  if (indicator_resid.size() == 0) {
+    // No residual variance is specified, fill sigmasq with Ve_remle_null along the diagonal
+    gsl_matrix *sigmasq = gsl_matrix_alloc(n_resid, n_resid);
+
+    for (size_t i = 0; i < n_resid; ++i) {
+      gsl_matrix_set(sigmasq, i, i, Ve_remle_null[i]);
+    }
+
+    info_msg("No residual variances were specified: Ve_remle_null is placed along the diagonal.");
+    gsl_matrix_free(sigmasq);
+    return;
+  }
+
+  size_t ci_test = 0;
+  gsl_matrix *sigmasq = gsl_matrix_alloc(n_resid, n_resid);
+
+  // If residual variance is specified, use resid matrix
+  for (size_t i = 0; i < n_resid; ++i) {
+    gsl_matrix_set(sigmasq, i, i, resid[i][i]);
+  }
+
+  size_t flag_ipt = 0;
+  double v_min, v_max;
+  std::set<size_t> set_remove;
+
+  // If no residual variance is found, assign Ve_remle_null
+  if (n_resid == set_remove.size()) {
+    indicator_resid.clear();
+    n_resid = 1;
+  } else if (flag_ipt == 0) {
+    info_msg("No residual variances found in the resid file: a diagonal of Ve_remle_null is added.");
+    for (std::vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
+      if (indicator_idv[i] == 0 || indicator_resid[i] == 0) {
+        continue;
+      }
+      gsl_matrix_set(resid, i, i, Ve_remle_null[i]);
+    }
+
+    n_resid++;
+  }
+
+  gsl_matrix_free(sigmasq);
+  return;
+}
+
 void PARAM::CheckCvt() {
   if (indicator_cvt.size() == 0) {
     return;
@@ -2026,6 +2112,13 @@ void PARAM::ProcessCvtPhen() {
     }
   }
 
+    // Remove individuals with missing residual variance.
+  if ((indicator_resid).size() != 0) {
+    for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
+      indicator_idv[i] *= indicator_resid[i];
+    }
+  }
+
   // Obtain ni_test.
   ni_test = 0;
   for (vector<int>::size_type i = 0; i < (indicator_idv).size(); ++i) {
@@ -2141,6 +2234,25 @@ void PARAM::CopyWeight(gsl_vector *w) {
   return;
 }
 
+void PARAM::CopyResid(gsl_matrix *sigmasq) {
+  size_t ci_test = 0;
+
+  // Loop through the individuals and residual variance indicators
+  for (std::vector<int>::size_type i = 0; i < indicator_idv.size(); ++i) {
+    if (indicator_idv[i] == 0 || indicator_resid[i] == 0) {
+      continue;  // Skip if either the individual or residual variance is not active
+    }
+    
+    // Set the diagonal element in sigmasq from resid
+    gsl_matrix_set(sigmasq, ci_test, ci_test, gsl_matrix_get(resid, i, i));
+
+    // Increment the index for the next valid individual/residual variance
+    ci_test++;
+  }
+
+  return;
+}
+
 // If flag=0, then use indicator_idv to load W and Y;
 // else, use indicator_cvt to load them.
 void PARAM::CopyCvtPhen(gsl_matrix *W, gsl_vector *y, size_t flag) {
diff --git a/src/param.h b/src/param.h
index e747182..89c7c81 100644
--- a/src/param.h
+++ b/src/param.h
@@ -136,6 +136,7 @@ class PARAM {
   string file_anno; // Optional.
   string file_gxe;  // Optional.
   string file_cvt;  // Optional.
+  string file_resid; // Optional
   string file_cat, file_mcat;
   string file_catc, file_mcatc;
   string file_var;
@@ -218,6 +219,7 @@ class PARAM {
 
   HYPBSLMM cHyp_initial;
 
+
   // VARCOV-related parameters.
   double window_cm;
   size_t window_bp;
@@ -243,6 +245,7 @@ class PARAM {
   size_t ni_control, ni_case; // Number of controls and number of cases.
   size_t ni_subsample;        // Number of subsampled individuals.
   size_t n_cvt;               // Number of covariates.
+  size_t n_resid;          // Number of residual variances.
   size_t n_cat;               // Number of continuous categories.
   size_t n_ph;                // Number of phenotypes.
   size_t n_vc;                // Number of variance components
@@ -267,6 +270,9 @@ class PARAM {
   // Vector recording all covariates (NA replaced with -9).
   vector<vector<double>> cvt;
 
+// Vector recording all residual variances (NA replaced with -9).
+  vector<double> resid;
+
   // Vector recording all covariates (NA replaced with -9).
   vector<double> gxe;
 
@@ -294,6 +300,10 @@ class PARAM {
   // analysis.
   vector<int> indicator_cvt;
 
+  // Indicator for residuals: 0 missing, 1 available for
+  // analysis.
+  vector<int> indicator_resid;
+
   // Indicator for gxe: 0 missing, 1 available for analysis.
   vector<int> indicator_gxe;
 
@@ -340,6 +350,7 @@ class PARAM {
                      const bool calc_K);
   void CheckCvt();
   void CopyCvt(gsl_matrix *W);
+  void CopyResid(gsl_matrix *sigmasq);
   void CopyA(size_t flag, gsl_matrix *A);
   void CopyGxe(gsl_vector *gxe);
   void CopyWeight(gsl_vector *w);