Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modified files for strain-specific residual variance option #269

Open
wants to merge 32 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7c70ec5
Modified files for strain-specific residual variance option
Mar 20, 2023
ca93fb2
Fixed 'cvt' variable and missing 0 < geno < 2 checks
Mar 21, 2023
763a0b5
Fixed errors except for MphInitial errors
Apr 6, 2023
18e15e3
Fixed parameter names, attempted to integrate residual variance as an…
Jul 18, 2023
72f5a24
More bug fixes
Jul 28, 2023
f76326b
Added PARAM 'CheckResidvar' so that if no vector is supplied, it is r…
Jul 28, 2023
58e370f
Update gemma.cpp
jb621-star Oct 7, 2024
924daa5
Update mvlmm.cpp
jb621-star Oct 7, 2024
db72327
Update mvlmm.cpp
jb621-star Oct 7, 2024
363ec00
Update mvlmm.cpp
jb621-star Oct 8, 2024
c40699b
Update mvlmm.cpp
jb621-star Oct 8, 2024
96d5f17
Update gemma.cpp
jb621-star Oct 8, 2024
112d499
Update gemma.cpp
jb621-star Oct 8, 2024
6f1d2cd
Update gemma_io.cpp
jb621-star Oct 8, 2024
057d27a
Update gemma_io.h
jb621-star Oct 8, 2024
a8352e8
Update lmm.cpp
jb621-star Oct 8, 2024
f858b30
Update mvlmm.h
jb621-star Oct 8, 2024
587c181
Update param.cpp
jb621-star Oct 8, 2024
e85449d
Update param.h
jb621-star Oct 8, 2024
ad54957
Update param.h
jb621-star Oct 8, 2024
b0da2e4
Update param.h
jb621-star Oct 22, 2024
9580e68
Update gemma.cpp
jb621-star Oct 22, 2024
1b9c986
Update gemma_io.cpp
jb621-star Oct 22, 2024
e193600
Update gemma_io.h
jb621-star Oct 22, 2024
8409e49
Update lmm.cpp
jb621-star Oct 22, 2024
06b5f62
Update param.h
jb621-star Oct 22, 2024
be69a55
Update param.cpp
jb621-star Oct 22, 2024
2b25992
Update gemma.cpp
jb621-star Oct 22, 2024
e1d92d0
Update mvlmm.h
jb621-star Oct 22, 2024
1caa0d1
Update gemma_io.cpp
jb621-star Oct 28, 2024
eb1e113
Update lmm.cpp
jb621-star Oct 28, 2024
c18db6d
Update mvlmm.h
jb621-star Oct 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 39 additions & 18 deletions src/gemma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,17 @@ void GEMMA::PrintHelp(size_t option) {
cout << " variable for individual 2" << endl;
cout << " ..." << endl;
cout << " missing value: NA" << endl;
cout << " -resid [filename] "
<< " residual variance file contains a column of positive values to be used "
<< "directly as for the residual variance---each value corresponds to an "
<< "individual, in which each value is the empirical residual variance (sigmasq) based"
<< "on a trait value calculated from multiple replicates for this individual"
<< "(similar in format to phenotype file)"
<< endl;
cout << " format: variable for individual 1" << endl;
cout << " variable for individual 2" << endl;
cout << " ..." << endl;
cout << " missing value: NA" << endl;
cout << " -k [filename] "
<< " specify input kinship/relatedness matrix file name" << endl;
cout << " -mk [filename] "
Expand Down Expand Up @@ -824,6 +835,14 @@ void GEMMA::Assign(int argc, char **argv, PARAM &cPar) {
str.clear();
str.assign(argv[i]);
cPar.file_weight = str;
} else if (strcmp(argv[i], "-resid") == 0) {
if (argv[i + 1] == NULL || argv[i + 1][0] == '-') {
continue;
}
++i;
str.clear();
str.assign(argv[i]);
cPar.file_resid = str;
jb621-star marked this conversation as resolved.
Show resolved Hide resolved
} else if (strcmp(argv[i], "-wsnp") == 0) {
if (argv[i + 1] == NULL || argv[i + 1][0] == '-') {
continue;
Expand Down Expand Up @@ -1791,9 +1810,9 @@ void GEMMA::BatchRun(PARAM &cPar) {
gsl_vector_view UtY_col = gsl_matrix_column(UtY, 0);

// obtain estimates
CalcLambda('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max,
CalcLambda('R', U, eval, sigmasq, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max,
cPar.n_region, lambda, logl);
CalcLmmVgVeBeta(eval, UtW, &UtY_col.vector, lambda, vg, ve, beta,
CalcLmmVgVeBeta(U, eval, sigmasq, UtW, &UtY_col.vector, lambda, vg, ve, beta,
se_beta);

cout << "REMLE estimate for vg in the null model = " << vg << endl;
Expand Down Expand Up @@ -2611,6 +2630,8 @@ void GEMMA::BatchRun(PARAM &cPar) {
}
}
}
// Need some way of reading the residual variance file and then
// assigning it to what would have been the ..+ 1 part of the computation

// eigen-decomposition and calculate trace_G - main track
cout << "Start Eigen-Decomposition..." << endl;
Expand Down Expand Up @@ -2699,11 +2720,11 @@ void GEMMA::BatchRun(PARAM &cPar) {

assert_issue(is_issue(26), ROUND(UtY->data[0]) == -16.6143);

CalcLambda('L', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max,
CalcLambda('L', U, eval, sigmasq, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max,
cPar.n_region, cPar.l_mle_null, cPar.logl_mle_H0);
assert(!isnan(UtY->data[0]));

CalcLmmVgVeBeta(eval, UtW, &UtY_col.vector, cPar.l_mle_null,
CalcLmmVgVeBeta(U, eval, sigmasq, UtW, &UtY_col.vector, cPar.l_mle_null,
cPar.vg_mle_null, cPar.ve_mle_null, &beta.vector,
&se_beta.vector);

Expand All @@ -2722,9 +2743,9 @@ void GEMMA::BatchRun(PARAM &cPar) {
assert(!isnan(cPar.se_beta_mle_null.front()));

// the following functions do not modify eval
CalcLambda('R', eval, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max,
CalcLambda('R', U, eval, sigmasq, UtW, &UtY_col.vector, cPar.l_min, cPar.l_max,
cPar.n_region, cPar.l_remle_null, cPar.logl_remle_H0);
CalcLmmVgVeBeta(eval, UtW, &UtY_col.vector, cPar.l_remle_null,
CalcLmmVgVeBeta(U, eval, sigmasq, UtW, &UtY_col.vector, cPar.l_remle_null,
cPar.vg_remle_null, cPar.ve_remle_null, &beta.vector,
&se_beta.vector);

Expand Down Expand Up @@ -2795,22 +2816,22 @@ void GEMMA::BatchRun(PARAM &cPar) {
if (!cPar.file_bfile.empty()) {
// PLINK analysis
if (cPar.file_gxe.empty()) {
cLmm.AnalyzePlink(U, eval, UtW, &UtY_col.vector, W,
cLmm.AnalyzePlink(U, eval, sigmasq, UtW, &UtY_col.vector, W,
&Y_col.vector, cPar.setGWASnps);
}
else {
cLmm.AnalyzePlinkGXE(U, eval, UtW, &UtY_col.vector, W,
cLmm.AnalyzePlinkGXE(U, eval, sigmasq, UtW, &UtY_col.vector, W,
&Y_col.vector, env);
}
}
else {
// BIMBAM analysis

if (cPar.file_gxe.empty()) {
cLmm.AnalyzeBimbam(U, eval, UtW, &UtY_col.vector, W,
cLmm.AnalyzeBimbam(U, eval, sigmasq, UtW, &UtY_col.vector, W,
&Y_col.vector, cPar.setGWASnps);
} else {
cLmm.AnalyzeBimbamGXE(U, eval, UtW, &UtY_col.vector, W,
cLmm.AnalyzeBimbamGXE(U, eval, sigmasq, UtW, &UtY_col.vector, W,
&Y_col.vector, env);
}
}
Expand All @@ -2827,15 +2848,15 @@ void GEMMA::BatchRun(PARAM &cPar) {

if (!cPar.file_bfile.empty()) {
if (cPar.file_gxe.empty()) {
cMvlmm.AnalyzePlink(U, eval, UtW, UtY);
cMvlmm.AnalyzePlink(U, eval, sigmasq, UtW, UtY);
} else {
cMvlmm.AnalyzePlinkGXE(U, eval, UtW, UtY, env);
cMvlmm.AnalyzePlinkGXE(U, eval, sigmasq, UtW, UtY, env);
}
} else {
if (cPar.file_gxe.empty()) {
cMvlmm.AnalyzeBimbam(U, eval, UtW, UtY);
cMvlmm.AnalyzeBimbam(U, eval, sigmasq, UtW, UtY);
} else {
cMvlmm.AnalyzeBimbamGXE(U, eval, UtW, UtY, env);
cMvlmm.AnalyzeBimbamGXE(U, eval, sigmasq, UtW, UtY, env);
}
}

Expand Down Expand Up @@ -2924,9 +2945,9 @@ void GEMMA::BatchRun(PARAM &cPar) {
CalcUtX(U, y, Uty);

// calculate REMLE/MLE estimate and pve
CalcLambda('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region,
CalcLambda('L', U, eval, sigmasq, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region,
cPar.l_mle_null, cPar.logl_mle_H0);
CalcLambda('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region,
CalcLambda('R', U, eval, sigmasq, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region,
cPar.l_remle_null, cPar.logl_remle_H0);
CalcPve(eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null,
cPar.pve_se_null);
Expand Down Expand Up @@ -3034,9 +3055,9 @@ void GEMMA::BatchRun(PARAM &cPar) {
CalcUtX(U, y, Uty);

// calculate REMLE/MLE estimate and pve
CalcLambda('L', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region,
CalcLambda('L', U, eval, sigmasq, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region,
cPar.l_mle_null, cPar.logl_mle_H0);
CalcLambda('R', eval, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region,
CalcLambda('R', U, eval, sigmasq, UtW, Uty, cPar.l_min, cPar.l_max, cPar.n_region,
cPar.l_remle_null, cPar.logl_remle_H0);
CalcPve(eval, UtW, Uty, cPar.l_remle_null, cPar.trace_G, cPar.pve_null,
cPar.pve_se_null);
Expand Down
127 changes: 119 additions & 8 deletions src/gemma_io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ std::istream &safeGetline(std::istream &is, std::string &t) {

// Read SNP file. A single column of SNP names.
bool ReadFile_snps(const string file_snps, set<string> &setSnps) {
debug_msg("enter ReadFile_snps");
debug_msg("entered");
setSnps.clear();

igzstream infile(file_snps.c_str(), igzstream::in);
Expand Down Expand Up @@ -329,9 +329,6 @@ bool ReadFile_anno(const string &file_anno, map<string, string> &mapRS2chr,
mapRS2bp[rs] = b_pos;
mapRS2cM[rs] = cM;
}
// for (auto& [key, value] : mapRS2bp) {
// cerr << key << endl;
//}

infile.close();
infile.clear();
Expand Down Expand Up @@ -509,6 +506,73 @@ bool ReadFile_cvt(const string &file_cvt, vector<int> &indicator_cvt,
return true;
}

bool ReadFile_resid(const string &file_resid, vector<int> &indicator_resid,
vector<vector<double>> &sigmasq, size_t &n_resid) {
debug_msg("entered");
indicator_resid.clear();

ifstream infile(file_resid.c_str(), ifstream::in);
if (!infile) {
cout << "error! fail to open residual variance file: " << file_resid << endl;
return false;
}

string line;
char *ch_ptr;
double d;

int flag_na = 0;

while (!safeGetline(infile, line).eof()) {
vector<double> v_d;
flag_na = 0;
ch_ptr = strtok((char *)line.c_str(), " ,\t");
while (ch_ptr != NULL) {
if (strcmp(ch_ptr, "NA") == 0) {
flag_na = 1;
d = -9;
} else {
d = atof(ch_ptr);
}

v_d.push_back(d);
ch_ptr = strtok(NULL, " ,\t");
}
if (flag_na == 0) {
indicator_resid.push_back(1);
} else {
indicator_resid.push_back(0);
}
sigmasq.push_back(v_d);
}

if (indicator_resid.empty()) {
n_resid = 0;
} else {
flag_na = 0;
for (vector<int>::size_type i = 0; i < indicator_resid.size(); ++i) {
if (indicator_resid[i] == 0) {
continue;
}

if (flag_na == 0) {
flag_na = 1;
n_resid = sigmasq[i].size();
}
if (flag_na != 0 && n_resid != sigmasq[i].size()) {
cout << "error! number of residuals in row " << i
<< " do not match other rows." << endl;
return false;
}
}
}

infile.close();
infile.clear();

return true;
}

// Read .bim file.
bool ReadFile_bim(const string &file_bim, vector<SNPINFO> &snpInfo) {
debug_msg("entered");
Expand Down Expand Up @@ -684,9 +748,10 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
double maf, geno, geno_old;
size_t n_miss;
size_t n_0, n_1, n_2;

double min_g = std::numeric_limits<float>::max();
double max_g = std::numeric_limits<float>::min();

int flag_poly;

int ni_total = indicator_idv.size();
Expand All @@ -699,9 +764,6 @@ bool ReadFile_geno(const string &file_geno, const set<string> &setSnps,
file_pos = 0;
auto count_warnings = 0;
auto infilen = file_geno.c_str();
// for (auto& [key, value] : mapRS2bp) {
// cerr << key << endl;
// }
while (!safe_get_line(infile, line).eof()) {
ch_ptr = strtok_safe2((char *)line.c_str(), " ,\t",infilen);
rs = ch_ptr;
Expand Down Expand Up @@ -1412,6 +1474,55 @@ void ReadFile_eigenD(const string &file_kd, bool &error, gsl_vector *eval) {
return;
}

void ReadFile_resid(const string &file_resid, bool &error, gsl_matrix *sigmasq) {
debug_msg("entered");
igzstream infile(file_resid.c_str(), igzstream::in);
if (!infile) {
cout << "error! fail to open the residual variance file: " << file_resid << endl;
error = true;
return;
}

size_t n_row = sigmasq->size1, n_col = sigmasq->size2, i_row = 0, i_col = 0;

gsl_matrix_set_zero(sigmasq); //change this so that sigmasq = V_e somehow

string line;
char *ch_ptr;
double d;

while (getline(infile, line)) {
if (i_row == n_row) {
cout << "error! number of rows in the residual variance file is larger "
<< "than expected." << endl;
error = true;
}

i_col = 0;
ch_ptr = strtok((char *)line.c_str(), " ,\t");
while (ch_ptr != NULL) {
if (i_col == n_col) {
cout << "error! number of columns in the residual variance file "
<< "is larger than expected, for row = " << i_row << endl;
error = true;
}

d = atof(ch_ptr);
gsl_matrix_set(sigmasq, i_row, i_col, d);
i_col++;

ch_ptr = strtok(NULL, " ,\t");
}

i_row++;
}

infile.close();
infile.clear();

return;
}

// Read bimbam mean genotype file and calculate kinship matrix.
bool BimbamKin(const string file_geno, const set<string> ksnps,
vector<int> &indicator_snp, const int k_mode,
Expand Down
2 changes: 2 additions & 0 deletions src/gemma_io.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ void ReadFile_mk(const string &file_mk, vector<int> &indicator_idv,
gsl_matrix *G);
void ReadFile_eigenU(const string &file_u, bool &error, gsl_matrix *U);
void ReadFile_eigenD(const string &file_d, bool &error, gsl_vector *eval);
bool ReadFile_resid(const string &file_resid, vector<int> &indicator_resid,
gsl_matrix *sigmasq, size_t &n_resid);

bool BimbamKin(const string file_geno, const set<string> ksnps,
vector<int> &indicator_snp, const int k_mode,
Expand Down
14 changes: 5 additions & 9 deletions src/lmm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@
#include "lmm.h"
#include "mathfunc.h"

#define P_YY_MIN 0.00000001

using namespace std;

void LMM::CopyFromParam(PARAM &cPar) {
Expand Down Expand Up @@ -524,8 +522,7 @@ double LogL_f(double l, void *params) {
index_yy = GetabIndex(n_cvt + 2, n_cvt + 2, n_cvt);
double P_yy = gsl_matrix_safe_get(Pab, nc_total, index_yy);

if (P_yy >= 0.0 && (P_yy < P_YY_MIN)) P_yy = P_YY_MIN; // control potential round-off

if (P_yy == 0.0) P_yy = 0.00000001; // control potential round-off
if (is_check_mode() || is_debug_mode()) {
// cerr << "P_yy is" << P_yy << endl;
assert(!is_nan(P_yy));
Expand Down Expand Up @@ -851,7 +848,7 @@ double LogRL_f(double l, void *params) {
index_ww = GetabIndex(n_cvt + 2, n_cvt + 2, n_cvt);
double P_yy = gsl_matrix_safe_get(Pab, nc_total, index_ww);
// P_yy is positive and may get zeroed printf("P_yy=%f",P_yy);
if (P_yy >= 0.0 && (P_yy < P_YY_MIN)) P_yy = P_YY_MIN; // control potential round-off
if (P_yy == 0.0) P_yy = 0.00000001; // control potential round-off

double c = 0.5 * df * (safe_log(df) - safe_log(2 * M_PI) - 1.0);
f = c - 0.5 * logdet_h - 0.5 * logdet_hiw - 0.5 * df * safe_log(P_yy);
Expand Down Expand Up @@ -2027,14 +2024,13 @@ void CalcLambda(const char func_name, FUNC_PARAM &params, const double l_min,
for (vector<double>::size_type i = 0; i < lambda_lh.size(); ++i) {
lambda_l = lambda_lh[i].first;
lambda_h = lambda_lh[i].second;
// printf("%f,%f\n",lambda_l,lambda_h);
auto handler = gsl_set_error_handler_off();
gsl_root_fsolver_set((gsl_root_fsolver*)s_f, &F, lambda_l, lambda_h);

int status = GSL_FAILURE;
uint iter = 0;
const auto max_iter = 100;

auto handler = gsl_set_error_handler_off();
do {
iter++;
status = gsl_root_fsolver_iterate((gsl_root_fsolver*)s_f);
Expand Down Expand Up @@ -2138,7 +2134,7 @@ void CalcLambda(const char func_name, FUNC_PARAM &params, const double l_min,
}

// Calculate lambda in the null model.
void CalcLambda(const char func_name, const gsl_vector *eval,
void CalcLambda(const char func_name, const gsl_matrix *U, const gsl_vector *eval,
const gsl_matrix *UtW, const gsl_vector *Uty,
const double l_min, const double l_max, const size_t n_region,
double &lambda, double &logl_H0) {
Expand Down Expand Up @@ -2205,7 +2201,7 @@ void CalcPve(const gsl_vector *eval, const gsl_matrix *UtW,
// Obtain REML estimate for Vg and Ve using lambda_remle.
// Obtain beta and se(beta) for coefficients.
// ab is not used when e_mode==0.
void CalcLmmVgVeBeta(const gsl_vector *eval, const gsl_matrix *UtW,
void CalcLmmVgVeBeta(const gsl_matrix *U, const gsl_vector *eval, const gsl_matrix *UtW,
const gsl_vector *Uty, const double lambda, double &vg,
double &ve, gsl_vector *beta, gsl_vector *se_beta) {
size_t n_cvt = UtW->size2, ni_test = UtW->size1;
Expand Down
Loading