forked from selective-inference/compare-selection
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils.py
130 lines (100 loc) · 3.6 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import numpy as np
import pandas as pd
# Rpy
import rpy2.robjects as rpy
from rpy2.robjects import numpy2ri
rpy.r('suppressMessages(library(selectiveInference)); suppressMessages(library(knockoff))') # R libraries we will use
rpy.r("""
estimate_sigma_data_splitting = function(X,y, verbose=FALSE){
nrep = 10
sigma_est = 0
nest = 0
for (i in 1:nrep){
n=nrow(X)
m=floor(n/2)
subsample = sample(1:n, m, replace=FALSE)
leftover = setdiff(1:n, subsample)
CV = cv.glmnet(X[subsample,], y[subsample], standardize=FALSE, intercept=FALSE, family="gaussian")
beta_hat = coef(CV, s="lambda.min")[-1]
selected = which(beta_hat!=0)
if (verbose){
print(c("nselected", length(selected)))
}
if (length(selected)>0){
LM = lm(y[leftover]~X[leftover,][,selected])
sigma_est = sigma_est+sigma(LM)
nest = nest+1
}
}
return(sigma_est/nest)
}
""")
def gaussian_setup(X, Y, run_CV=True):
"""
Some calculations that can be reused by methods:
lambda.min, lambda.1se, lambda.theory and Reid et al. estimate of noise
"""
n, p = X.shape
Xn = X / np.sqrt((X**2).sum(0))[None, :]
numpy2ri.activate()
rpy.r.assign('X', X)
rpy.r.assign('Y', Y)
numpy2ri.deactivate()
rpy.r('X=as.matrix(X)')
rpy.r('Y=as.numeric(Y)')
l_theory = np.fabs(Xn.T.dot(np.random.standard_normal((n, 500)))).max(1).mean() * np.ones(p)
if run_CV:
numpy2ri.activate()
rpy.r.assign('X', X)
rpy.r.assign('Y', Y)
rpy.r('X=as.matrix(X)')
rpy.r('Y=as.numeric(Y)')
rpy.r('G = cv.glmnet(X, Y, intercept=FALSE, standardize=FALSE)')
rpy.r('sigma_reid = selectiveInference:::estimate_sigma(X, Y, coef(G, s="lambda.min")[-1]) # sigma via Reid et al.')
rpy.r("L = G[['lambda.min']]")
rpy.r("L1 = G[['lambda.1se']]")
L = rpy.r('L')
L1 = rpy.r('L1')
sigma_reid = rpy.r('sigma_reid')[0]
numpy2ri.deactivate()
return L * np.sqrt(X.shape[0]) * 1.0001, L1 * np.sqrt(X.shape[0]) * 1.0001, l_theory, sigma_reid
else:
return None, None, l_theory, None
def BHfilter(pval, q=0.2):
numpy2ri.activate()
rpy.r.assign('pval', np.asarray(pval))
rpy.r.assign('q', q)
rpy.r('Pval = p.adjust(pval, method="BH")')
rpy.r('S = which((Pval < q)) - 1')
S = rpy.r('S')
numpy2ri.deactivate()
return np.asarray(S, np.int)
def summarize(groupings,
results_df,
summary):
grouped = results_df.groupby(groupings, as_index=False)
summaries = []
summaries = [(n, summary(g)) for n, g in grouped]
summary_df = pd.concat([s for _, s in summaries])
summary_df.index = [n for n, _ in summaries]
return summary_df
def get_method_params(methods):
# find all columns needed for output
colnames = []
for method in methods:
M = method(np.random.standard_normal((10,5)), np.random.standard_normal(10), 1., 1., 1., 1.)
colnames += M.trait_names()
colnames = sorted(np.unique(colnames))
def get_col(method, colname):
if colname in method.trait_names():
return getattr(method, colname)
def get_params(method):
return [get_col(method, colname) for colname in colnames]
method_names = []
method_params = []
for method in methods:
M = method(np.random.standard_normal((10,5)), np.random.standard_normal(10), 1., 1., 1., 1.)
method_params.append(get_params(M))
method_names.append(M.method_name)
method_params = pd.DataFrame(method_params, columns=colnames)
return method_params, [m.__name__ for m in methods], method_names