-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathest_hubble.py
333 lines (290 loc) · 15 KB
/
est_hubble.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
import matplotlib.pyplot as plt
import numpy as np
import math
from scipy import integrate
import h5py
import os
import argparse
import configparser
#setup parallelization
import multiprocessing as mp
N_CORES = mp.cpu_count()
#np.seterr(all='raise')
MAX_Z_PROC_SIZE = 500
#read configuration
config = configparser.ConfigParser()
config.read('params.conf')
PRIOR_RANGE = [float(val) for val in config['physical']['H_0_prior'].split(sep=',')]
OMEGA_M = float(config['physical']['omega_matter'])
OMEGA_A = float(config['physical']['omega_lambda'])
C_L = float(config['physical']['light_speed'])
C_L_SQ = C_L*C_L
DIR_NAME = config['analysis']['type'].strip()
EVENT_LIST = sorted(['_'.join(s.split('_')[:2]) for s in os.listdir(config['analysis']['event_dir'])])
if len(PRIOR_RANGE) != 2:
raise ValueError("Ranges must have two elements.")
if len(EVENT_LIST) == 0:
raise ValueError("At least one event must be supplied.")
N_H0 = 250
#override configuration file with command line arguments if supplied
parser = argparse.ArgumentParser(description='Estimate the Hubble constant based on a GW event volume and a corresponding skymap.')
parser.add_argument('--in-directory', type=str, nargs='?', help='Type of events to sample. Accepcted values are GW_events for real events and sim_events for simulated events. Defaults to {}.'.format(DIR_NAME), default=DIR_NAME)
parser.add_argument('--n-events-use', type=int, default=len(EVENT_LIST), help='Number of events to use in the Hubble constant estimation. Must be <= the number of events available.')
parser.add_argument('--n-cores-max', type=int, default=N_CORES, help='Maximum number of cores to use. Otherwise computer get angry >:{')
parser.add_argument('--save-pdf', type=str, help='Location to save the PDFs')
parser.add_argument('--save-final', action='store_true', default=False)
args = parser.parse_args()
DIR_NAME = args.in_directory
N_CORES = min(args.n_cores_max, N_CORES)
def integrand_em(z_i, z_mu, z_err_sq, h_0):
'''integrand for Bayes' theorem
z_i: the term being integrated over, the redshift of the ith galaxy
z_mu: expected value of z based on EM observation
z_err_sq: variance on z (taken to be Gaussian)
sig_cz_sq: the square error of c*z_i
gw_dist_mu: expected value of GW distance
gw_dist_var: variance of GW distance
h_0: Hubble's constant
'''
marg_em = np.exp( (z_i - z_mu)*(z_mu - z_i) / (2*z_err_sq) )
return marg_em
def integrand(z_i, z_mu, z_err_sq, h_0, gw_dist_mu, gw_dist_var):
'''integrand for Bayes' theorem
z_i: the term being integrated over, the redshift of the ith galaxy
z_mu: expected value of z based on EM observation
z_err_sq: variance on z (taken to be Gaussian)
sig_cz_sq: the square error of c*z_i
gw_dist_mu: expected value of GW distance
gw_dist_var: variance of GW distance
h_0: Hubble's constant
'''
#we need to marginalize over dzi and solid angles (see Soares-Santos et al)
c_z = C_L*z_i
try:
dV = c_z*c_z / (h_0*h_0*h_0*np.sqrt(OMEGA_M*(1+z_i)**3 + OMEGA_A))
except ValueError:
dV = 1
#compute marginal posteriors on the GW data and EM data
marg_gw = np.exp( (h_0*gw_dist_mu - c_z)*(c_z - h_0*gw_dist_mu) / (2*gw_dist_var*h_0*h_0) )
marg_em = np.exp( (z_i - z_mu)*(z_mu - z_i) / (2*z_err_sq) )
return marg_gw*marg_em*dV
def p_part_gals(z, z_err, h_vals, z_min, z_max, gw_dist_mu, gw_dist_var):
'''Helper function which computes the sum of a list of galaxies with specified redshifts
z: iterable
z_err: iterable, must have the same length as z
h_vals: list of H_0 values from the prior distribution
z_min: lower bound for integration over z
z_max: upper bound for integration over z
gw_dist_mu: expected value of GW distance
gw_dist_var: variance of GW distance
returns: the probability distribution for H_0 considering only the current set of galaxies
'''
mu_z = np.array(z)
var_z = np.array(z_err)**2
mu_z = mu_z[var_z > 0.]
var_z = var_z[var_z > 0.]
h = np.array(h_vals)
z = np.linspace(z_min, z_max, 100)
dz = z[1] - z[0]
_, _, var_z = np.meshgrid(h, z, var_z)
h, z, mu_z = np.meshgrid(h, z, mu_z)
p_em = np.sum(np.sum(np.sqrt(var_z) * integrand_em(z, mu_z, var_z, h), axis=2), axis=0) * dz
p_gw = np.sum(np.sum(integrand(z, mu_z, var_z, h, gw_dist_mu, gw_dist_var), axis=2), axis=0) * dz
return p_gw / p_em
class Posterior_PDF:
'''A utility class which stores a posterior pdf for a fixed vm and arbitrary distribution for H_0. The prior distribution for d is always assumed to be uniform.
'''
def __init__(self, h_range, num=N_H0):
'''Generates a posterior pdf for the Hubble data using the observed data vm and vm_err and the prior distribution for d
h_range: a tuple specifying the range of values which H_0 may assume. We take this as a uniform prior
num: number of values of d to sample when generating the pdf
'''
self.h_list = np.linspace(h_range[0], h_range[1], num=num)
self.h_space = (h_range[1] - h_range[0]) / num
self.p_list = np.ones(len(self.h_list))
self.p_list_current = None
def guess_index(self, h):
'''Helper function which finds the index in h_list which has the value closest to x assuming that lst is increasing'''
h_ind_guess = int(h / self.h_space)
list_len = len(self.h_list)
#ensure that the index is between 0 and the length of the list
if h_ind_guess < 1:
h_ind_guess = 1
if h_ind_guess >= list_len - 1:
h_ind_guess = list_len - 2
#look at adjacent points to find the best value
err = abs(self.h_list[h_ind_guess] - h)
best_ind = h_ind_guess
if abs(self.h_list[h_ind_guess - 1] - h) < err:
best_ind = h_ind_guess - 1
err = abs(self.h_list[h_ind_guess - 1] - h)
if abs(self.h_list[h_ind_guess + 1] - h) < err:
best_ind = h_ind_guess + 1
return best_ind
def integrate(self, a=-1, b=-1):
'''Finds the integral int_a^b p(d) dd
h_list: iterable of distances,
p_list: iterable of probabilities associated with each value of d. This must have the same length as h_list
a: lower bound or -1 to integrate from the lowest possible value for H_0
b: upper bound or -1 to integrate up to the highest possible value for H_0
returns: approximation of integral in that value
'''
a_ind = self.guess_index(a) if (a >= 0) else 0
b_ind = self.guess_index(b) if (b >= 0) else len(h_list)-1
#use the trapezoid rule
int_p = 0.0
for i in range(a_ind+1, b_ind):
int_p += (self.h_list[i] - self.h_list[i-1])*(self.p_list[i] + self.p_list[i-1])/2
return int_p
def ex_value(self):
'''Find the expectation value for H_0'''
int_p = 0.0
for i in range(1, len(self.h_list)):
int_p += (self.h_list[i] - self.h_list[i-1])*(self.h_list[i]*self.p_list[i] + self.h_list[i-1]*self.p_list[i-1])/2
return int_p
def find_confidence_central(self, conf):
'''Use a greedy algorithm to find the central conf% confidence interval
conf: a number between 0 and 1 specifying the desired confidence interval
returns: a tuple with three values. The first two values give lower and upper bounds respectively, the last value gives the actual probability contained
'''
int_p_low = 0.0
found_lower = False
lower = 0.0
bounds = []
#use the trapezoid rule
int_p = 0.0
for i in range(1, len(self.h_list)):
int_p += (self.h_list[i] - self.h_list[i-1])*(self.p_list[i] + self.p_list[i-1])/2
#if we've covered half of the confidence interval, then set the lower bound
if not found_lower and int_p >= (1.0-conf)/2:
lower = self.h_list[i]
int_p_low = int_p
found_lower = True
if int_p >= 1.0 - conf/2:
return lower, self.h_list[i], int_p-int_p_low
return lower, self.h_list[-1], int_p-int_p_low
def find_confidence(self, conf, tolerance=0.01, max_iters=10):
'''find the minimum width conf confidence interval for the approximate probability distribution function specified by h_list and p_list
conf: confidence interval to estimate
tolerance: we stop trying to refine our interval if we have a confidence that is within tolerance a default 1% tolerance is used
max_iters: maximum number of iterations to perform, even if we didn't achieve the desired tolerance
'''
self.normalize()
center = self.ex_value()
#these store the maximum probability heights which find_interval_iter is allowed to explore
center_val = self.p_list[self.guess_index(center)]
#we recursively guess a "height" on our pdf to consider. The pdf is searched to find values of d such that p(d)=height. Call these two values d1 and d2. We then integrate to find the probability between d1 and d2. If this value is greater (less) than conf then we take a higher (lower) height until we are within the specified tolerance or max_iter iterations have been performed
def find_interval_iter(height, height_lower, height_upper, n_visited_heights):
h_lo = self.h_list[0]
h_hi = self.h_list[-1]
#scan to find d1 and d2
for i, p in enumerate(self.p_list):
if p > height:
#we are just being carefull not to index out of bounds
if i == 0:
h_lo = height*(self.h_list[1] - self.h_list[0]) / self.p_list[1] #linear interpolation
elif self.p_list[i-1] < height:
#linear interpolation
h_lo = self.h_list[i-1] + (height - self.p_list[i-1])*(self.h_list[i] - self.h_list[i-1]) / (p - self.p_list[i-1])
if i > 0 and p < height and self.p_list[i-1] > height:
#linear interpolation
h_hi = self.h_list[i-1] + (height - self.p_list[i-1])*(self.h_list[i] - self.h_list[i-1]) / (p - self.p_list[i-1])
int_val = self.integrate(h_lo, h_hi)
#check for convergence
if abs(int_val - conf) < tolerance or n_visited_heights > max_iters:
return (h_lo, h_hi, int_val)
#explore a larger window of values by looking at a lower height halfway between the current height and the best lower height already visited
if int_val < conf:
height_upper = height
return find_interval_iter( (height + height_lower)/2, height_lower, height, n_visited_heights+1)
#similar but for a smaller window
else:
height_lower = height
return find_interval_iter( (height + height_upper)/2, height, height_upper, n_visited_heights+1)
return find_interval_iter(center_val/2, 0, center_val, 0)
def find_confidence_alt(self, conf, tolerance=0.01, max_iters=20):
from scipy.interpolate import interp1d
self.normalize()
index_peak = np.argmax(self.p_list)
f_left = interp1d(self.p_list[:index_peak], self.h_list[:index_peak])
f_right = interp1d(self.p_list[index_peak:], self.h_list[index_peak:])
p_upper = self.p_list[index_peak]
p_lower = 0.
p = 0.5 * (p_upper + p_lower)
iters = 0
while True:
diff = self.integrate(f_left(p), f_right(p)) - conf
if np.abs(diff) < tolerance or iters > max_iters:
break
if diff < 0:
p_upper = p
p = 0.5 * (p + p_lower)
else:
p_lower = p
p = 0.5 * (p + p_upper)
return f_left(p), f_right(p)
def normalize(self):
'''given a pdf with the specified support (taken to be a subset of R), return its normalized value. The support is unaltered.
'''
#use the trapezoid rule
int_p = 0.0
for i in range( 1, len(self.h_list) ):
int_p += (self.h_list[i] - self.h_list[i-1])*(self.p_list[i] + self.p_list[i-1])/2
self.p_list /= int_p
def add_event(self, rshift_fname):
galaxies = h5py.File(rshift_fname, 'r')
#figure out the range for our z-cut
ev_d_range = galaxies['distance_posterior']['conf_interval']
z_min = ev_d_range[0]*PRIOR_RANGE[0] / C_L
z_max = ev_d_range[1]*PRIOR_RANGE[1] / C_L
gw_dist_mu = galaxies['distance_posterior']['expectation'][0]
gw_dist_var = (galaxies['distance_posterior']['std_error'][0])**2
#posterier from Nair et al
rshifts = galaxies['redshifts']
#make sure that we have redshifts and errors for each galaxy
assert len(rshifts['z']) == len(rshifts['z_err'])
print("Found %d matching galaxies" % len(rshifts['z']))
#allocate different parts of the job to different cores
bsiz = len(rshifts['z']) // N_CORES
rem = len(rshifts['z']) % N_CORES
#we need to convert the two separate arrays for z and z_err into an array of arguments passed to p_one_gal.
# ;)
#arg_list = [([z for z in rshifts['z'][i*(bsiz+1)+(rem-i)*((i-rem+N_CORES)//N_CORES):(i+1)*bsiz+i+1+(rem-i-1)*((i-rem+N_CORES)//N_CORES)]], [z for z in rshifts['z_err'][i*(bsiz+1)+(rem-i)*((i-rem+N_CORES)//N_CORES):(i+1)*bsiz+i+1+(rem-i-1)*((i-rem+N_CORES)//N_CORES)]], self.h_list, z_min, z_max, gw_dist_mu, gw_dist_var) for i in range(N_CORES)]
num_sections = rshifts['z'].size // MAX_Z_PROC_SIZE + 1
zs = np.array_split(rshifts['z'], num_sections)
zs_err = np.array_split(rshifts['z_err'], num_sections)
arg_list = [(zs[i], zs_err[i], self.h_list, z_min, z_max, gw_dist_mu, gw_dist_var) for i in range(len(zs))]
#create a temporary pdf for this event. We need to take the product over multiple events to get the final result
tmp_pdf = np.zeros(len(self.p_list))
if N_CORES > 1:
pool = mp.Pool(processes=N_CORES)
res = pool.starmap(p_part_gals, arg_list)
else:
res = [p_part_gals(*arg_list[0])]
#sum together each result from the pool
for r in res:
for i in range(len(tmp_pdf)):
tmp_pdf[i] += r[i]
self.p_list_current = tmp_pdf
#move the new data into the pdf
self.p_list = tmp_pdf*self.p_list
galaxies.close()
pdf = Posterior_PDF(PRIOR_RANGE)
pdf_array = np.empty((args.n_events_use, N_H0))
events = EVENT_LIST[:args.n_events_use]
for i, ev in enumerate(events):
#load the list of potential galaxies
rshift_fname = DIR_NAME + '/' + ev + '_rshifts.h5'
print( "Detection %d / %d:" % (i, len(events)), end=' ' )
pdf.add_event(rshift_fname)
pdf.normalize()
pdf_array[i] = pdf.p_list_current
if args.save_pdf is not None:
np.savetxt(args.save_pdf, pdf_array)
if args.save_final:
plt.plot(pdf.h_list, pdf.p_list)
plt.xlabel(r'$H_0$ km s$^{-1}$ Mpc$^{-1}$')
plt.ylabel(r'$p(H_0 | D)$')
plt.title(r'Posterior distribution for $H_0$')
#plt.show()
plt.savefig('hub.svg')