-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathld_score.py
181 lines (129 loc) · 6.13 KB
/
ld_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Re-implementation of LD score regression (Bulik-Sullivan et al., Nat Genet 2015).
1. Implement LD calculation in the 1K genomes
- With and without adjustment for population Structure.
- Store local LD structure.
2. Implement LD score regression for a given set of SNPs.
3. Implement heritability estimation.
4. Implement genetic correlation estimation.
"""
import ld
import h5py
import kgenome
import h5py_util as hu
import pandas
try:
import scipy as sp
except Exception:
import numpy as sp
print 'Using Numpy instead of Scipy.'
ok_nts = sp.array(['A', 'T', 'G', 'C'])
def get_bulik_sullivan_15_sids(sids_dir='/project/HeritPartition/faststorage/ld_scores_bulik_sullivan_2015/eur_w_ld_chr'):
chrom_ok_sids_dict = {}
for chrom in range(1, 23):
chrom_str = 'chr%d' % chrom
filename = '%s/%d.l2.ldscore' % (sids_dir, chrom)
ld_score_table = pandas.read_table(filename)
sids = sp.array(ld_score_table['SNP'])
chrom_ok_sids_dict[chrom_str] = sids
return chrom_ok_sids_dict
def generate_1k_LD_scores(input_genotype_file, chrom_snp_trans_mats,
maf_thres=0.01, ld_radius=200, debug_filter_frac=0.01,
indiv_filter=None , snp_filter=None):
"""
Generates 1k genomes LD scores and stores in the given file
"""
chrom_ld_scores_dict = {}
ld_score_sum = 0
struct_adj_ld_score_sum = 0
num_snps = 0
print 'Calculating LD information w. radius %d' % ld_radius
in_h5f = h5py.File(input_genotype_file)
print 'Calculating local LD'
for chrom in range(1, 23):
print 'Working on Chromosome %d' % chrom
chrom_str = 'chr%d' % chrom
print 'Loading SNPs'
g_dict = kgenome.get_genotype_data(in_h5f, chrom, maf_thres, indiv_filter=indiv_filter,
snp_filter=snp_filter, randomize_sign=False, snps_signs=None,
return_snps_info=True, debug_filter_frac=debug_filter_frac)
norm_snps = g_dict['norm_snps']
ret_dict = ld.get_ld_scores(norm_snps, ld_radius=ld_radius)
avg_ld_score = sp.mean(ret_dict['ld_scores'])
g_dict['ld_scores'] = ret_dict['ld_scores']
g_dict['avg_ld_score'] = avg_ld_score
ld_score_sum += sp.sum(ret_dict['ld_scores'])
print 'Un-adjusted average LD score was: %0.3f' % avg_ld_score
if chrom_snp_trans_mats is not None:
snp_trans_mat = chrom_snp_trans_mats[chrom_str]
norm_snps = sp.dot(norm_snps, snp_trans_mat.T)
# Need to re-normalize?
snp_means = sp.mean(norm_snps, 1)
snp_means.shape = (len(snp_means), 1)
snp_stds = sp.std(norm_snps, 1)
snp_stds.shape = (len(snp_stds), 1)
norm_snps = sp.array((norm_snps - snp_means) / snp_stds)
ret_dict = ld.get_ld_scores(norm_snps, ld_radius=ld_radius)
avg_ld_score = sp.mean(ret_dict['ld_scores'])
print 'Pop-structure adjusted average LD score was: %0.3f' % avg_ld_score
g_dict['struct_adj_ld_scores'] = ret_dict['ld_scores']
g_dict['avg_struct_adj_ld_score'] = avg_ld_score
struct_adj_ld_score_sum += sp.sum(ret_dict['ld_scores'])
del g_dict['norm_snps']
del g_dict['snp_means']
del g_dict['snp_stds']
chrom_ld_scores_dict[chrom_str] = g_dict
num_snps += len(norm_snps)
avg_gw_ld_score = ld_score_sum / float(num_snps)
avg_gw_struct_adj_ld_score = ld_score_sum / float(num_snps)
ld_scores_dict = {'avg_gw_ld_score': avg_gw_ld_score,
'avg_gw_struct_adj_ld_score':avg_gw_struct_adj_ld_score,
'chrom_dict':chrom_ld_scores_dict}
print 'Done calculating the LD table and LD scores.'
return ld_scores_dict
def calculate(input_genotype_file, input_ld_pruned_genotype_file,
ld_score_file, kinship_pca_file, ld_radius=200, maf_thres=0.01, snp_filter_frac=0.05, debug_filter_frac=1):
"""
Generates population structure adjusted 1k genomes LD scores and stores in the given file.
"""
# Kinship
kinship_pca_dict = kgenome.get_kinship_pca_dict(input_ld_pruned_genotype_file, kinship_pca_file,
maf_thres=maf_thres, snp_filter_frac=snp_filter_frac)
chrom_snp_trans_mats = {}
for chrom in range(1, 23):
print 'Working on Chromosome %d' % chrom
chrom_str = 'chr%d' % chrom
chrom_snp_trans_mats[chrom_str] = kinship_pca_dict[chrom_str]['cholesky_decomp_inv_snp_cov']
# bla
lds_dict = generate_1k_LD_scores(input_genotype_file, chrom_snp_trans_mats,
maf_thres=maf_thres, ld_radius=ld_radius,
debug_filter_frac=debug_filter_frac)
# Store LD scores
lds_h5f = h5py.File(ld_score_file)
hu.dict_to_hdf5(lds_dict, lds_h5f)
lds_h5f.close()
def estimate_heritability(ld_score_file='/home/bjarni/PCMA/faststorage/1_DATA/1k_genomes/ld_scores_ldr2500.hdf5'):
pass
def estimate_genetic_correlation():
pass
def generate_genetic_corr_figure():
pass
def generate_herit_figure():
pass
def get_1K_genomes_LD_scores(snps, ld_radius=200):
"""
Calculates LD tables, and the LD scores in one go...
"""
# First estimate the genotype covariance matrix with and
pass
# return ret_dict
def get_genotype_cov_mat():
"""
Estimate genotype covariance matrix from the provided SNPs.
"""
if __name__ == '__main__':
calculate('/home/bjarni/PCMA/faststorage/1_DATA/1k_genomes/1K_genomes_phase3_EUR_unrelated.hdf5',
'/home/bjarni/PCMA/faststorage/1_DATA/1k_genomes/1K_genomes_phase3_EUR_unrelated_ld_pruned.hdf5',
'/home/bjarni/PCMA/faststorage/1_DATA/1k_genomes/ld_scores_ldr3000.hdf5',
'/home/bjarni/PCMA/faststorage/1_DATA/1k_genomes/1kgenomes_kinship_pca_f0.95.hdf5',
ld_radius=3000, maf_thres=0.01, snp_filter_frac=0.95, debug_filter_frac=1)