Skip to content

Commit

Permalink
Merge pull request #109 from bvilhjal/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
bvilhjal authored Oct 11, 2019
2 parents 737d69c + 69a7631 commit 3db18bb
Show file tree
Hide file tree
Showing 71 changed files with 138,680 additions and 130,286 deletions.
43 changes: 35 additions & 8 deletions LDpred.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,17 @@

import sys
import textwrap
import random
import scipy

__version__ = '1.0.6'
__date__ = '18 March 2019'
__version__ = '1.0.7'
__date__ = '11 October 2019'

#Set a random seed to avoid varying results.
random_seed=42
random.seed(random_seed)
scipy.random.seed(random_seed)

title = '\033[96mLDpred v. %s\033[0m'%__version__
title_len= len(title)-9
num_dashes = reporting.window_length-title_len-2
Expand Down Expand Up @@ -103,7 +110,7 @@
parser_coord.add_argument('--max-freq-discrep', type=float, default=0.1,
help='Max frequency discrepancy allowed between reported sum stats frequency and frequency '
'in the LD reference data. To disable check, set to 1.')
parser_coord.add_argument('--ssf-format', type=str, default="CUSTOM", choices={'CUSTOM','STANDARD','GIANT', 'PGC'},
parser_coord.add_argument('--ssf-format', type=str, default="CUSTOM", choices={'LDPRED','CUSTOM','STANDARD','GIANT', 'PGC'},
help='This is the format type of the summary statistics file. '
'By default the CUSTOM format requires the user to specify the file format using additional '
'arguments.')
Expand All @@ -128,6 +135,9 @@
help="Column header containing the P-value information")
parser_coord.add_argument('--eff', type=str, default="OR",
help="Column header containing effect size information")
parser_coord.add_argument('--se', type=str, default="SE",
help='Column header containing standard error. Note: these are only used if corresponding p-values'
' are smaller than approximately 1E-325, or if --z-from-se is set.')
parser_coord.add_argument('--ncol', type=str, default="N",
help="Column header containing sample size information")
parser_coord.add_argument('--case-freq', type=str, default=None,
Expand All @@ -138,6 +148,9 @@
help="Column header containing case sample size information")
parser_coord.add_argument('--control-n', type=str, default=None,
help="Column header containing control sample size information")
parser_coord.add_argument('--z-from-se', default=False, action='store_true',
help='If this option is set, then the effects use by LDpred are derived using the effect estimates '
'and their standard errors. (Only possible if standard errors are available in the summary stats)')
# parser_coord.add_argument('--gmdir', type=str,
# help='The directory of genetic map.', default=None)

Expand All @@ -160,15 +173,22 @@
parser_gibbs.add_argument('--f', default=[1,0.3,0.1,0.03,0.01,0.003,0.001], nargs='+', type=float,
help="Fraction of causal variants used in the Gibbs sampler")

parser_gibbs.add_argument('--N', type=int, default=100000, required=True,
parser_gibbs.add_argument('--N', type=int, default=None,
help='Number of individuals on which the summary statistics are assumed to be based on.')
parser_gibbs.add_argument('--n-iter', type=int, default=60,
help='The number of iterations used by the Gibbs sampler. The default is 60.')
parser_gibbs.add_argument('--n-burn-in', type=int, default=5,
help='The number of burn-in iterations used by the Gibbs sampler. The default is 5.')
parser_gibbs.add_argument('--h2', type=float, default=None,
help='The heritability assumed by LDpred. By default it estimates the heritability from'
' the GWAS summary statistics using LD score regression (Bulik-Sullivan et al., Nat Genet 2015).')
help='The genome-wide heritability assumed by LDpred, which is then partitioned '
'proportional to the number of SNPs on each chromosome. By default it estimates the '
'heritability for each chromosome from the GWAS summary statistics using LD score '
'regression (Bulik-Sullivan et al., Nat Genet 2015).')
parser_gibbs.add_argument('--use-gw-h2', default=False, action='store_true',
help='Estimate heritability genome-wide and partition it proportional to the number of SNPs on each chromsome '
'instead of estimating it for each chromosome separately. This is like a likely good choice if '
'the summary statistics used are based on small sample sizes (approx <50K), or if the trait is '
'not very heritable.')
parser_gibbs.add_argument('--no-ld-compression', default=False, action='store_true',
help='Do not compress LD information. Saves storing and loading time of LD information, '
'but takes more space on disk.')
Expand Down Expand Up @@ -199,8 +219,15 @@
parser_inf.add_argument('--N', type=int, default=100000, required=True,
help='Number of individuals on which the summary statistics are assumed to be based on.')
parser_inf.add_argument('--h2', type=float, default=None,
help='The heritability assumed by LDpred. By default it estimates the heritability from'
' the GWAS summary statistics using LD score regression (Bulik-Sullivan et al., Nat Genet 2015).')
help='The genome-wide heritability assumed by LDpred, which is then partitioned '
'proportional to the number of SNPs on each chromosome. By default it estimates the '
'heritability for each chromosome from the GWAS summary statistics using LD score '
'regression (Bulik-Sullivan et al., Nat Genet 2015).')
parser_inf.add_argument('--use-gw-h2', default=False, action='store_true',
help='Estimate heritability genome-wide and partition it proportional to the number of SNPs on each chromsome '
'instead of estimating it for each chromosome separately. This is like a likely good choice if '
'the summary statistics used are based on small sample sizes (approx <50K), or if the trait is '
'not very heritable.')
parser_inf.add_argument('--no-ld-compression', default=False, action='store_true',
help='Do not compress LD information. Saves storing and loading time of LD information, '
'but takes more space on disk.')
Expand Down
16 changes: 12 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,27 @@ LDpred is a Python based software package that adjusts GWAS summary statistics
for the effects of linkage disequilibrium (LD). The details of the method is
described in Vilhjalmsson et al. (AJHG 2015) [http://www.cell.com/ajhg/abstract/S0002-9297(15)00365-1]

* The current version is 1.0.6
* The current version is 1.0.7

## Getting Started ##
### News ###

Mar 18th, 2019: Version 1.0.6 released and is available on pip using
Oct 11th, 2019: Version 1.0.7 released and is available on pip using

`pip install ldpred`

Feb 28th, 2019: Added a wiki with Q and A. Please add questions (and answers)
https://github.com/bvilhjal/ldpred/wiki/LDpred-Wiki
Oct 11th, 2019:
⋅⋅* Improved handling of variants with p-values rounded down to 0.

⋅⋅* Added options to enable effect estimate inference from standard errors (--z-from-se) in coordination step.

⋅⋅* Fixed a serious bug that caused sample sizes in summary stats file not always be used correctly when provided.

⋅⋅* LDpred gibbs can now handle differing sample sizes per variant effects, if they are parsed in summary stats. This can improves convergence and accuracy subtantially, but slows the gibbs sampler (approx 5x). It can be avoided by setting --N flag.

⋅⋅* If the heritability is not provided when running LDpred gibbs and inf it now estimates it separately for each chromosome, which can improve accuracy if summary stats sample size is large. To avoid this behavior, either set --h2 flag, or use --use-gw-h2 flag.

⋅⋅* Improved reporting and other things.

### Requirements ###
LDpred currently requires three Python packages to be installed and in path. These
Expand Down
74 changes: 56 additions & 18 deletions ldpred/LD_pruning_thres.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
import h5py
import scipy as sp
import time
import sys
from ldpred import ld
from ldpred import reporting
from ldpred import util

def smart_ld_pruning(beta_hats, ld_table, pvalues=None, max_ld=0.2, verbose=False):
def ld_clumping(beta_hats, ld_table, pruning_stat =None, pvalues=None, max_ld=0.2, verbose=False):
"""
Smart LD pruning.
Smart LD pruning. (Also known as LD clumping)
"""
if verbose:
print('Doing smart LD pruning')
print('LD clumping')
t0 = time.time()
if pvalues is not None:
if pruning_stat is not None:
pruning_vector = ld.smart_ld_pruning(pruning_stat, ld_table, max_ld=max_ld, verbose=verbose, reverse=True)
elif pvalues is not None:
pruning_vector = ld.smart_ld_pruning(pvalues, ld_table, max_ld=max_ld, verbose=verbose)
else:
pruning_vector = ld.smart_ld_pruning(beta_hats ** 2, ld_table, max_ld=max_ld, verbose=verbose, reverse=True)
Expand Down Expand Up @@ -46,14 +50,6 @@ def ld_pruning(data_file=None, ld_radius = None, out_file_prefix=None, p_thres=N
if has_phenotypes:
risk_scores = sp.zeros(num_individs)

print('')
if max_r2<1:
print('Applying LD-pruning + P-value thresholding with p-value threshold of %0.2e, a LD radius of %d SNPs, and a max r2 of %0.2f' %(p_thres, ld_radius, max_r2))
else:
if p_thres<1:
print('Applying P-value thresholding with p-value threshold of %0.2e' %(p_thres))
else:
print('Calculating polygenic risk score using all SNPs')
results_dict = {}
num_snps = 0
cord_data_g = df['cord_data']
Expand Down Expand Up @@ -121,7 +117,8 @@ def ld_pruning(data_file=None, ld_radius = None, out_file_prefix=None, p_thres=N
norm_ref_snps = sp.array((raw_snps - snp_means)/snp_stds,dtype='float32')
ld_table = ld.calc_ld_table(norm_ref_snps, max_ld_dist=ld_radius, min_r2=max_r2, verbose=verbose)

updated_raw_betas, pruning_vector = smart_ld_pruning(raw_betas, ld_table, pvalues=pvalues, max_ld=max_r2, verbose=verbose)
updated_raw_betas, pruning_vector = ld_clumping(raw_betas, ld_table, pruning_stat=pval_derived_betas**2,
max_ld=max_r2, verbose=verbose)
updated_pval_derived_betas = pval_derived_betas * pruning_vector
num_snps_used += sp.sum(pruning_vector)
else:
Expand All @@ -144,9 +141,10 @@ def ld_pruning(data_file=None, ld_radius = None, out_file_prefix=None, p_thres=N
r2 = corr ** 2
print('The R2 prediction accuracy of PRS using %s was: %0.4f' %(chrom_str, r2))


print('There were %d (SNP) effects after p-value thresholding' % tot_num_snps)
print('After LD-pruning %d SNPs had non-zero effects'%num_snps_used)
if verbose:
print('There were %d (SNP) effects after p-value thresholding' % tot_num_snps)
print('After LD-pruning %d SNPs had non-zero effects'%num_snps_used)

if has_phenotypes:
results_dict[p_str]['y']=y
results_dict[p_str]['risk_scores']=risk_scores
Expand Down Expand Up @@ -177,9 +175,49 @@ def ld_pruning(data_file=None, ld_radius = None, out_file_prefix=None, p_thres=N
f.write('%s %d %s %s %s %0.4e %0.4e %0.4e %0.4e\n'%(chrom, pos, sid, nt1, nt2, raw_beta, raw_pval_beta, upd_beta, upd_pval_beta))
df.close()

def main(p_dict):
def run_pt(p_dict,summary_dict={}):
pi=0
for p_thres in reversed(p_dict['p']):
max_r2s=p_dict['r2']
if p_dict['debug']:
print('')
if len(max_r2s)>1 or max_r2s[0]<1:
r2s_string = ', '.join(['%0.2f'%r2 for r2 in max_r2s])
print('Applying LD-pruning + P-value thresholding with p-value threshold of %0.2e, a LD radius of %d SNPs, and a max r2(s) of %s' %(p_thres, p_dict['ldr'], r2s_string))
elif p_thres<1:
print('Applying P-value thresholding with p-value threshold of %0.2e' %(p_thres))
else:
print('Calculating polygenic risk score using all SNPs')
else:
sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (float(pi) / len(p_dict['p']))))
sys.stdout.flush()

ld_pruning(data_file=p_dict['cf'], out_file_prefix=p_dict['out'], p_thres=p_thres, ld_radius=p_dict['ldr'],
max_r2s=p_dict['r2'])
max_r2s=max_r2s)
pi+=1
if not p_dict['debug']:
sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%\n' % (100.0))
sys.stdout.flush()
summary_dict[1.2]={'name':'P-value thresholds used','value':str(p_dict['p'])}
summary_dict[1.3]={'name':'Maximum r2 LD thresholds used','value':str(max_r2s)}


def main(p_dict):
summary_dict = {}
summary_dict[0]={'name':'Coordinated data filename','value':p_dict['cf']}
summary_dict[0.1]={'name':'SNP weights output file (prefix)', 'value':p_dict['out']}
summary_dict[1]={'name':'LD radius used','value':str(p_dict['ldr'])}
t0 = time.time()
summary_dict[1.1]={'name':'dash', 'value':'LD-pruning + Thresholding'}
run_pt(p_dict,summary_dict)
t1 = time.time()
t = (t1 - t0)
summary_dict[2]={'name':'Running time for calculating P+T:','value':'%d min and %0.2f secs'% (t / 60, t % 60)}

reporting.print_summary(summary_dict, 'Summary of LD-pruning + Tresholding')






Loading

0 comments on commit 3db18bb

Please sign in to comment.