Merge pull request #109 from bvilhjal/develop

Develop
bvilhjal · Oct 11, 2019 · 3db18bb · 3db18bb
2 parents 737d69c + 69a7631
commit 3db18bb
Show file tree

Hide file tree

Showing 71 changed files with 138,680 additions and 130,286 deletions.
diff --git a/LDpred.py b/LDpred.py
@@ -10,10 +10,17 @@
 
 import sys
 import textwrap
+import random
+import scipy
 
-__version__ = '1.0.6'
-__date__ = '18 March 2019'
+__version__ = '1.0.7'
+__date__ = '11 October 2019'
 
+#Set a random seed to avoid varying results.
+random_seed=42
+random.seed(random_seed)
+scipy.random.seed(random_seed)
+
 title = '\033[96mLDpred v. %s\033[0m'%__version__
 title_len= len(title)-9
 num_dashes = reporting.window_length-title_len-2
@@ -103,7 +110,7 @@
 parser_coord.add_argument('--max-freq-discrep', type=float, default=0.1,
                     help='Max frequency discrepancy allowed between reported sum stats frequency and frequency '
                     'in the LD reference data.  To disable check, set to 1.')
-parser_coord.add_argument('--ssf-format', type=str, default="CUSTOM", choices={'CUSTOM','STANDARD','GIANT', 'PGC'},
+parser_coord.add_argument('--ssf-format', type=str, default="CUSTOM", choices={'LDPRED','CUSTOM','STANDARD','GIANT', 'PGC'},
                     help='This is the format type of the summary statistics file. '
                     'By default the CUSTOM format requires the user to specify the file format using additional '
                     'arguments.')
@@ -128,6 +135,9 @@
                     help="Column header containing the P-value information")
 parser_coord.add_argument('--eff', type=str, default="OR",
                     help="Column header containing effect size information")
+parser_coord.add_argument('--se', type=str, default="SE",
+                    help='Column header containing standard error. Note: these are only used if corresponding p-values'
+                    ' are smaller than approximately 1E-325, or if --z-from-se is set.')
 parser_coord.add_argument('--ncol', type=str, default="N",
                     help="Column header containing sample size information")
 parser_coord.add_argument('--case-freq', type=str, default=None,
@@ -138,6 +148,9 @@
                     help="Column header containing case sample size information")
 parser_coord.add_argument('--control-n', type=str, default=None,
                     help="Column header containing control sample size information")
+parser_coord.add_argument('--z-from-se', default=False, action='store_true',
+                    help='If this option is set, then the effects use by LDpred are derived using the effect estimates '
+                    'and their standard errors. (Only possible if standard errors are available in the summary stats)')
 # parser_coord.add_argument('--gmdir', type=str,
 #                     help='The directory of genetic map.', default=None)
 
@@ -160,15 +173,22 @@
 parser_gibbs.add_argument('--f', default=[1,0.3,0.1,0.03,0.01,0.003,0.001], nargs='+', type=float,
                     help="Fraction of causal variants used in the Gibbs sampler")
 
-parser_gibbs.add_argument('--N', type=int, default=100000, required=True,
+parser_gibbs.add_argument('--N', type=int, default=None,
                     help='Number of individuals on which the summary statistics are assumed to be based on.')
 parser_gibbs.add_argument('--n-iter', type=int, default=60,
                     help='The number of iterations used by the Gibbs sampler. The default is 60.')
 parser_gibbs.add_argument('--n-burn-in', type=int, default=5,
                     help='The number of burn-in iterations used by the Gibbs sampler. The default is 5.')
 parser_gibbs.add_argument('--h2', type=float, default=None, 
-                    help='The heritability assumed by LDpred.  By default it estimates the heritability from'
-                    ' the GWAS summary statistics using LD score regression (Bulik-Sullivan et al., Nat Genet 2015).')
+                    help='The genome-wide heritability assumed by LDpred, which is then partitioned '
+                    'proportional to the number of SNPs on each chromosome.  By default it estimates the '
+                    'heritability for each chromosome from the GWAS summary statistics using LD score '
+                    'regression (Bulik-Sullivan et al., Nat Genet 2015).')
+parser_gibbs.add_argument('--use-gw-h2', default=False, action='store_true',
+                    help='Estimate heritability genome-wide and partition it proportional to the number of SNPs on each chromsome '
+                    'instead of estimating it for each chromosome separately.  This is like a likely good choice if '
+                    'the summary statistics used are based on small sample sizes (approx <50K), or if the trait is '
+                    'not very heritable.')
 parser_gibbs.add_argument('--no-ld-compression', default=False, action='store_true',
                     help='Do not compress LD information.  Saves storing and loading time of LD information, '
                     'but takes more space on disk.')
@@ -199,8 +219,15 @@
 parser_inf.add_argument('--N', type=int, default=100000, required=True,
                     help='Number of individuals on which the summary statistics are assumed to be based on.')
 parser_inf.add_argument('--h2', type=float, default=None, 
-                    help='The heritability assumed by LDpred.  By default it estimates the heritability from'
-                    ' the GWAS summary statistics using LD score regression (Bulik-Sullivan et al., Nat Genet 2015).')
+                    help='The genome-wide heritability assumed by LDpred, which is then partitioned '
+                    'proportional to the number of SNPs on each chromosome.  By default it estimates the '
+                    'heritability for each chromosome from the GWAS summary statistics using LD score '
+                    'regression (Bulik-Sullivan et al., Nat Genet 2015).')
+parser_inf.add_argument('--use-gw-h2', default=False, action='store_true', 
+                    help='Estimate heritability genome-wide and partition it proportional to the number of SNPs on each chromsome '
+                    'instead of estimating it for each chromosome separately.  This is like a likely good choice if '
+                    'the summary statistics used are based on small sample sizes (approx <50K), or if the trait is '
+                    'not very heritable.')
 parser_inf.add_argument('--no-ld-compression', default=False, action='store_true',
                     help='Do not compress LD information.  Saves storing and loading time of LD information, '
                     'but takes more space on disk.')

diff --git a/README.md b/README.md
@@ -6,19 +6,27 @@ LDpred is a Python based software package that adjusts GWAS summary statistics
 for the effects of linkage disequilibrium (LD).  The details of the method is
 described in Vilhjalmsson et al. (AJHG 2015) [http://www.cell.com/ajhg/abstract/S0002-9297(15)00365-1]
 
-* The current version is 1.0.6
+* The current version is 1.0.7
 
 ## Getting Started ##
 ### News ###
 
-Mar 18th, 2019: Version 1.0.6 released and is available on pip using 
+Oct 11th, 2019: Version 1.0.7 released and is available on pip using 
 
 `pip install ldpred`
 
-Feb 28th, 2019: Added a wiki with Q and A.  Please add questions (and answers)
-https://github.com/bvilhjal/ldpred/wiki/LDpred-Wiki
+Oct 11th, 2019: 
+⋅⋅* Improved handling of variants with p-values rounded down to 0.  
 
+⋅⋅* Added options to enable effect estimate inference from standard errors (--z-from-se) in coordination step. 
 
+⋅⋅* Fixed a serious bug that caused sample sizes in summary stats file not always be used correctly when provided.
+
+⋅⋅* LDpred gibbs can now handle differing sample sizes per variant effects, if they are parsed in summary stats.  This can improves convergence and accuracy subtantially, but slows the gibbs sampler (approx 5x).  It can be avoided by setting --N flag.
+
+⋅⋅* If the heritability is not provided when running LDpred gibbs and inf it now estimates it separately for each chromosome, which can improve accuracy if summary stats sample size is large.  To avoid this behavior, either set --h2 flag, or use --use-gw-h2 flag.
+
+⋅⋅* Improved reporting and other things.
 
 ### Requirements ###
 LDpred currently requires three Python packages to be installed and in path.  These

diff --git a/ldpred/LD_pruning_thres.py b/ldpred/LD_pruning_thres.py
@@ -1,17 +1,21 @@
 import h5py
 import scipy as sp
 import time
+import sys
 from ldpred import ld
+from ldpred import reporting
 from ldpred import util
 
-def smart_ld_pruning(beta_hats, ld_table, pvalues=None, max_ld=0.2, verbose=False):
+def ld_clumping(beta_hats, ld_table, pruning_stat =None, pvalues=None, max_ld=0.2, verbose=False):
     """
-    Smart LD pruning.
+    Smart LD pruning.  (Also known as LD clumping)
     """    
     if verbose:
-        print('Doing smart LD pruning')
+        print('LD clumping')
     t0 = time.time()
-    if pvalues is not None:
+    if pruning_stat is not None: 
+        pruning_vector = ld.smart_ld_pruning(pruning_stat, ld_table, max_ld=max_ld, verbose=verbose, reverse=True)    
+    elif pvalues is not None:
         pruning_vector = ld.smart_ld_pruning(pvalues, ld_table, max_ld=max_ld, verbose=verbose)    
     else:
         pruning_vector = ld.smart_ld_pruning(beta_hats ** 2, ld_table, max_ld=max_ld, verbose=verbose, reverse=True)    
@@ -46,14 +50,6 @@ def ld_pruning(data_file=None, ld_radius = None, out_file_prefix=None, p_thres=N
         if has_phenotypes:
             risk_scores = sp.zeros(num_individs)    
 
-        print('')
-        if max_r2<1:
-            print('Applying LD-pruning + P-value thresholding with p-value threshold of %0.2e, a LD radius of %d SNPs, and a max r2 of %0.2f' %(p_thres, ld_radius, max_r2))
-        else:
-            if p_thres<1:
-                print('Applying P-value thresholding with p-value threshold of %0.2e' %(p_thres))
-            else:
-                print('Calculating polygenic risk score using all SNPs')        
         results_dict = {}
         num_snps = 0
         cord_data_g = df['cord_data']
@@ -121,7 +117,8 @@ def ld_pruning(data_file=None, ld_radius = None, out_file_prefix=None, p_thres=N
                 norm_ref_snps = sp.array((raw_snps - snp_means)/snp_stds,dtype='float32') 
                 ld_table = ld.calc_ld_table(norm_ref_snps, max_ld_dist=ld_radius, min_r2=max_r2, verbose=verbose)
 
-                updated_raw_betas, pruning_vector = smart_ld_pruning(raw_betas, ld_table, pvalues=pvalues, max_ld=max_r2, verbose=verbose)
+                updated_raw_betas, pruning_vector = ld_clumping(raw_betas, ld_table, pruning_stat=pval_derived_betas**2, 
+                                                                max_ld=max_r2, verbose=verbose)
                 updated_pval_derived_betas = pval_derived_betas * pruning_vector
                 num_snps_used += sp.sum(pruning_vector)
             else:
@@ -144,9 +141,10 @@ def ld_pruning(data_file=None, ld_radius = None, out_file_prefix=None, p_thres=N
                 r2 = corr ** 2
                 print('The R2 prediction accuracy of PRS using %s was: %0.4f' %(chrom_str, r2))
 
-
-        print('There were %d (SNP) effects after p-value thresholding' % tot_num_snps)
-        print('After LD-pruning %d SNPs had non-zero effects'%num_snps_used)
+        if verbose:            
+            print('There were %d (SNP) effects after p-value thresholding' % tot_num_snps)
+            print('After LD-pruning %d SNPs had non-zero effects'%num_snps_used)
+
         if has_phenotypes:
             results_dict[p_str]['y']=y
             results_dict[p_str]['risk_scores']=risk_scores
@@ -177,9 +175,49 @@ def ld_pruning(data_file=None, ld_radius = None, out_file_prefix=None, p_thres=N
                 f.write('%s    %d    %s    %s    %s    %0.4e    %0.4e    %0.4e    %0.4e\n'%(chrom, pos, sid, nt1, nt2, raw_beta, raw_pval_beta, upd_beta, upd_pval_beta))
     df.close()
 
-def main(p_dict):
+def run_pt(p_dict,summary_dict={}):
+    pi=0
     for p_thres in reversed(p_dict['p']):
+        max_r2s=p_dict['r2']
+        if p_dict['debug']:
+            print('')
+            if len(max_r2s)>1 or max_r2s[0]<1:
+                r2s_string = ', '.join(['%0.2f'%r2 for r2 in max_r2s])
+                print('Applying LD-pruning + P-value thresholding with p-value threshold of %0.2e, a LD radius of %d SNPs, and a max r2(s) of %s' %(p_thres, p_dict['ldr'], r2s_string))
+            elif p_thres<1:
+                print('Applying P-value thresholding with p-value threshold of %0.2e' %(p_thres))
+            else:
+                print('Calculating polygenic risk score using all SNPs')        
+        else:
+            sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%' % (100.0 * (float(pi) / len(p_dict['p']))))
+            sys.stdout.flush()
+
         ld_pruning(data_file=p_dict['cf'], out_file_prefix=p_dict['out'], p_thres=p_thres, ld_radius=p_dict['ldr'],
-                   max_r2s=p_dict['r2'])
+                   max_r2s=max_r2s)
+        pi+=1
+    if not p_dict['debug']:
+        sys.stdout.write('\b\b\b\b\b\b\b%0.2f%%\n' % (100.0))
+        sys.stdout.flush()
+    summary_dict[1.2]={'name':'P-value thresholds used','value':str(p_dict['p'])}
+    summary_dict[1.3]={'name':'Maximum r2 LD thresholds used','value':str(max_r2s)}
+
+
+def main(p_dict):
+    summary_dict = {}
+    summary_dict[0]={'name':'Coordinated data filename','value':p_dict['cf']}
+    summary_dict[0.1]={'name':'SNP weights output file (prefix)', 'value':p_dict['out']}
+    summary_dict[1]={'name':'LD radius used','value':str(p_dict['ldr'])}
+    t0 = time.time()
+    summary_dict[1.1]={'name':'dash', 'value':'LD-pruning + Thresholding'}
+    run_pt(p_dict,summary_dict)
+    t1 = time.time()
+    t = (t1 - t0)
+    summary_dict[2]={'name':'Running time for calculating P+T:','value':'%d min and %0.2f secs'% (t / 60, t % 60)}
+
+    reporting.print_summary(summary_dict, 'Summary of LD-pruning + Tresholding')
+
+
+
+