From 0584337a61dba42637192633adfde74cb9ba4120 Mon Sep 17 00:00:00 2001
From: Yavin4 <pspahn@eng.ucsd.edu>
Date: Thu, 24 Aug 2017 14:00:16 -0700
Subject: [PATCH] v2.7.5

---
 Scripts/CheckLibrary.py | 65 ++++++++++++++++++++++++++++++++++++++++
 Scripts/FindHits.py     | 66 +----------------------------------------
 Scripts/PinAPL.py       |  9 +++---
 Scripts/PrintStatus.py  |  4 +--
 configuration.yaml      |  3 +-
 5 files changed, 75 insertions(+), 72 deletions(-)
 create mode 100755 Scripts/CheckLibrary.py

diff --git a/Scripts/CheckLibrary.py b/Scripts/CheckLibrary.py
new file mode 100755
index 0000000..1063791
--- /dev/null
+++ b/Scripts/CheckLibrary.py
@@ -0,0 +1,65 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Oct 10 10:22:56 2016
+
+@author: philipp
+"""
+# Library sanity check
+# =======================================================================
+# Imports
+import yaml
+import sys
+import os
+import pandas
+
+def LibrarySanityCheck():
+    # ------------------------------------------------
+    # Get parameters
+    # ------------------------------------------------
+    configFile = open('configuration.yaml','r')
+    config = yaml.load(configFile)
+    configFile.close()
+    LibDir = config['LibDir']
+    LibFilename = config['LibFilename']
+    LibFormat = LibFilename[-3:]
+    if LibFormat == 'tsv':
+        libsep = '\t'
+    elif LibFormat == 'csv':
+        libsep = ','
+
+    # ----------------------------------
+    # Read library
+    # ----------------------------------
+    os.chdir(LibDir)
+    LibCols = ['gene','ID','seq']
+    LibFile = pandas.read_table(LibFilename, sep = libsep, skiprows = 1, names = LibCols)
+    GeneNames = list(LibFile['gene'])
+    ID = list(LibFile['ID'])
+    seq = list(LibFile['seq'])
+
+    # ----------------------------------
+    # Replace non-printable characters (...these cause problems in PlotCount.py)
+    # ----------------------------------    
+    GeneNames0 = []
+    for gene in GeneNames:
+        gene = gene.replace('|','_')
+        gene = gene.replace('(','_')
+        gene = gene.replace(')','_')
+        gene = gene.replace(';','_')
+        gene = gene.replace('"','')
+        gene = gene.replace('/','_')
+        gene = gene.replace('\\','_')
+        GeneNames0.append(gene)
+    if GeneNames != GeneNames0:
+            LibFile0 = pandas.DataFrame(data = {'gene': [gene for gene in GeneNames0],
+                                     'ID': [id for id in ID],
+                                     'seq': [s for s in seq]},
+                            columns = ['gene','ID','seq'])
+            LibFile0.to_csv(LibFilename, sep = libsep, index = False)
+            print("WARNING: Found non-printable characters in library file. Replaced by '_' ")
+
+
+   
+if __name__ == "__main__":
+    LibrarySanityCheck()
diff --git a/Scripts/FindHits.py b/Scripts/FindHits.py
index c46c7e8..c924578 100755
--- a/Scripts/FindHits.py
+++ b/Scripts/FindHits.py
@@ -146,70 +146,7 @@ def PrepareHitList(sample):
         QQPlot(NBpval,significant,pvalDir,sample,res,svg,alpha)
         zScorePlot(fc,significant,pvalDir,ScreenType,sample,res,svg,alpha)
 
-    
-#    # Compute fold change (compared to control)
-#    print('Computing fold changes ...')
-#    fc = list()
-#    for k in range(L):
-#        if x[k]==0 or mu[k]==0:
-#            fc.append((x[k]+delta)/(mu[k]+delta))
-#        else:
-#            fc.append(x[k]/mu[k])
-#    # Compute negative binomial p-values    
-#    if max(sigma2) > 0: 
-#        print('Computing p-values ...')
-#        # Neg. Binom. Parameters  n: number of failures, p: probability of failure       
-#        n = list(); p = list()
-#        for i in range(L):
-#            if mu[i]==0 or sigma2[i]==0:
-#                n.append(((mu[i]+delta)**2/(sigma2[i]+2*delta))/(1-(mu[i]+delta)/(sigma2[i]+2*delta)))
-#                p.append((mu[i]+delta)/(sigma2[i]+2*delta))
-#            else:
-#                n.append((mu[i]**2/sigma2[i])/(1-mu[i]/sigma2[i]))
-#                p.append(mu[i]/sigma2[i])
-#        NBpval = list(); 
-#        if ScreenType == 'enrichment':
-#            for i in range(L):
-#                if mu[i]==0 and x[i]==0:
-#                    NBpval.append(1)
-#                elif x[i]<=mu[i]:
-#                    NBpval.append(1)
-#                else: 
-#                    NBpval.append(1 - scipy.stats.nbinom.cdf(x[i],n[i],p[i]))
-#        elif ScreenType == 'depletion':
-#            for i in range(L):
-#                if mu[i]==0 and x[i]==0:
-#                    NBpval.append(1)
-#                elif x[i]>=mu[i]:
-#                    NBpval.append(1)                                                           
-#                else:
-#                    NBpval.append(scipy.stats.nbinom.cdf(x[i],n[i],p[i]))
-#        else:
-#            print('ERROR: Check spelling of ScreenType in configuration file!')
-#        # Compute two-sided pvalues (for volcano plot only!)
-#        NBpval2 = list()
-#        for i in range(L):
-#            if x[i]<=mu[i]:
-#                NBpval2.append(scipy.stats.nbinom.cdf(x[i],n[i],p[i]))
-#            else:
-#                NBpval2.append(1 - scipy.stats.nbinom.cdf(x[i],n[i],p[i]))                        
-#        # p-value correction for multiple tests
-#        print('p-value correction ...')
-#        multTest = multipletests(NBpval,alpha,padj)
-#        significant = multTest[0]
-#        NBpval_0 = multTest[1]
-#        # Plots
-#        print('Plotting p-values ...')
-#        pvalHist(NBpval,NBpval_0,pvalDir,sample,res,svg)
-#        VolcanoPlot(fc,NBpval2,significant,pvalDir,ScreenType,sample,res,svg,alpha)
-#        QQPlot(NBpval,significant,pvalDir,sample,res,svg,alpha)
-#        zScorePlot(fc,significant,pvalDir,ScreenType,sample,res,svg,alpha)
-#    else:         # no control replicates
-#        print('WARNING: No control replicates! No p-values computed...')
-#        NBpval = [1 for k in range(L)]
-#        NBpval_0 = [1 for k in range(L)]
-#        significant = [False for k in range(L)]
-            
+               
     # -----------------------------------------------
     # Save sgRNA dataframe
     # -----------------------------------------------
@@ -235,7 +172,6 @@ def PrepareHitList(sample):
         Results_df_0 = Results_df.sort_values(['significant','p-value','fold change','sgRNA'],ascending=[0,1,1,1])
     ListFilename = sample+'_'+str(alpha)+'_'+padj+'_sgRNAList.tsv'
     Results_df_0.to_csv(ListFilename, sep = '\t', index = False)
-    #Results_df_0.to_hdf(ListFilename+'.hdf','df')
     if SheetFormat == 'xlsx':
         print('Converting to xlsx ...')
         ListFilename = sample+'_'+str(alpha)+'_'+padj+'_sgRNAList.xlsx'
diff --git a/Scripts/PinAPL.py b/Scripts/PinAPL.py
index 7eba807..b113b57 100755
--- a/Scripts/PinAPL.py
+++ b/Scripts/PinAPL.py
@@ -32,6 +32,7 @@
 AlnQCDir = config['AlnQCDir']
 SeqQCDir = config['SeqQCDir']
 LogFileDir = config['LogFileDir']
+SanityScript = config['SanityScript']
 IndexScript = config['IndexScript']
 LoaderScript = config['LoaderScript']
 ReadDepthScript = config['ReadDepthScript']
@@ -55,6 +56,9 @@
 os.system('python -u PrintStatus.py Header blank 2>&1 | tee PinAPL-Py.log')
 start = time.time()
 
+# Library sanity check
+os.system('python -u '+SanityScript+'.py 2>&1 | tee -a PinAPL-Py.log')
+
 # Generate index if not present
 if not os.path.exists(IndexDir):
     StatMsg = 'Building library index ...'
@@ -196,7 +200,4 @@
 os.system('cp configuration.yaml '+LogFileDir)
 os.system('cp PinAPL-Py.log '+LogFileDir)
 os.chdir(WorkingDir)
-os.system('cp DataSheet.xlsx '+LogFileDir)
-
-# Status message
-print('LOADING RESULTS PAGE. PLEASE REFRESH PERIODICALLY...')
+os.system('cp DataSheet.xlsx '+LogFileDir)
\ No newline at end of file
diff --git a/Scripts/PrintStatus.py b/Scripts/PrintStatus.py
index 646ced3..01801d0 100755
--- a/Scripts/PrintStatus.py
+++ b/Scripts/PrintStatus.py
@@ -12,8 +12,8 @@
 
 def PrintStatus_Header():
     print('**************************************************')
-    print('Launching PinAPL-Py v2.7.4..')
-    print('P. Spahn et al., UC San Diego (07/2017)')
+    print('Launching PinAPL-Py v2.7.5..')
+    print('P. Spahn et al., UC San Diego (08/2017)')
     print('**************************************************')
     
 def PrintStatus_SubHeader(msg):
diff --git a/configuration.yaml b/configuration.yaml
index bd888f5..76f93a5 100644
--- a/configuration.yaml
+++ b/configuration.yaml
@@ -31,7 +31,7 @@ alpha: 0.001           # critical false-discovery rate for sign. sgRNAs/genes
 P_0: 0.00005           # maximum p-value for sgRNA to be taken into account for aRRA
 Np: 1000               # number of permutations for gene ranking analysis
 TopN: 25               # number of top sgRNAs to take into account for clustering
-thr_STARS: 20          # threshold percentage for STARS analysis
+thr_STARS: 10          # threshold percentage for STARS analysis
 CutErrorTol: 0.1       # cutadapt error tolerance
 R_min: 20             # minimal required read length after cutadapt trimming
 L_bw: 11               # Bowtie2 -L parameter (seed length)
@@ -83,6 +83,7 @@ CutAdaptDir: '/root/.local/bin/'
 STARSDir: '/opt/PinAPL-Py/Scripts/STARS_mod/'
 
 # SCRIPT FILENAMES
+SanityScript: 'CheckLibrary'
 IndexScript: 'BuildLibraryIndex'
 LoaderScript: 'LoadDataSheet'
 ReadDepthScript: 'PlotNumReads'