From 0584337a61dba42637192633adfde74cb9ba4120 Mon Sep 17 00:00:00 2001 From: Yavin4 Date: Thu, 24 Aug 2017 14:00:16 -0700 Subject: [PATCH] v2.7.5 --- Scripts/CheckLibrary.py | 65 ++++++++++++++++++++++++++++++++++++++++ Scripts/FindHits.py | 66 +---------------------------------------- Scripts/PinAPL.py | 9 +++--- Scripts/PrintStatus.py | 4 +-- configuration.yaml | 3 +- 5 files changed, 75 insertions(+), 72 deletions(-) create mode 100755 Scripts/CheckLibrary.py diff --git a/Scripts/CheckLibrary.py b/Scripts/CheckLibrary.py new file mode 100755 index 0000000..1063791 --- /dev/null +++ b/Scripts/CheckLibrary.py @@ -0,0 +1,65 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- +""" +Created on Mon Oct 10 10:22:56 2016 + +@author: philipp +""" +# Library sanity check +# ======================================================================= +# Imports +import yaml +import sys +import os +import pandas + +def LibrarySanityCheck(): + # ------------------------------------------------ + # Get parameters + # ------------------------------------------------ + configFile = open('configuration.yaml','r') + config = yaml.load(configFile) + configFile.close() + LibDir = config['LibDir'] + LibFilename = config['LibFilename'] + LibFormat = LibFilename[-3:] + if LibFormat == 'tsv': + libsep = '\t' + elif LibFormat == 'csv': + libsep = ',' + + # ---------------------------------- + # Read library + # ---------------------------------- + os.chdir(LibDir) + LibCols = ['gene','ID','seq'] + LibFile = pandas.read_table(LibFilename, sep = libsep, skiprows = 1, names = LibCols) + GeneNames = list(LibFile['gene']) + ID = list(LibFile['ID']) + seq = list(LibFile['seq']) + + # ---------------------------------- + # Replace non-printable characters (...these cause problems in PlotCount.py) + # ---------------------------------- + GeneNames0 = [] + for gene in GeneNames: + gene = gene.replace('|','_') + gene = gene.replace('(','_') + gene = gene.replace(')','_') + gene = gene.replace(';','_') + gene = gene.replace('"','') + gene = gene.replace('/','_') + gene = gene.replace('\\','_') + GeneNames0.append(gene) + if GeneNames != GeneNames0: + LibFile0 = pandas.DataFrame(data = {'gene': [gene for gene in GeneNames0], + 'ID': [id for id in ID], + 'seq': [s for s in seq]}, + columns = ['gene','ID','seq']) + LibFile0.to_csv(LibFilename, sep = libsep, index = False) + print("WARNING: Found non-printable characters in library file. Replaced by '_' ") + + + +if __name__ == "__main__": + LibrarySanityCheck() diff --git a/Scripts/FindHits.py b/Scripts/FindHits.py index c46c7e8..c924578 100755 --- a/Scripts/FindHits.py +++ b/Scripts/FindHits.py @@ -146,70 +146,7 @@ def PrepareHitList(sample): QQPlot(NBpval,significant,pvalDir,sample,res,svg,alpha) zScorePlot(fc,significant,pvalDir,ScreenType,sample,res,svg,alpha) - -# # Compute fold change (compared to control) -# print('Computing fold changes ...') -# fc = list() -# for k in range(L): -# if x[k]==0 or mu[k]==0: -# fc.append((x[k]+delta)/(mu[k]+delta)) -# else: -# fc.append(x[k]/mu[k]) -# # Compute negative binomial p-values -# if max(sigma2) > 0: -# print('Computing p-values ...') -# # Neg. Binom. Parameters n: number of failures, p: probability of failure -# n = list(); p = list() -# for i in range(L): -# if mu[i]==0 or sigma2[i]==0: -# n.append(((mu[i]+delta)**2/(sigma2[i]+2*delta))/(1-(mu[i]+delta)/(sigma2[i]+2*delta))) -# p.append((mu[i]+delta)/(sigma2[i]+2*delta)) -# else: -# n.append((mu[i]**2/sigma2[i])/(1-mu[i]/sigma2[i])) -# p.append(mu[i]/sigma2[i]) -# NBpval = list(); -# if ScreenType == 'enrichment': -# for i in range(L): -# if mu[i]==0 and x[i]==0: -# NBpval.append(1) -# elif x[i]<=mu[i]: -# NBpval.append(1) -# else: -# NBpval.append(1 - scipy.stats.nbinom.cdf(x[i],n[i],p[i])) -# elif ScreenType == 'depletion': -# for i in range(L): -# if mu[i]==0 and x[i]==0: -# NBpval.append(1) -# elif x[i]>=mu[i]: -# NBpval.append(1) -# else: -# NBpval.append(scipy.stats.nbinom.cdf(x[i],n[i],p[i])) -# else: -# print('ERROR: Check spelling of ScreenType in configuration file!') -# # Compute two-sided pvalues (for volcano plot only!) -# NBpval2 = list() -# for i in range(L): -# if x[i]<=mu[i]: -# NBpval2.append(scipy.stats.nbinom.cdf(x[i],n[i],p[i])) -# else: -# NBpval2.append(1 - scipy.stats.nbinom.cdf(x[i],n[i],p[i])) -# # p-value correction for multiple tests -# print('p-value correction ...') -# multTest = multipletests(NBpval,alpha,padj) -# significant = multTest[0] -# NBpval_0 = multTest[1] -# # Plots -# print('Plotting p-values ...') -# pvalHist(NBpval,NBpval_0,pvalDir,sample,res,svg) -# VolcanoPlot(fc,NBpval2,significant,pvalDir,ScreenType,sample,res,svg,alpha) -# QQPlot(NBpval,significant,pvalDir,sample,res,svg,alpha) -# zScorePlot(fc,significant,pvalDir,ScreenType,sample,res,svg,alpha) -# else: # no control replicates -# print('WARNING: No control replicates! No p-values computed...') -# NBpval = [1 for k in range(L)] -# NBpval_0 = [1 for k in range(L)] -# significant = [False for k in range(L)] - + # ----------------------------------------------- # Save sgRNA dataframe # ----------------------------------------------- @@ -235,7 +172,6 @@ def PrepareHitList(sample): Results_df_0 = Results_df.sort_values(['significant','p-value','fold change','sgRNA'],ascending=[0,1,1,1]) ListFilename = sample+'_'+str(alpha)+'_'+padj+'_sgRNAList.tsv' Results_df_0.to_csv(ListFilename, sep = '\t', index = False) - #Results_df_0.to_hdf(ListFilename+'.hdf','df') if SheetFormat == 'xlsx': print('Converting to xlsx ...') ListFilename = sample+'_'+str(alpha)+'_'+padj+'_sgRNAList.xlsx' diff --git a/Scripts/PinAPL.py b/Scripts/PinAPL.py index 7eba807..b113b57 100755 --- a/Scripts/PinAPL.py +++ b/Scripts/PinAPL.py @@ -32,6 +32,7 @@ AlnQCDir = config['AlnQCDir'] SeqQCDir = config['SeqQCDir'] LogFileDir = config['LogFileDir'] +SanityScript = config['SanityScript'] IndexScript = config['IndexScript'] LoaderScript = config['LoaderScript'] ReadDepthScript = config['ReadDepthScript'] @@ -55,6 +56,9 @@ os.system('python -u PrintStatus.py Header blank 2>&1 | tee PinAPL-Py.log') start = time.time() +# Library sanity check +os.system('python -u '+SanityScript+'.py 2>&1 | tee -a PinAPL-Py.log') + # Generate index if not present if not os.path.exists(IndexDir): StatMsg = 'Building library index ...' @@ -196,7 +200,4 @@ os.system('cp configuration.yaml '+LogFileDir) os.system('cp PinAPL-Py.log '+LogFileDir) os.chdir(WorkingDir) -os.system('cp DataSheet.xlsx '+LogFileDir) - -# Status message -print('LOADING RESULTS PAGE. PLEASE REFRESH PERIODICALLY...') +os.system('cp DataSheet.xlsx '+LogFileDir) \ No newline at end of file diff --git a/Scripts/PrintStatus.py b/Scripts/PrintStatus.py index 646ced3..01801d0 100755 --- a/Scripts/PrintStatus.py +++ b/Scripts/PrintStatus.py @@ -12,8 +12,8 @@ def PrintStatus_Header(): print('**************************************************') - print('Launching PinAPL-Py v2.7.4..') - print('P. Spahn et al., UC San Diego (07/2017)') + print('Launching PinAPL-Py v2.7.5..') + print('P. Spahn et al., UC San Diego (08/2017)') print('**************************************************') def PrintStatus_SubHeader(msg): diff --git a/configuration.yaml b/configuration.yaml index bd888f5..76f93a5 100644 --- a/configuration.yaml +++ b/configuration.yaml @@ -31,7 +31,7 @@ alpha: 0.001 # critical false-discovery rate for sign. sgRNAs/genes P_0: 0.00005 # maximum p-value for sgRNA to be taken into account for aRRA Np: 1000 # number of permutations for gene ranking analysis TopN: 25 # number of top sgRNAs to take into account for clustering -thr_STARS: 20 # threshold percentage for STARS analysis +thr_STARS: 10 # threshold percentage for STARS analysis CutErrorTol: 0.1 # cutadapt error tolerance R_min: 20 # minimal required read length after cutadapt trimming L_bw: 11 # Bowtie2 -L parameter (seed length) @@ -83,6 +83,7 @@ CutAdaptDir: '/root/.local/bin/' STARSDir: '/opt/PinAPL-Py/Scripts/STARS_mod/' # SCRIPT FILENAMES +SanityScript: 'CheckLibrary' IndexScript: 'BuildLibraryIndex' LoaderScript: 'LoadDataSheet' ReadDepthScript: 'PlotNumReads'