-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdimsum-latest.py
182 lines (127 loc) · 8.38 KB
/
dimsum-latest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/env python
# coding: utf-8
# # DiMSum script
# In[11]:
import pandas as pd
import numpy as np
import os
# In[12]:
fastq_dir = '/home/rodur28/novaseq_run2/reads/'
outpath = '/home/rodur28/novaseq_run2_dimsum-latest/'
path_to_layout = '/home/rodur28/novaseq_run2_analysis/20221205_NovaSeq_DMS_sample_metadata.xlsx'
path_to_nbgen = '/home/rodur28/novaseq_run2_analysis/nbgen.xlsx'
lc = ['strain','locus','pool_type','compound'] # list of attributes to group samples together
sample_label = 'nanuq_id' # column header corresponding to sample id in layout file
# ## Generate DimSum experimental design file
# Script v1:
# 1. User provides a sample id
# 2. A common set of conditions is extracted from the sample layout
# 3. Samples sharing the set of conditions, including precultures (input) and the single corresponding selection condition (output) are extracted
# 4. Experimental design file is created in DiMSum format
#
# **Script v2**:
# 1. All samples in sample layout are grouped by set of conditions
# 2. One sample id per set is extracted
# 3. Script v1 steps 2-4 loop over for each sample id obtained in Script v2 step 2.
# In[13]:
layoutdf = pd.read_excel(path_to_layout, index_col=0)
layoutdf
# In[14]:
nbgendf = pd.read_excel(path_to_nbgen, index_col=0)
nbgendf.rename(columns={'id_number':'id'}, inplace=True)
nbgendf
# In[15]:
df = pd.merge(layoutdf, nbgendf, how='left')
df
# In[16]:
gby = df.groupby(lc).first().reset_index()[lc+[sample_label]] # excludes precultures
gby
# In[17]:
def get_condition_set(df, sample_col_label, s, list_conditions):
condition_set = []
for x in list_conditions:
condition_set.append(df.loc[df[sample_col_label] == s, x].values[0])
return condition_set
# In[18]:
def get_replicates_df(df, list_conditions, condition_set):
subdf = df[eval('&'.join(['(df.{}==\'{}\')'.format(v, condition_set[i]) for i,v in enumerate(list_conditions[:-1])]))] # every sample meeting criteria except not restricting to last one = compound
groupdf = subdf[subdf[list_conditions[-1]].isin([condition_set[-1],np.nan])]
return groupdf.sort_values(by=list_conditions[-1]).reset_index(drop=True)
# In[19]:
for x in gby[sample_label]:
c_set = get_condition_set(df, sample_label, x, lc)
print(f"{x:15} {'Strain':8}{c_set[0]:10} {'Locus':8}{c_set[1]:12} {'Pool type':10}{c_set[2]:15} {'Selection compound':20}{c_set[3]}")
rep_df = get_replicates_df(df, lc, c_set)
design_df = pd.DataFrame(columns=['sname','compound','experiment_replicate','selection_id','selection_replicate','technical_replicate','pair1','pair2','cutadapt5First','cutadapt5Second','generations','selection_time'])
pre_cultures = [x for x in rep_df[sample_label] if 'PC' in x]
nb_precultures = len(pre_cultures)
nb_samples = len(rep_df[sample_label])-nb_precultures
design_df['sname'] = [x for x in rep_df[sample_label] if 'PC' not in x]*nb_precultures
design_df['compound'] = design_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'compound'].values[0])
nb_selection = len(design_df.compound.unique())
exp_r = []
for pc in range(nb_precultures*nb_selection):
for s in range(nb_samples//nb_selection):
exp_r.append(pc+1)
design_df['experiment_replicate'] = exp_r
design_df['selection_id'] = 1
design_df['selection_replicate'] = design_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'replicate'].values[0])
design_df['technical_replicate'] = ''
design_df['generations'] = design_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'total_nb_gen'].values[0])
design_df['selection_time'] = design_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'selection_time'].values[0])
# Precultures are added separately
# Replicates in the exp design are : up to 3 precultures, then inoculated 3 times per selection condition
# I decided to launch dimsum analysis on 1 preculture + all replicates of all selection conditions, for each preculture
# ..which means post-selection samples are duplicated every time for each new preculture / "experimental replicate"
# This is because dimsum does not allow for replicates before selection
# NOTE - Later on we'll have to decide how we want to calculate selection coefficient, particularly because of missing t0
# In either of the chosen method, we'll probably need to edit expdesign_sampleid file to include precultures
# If all corresponding precultures (input) are present, selection coefficient will most likely be calculated by pairing outputs with the appropriate input
# In the case of missing t0, one might decide to discard the corresponding outputs
# Alternatively, one can use the error model generated by DiMSum to determine if variability between inputs is low enough that we can infer the missing t0
prec_d = {'sname': pre_cultures*nb_selection,
'compound':['']*nb_precultures*nb_selection,
'experiment_replicate': range(1,nb_precultures*nb_selection+1),
'selection_id': [0]*nb_precultures*nb_selection,
'selection_replicate':['']*nb_precultures*nb_selection,
'technical_replicate':['']*nb_precultures*nb_selection, # Technical replicates should only be specified for samples which have been re-sequenced on different lanes
'generations':['']*nb_precultures*nb_selection,
'selection_time':['']*nb_precultures*nb_selection
}
fulldesign_df = pd.concat([design_df,pd.DataFrame(prec_d)]).reset_index(drop=True)
fulldesign_df['sample_name'] = fulldesign_df.apply(lambda row: 'output'+str(row.experiment_replicate)+row.compound+str(row.selection_replicate) if row.selection_id==1 else 'input'+str(row.experiment_replicate)+row.compound+str(row.selection_replicate), axis=1)
fulldesign_df['pair1'] = fulldesign_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'file_basename'].values[0] + '_R1.fastq.gz')
fulldesign_df['pair2'] = fulldesign_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'file_basename'].values[0] + '_R2.fastq.gz')
fulldesign_df['cutadapt5First'] = fulldesign_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'cutadapt5First'].values[0])
fulldesign_df['cutadapt5Second'] = fulldesign_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'cutadapt5Second'].values[0])
sample_name_col = fulldesign_df.pop('sample_name')
fulldesign_df.insert(0, 'sample_name', sample_name_col)
cset_name = '_'.join(c_set)
expdesignpath = outpath+cset_name
if not os.path.exists(expdesignpath):
os.makedirs(expdesignpath)
fulldesign_df.to_csv(expdesignpath+'/expdesign_sampleid.txt', sep='\t', index=False) # Dataframe is saved with sample ids and compound just in case, for downstream analysis
expdesign_df = fulldesign_df.drop(columns=['sname','compound'])
expdesign_df.to_csv(expdesignpath+'/expdesign.txt', sep='\t', index=False) # Experimental design file corresponds to the same dataframe without sample ids nor compound
# Note - sample_name must contain alphanumeric characters only and typically won't work well is the name is complicated
# Indicating input or output helps with downstream analyses
# Including experimental and selection replicates is necessary to prevent duplicate names that are not technical replicates
# DiMSum command
if len(rep_df.wildtypeSequence.unique()) != 1:
print('Error: multiple WT sequences in extracted samples')
else:
wtseq = rep_df.wildtypeSequence.unique()[0]
cutoff_length = round(0.75*len(wtseq))
wtseq
cmdline = f'DiMSum --projectName {cset_name} --outputPath {outpath} --experimentDesignPath {expdesignpath}/expdesign.txt --startStage 1 --numCores 10 --cutadaptMinLength {cutoff_length} --wildtypeSequence {wtseq} --indels all --maxSubstitutions 9 --mixedSubstitutions T --fitnessMinInputCountAll 10 --fastqFileDir {fastq_dir} --experimentDesignPairDuplicates T'
print(cmdline)
# What follows allows to send directly the command to bash from the notebook (or the python script)
import subprocess
p = subprocess.Popen(cmdline.split(),stdout=subprocess.PIPE)
while True:
output = p.stdout.readline()
if p.poll() is not None and output == b'':
break
if output:
print (output.decode('utf-8').strip())
retval = p.poll()