dimsum_scripts/dimsum-latest.py

#!/usr/bin/env python
# coding: utf-8

# # DiMSum script

# In[11]:


import pandas as pd
import numpy as np
import os


# In[12]:


fastq_dir = '/home/rodur28/novaseq_run2/reads/'
outpath = '/home/rodur28/novaseq_run2_dimsum-latest/'
path_to_layout = '/home/rodur28/novaseq_run2_analysis/20221205_NovaSeq_DMS_sample_metadata.xlsx'
path_to_nbgen = '/home/rodur28/novaseq_run2_analysis/nbgen.xlsx'
lc = ['strain','locus','pool_type','compound'] # list of attributes to group samples together
sample_label = 'nanuq_id' # column header corresponding to sample id in layout file


# ## Generate DimSum experimental design file

# Script v1:
# 1. User provides a sample id
# 2. A common set of conditions is extracted from the sample layout
# 3. Samples sharing the set of conditions, including precultures (input) and the single corresponding selection condition (output) are extracted
# 4. Experimental design file is created in DiMSum format
# 
# **Script v2**:
# 1. All samples in sample layout are grouped by set of conditions
# 2. One sample id per set is extracted
# 3. Script v1 steps 2-4 loop over for each sample id obtained in Script v2 step 2.

# In[13]:


layoutdf = pd.read_excel(path_to_layout, index_col=0)
layoutdf


# In[14]:


nbgendf = pd.read_excel(path_to_nbgen, index_col=0)
nbgendf.rename(columns={'id_number':'id'}, inplace=True)
nbgendf


# In[15]:


df = pd.merge(layoutdf, nbgendf, how='left')
df


# In[16]:


gby = df.groupby(lc).first().reset_index()[lc+[sample_label]] # excludes precultures
gby


# In[17]:


def get_condition_set(df, sample_col_label, s, list_conditions):
    condition_set = [] 
    for x in list_conditions:
            condition_set.append(df.loc[df[sample_col_label] == s, x].values[0])
    return condition_set


# In[18]:


def get_replicates_df(df, list_conditions, condition_set):
    subdf = df[eval('&'.join(['(df.{}==\'{}\')'.format(v, condition_set[i]) for i,v in enumerate(list_conditions[:-1])]))] # every sample meeting criteria except not restricting to last one = compound
    groupdf = subdf[subdf[list_conditions[-1]].isin([condition_set[-1],np.nan])]
    return groupdf.sort_values(by=list_conditions[-1]).reset_index(drop=True)


# In[19]:


for x in gby[sample_label]:
    c_set = get_condition_set(df, sample_label, x, lc)
    print(f"{x:15} {'Strain':8}{c_set[0]:10} {'Locus':8}{c_set[1]:12} {'Pool type':10}{c_set[2]:15} {'Selection compound':20}{c_set[3]}")
    
    rep_df = get_replicates_df(df, lc, c_set)

    design_df = pd.DataFrame(columns=['sname','compound','experiment_replicate','selection_id','selection_replicate','technical_replicate','pair1','pair2','cutadapt5First','cutadapt5Second','generations','selection_time'])
    pre_cultures = [x for x in rep_df[sample_label] if 'PC' in x]
    nb_precultures = len(pre_cultures)
    nb_samples = len(rep_df[sample_label])-nb_precultures
    
    design_df['sname'] = [x for x in rep_df[sample_label] if 'PC' not in x]*nb_precultures
    design_df['compound'] = design_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'compound'].values[0])
    nb_selection = len(design_df.compound.unique())
    
    exp_r = []
    for pc in range(nb_precultures*nb_selection):
        for s in range(nb_samples//nb_selection):
            exp_r.append(pc+1)
    design_df['experiment_replicate'] = exp_r
            
    design_df['selection_id'] = 1
    design_df['selection_replicate'] = design_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'replicate'].values[0])
    design_df['technical_replicate'] = ''
    design_df['generations'] = design_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'total_nb_gen'].values[0])
    design_df['selection_time'] = design_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'selection_time'].values[0])
    
    # Precultures are added separately
    # Replicates in the exp design are : up to 3 precultures, then inoculated 3 times per selection condition
    # I decided to launch dimsum analysis on 1 preculture + all replicates of all selection conditions, for each preculture
    # ..which means post-selection samples are duplicated every time for each new preculture / "experimental replicate"
    # This is because dimsum does not allow for replicates before selection
    
    # NOTE - Later on we'll have to decide how we want to calculate selection coefficient, particularly because of missing t0
    # In either of the chosen method, we'll probably need to edit expdesign_sampleid file to include precultures
    # If all corresponding precultures (input) are present, selection coefficient will most likely be calculated by pairing outputs with the appropriate input
    # In the case of missing t0, one might decide to discard the corresponding outputs
    # Alternatively, one can use the error model generated by DiMSum to determine if variability between inputs is low enough that we can infer the missing t0
    
    prec_d = {'sname': pre_cultures*nb_selection,
              'compound':['']*nb_precultures*nb_selection, 
              'experiment_replicate': range(1,nb_precultures*nb_selection+1), 
              'selection_id': [0]*nb_precultures*nb_selection, 
              'selection_replicate':['']*nb_precultures*nb_selection, 
              'technical_replicate':['']*nb_precultures*nb_selection, # Technical replicates should only be specified for samples which have been re-sequenced on different lanes
              'generations':['']*nb_precultures*nb_selection,
              'selection_time':['']*nb_precultures*nb_selection
             }
    
    fulldesign_df = pd.concat([design_df,pd.DataFrame(prec_d)]).reset_index(drop=True)
    fulldesign_df['sample_name'] = fulldesign_df.apply(lambda row: 'output'+str(row.experiment_replicate)+row.compound+str(row.selection_replicate) if row.selection_id==1 else 'input'+str(row.experiment_replicate)+row.compound+str(row.selection_replicate), axis=1)
    fulldesign_df['pair1'] = fulldesign_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'file_basename'].values[0] + '_R1.fastq.gz')
    fulldesign_df['pair2'] = fulldesign_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'file_basename'].values[0] + '_R2.fastq.gz')
    fulldesign_df['cutadapt5First'] = fulldesign_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'cutadapt5First'].values[0])
    fulldesign_df['cutadapt5Second'] = fulldesign_df.sname.apply(lambda x: rep_df.loc[rep_df[sample_label]==x, 'cutadapt5Second'].values[0])
    sample_name_col = fulldesign_df.pop('sample_name')
    fulldesign_df.insert(0, 'sample_name', sample_name_col)
    
    cset_name = '_'.join(c_set)
    expdesignpath = outpath+cset_name
    
    if not os.path.exists(expdesignpath):
        os.makedirs(expdesignpath)
    
    fulldesign_df.to_csv(expdesignpath+'/expdesign_sampleid.txt', sep='\t', index=False) # Dataframe is saved with sample ids and compound just in case, for downstream analysis
    expdesign_df = fulldesign_df.drop(columns=['sname','compound'])
    expdesign_df.to_csv(expdesignpath+'/expdesign.txt', sep='\t', index=False) # Experimental design file corresponds to the same dataframe without sample ids nor compound
    # Note - sample_name must contain alphanumeric characters only and typically won't work well is the name is complicated
    # Indicating input or output helps with downstream analyses
    # Including experimental and selection replicates is necessary to prevent duplicate names that are not technical replicates

    # DiMSum command
    
    if len(rep_df.wildtypeSequence.unique()) != 1:
        print('Error: multiple WT sequences in extracted samples')
    else:
        wtseq = rep_df.wildtypeSequence.unique()[0]
    
    cutoff_length = round(0.75*len(wtseq))
    wtseq
    
    cmdline = f'DiMSum --projectName {cset_name} --outputPath {outpath} --experimentDesignPath {expdesignpath}/expdesign.txt --startStage 1 --numCores 10 --cutadaptMinLength {cutoff_length} --wildtypeSequence {wtseq} --indels all --maxSubstitutions 9 --mixedSubstitutions T --fitnessMinInputCountAll 10 --fastqFileDir {fastq_dir} --experimentDesignPairDuplicates T'
    print(cmdline)
    
    # What follows allows to send directly the command to bash from the notebook (or the python script)
    import subprocess
    p = subprocess.Popen(cmdline.split(),stdout=subprocess.PIPE)
    while True:
        output = p.stdout.readline()
        if p.poll() is not None and output == b'':
            break
        if output:
            print (output.decode('utf-8').strip())
    retval = p.poll()