andersen-lab · AlaaALatif · Feb 21, 2021 · Feb 21, 2021 · Feb 22, 2021 · Feb 22, 2021
diff --git a/README.md b/README.md
@@ -1,25 +1,61 @@
-This is the code repository for `bjorn` - a suite of tools that can be used to generate information for large-scale genomic surveillance of SARS-CoV-2 sequences. `bjorn` heavily relies on external tools such as `datafunk`, `minimap2`, and `pandas`. 
+# `björn`
+This is the code repository for `bjorn` - a suite of miscellaneous tools that can be used to:
+
+* generate information for large-scale genomic surveillance of SARS-CoV-2 sequences. This functionality relies on external tools such as `datafunk`, `minimap2`, and `pandas`. 
+
+* prepare results and data files from SARS-CoV-2 sequencing analysis for release to public databases such as GISAID, Google Cloud, and GitHub
 
 ## Installation
 * Install Anaconda: [instructions can be found here](https://docs.anaconda.com/anaconda/install/)
 * Create the `bjorn` environment
 ```bash
-conda env create -f envs/macos.yml -n bjorn
+conda env create -f env/linux.yml -n bjorn
 ```
 * Activate environment
 ```bash
 conda activate bjorn
 ```
-* Install datafunk: [instructions (ensure environment is activated during installation)](https://github.com/cov-ert/datafunk)
+* Install datafunk (inside the activated environment): [instructions (ensure environment is activated during installation)](https://github.com/cov-ert/datafunk)
 
 ## Usage
+### Information for Surveillance of SARS-CoV-2 Genomic Mutations
+* Activate `bjorn` environment
+```bash
+conda activate bjorn
+```
+* Open `config.json` to specify your parameters such as
+    * current date
+    * date appended to each filepath 
+    * output directory where results are saved
+    * number of CPU cores available for use 
+* Run the `run_sitrep.sh` script to initiate the Snakemake pipeline
+```bash
+bash run_sitrep.sh
+```
+
+### Post-processing of SARS-CoV-2 Sequencing Results for Release to public databases
 * Activate `bjorn` environment
 ```bash
 conda activate bjorn
 ```
-* Open `config.json` to specify your parameters, then save file
-* NB: the config will run a test by default. Once its tested, make sure to change the `is_test` value to `false` in `config.json`
-* Run `count_variants` to generate mutation information
+* Open `run_alab_release.sh` to specify your parameters such as
+    * filepath to sample sheet containing sample metadata (input)
+    * filepath to updated metadata of samples that have already been uploaded
+    * output directory where results are saved
+    * number of CPU cores available for use
+    * minimum coverage required for each sample (QC filter)
+    * minimum average depth required for each sample (QC filter)
+    * DEFAULT: test parameters
+* Open `config.json` to specify your parameters such as
+    * list of SARS-CoV-2 genes that are considered non-concerning
+        * i.e. the occurrence of open-read frame (ORF) altering mutations can be accepted
+        * e.g. ['ORF8', 'ORF10']
+    * list of SARS-CoV-2 mutations that are considered non-concerning
+        * i.e. the occurrence of `ORF8:Q27_` can be accepted (B117 exists)
+        * e.g. ['ORF8:Q27_']
+* Run the `run_alab_release.sh` script to initiate the data release pipeline
 ```bash
-python count_variants.py
-```
+bash run_alab_release.sh
+```
+* `bjorn` assumes the following file structure for the input sequencing data
+![Release Structure](figs/alab_release_filestructure.png)
diff --git a/Snakefile b/Snakefile
@@ -0,0 +1,130 @@
+import sys
+sys.path.append('src/')
+from path import Path
+from datetime import datetime
+import pandas as pd
+import json
+import argparse
+import bjorn_support as bs
+import json2fasta as bj
+import chunk_fasta as bf
+import msa_2_mutations as bm
+
+
+# load user parameters
+configfile: "config.json"
+
+username = config['username']
+password = config['password']
+out_dir = config['out_dir']
+is_test = config['feed_test']
+if is_test:
+    current_datetime = 'test'
+else:
+    current_datetime = datetime.now().strftime("%Y-%m-%d-%H-%M")
+gisaid_sequences_filepath = out_dir + '/' + config['gisaid_fasta'] + '_' + current_datetime + '.fasta'
+meta_filepath = out_dir + '/' + config['gisaid_meta'] + '_' + current_datetime + '.tsv.gz'
+info_filepath = out_dir + '/' + config['chunk_info']
+chunks_dir = out_dir + '/chunks'
+fasta_dir = chunks_dir + '/fasta/' + current_datetime
+# sam_dir = chunks_dir + '/sam/' + current_date
+# msa_dir = chunks_dir + '/msa/' + current_date
+# muts_dir = chunks_dir + '/muts/' + current_date
+logs_dir = out_dir + '/logs'
+chunk_size = int(config['chunk_size'])
+num_cpus = int(config['num_cpus'])
+reference_filepath = config['ref_fasta']
+patient_zero = config['patient_zero']
+
+# Download and pre-process GISAID data
+download_cmd = f"src/json2fasta.py -u {username} -p {password} -s {chunk_size} -t {current_datetime}"
+bs.run_command(download_cmd)
+info_df = pd.read_csv(info_filepath)
+# info_df = bj.download_process_data(username, password, chunk_size)
+
+
+rule all:
+    input:
+        "{out_dir}/mutations_{current_datetime}.csv".format(out_dir = out_dir, current_datetime = current_datetime), # output data (signal)
+        # expand("{chunks_dir}/muts/{current_datetime}/{sample}.mutations.csv", chunks_dir = chunks_dir, current_datetime = current_datetime, sample = info_df['chunk_names']), # bjorn
+        # expand("{chunks_dir}/msa/{current_datetime}/{sample}.aligned.fasta", chunks_dir = chunks_dir, current_datetime = current_datetime, sample = info_df['chunk_names']), # data2funk -> gofasta
+        # expand("{chunks_dir}/sam/{current_datetime}/{sample}.sam", chunks_dir = chunks_dir, current_datetime = current_datetime, sample = info_df['chunk_names']), # minimap2 -> mafft
+        # expand("{chunks_dir}/fasta/{current_datetime}/{sample}.fasta", chunks_dir = chunks_dir, current_datetime = current_datetime, sample = info_df['chunk_names']), # chunk_fasta
+        gisaid_sequences_filepath, # input data (signal)
+        info_filepath
+        # reference_filepath, # input data (patient zero)
+
+
+
+
+# TODO: create merge_mutations.py 
+rule merge_results:
+    input:
+        expand("{chunks_dir}/muts/{current_datetime}/{sample}.mutations.csv", chunks_dir = chunks_dir, sample = info_df['chunk_names'], current_datetime = current_datetime),
+        meta_filepath=meta_filepath
+    threads: 1
+    params:
+        current_datetime=current_datetime,
+    output:
+        "{out_dir}/mutations_{current_datetime}.csv"
+    shell:
+        """
+        src/merge_results.py -i {chunks_dir}/muts/{current_datetime}/ -m {input.meta_filepath} -o {output} -t {params.current_datetime}
+        """
+
+
+# TODO: test msa_2_mutations.py 
+rule run_bjorn:
+    input:
+        "{chunks_dir}/msa/{current_datetime}/{sample}.aligned.fasta"
+    params:
+        patient_zero=patient_zero,
+    output:
+        temp("{chunks_dir}/muts/{current_datetime}/{sample}.mutations.csv")
+    shell:
+        """
+        src/msa_2_mutations.py -i {input} -r {params.patient_zero} -o {output}
+        """
+        # for i, o in zip(input, output):
+        #     _ = bm.msa_2_mutations(i, params.patient_zero, o, config)
+
+rule run_data2funk:
+    input:
+        "{chunks_dir}/sam/{current_datetime}/{sample}.sam",
+    params:
+        reference_filepath=reference_filepath,
+    output:
+        temp("{chunks_dir}/msa/{current_datetime}/{sample}.aligned.fasta"),
+    shell:
+        """
+        datafunk sam_2_fasta -s {input} -r {params.reference_filepath} -o {output} --pad --log-inserts
+        """
+
+rule run_minimap2:
+    input:
+        "{chunks_dir}/fasta/{current_datetime}/{sample}.fasta"
+    params:
+        num_cpus=num_cpus,
+        reference_filepath=reference_filepath
+    output:
+        temp("{chunks_dir}/sam/{current_datetime}/{sample}.sam"),
+    shell:
+        """
+        minimap2 -a -x asm5 -t {params.num_cpus} {params.reference_filepath} {input} -o {output}
+        """
+
+rule chunk_fasta:
+    input:
+        gisaid_sequences_filepath,
+    params:
+        reference_filepath=reference_filepath,
+        chunk_size=chunk_size,
+        out_dir=fasta_dir
+        # out_dir=lambda wildcards, output: Path(output).parent
+    threads: 1
+    output:
+        temp(expand("{chunks_dir}/fasta/{current_datetime}/{sample}.fasta", chunks_dir = chunks_dir, current_datetime = current_datetime, sample = info_df['chunk_names']))
+    shell:
+        """
+        src/chunk_fasta.py -f {input} -r {params.reference_filepath} -s {params.chunk_size} -o {chunks_dir}/fasta/{current_datetime}
+        """
diff --git a/VoC.ipynb → archive/VoC.ipynb b/VoC.ipynb → archive/VoC.ipynb
diff --git a/archive/alab_mutations.py b/archive/alab_mutations.py
@@ -0,0 +1,51 @@
+import subprocess
+import shlex
+import json
+from path import Path
+import pandas as pd
+import bjorn_support as bs
+import mutations as bm
+import data as bd
+
+
+with open('config.json', 'r') as f:
+    config = json.load(f)
+
+date = config['date']
+out_dir = Path(config['alab_out_dir'])
+ref_fp = Path(config['ref_fasta'])
+patient_zero = config['patient_zero']
+num_cpus = config['num_cpus']
+in_alab_seqs = Path(config['alab_sequences'])
+in_alab_meta = Path(config['alab_meta'])
+if not Path.isdir(out_dir):
+    Path.mkdir(out_dir)
+    print(f"Created results directory: {out_dir}")
+else:
+    print(f"Results directory {out_dir} already exists...Continuing...")
+# concatenate all consensus sequences
+fa_fp = out_dir/'alab_seqs.fa'
+if not Path.isfile(fa_fp):
+    fa_fp = bs.concat_fasta(in_alab_seqs, out_dir/'alab_seqs')
+print(f"Concatenated all sequences and wrote to {fa_fp}")
+# align consensus sequences
+msa_fp = Path(fa_fp.split('.')[0] + '_aligned.fa')
+if not Path.isfile(msa_fp):
+    print(f"Aligning sequences with reference...")
+    msa_fp = bs.align_fasta_reference(fa_fp, msa_fp, ref_fp=ref_fp, num_cpus=num_cpus)
+print(f"Multiple sequence alignment of A-lab samples with reference saved in {msa_fp}")
+# msa2_fp = Path(fa_fp.split('.')[0] + '_aligned_absolute.fa')
+# if not Path.isfile(msa2_fp):
+#     print(f"Aligning sequences without reference...")
+#     msa2_fp = bs.align_fasta(fa_fp, msa2_fp, num_cpus=num_cpus)
+# print(f"Multiple sequence alignment of A-lab samples without reference saved in {msa2_fp}")
+# Identify substitutions and deletions 
+msa_data = bs.load_fasta(msa_fp, is_aligned=True)
+subs_wide = bm.identify_replacements(msa_data, in_alab_meta, data_src='alab')
+subs_wide_fp = out_dir/f'alab_substitutions_wide_{date}.csv'
+subs_wide.sort_values('num_samples', ascending=False).to_csv(subs_wide_fp, index=False)
+print(f"Substitution-based mutations of A-lab samples saved in {subs_wide_fp}")
+dels_wide = bm.identify_deletions(msa_data, in_alab_meta, data_src='alab')
+dels_wide_fp = out_dir/f'alab_deletions_wide_{date}.csv'
+dels_wide.sort_values('num_samples', ascending=False).to_csv(dels_wide_fp, index=False)
+print(f"Deletion-based mutations of A-lab samples saved in {dels_wide_fp}")
diff --git a/bjorn.py → archive/bjorn.py b/bjorn.py → archive/bjorn.py
@@ -13,7 +13,7 @@
 from itertools import repeat
 import os
 from datetime import datetime as dt
-from bjorn_support import concat_fasta, align_fasta, compute_tree, map_gene_to_pos
+from bjorn_support import concat_fasta, align_fasta, compute_tree, map_gene_to_pos, load_fasta
 from mutations import identify_replacements, identify_deletions, identify_insertions
 from onion_trees import load_tree, visualize_tree, get_indel2color, get_sample2color
 import data as bd
@@ -392,20 +392,33 @@ def process_id(x):
         colors = list(mcolors.TABLEAU_COLORS.keys())
         # path to new github metadata
         meta_fp = out_dir/'metadata.csv'
+        # load multiple sequence alignment
+        msa_data = load_fasta(msa_fp, is_aligned=True)
+        # identify insertions
+        insertions = identify_insertions(msa_data, 
+                                         meta_fp=meta_fp, 
+                                         patient_zero=patient_zero, 
+                                         min_ins_len=1,
+                                         data_src='alab')
+        # save insertion results to file
+        insertions.to_csv(out_dir/'insertions.csv', index=False)
         # identify substitution mutations
-        subs = identify_replacements(msa_fp,
-                                 meta_fp,
-                                 patient_zero)
+        subs = identify_replacements(msa_data,
+                                    meta_fp=meta_fp,
+                                    data_src='alab',
+                                    patient_zero=patient_zero)
+        # save substitution results to file
         subs.to_csv(out_dir/'replacements.csv', index=False)
         # identify deletions
-        deletions = identify_deletions(msa_fp,
-                                   meta_fp,
-                                   patient_zero,
-                                   min_del_len=1)
+        deletions = identify_deletions(msa_data,
+                                        meta_fp=meta_fp,
+                                        data_src='alab',
+                                        patient_zero=patient_zero,
+                                        min_del_len=1)
         # save deletion results to file
         deletions.to_csv(out_dir/'deletions.csv', index=False)
         # plot Phylogenetic tree with top consensus deletions annotated
-        # deletions = deletions.nlargest(len(colors), 'num_samples')
+        deletions = deletions.nlargest(len(colors), 'num_samples')
         # del2color = get_indel2color(deletions, colors)
         # sample_colors = get_sample2color(deletions, colors)
         # fig2 = visualize_tree(tree, sample_colors,
@@ -415,10 +428,6 @@ def process_id(x):
         #                   indels=deletions, colors=colors,
         #                   isnv_info=True);
         # fig3.savefig(tree_dir/'deletion_isnv_tree.pdf', dpi=300)
-        # identify insertions
-        insertions = identify_insertions(msa_fp, meta_fp, patient_zero, min_ins_len=1)
-        # save deletion results to file
-        insertions.to_csv(out_dir/'insertions.csv', index=False)
         # plot Phylogenetic tree with top consensus deletions annotated
         insertions = insertions.nlargest(len(colors), 'num_samples')
         # del2color = get_indel2color(insertions, colors)
@@ -434,10 +443,9 @@ def process_id(x):
             Path.mkdir(out_dir);
     # Data logging
     with open("{}/data_release.log".format(out_dir), 'w') as f:
-        f.write(f"Prepared {final_result.shape[0]} samples for release")
+        f.write(f"Prepared {final_result.shape[0]} samples for release\n")
         f.write(f'{num_samples_missing_coverage} samples are missing coverage information\n')
         f.write(f'{low_coverage_samples.shape[0]} samples were found to have coverage below 90%\n')
         f.write(f'{num_samples_missing_cons} samples were ignored because they were missing consensus sequence files\n')
         f.write(f'{num_samples_missing_bams} samples were ignored because they were missing BAM sequence files\n')
-        f.write(f'{num_samples_missing_bams} samples were ignored because they were missing BAM sequence files\n')
     print(f"Transfer Complete. All results saved in {out_dir}")
diff --git a/bjorn_support.py → archive/bjorn_support.py b/bjorn_support.py → archive/bjorn_support.py
@@ -9,6 +9,35 @@
 from Bio import Seq, SeqIO, AlignIO, Phylo, Align
 
 
+def batch_iterator(iterator, chunk_size):
+    """Returns lists of length batch_size.
+
+    This can be used on any iterator, for example to batch up
+    SeqRecord objects from Bio.SeqIO.parse(...), or to batch
+    Alignment objects from Bio.AlignIO.parse(...), or simply
+    lines from a file handle.
+
+    This is a generator function, and it returns lists of the
+    entries from the supplied iterator.  Each list will have
+    batch_size entries, although the final list may be shorter.
+    Citation: https://biopython.org/wiki/Split_large_file
+    """
+    record = True
+    while record:
+        chunk = []
+        while len(chunk) < chunk_size:
+            try:
+                record = next(iterator)
+            except StopIteration:
+                record = None
+            if record is None:
+                # End of file
+                break
+            chunk.append(record)
+        if chunk:
+            yield chunk
+
+
 def dict2fasta(seqs: dict, fasta_fp: str, wrap=80):
     with open(fasta_fp, 'w') as f:
         for gid, gseq in seqs.items():
@@ -148,6 +177,7 @@ def concat_fasta_2(in_filepaths: list, out_filepath):
     """Concatenate fasta sequences into single fasta file.
     Takes a list of fasta filepaths and an output filename for saving"""
     cat_cmd = f"cat {' '.join(in_filepaths)} > {out_filepath}"
+    print(cat_cmd)
     run_command(cat_cmd)
     return out_filepath
 

diff --git a/bjorn_tests.ipynb → archive/bjorn_tests.ipynb b/bjorn_tests.ipynb → archive/bjorn_tests.ipynb