roblanf · kdm9 · May 19, 2020 · May 28, 2020
diff --git a/environment.yml b/environment.yml
@@ -22,6 +22,10 @@ dependencies:
   - ucsc-faFilter
   - ucsc-faSomeRecords
   - ucsc-faSplit
+  - openssl=="1.0.*"  # Beacuse ucsc tools are broken with openssl 1.1.1*
+  - r-base
+  - r-ape
+  - r-optparse
   - pip:
     - pandas==1.0.1
     - pytools==2020.1

diff --git a/full-pipeline-gadi.pbs b/full-pipeline-gadi.pbs
@@ -0,0 +1,28 @@
+#!/bin/bash
+#PBS -q hugemem
+#PBS -l ncpus=48
+#PBS -l walltime=12:00:00
+#PBS -l mem=1500G
+#PBS -l jobfs=1400G
+#PBS -l wd
+#PBS -l storage=scratch/xe2+gdata/xe2
+#PBS -j oe
+#PBS -o gadi/log/
+#PBS -M [email protected]
+#PBS -m abe
+#PBS -P xe2
+#PBS -W umask=002
+#PBS -N sarscov2
+
+source /g/data/xe2/gadi/profile.sh
+module purge
+useconda
+conda activate sarscov2phylo
+
+set -xeuo pipefail
+
+#bash scripts/global_tree_gisaid.sh -i ../data/act/act.fasta -o ../data/act/global_act.fa -t ${PBS_NCPUS:-2} -k 100
+#bash scripts/global_tree_gisaid.sh -i ../data/gisaid/gisaid_hcov-19_2020_05_08_06.fasta -o ../data/gisaid/global.fa -t ${PBS_NCPUS:-2} -k 100
+#bash scripts/global_tree_gisaid.sh -i ../data/act_2020-05-20/gisaid_hcov-19_2020_05_20_01.fasta -o ../data/tiny-gisaid/global.fa -t ${PBS_NCPUS:-2} -k 100
+#bash scripts/global_tree_gisaid_plus.sh -i ../data/tiny-gisaid/tiny-gisaid.fa  -a  ../data/act/act.fasta -d 5 -o ../data/tiny-gisaid/global_plus_act.fa -t ${PBS_NCPUS:-2} -k 100
+bash scripts/global_tree_gisaid_plus.sh -i ../data/act_2020-05-20/gisaid_hcov-19_2020_05_20_01.fasta  -a  ../data/act/act.fasta -d 5 -o ../data/act/global_plus_act.fa -t ${PBS_NCPUS:-2} -k 100
diff --git a/scripts/align_k_dissimilar.sh b/scripts/align_k_dissimilar.sh
@@ -29,62 +29,61 @@ then
    helpFunction
 fi
 
+set -xeuo pipefail
+
+export TMP="${TMP:-$(mktemp -d)}"
+
 n=$(grep '>' $inputfasta | wc -l)
 echo ""
 echo ""
 echo "Filtering the $n initial unaligned input sequences to remove any with > 10 ambiguous bases"
 echo ""
 echo ""
-seqmagick quality-filter --max-ambiguous 10 --max-length 100000 --min-length 100 $inputfasta $inputfasta"_filtered.fa"
-inputfasta=$inputfasta"_filtered.fa"
+seqmagick quality-filter --max-ambiguous 10 --max-length 100000 --min-length 100 "$inputfasta" "${TMP}/filtered.fa"
 
 # how many sequences in the inputfasta
-n=$(grep '>' $inputfasta | wc -l)
-echo "$n sequences remain after filtering"
+seqs="${TMP}/filtered.fa"
+n=$(grep '>' "$seqs" | wc -l)
+echo "$n sequences remain after filtering in '$seqs'"
 
 #define a cutoff to switch to the quick & dirty method of choosing the seqs
 cutoff=50000
 
-inputdir=$(dirname $inputfasta)
-
 echo "Using MAFFT to get a guide tree"
 
 if (( $n > $cutoff )); then
 
 	echo "Using parttree distance method in MAFFT"
 
 	# use this method for maximum speed on v large datasets
-	mafft --thread $threads --retree 0 --treeout --parttree $inputfasta > /dev/null
+	mafft --thread $threads --retree 0 --treeout --parttree "${seqs}" > /dev/null
 
-	# replace all newlines
-	tr -d '\n' < "$inputfasta.tree" > "$inputdir/guide1.tree"
 
+	# replace all newlines then
 	# add branchlengths because this method doesn't give any.
 	# assuming equal branchlengths is obviously not correct, but nevertheless
 	# still gives a pragmatic way of selecting the k divergent sequences
-	sed 's/),/)0.1,/g' "$inputdir/guide1.tree" > "$inputdir/guide2.tree"
-	sed 's/))/)0.1)/g' "$inputdir/guide2.tree" > "$inputdir/guide3.tree"
-
+	tr -d '\n' < "${seqs}.tree" | sed -e 's/),/)0.1,/g' -e 's/))/)0.1)/g' > "$inputdir/guide.tree"
 else
 
 	echo "Using 6mer distance method in MAFFT"
 
 	# use this method if it's feasible, which calculates 6mer distances
-	mafft --thread $threads --retree 0 --treeout $inputfasta > /dev/null
+	mafft --thread $threads --retree 0 --treeout "${seqs}" > /dev/null
 	# replace all newlines
-	tr -d '\n' < $inputfasta.tree > "$inputdir/guide3.tree"
+	tr -d '\n' < "${seqs}.tree" > "$TMP/guide.tree"
 
 fi
 
 # now continue for all methods 
 
 # add a semicolon to the tree so iq-tree can read it
-echo ";" >> "$inputdir/guide3.tree"
+echo ";" >> "$TMP/guide.tree"
 
 echo "Using IQ-TREE to select the K most dissimilar sequences"
 
 # get the k most dissimilar sequences using iq-tree
-iqtree -te "$inputdir/guide3.tree" -k $k
+iqtree -te "$TMP/guide.tree" -k $k
 
 echo "Extracting K most dissimilar sequences"
 
@@ -94,54 +93,32 @@ if (( $n > $cutoff )); then
 	# extract those from the fasta file to make a new fasta
 	# https://www.biostars.org/p/2822/
 	# get the numbers of the sequences in the sorted file
-	awk '/^The optimal PD set has/{flag=1;next}/^Corresponding sub-tree/{flag=0}flag' "$inputdir/guide3.tree.pda" > "$inputdir/numbers.txt"
+	awk '/^The optimal PD set has/{flag=1;next}/^Corresponding sub-tree/{flag=0}flag' "$TMP/guide.tree.pda" > "$TMP/numbers.txt"
 	# get the names
-	grep '>' $inputfasta > "$inputdir/all_names.txt"
-	# get the names given the line numbers
-	awk 'FNR==NR{a[$1];next}(FNR in a){print}' "$inputdir/numbers.txt" "$inputdir/all_names.txt" > "$inputdir/k_names_raw.txt"
-	# remove the ">"
-	tr -d \> < "$inputdir/k_names_raw.txt" > "$inputdir/k_names.txt"
+	grep '>' "$seqs" > "$TMP/all_names.txt"
+	# get the names given the line numbers, and tr to remove the ">"
+	awk 'FNR==NR{a[$1];next}(FNR in a){print}' "$TMP/numbers.txt" "$TMP/all_names.txt" | tr -d \>  > "$TMP/k_names.txt"
 
 else
 	# extract those from the fasta file to make a new fasta
-	awk '/^The optimal PD set has/{flag=1;next}/^Corresponding sub-tree/{flag=0}flag' "$inputdir/guide3.tree.pda" > "$inputdir/k_names_long.txt"
+	awk '/^The optimal PD set has/{flag=1;next}/^Corresponding sub-tree/{flag=0}flag' "$TMP/guide.tree.pda" > "$TMP/k_names_long.txt"
 
 	# these names are like 123_name. For safety we only use the number, since mafft edits special characters in names
-	cut -d _ -f 1 "$inputdir/k_names_long.txt" > "$inputdir/numbers.txt"
+	cut -d _ -f 1 "$TMP/k_names_long.txt" > "$TMP/numbers.txt"
 
 	# get the names
-	grep '>' $inputfasta > "$inputdir/all_names.txt"
-
-	# get the names given the line numbers
-	awk 'FNR==NR{a[$1];next}(FNR in a){print}' "$inputdir/numbers.txt" "$inputdir/all_names.txt" > "$inputdir/k_names_raw.txt"
-
-	# remove the ">"
-	tr -d \> < "$inputdir/k_names_raw.txt" > "$inputdir/k_names.txt"
+	grep '>' $seqs > "$TMP/all_names.txt"
 
+	# get the names given the line numbers remove the ">"
+	awk 'FNR==NR{a[$1];next}(FNR in a){print}' "$TMP/numbers.txt" "$TMP/all_names.txt" | tr -d '>' > "$TMP/k_names.txt"
 fi
 
 
-rm "$inputdir/k_names_long.txt"
-rm "$inputdir/guide1.tree"
-rm "$inputdir/guide2.tree"
-rm "$inputdir/guide3.tree"
-rm "$inputdir/guide3.tree.log"
-rm "$inputfasta.tree"
-rm "$inputdir/k_names_raw.txt"
-rm "$inputdir/numbers.txt"
-rm "$inputdir/all_names.txt"
-rm "$inputdir/guide3.tree.pda"
-
-
 # extract the fasta records we want
 #seqtk subseq $inputfasta "$inputdir/k_names.txt" > "$inputdir/k_unaligned.fa"
-faSomeRecords $inputfasta "$inputdir/k_names.txt" "$inputdir/k_unaligned.fa"
+faSomeRecords $seqs "$TMP/k_names.txt" "$TMP/k_unaligned.fa"
 
 echo "Aligning K most dissimilar sequences"
 
 # align the k most dissimilar sequences in MAFFT
-mafft --thread $threads --auto "$inputdir/k_unaligned.fa" > $outputfasta
-
-rm "$inputdir/k_unaligned.fa"
-rm $inputfasta
-rm "$inputdir/k_names.txt"
+mafft --thread $threads --auto "$TMP/k_unaligned.fa" > "$outputfasta" 
diff --git a/scripts/assign_lineages.sh b/scripts/assign_lineages.sh
@@ -24,6 +24,7 @@ do
 done
 
 
+set -xeuo pipefail
 # First we get the pre-computed lineages from hcov lineages
 
 wget $lineageurl > lineages.csv
@@ -42,4 +43,4 @@ pangolin seqs_todo.fa -o $inputdir -t $threads
 
 # TODO combine lineage information into single file
 # i.e. add pangolin output to the lineages.csv, being aware they have different columns...
-# and noting that lineages.csv is based on epiIDs
+# and noting that lineages.csv is based on epiIDs
diff --git a/scripts/filter_aln.sh b/scripts/filter_aln.sh
@@ -24,6 +24,8 @@ then
    helpFunction
 fi
 
+set -xeuo pipefail
+
 echo ""
 echo "Filtering strains shorter than 29100 and/or with more than 200 ambiguities (that's ~1%)"
 echo "Filtering sites with more than 1% gaps"
@@ -32,4 +34,4 @@ echo ""
 esl-alimask --gapthresh 0.01 --informat afa --outformat afa --dna -o $inputfasta"_alimask.fa" -g  $inputfasta
 esl-alimanip --lmin 29100 --xambig 200 --informat afa --outformat afa --dna -o $outputfasta $inputfasta"_alimask.fa"
 
-rm $inputfasta"_alimask.fa"
+rm $inputfasta"_alimask.fa"
diff --git a/scripts/global_profile_alignment.sh b/scripts/global_profile_alignment.sh
@@ -29,6 +29,7 @@ then
    helpFunction
 fi
 
+set -uxeo pipefail
 # so we can get at it as a global variable
 export REFERENCE_ALN=$refaln
 
@@ -37,7 +38,7 @@ echo ""
 echo "Splitting unaligned input sequences into individual files"
 N=$(grep ">" $inputfasta | wc -l)
 N=$(( N*2 )) # doubling it is to ensure each record goes to one file
-faSplit sequence $inputfasta $N individual_seq
+faSplit sequence $inputfasta $N $TMP/profile_align_individual_seq
 
 
 echo ""
@@ -50,8 +51,7 @@ profile_align()
 
 	seqfile=$1
 	alfile=$seqfile"_profile_aligned.fa"
-	stofile=$alfile".sto"
-   final=$seqfile"_ind_aligned.fa"
+	final=$seqfile"_ind_aligned.fa"
 
 	mafft --thread 1 --quiet --keeplength --add $seqfile "$REFERENCE_ALN" > $alfile
 
@@ -61,22 +61,16 @@ profile_align()
 
 	rm $seqfile
 	rm $alfile
-
 }
 
 export -f profile_align
 
-inputdir=$(dirname $inputfasta)
-ls $inputdir | grep individual_seq | parallel -j $threads --bar "profile_align {}" > /dev/null
+find $TMP -type f -name '*profile_align_individual_seq*' | parallel -j $threads --bar "profile_align {}" > /dev/null
 
 # join it all together and clean up
 # note that here we *APPEND* to the global alignment, which allows us to add small batches of new stuff whenever we like
-find $inputdir -name \*.fa_ind_aligned.fa -exec cat {} \; >> $outputfasta
-find $inputdir -maxdepth 1 -name "individual_seq*" -delete
+find $TMP -name \*.fa_ind_aligned.fa -exec cat {} \; >> "$TMP/aligned_withdups.fa"
+find $TMP -maxdepth 1 -name "*profile_align_individual_seq*" -delete
 
 #finally, remove duplicates by name - can occur if a seq. exists in the reference alignment
-faFilter -uniq $outputfasta $outputfasta"_deduped.fa"
-
-# overwite the output with the deduplicated file
-rm $outputfasta
-mv $outputfasta"_deduped.fa" $outputfasta
+faFilter -uniq "$TMP/aligned_withdups.fa" $outputfasta
diff --git a/scripts/global_tree_gisaid.sh b/scripts/global_tree_gisaid.sh
@@ -29,22 +29,24 @@ then
    helpFunction
 fi
 
+set -uxeo pipefail
+
 DIR="$(cd "$(dirname "$0")" && pwd)"
+export TMP="${TMP:-$(mktemp -d)}"
+#trap "rm -rf '$TMP'" EXIT
 
 # first we trim the sequences
 inputdir=$(dirname $inputfasta)
 
-cd $inputdir
-
-allseqs=$inputdir"allseqs_unaligned.fasta"
-cat $inputfasta > $allseqs
+allseqs="$TMP/allseqs_unaligned.fasta"
+cp $inputfasta $allseqs
 
 
 echo ""
 echo "Processing raw data and trimming UTRs "
 echo ""
 
-trimmed_gisaid="$inputdir/trimmed.fa"
+trimmed_gisaid="$TMP/trimmed.fa"
 bash $DIR/trim_seqs.sh -i $allseqs -o $trimmed_gisaid -t $threads
 
 #### BUILD THE GLOBAL ALIGNMENT ######
@@ -54,20 +56,21 @@ bash $DIR/trim_seqs.sh -i $allseqs -o $trimmed_gisaid -t $threads
 echo ""
 echo "Making alignment of $k most dissimilar sequences"
 echo ""
-aln_k="$inputdir/aln_k.fa"
+aln_k="$TMP/aln_k.fa"
 bash $DIR/align_k_dissimilar.sh -i $trimmed_gisaid -k $k -o $aln_k -t $threads
 
 echo ""
 echo "Filtering sites with >10% gaps from k most dissimilar sequence alignment"
 echo ""
-aln_k_filtered="$inputdir/aln_k_filtered.fa"
+aln_k_filtered="$TMP/aln_k_filtered.fa"
 esl-alimask --gapthresh 0.1 --informat afa --outformat afa --dna -o $aln_k_filtered -g  $aln_k
 
 
+
 echo ""
 echo "Making global profile alignment"
 echo ""
-aln_global="$inputdir/aln_global_unfiltered.fa"
+aln_global="$TMP/aln_global_unfiltered.fa"
 bash $DIR/global_profile_alignment.sh -i $trimmed_gisaid -o $aln_global -t $threads -r $aln_k_filtered
 
 
@@ -80,11 +83,11 @@ esl-alistat $aln_global
 
 echo ""
 echo "Filtering sites with >1% gaps"
-esl-alimask --gapthresh 0.01 --informat afa --outformat afa --dna -o $aln_global"_alimask.fa" -g  $aln_global
+esl-alimask --gapthresh 0.01 --informat afa --outformat afa --dna -o "$TMP/aln_global_alimask.fa" -g  $aln_global
 echo "Filtering sequences that are shorter than 29100 bp and/or have >200 ambiguities"
-esl-alimanip --lmin 29100 --xambig 200 --informat afa --outformat afa --dna -o $aln_global"alimanip.fa" $aln_global"_alimask.fa"
+esl-alimanip --lmin 29100 --xambig 200 --informat afa --outformat afa --dna -o "$TMP/aln_global_alimanip.fa" "$TMP/aln_global_alimask.fa"
 
-mv $aln_global $outputfasta
+mv "$TMP/aln_global_alimanip.fa" "$outputfasta"
 
 echo ""
 echo "alignment stats after filtering"