Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Gadi config + use temp dir #1

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ dependencies:
- ucsc-faFilter
- ucsc-faSomeRecords
- ucsc-faSplit
- openssl=="1.0.*" # Beacuse ucsc tools are broken with openssl 1.1.1*
- r-base
- r-ape
- r-optparse
- pip:
- pandas==1.0.1
- pytools==2020.1
Expand Down
28 changes: 28 additions & 0 deletions full-pipeline-gadi.pbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
#PBS -q hugemem
#PBS -l ncpus=48
#PBS -l walltime=12:00:00
#PBS -l mem=1500G
#PBS -l jobfs=1400G
#PBS -l wd
#PBS -l storage=scratch/xe2+gdata/xe2
#PBS -j oe
#PBS -o gadi/log/
#PBS -M [email protected]
#PBS -m abe
#PBS -P xe2
#PBS -W umask=002
#PBS -N sarscov2

source /g/data/xe2/gadi/profile.sh
module purge
useconda
conda activate sarscov2phylo

set -xeuo pipefail

#bash scripts/global_tree_gisaid.sh -i ../data/act/act.fasta -o ../data/act/global_act.fa -t ${PBS_NCPUS:-2} -k 100
#bash scripts/global_tree_gisaid.sh -i ../data/gisaid/gisaid_hcov-19_2020_05_08_06.fasta -o ../data/gisaid/global.fa -t ${PBS_NCPUS:-2} -k 100
#bash scripts/global_tree_gisaid.sh -i ../data/act_2020-05-20/gisaid_hcov-19_2020_05_20_01.fasta -o ../data/tiny-gisaid/global.fa -t ${PBS_NCPUS:-2} -k 100
#bash scripts/global_tree_gisaid_plus.sh -i ../data/tiny-gisaid/tiny-gisaid.fa -a ../data/act/act.fasta -d 5 -o ../data/tiny-gisaid/global_plus_act.fa -t ${PBS_NCPUS:-2} -k 100
bash scripts/global_tree_gisaid_plus.sh -i ../data/act_2020-05-20/gisaid_hcov-19_2020_05_20_01.fasta -a ../data/act/act.fasta -d 5 -o ../data/act/global_plus_act.fa -t ${PBS_NCPUS:-2} -k 100
75 changes: 26 additions & 49 deletions scripts/align_k_dissimilar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,62 +29,61 @@ then
helpFunction
fi

set -xeuo pipefail

export TMP="${TMP:-$(mktemp -d)}"

n=$(grep '>' $inputfasta | wc -l)
echo ""
echo ""
echo "Filtering the $n initial unaligned input sequences to remove any with > 10 ambiguous bases"
echo ""
echo ""
seqmagick quality-filter --max-ambiguous 10 --max-length 100000 --min-length 100 $inputfasta $inputfasta"_filtered.fa"
inputfasta=$inputfasta"_filtered.fa"
seqmagick quality-filter --max-ambiguous 10 --max-length 100000 --min-length 100 "$inputfasta" "${TMP}/filtered.fa"

# how many sequences in the inputfasta
n=$(grep '>' $inputfasta | wc -l)
echo "$n sequences remain after filtering"
seqs="${TMP}/filtered.fa"
n=$(grep '>' "$seqs" | wc -l)
echo "$n sequences remain after filtering in '$seqs'"

#define a cutoff to switch to the quick & dirty method of choosing the seqs
cutoff=50000

inputdir=$(dirname $inputfasta)

echo "Using MAFFT to get a guide tree"

if (( $n > $cutoff )); then

echo "Using parttree distance method in MAFFT"

# use this method for maximum speed on v large datasets
mafft --thread $threads --retree 0 --treeout --parttree $inputfasta > /dev/null
mafft --thread $threads --retree 0 --treeout --parttree "${seqs}" > /dev/null

# replace all newlines
tr -d '\n' < "$inputfasta.tree" > "$inputdir/guide1.tree"

# replace all newlines then
# add branchlengths because this method doesn't give any.
# assuming equal branchlengths is obviously not correct, but nevertheless
# still gives a pragmatic way of selecting the k divergent sequences
sed 's/),/)0.1,/g' "$inputdir/guide1.tree" > "$inputdir/guide2.tree"
sed 's/))/)0.1)/g' "$inputdir/guide2.tree" > "$inputdir/guide3.tree"

tr -d '\n' < "${seqs}.tree" | sed -e 's/),/)0.1,/g' -e 's/))/)0.1)/g' > "$inputdir/guide.tree"
else

echo "Using 6mer distance method in MAFFT"

# use this method if it's feasible, which calculates 6mer distances
mafft --thread $threads --retree 0 --treeout $inputfasta > /dev/null
mafft --thread $threads --retree 0 --treeout "${seqs}" > /dev/null
# replace all newlines
tr -d '\n' < $inputfasta.tree > "$inputdir/guide3.tree"
tr -d '\n' < "${seqs}.tree" > "$TMP/guide.tree"

fi

# now continue for all methods

# add a semicolon to the tree so iq-tree can read it
echo ";" >> "$inputdir/guide3.tree"
echo ";" >> "$TMP/guide.tree"

echo "Using IQ-TREE to select the K most dissimilar sequences"

# get the k most dissimilar sequences using iq-tree
iqtree -te "$inputdir/guide3.tree" -k $k
iqtree -te "$TMP/guide.tree" -k $k

echo "Extracting K most dissimilar sequences"

Expand All @@ -94,54 +93,32 @@ if (( $n > $cutoff )); then
# extract those from the fasta file to make a new fasta
# https://www.biostars.org/p/2822/
# get the numbers of the sequences in the sorted file
awk '/^The optimal PD set has/{flag=1;next}/^Corresponding sub-tree/{flag=0}flag' "$inputdir/guide3.tree.pda" > "$inputdir/numbers.txt"
awk '/^The optimal PD set has/{flag=1;next}/^Corresponding sub-tree/{flag=0}flag' "$TMP/guide.tree.pda" > "$TMP/numbers.txt"
# get the names
grep '>' $inputfasta > "$inputdir/all_names.txt"
# get the names given the line numbers
awk 'FNR==NR{a[$1];next}(FNR in a){print}' "$inputdir/numbers.txt" "$inputdir/all_names.txt" > "$inputdir/k_names_raw.txt"
# remove the ">"
tr -d \> < "$inputdir/k_names_raw.txt" > "$inputdir/k_names.txt"
grep '>' "$seqs" > "$TMP/all_names.txt"
# get the names given the line numbers, and tr to remove the ">"
awk 'FNR==NR{a[$1];next}(FNR in a){print}' "$TMP/numbers.txt" "$TMP/all_names.txt" | tr -d \> > "$TMP/k_names.txt"

else
# extract those from the fasta file to make a new fasta
awk '/^The optimal PD set has/{flag=1;next}/^Corresponding sub-tree/{flag=0}flag' "$inputdir/guide3.tree.pda" > "$inputdir/k_names_long.txt"
awk '/^The optimal PD set has/{flag=1;next}/^Corresponding sub-tree/{flag=0}flag' "$TMP/guide.tree.pda" > "$TMP/k_names_long.txt"

# these names are like 123_name. For safety we only use the number, since mafft edits special characters in names
cut -d _ -f 1 "$inputdir/k_names_long.txt" > "$inputdir/numbers.txt"
cut -d _ -f 1 "$TMP/k_names_long.txt" > "$TMP/numbers.txt"

# get the names
grep '>' $inputfasta > "$inputdir/all_names.txt"

# get the names given the line numbers
awk 'FNR==NR{a[$1];next}(FNR in a){print}' "$inputdir/numbers.txt" "$inputdir/all_names.txt" > "$inputdir/k_names_raw.txt"

# remove the ">"
tr -d \> < "$inputdir/k_names_raw.txt" > "$inputdir/k_names.txt"
grep '>' $seqs > "$TMP/all_names.txt"

# get the names given the line numbers remove the ">"
awk 'FNR==NR{a[$1];next}(FNR in a){print}' "$TMP/numbers.txt" "$TMP/all_names.txt" | tr -d '>' > "$TMP/k_names.txt"
fi


rm "$inputdir/k_names_long.txt"
rm "$inputdir/guide1.tree"
rm "$inputdir/guide2.tree"
rm "$inputdir/guide3.tree"
rm "$inputdir/guide3.tree.log"
rm "$inputfasta.tree"
rm "$inputdir/k_names_raw.txt"
rm "$inputdir/numbers.txt"
rm "$inputdir/all_names.txt"
rm "$inputdir/guide3.tree.pda"


# extract the fasta records we want
#seqtk subseq $inputfasta "$inputdir/k_names.txt" > "$inputdir/k_unaligned.fa"
faSomeRecords $inputfasta "$inputdir/k_names.txt" "$inputdir/k_unaligned.fa"
faSomeRecords $seqs "$TMP/k_names.txt" "$TMP/k_unaligned.fa"

echo "Aligning K most dissimilar sequences"

# align the k most dissimilar sequences in MAFFT
mafft --thread $threads --auto "$inputdir/k_unaligned.fa" > $outputfasta

rm "$inputdir/k_unaligned.fa"
rm $inputfasta
rm "$inputdir/k_names.txt"
mafft --thread $threads --auto "$TMP/k_unaligned.fa" > "$outputfasta"
3 changes: 2 additions & 1 deletion scripts/assign_lineages.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ do
done


set -xeuo pipefail
# First we get the pre-computed lineages from hcov lineages

wget $lineageurl > lineages.csv
Expand All @@ -42,4 +43,4 @@ pangolin seqs_todo.fa -o $inputdir -t $threads

# TODO combine lineage information into single file
# i.e. add pangolin output to the lineages.csv, being aware they have different columns...
# and noting that lineages.csv is based on epiIDs
# and noting that lineages.csv is based on epiIDs
4 changes: 3 additions & 1 deletion scripts/filter_aln.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ then
helpFunction
fi

set -xeuo pipefail

echo ""
echo "Filtering strains shorter than 29100 and/or with more than 200 ambiguities (that's ~1%)"
echo "Filtering sites with more than 1% gaps"
Expand All @@ -32,4 +34,4 @@ echo ""
esl-alimask --gapthresh 0.01 --informat afa --outformat afa --dna -o $inputfasta"_alimask.fa" -g $inputfasta
esl-alimanip --lmin 29100 --xambig 200 --informat afa --outformat afa --dna -o $outputfasta $inputfasta"_alimask.fa"

rm $inputfasta"_alimask.fa"
rm $inputfasta"_alimask.fa"
20 changes: 7 additions & 13 deletions scripts/global_profile_alignment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ then
helpFunction
fi

set -uxeo pipefail
# so we can get at it as a global variable
export REFERENCE_ALN=$refaln

Expand All @@ -37,7 +38,7 @@ echo ""
echo "Splitting unaligned input sequences into individual files"
N=$(grep ">" $inputfasta | wc -l)
N=$(( N*2 )) # doubling it is to ensure each record goes to one file
faSplit sequence $inputfasta $N individual_seq
faSplit sequence $inputfasta $N $TMP/profile_align_individual_seq


echo ""
Expand All @@ -50,8 +51,7 @@ profile_align()

seqfile=$1
alfile=$seqfile"_profile_aligned.fa"
stofile=$alfile".sto"
final=$seqfile"_ind_aligned.fa"
final=$seqfile"_ind_aligned.fa"

mafft --thread 1 --quiet --keeplength --add $seqfile "$REFERENCE_ALN" > $alfile

Expand All @@ -61,22 +61,16 @@ profile_align()

rm $seqfile
rm $alfile

}

export -f profile_align

inputdir=$(dirname $inputfasta)
ls $inputdir | grep individual_seq | parallel -j $threads --bar "profile_align {}" > /dev/null
find $TMP -type f -name '*profile_align_individual_seq*' | parallel -j $threads --bar "profile_align {}" > /dev/null

# join it all together and clean up
# note that here we *APPEND* to the global alignment, which allows us to add small batches of new stuff whenever we like
find $inputdir -name \*.fa_ind_aligned.fa -exec cat {} \; >> $outputfasta
find $inputdir -maxdepth 1 -name "individual_seq*" -delete
find $TMP -name \*.fa_ind_aligned.fa -exec cat {} \; >> "$TMP/aligned_withdups.fa"
find $TMP -maxdepth 1 -name "*profile_align_individual_seq*" -delete

#finally, remove duplicates by name - can occur if a seq. exists in the reference alignment
faFilter -uniq $outputfasta $outputfasta"_deduped.fa"

# overwite the output with the deduplicated file
rm $outputfasta
mv $outputfasta"_deduped.fa" $outputfasta
faFilter -uniq "$TMP/aligned_withdups.fa" $outputfasta
25 changes: 14 additions & 11 deletions scripts/global_tree_gisaid.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,22 +29,24 @@ then
helpFunction
fi

set -uxeo pipefail

DIR="$(cd "$(dirname "$0")" && pwd)"
export TMP="${TMP:-$(mktemp -d)}"
#trap "rm -rf '$TMP'" EXIT

# first we trim the sequences
inputdir=$(dirname $inputfasta)

cd $inputdir

allseqs=$inputdir"allseqs_unaligned.fasta"
cat $inputfasta > $allseqs
allseqs="$TMP/allseqs_unaligned.fasta"
cp $inputfasta $allseqs


echo ""
echo "Processing raw data and trimming UTRs "
echo ""

trimmed_gisaid="$inputdir/trimmed.fa"
trimmed_gisaid="$TMP/trimmed.fa"
bash $DIR/trim_seqs.sh -i $allseqs -o $trimmed_gisaid -t $threads

#### BUILD THE GLOBAL ALIGNMENT ######
Expand All @@ -54,20 +56,21 @@ bash $DIR/trim_seqs.sh -i $allseqs -o $trimmed_gisaid -t $threads
echo ""
echo "Making alignment of $k most dissimilar sequences"
echo ""
aln_k="$inputdir/aln_k.fa"
aln_k="$TMP/aln_k.fa"
bash $DIR/align_k_dissimilar.sh -i $trimmed_gisaid -k $k -o $aln_k -t $threads

echo ""
echo "Filtering sites with >10% gaps from k most dissimilar sequence alignment"
echo ""
aln_k_filtered="$inputdir/aln_k_filtered.fa"
aln_k_filtered="$TMP/aln_k_filtered.fa"
esl-alimask --gapthresh 0.1 --informat afa --outformat afa --dna -o $aln_k_filtered -g $aln_k



echo ""
echo "Making global profile alignment"
echo ""
aln_global="$inputdir/aln_global_unfiltered.fa"
aln_global="$TMP/aln_global_unfiltered.fa"
bash $DIR/global_profile_alignment.sh -i $trimmed_gisaid -o $aln_global -t $threads -r $aln_k_filtered


Expand All @@ -80,11 +83,11 @@ esl-alistat $aln_global

echo ""
echo "Filtering sites with >1% gaps"
esl-alimask --gapthresh 0.01 --informat afa --outformat afa --dna -o $aln_global"_alimask.fa" -g $aln_global
esl-alimask --gapthresh 0.01 --informat afa --outformat afa --dna -o "$TMP/aln_global_alimask.fa" -g $aln_global
echo "Filtering sequences that are shorter than 29100 bp and/or have >200 ambiguities"
esl-alimanip --lmin 29100 --xambig 200 --informat afa --outformat afa --dna -o $aln_global"alimanip.fa" $aln_global"_alimask.fa"
esl-alimanip --lmin 29100 --xambig 200 --informat afa --outformat afa --dna -o "$TMP/aln_global_alimanip.fa" "$TMP/aln_global_alimask.fa"

mv $aln_global $outputfasta
mv "$TMP/aln_global_alimanip.fa" "$outputfasta"

echo ""
echo "alignment stats after filtering"
Expand Down
Loading