-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy path2_extract_mapped_reads.sh
141 lines (93 loc) · 3.7 KB
/
2_extract_mapped_reads.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/bin/bash
############
set +e
set -u
set -o pipefail
BASEDIR="./"
OUTDIR="./mapped_reads/"
DEDUPSEQ="FALSE"
NAMELIST="not-a-file"
while getopts 'b:o:sn:' OPTION; do
case "$OPTION" in
b)
BASEDIR=$OPTARG
;;
o)
OUTDIR=$OPTARG
;;
s)
DEDUPSEQ="TRUE"
;;
n)
NAMELIST=$OPTARG
;;
?)
echo "script usage: $(basename $0) [-n file with samples names (nameslist.txt)] [-b HybPiper base folder] [-o output folder]"
echo "
Usage: 2_extract_mapped_reads.sh [options]
Options:
-b <PATH> Base folder of either HybPhaser (contains '01_data/'), HybPiper (contains sample directories), or HybPiper-RBGV (contains '04_processed_gene_directories/'). Default is './'
-o <PATH> Outpout folder to write read files into. Default is './mapped_reads/'
-n <PATH> Namelist. optional, if not set, all folders in samples directory are used as samples.
-s Select if duplicated sequences should be removed. "
exit 1
;;
esac
done
shift "$(($OPTIND -1))"
mkdir -p $OUTDIR
if [[ -d $BASEDIR/01_data ]]; then
SAMPLESDIR=$BASEDIR/01_data/
HYB="phaser"
echo $HYB
elif [[ -d $BASEDIR/04_processed_gene_directories/ ]]; then
SAMPLESDIR=$BASEDIR/04_processed_gene_directories/
HYB="piper"
else
SAMPLESDIR=$BASEDIR
HYB="piper"
fi
if [[ -f $NAMELIST ]]; then
SAMPLES=$(<$NAMELIST)
else
SAMPLES=$(ls -d -1 $SAMPLESDIR/* | rev | cut -f 1 -d "/" | rev)
fi
for SAMPLE in $SAMPLES
do
echo Generating on-target reads files for $SAMPLE
# concatenate reads (mapped per gene) per sample from either HybPhaser or HybPiper folders
if [[ $HYB == "phaser" ]]; then
cat $SAMPLESDIR/$SAMPLE/reads/*_unpaired.fasta $SAMPLESDIR/$SAMPLE/reads/*_combined.fasta > $OUTDIR/$SAMPLE"_mapped_reads.fasta" 2>/dev/null
else
cat $SAMPLESDIR/$SAMPLE/*/*_unpaired.fasta $SAMPLESDIR/$SAMPLE/*/*_interleaved.fasta > $OUTDIR/$SAMPLE"_mapped_reads.fasta" 2>/dev/null
fi
# The following pipline is based on Pierre Lindenbaum's script: https://www.biostars.org/p/3003/#3008 and composes of the following actions:
# in every row that starts with '>' put '@' at the end of the row
# replace '>' with '#'
# remove the break lines, replace '#' with a breakline, replace '@' with a tab
# sort the file according to the second column (sequences). the -u option keeps only one copy of each unique string.
# add '>' at the begining of each line
# sustitute tab (\t) with a breakline (\n)
# remove the first line (it's a blank line) and save the result into $output
if [[ $DEDUPSEQ == "TRUE" ]]; then
echo Deduplicating reads files for $SAMPLE
sed -e '/^>/s/$/@/' -e 's/^>/#/' $OUTDIR/$SAMPLE"_mapped_reads.fasta" |\
tr -d '\n' | tr "#" "\n" | tr "@" "\t" |\
sort -u -t $'\t' -f -k 2,2 |\
sed -e 's/^/>/' -e 's/\t/\n/' |\
tail -n +2 > $OUTDIR/$SAMPLE"_mapped_reads_dedup-seq.fasta"
rm $OUTDIR/$SAMPLE"_mapped_reads.fasta" 2>/dev/null
else
echo Keeping all reads for $SAMPLE
# NOTE: if the following commented code is run on paired-end interleaved data,
# which is the normal output from HybPhaser, one of the paired reads
# will be dropped (they have identical names), which seems like non-ideal
# behaviour (and different from what happens in removing duplicates above)
#sed -e '/^>/s/$/@/' -e 's/^>/#/' $OUTDIR/$SAMPLE"_mapped_reads.fasta" |\
#tr -d '\n' | tr "#" "\n" | tr "@" "\t" |\
#sort -u -t $'\t' -f -k 1,1 |\
#sed -e 's/^/>/' -e 's/\t/\n/' |\
#tail -n +2 > $OUTDIR/$SAMPLE"_mapped_reads_nodupnames.fasta"
#mv $OUTDIR/$SAMPLE"_mapped_reads_nodupnames.fasta" $OUTDIR/$SAMPLE"_mapped_reads.fasta"
fi
done