forked from weinstockj/telomere-length-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Snakefile
135 lines (106 loc) · 3.97 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import math
#import pdb
#pdb.set_trace()
configfile: "config.yml"
singularity: "telseq.overflow-fix2.withR.sif"
mem_step_size = 3400
def get_ids_for_batch(b):
batch_size = config["batch_size"]
beg = batch_size * int(b)
end = beg + batch_size
return config["sample_ids"][beg:end]
rule telseq_results:
params:
input = config["input_expression"]
output:
"per-sample/{sample_id}.telseq.out"
threads: 2
resources:
mem_mb = lambda wc, attempt: mem_step_size * attempt
shell:
"""
set -uo pipefail
tmp_dir=`mktemp -d`
tmp_out=$tmp_dir/$(basename {output})
tmp_in=$tmp_dir/$(basename {params.input})
set +e
gsutil -q -u {config[billing_project]} cp {params.input} $tmp_in
rc=$?
echo "download rc: $rc"
if [[ $rc == 0 ]]; then
read_length=$(~/samtools/samtools view -T ~/freeze11-calling/resources/ref/hs38DH.fa $tmp_in | awk '{{print length($10)}}' | head -n 1000 | sort -nr | head -n1)
echo "sample id: {wildcards.sample_id}"
echo "read length: $read_length"
if [[ $read_length > 0 ]]; then
~/samtools/samtools view -T ~/freeze11-calling/resources/ref/hs38DH.fa -u $tmp_in | singularity exec telseq.sif telseq -k 12 -u -o $tmp_out -r $read_length /dev/stdin
rc=$?
else
rc=1
fi
fi
if [[ $rc == 0 ]]; then
cp $tmp_out {output}
rc=$?
fi
rm -r $tmp_dir
exit $rc
"""
rule combined_telseq_results_batch:
input:
lambda wc: [rules.telseq_results.output[0].format(sample_id=id) for id in get_ids_for_batch(wc.batch)]
output:
"combined-topmed-frz11/batches/b{batch}.combined.csv"
threads: 2
resources:
mem_mb = lambda wc, attempt: mem_step_size * attempt
shell:
"""
set -euo pipefail
tmp_dir=`mktemp -d`
tmp_out=$tmp_dir/$(basename {output})
# There is a trailing column in telseq ouput that we remove here
(head -n1 {input[0]} | awk -F'\t' -vOFS=, '{{NF-=1; print "sample_id",$0}}'; for f in {input}; do sample_id=$(basename $f .telseq.out); tail -n+2 $f | awk -F'\t' -vOFS=, '{{NF-=1; print "'$sample_id'",$0}}'; done) > $tmp_out
mv $tmp_out {output}
"""
rule combined_telseq_results_batch_fix:
params:
input = lambda wc: [rules.telseq_results.output[0].format(sample_id=id) for id in get_ids_for_batch(wc.batch)]
output:
"combined-topmed-frz11-fix/batches/b{batch}.combined.csv"
threads: 2
resources:
mem_mb = lambda wc, attempt: mem_step_size * attempt
shell:
"""
set -euo pipefail
tmp_dir=`mktemp -d`
tmp_out=$tmp_dir/$(basename {output})
tmp_files=""
for f in {params.input}; do
sample_id=$(basename $f .telseq.out)
# There is a trailing column in telseq ouput that we remove here
(head -n1 $f | awk -F'\t' -vOFS=, '{{NF-=1; print "sample_id",$0}}'; tail -n+2 $f | awk -F'\t' -vOFS=, '{{NF-=1; print "'$sample_id'",$0}}') > $tmp_dir/$sample_id.csv
tmp_files="$tmp_files $tmp_dir/$sample_id.csv"
done
singularity exec telseq.sif ./combine-telseq-results.R $tmp_files > $tmp_out
mv $tmp_out {output}
rm -r $tmp_dir/
"""
rule combined_telseq_results:
input:
["combined-topmed-frz11-fix/batches/b" + str(b) + ".combined.csv" for b in range(0, math.ceil(len(config["sample_ids"]) / config["batch_size"]))]
output:
"combined-topmed-frz11-fix/all.combined.csv"
threads: 2
resources:
mem_mb = lambda wc, attempt: mem_step_size * attempt
shell:
"""
set -euo pipefail
tmp_dir=`mktemp -d`
tmp_out=$tmp_dir/$(basename {output})
singularity exec telseq.sif ./combine-telseq-results.R {input} > $tmp_out
mv $tmp_out {output}
rm -r $tmp_dir/
"""