-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathparse_star_junctions.smk
115 lines (100 loc) · 3.48 KB
/
parse_star_junctions.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
####Folders and all the other stuff
####humans
project_dir = "/SAN/vyplab/alb_projects/data/sinai_splice_junctions/"
out_spot = "parsed_splice_junctions/"
bam_spot = "/SAN/vyplab/alb_projects/data/sinai_splice_junctions/sinai_als_sj/"
bam_suffix = "SJ.out.tab"
sj_suffix = "SJ.out.tab"
####humans - as PSI
# project_dir = "/SAN/vyplab/alb_projects/data/sinai_splice_junctions/sinai_all_samples_renamed_sj_tabs/"
# out_spot = "normalized_annotated/beds/"
# bam_spot = "normalized_annotated/"
# bam_suffix = ".csv"
# sj_suffix = "_normalized_annotated.csv"
####cell lines
# a top level folder where the bams reside
# project_dir = "/SAN/vyplab/alb_projects/data/sinai_splice_junctions/"
# out_spot = "parsed_splice_junctions_cellines/"
# bam_spot = "all_bams_kds_linked/"
# bam_suffix = ".Aligned.sorted.out.bam"
# sj_suffix = ".SJ.out.tab"
####Which bed file, and what you want to name it
bed_file = "/SAN/vyplab/alb_projects/data/sinai_splice_junctions/beds/stmn2_and_unc13a_atxn1.sorted.bed"
final_output_name = "atxn1_and_aars"
#### Where do bedtools and bedops live on your system? ####
bedops_path = "/SAN/vyplab/alb_projects/tools/bedops/bin/"
bedtools_path = "/SAN/vyplab/alb_projects/tools/bedtools"
# =-------DON"T TOUCH ANYTHING PAST THIS POINT ----------------------------
output_dir = os.path.join(project_dir,out_spot)
bam_dir = os.path.join(project_dir,bam_spot)
# print(bam_dir)
SAMPLES, = glob_wildcards(bam_dir + "{sample}" + bam_suffix)
print(output_dir)
print("Number of Input Samples")
print(len(SAMPLES))
rule all_output:
input:
expand(output_dir + "{sample}.sorted.bed", sample = SAMPLES),
output_dir + final_output_name + "aggregated.clean.annotated.bed"
rule sj_to_bed:
input:
bam_dir + "{sample}" + sj_suffix
output:
temp(output_dir + "{sample}.bed")
shell:
"""
python3 splicejunction2bed.py --name --input {input} --output {output}
"""
rule sort_beds:
input:
output_dir + "{sample}.bed"
output:
output_dir + "{sample}.sorted.bed"
shell:
"""
{bedops_path}sort-bed {input} > {output}
"""
rule call_element:
input:
output_dir + "{sample}.sorted.bed"
output:
temp(output_dir + final_output_name + ".{sample}.bedops.element")
params:
bedtools = bedtools_path
shell:
"""
{params.bedtools} intersect -b {bed_file} -a {input} -wa > {output}
"""
# an aggregation over all produced clusters
rule aggregate:
input:
expand(output_dir + final_output_name + ".{sample}.bedops.element", sample = SAMPLES)
output:
output_dir + final_output_name + "aggregated.bed"
params:
cat_call = output_dir + final_output_name + "*.bedops.element"
shell:
"""
cat {params.cat_call} > {output}
"""
rule clean_aggregate:
input:
output_dir + final_output_name + "aggregated.bed"
output:
temp(output_dir + final_output_name + "aggregated.clean.bed")
shell:
"""
bedtools intersect -f 1 -wa -r -a {input} -b {bed_file} > {output}
"""
rule annotate_clean:
input:
output_dir + final_output_name + "aggregated.clean.bed"
output:
output_dir + final_output_name + "aggregated.clean.annotated.bed"
shell:
"""
bedtools intersect -f 1 -r -a {input} -b {bed_file} -wb | awk -v OFS="\t" '{{print $1,$2,$3,$4,$5,$6,$10}}' > {output}.tmp
cat {output}.tmp | uniq > {output}
rm {output}.tmp
"""