-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathaddmethlists.py
executable file
·124 lines (106 loc) · 4.14 KB
/
addmethlists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python
#
# Sum methylation counts from two independent files.
# E.g. to combine HBS-seq and BS-seq datasets after processing.
#
# Copyright (c) 2017 Graham Gower <[email protected]>
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
from __future__ import print_function
from gzopen import gzopen
import sys
import itertools
def parse_col0(fn):
with gzopen(fn) as f:
for line in f:
yield line.split(None, 1)[0]
def parse_tsv(fn, ichrom, ipos, chrmap, skip=1):
with gzopen(fn) as f:
while skip:
skip -= 1
next(f)
for line in f:
fields = line.split()
key = (chrmap[fields[ichrom]], int(fields[ipos]))
yield key, fields
def parse_args():
import argparse
parser = argparse.ArgumentParser(description="Sum methylation counts from two files")
parser.add_argument("-m", "--methylkit", action="store_true", default=False)
parser.add_argument("-p", "--pileOmeth", action="store_true", default=False)
parser.add_argument("-z", "--gzip", action="store_true", default=False, help="output is gzipped")
parser.add_argument("-l", "--chr-list", required=True, help="chromosome list for sort order (e.g. faidx file)")
parser.add_argument("in1")
parser.add_argument("in2")
args = parser.parse_args()
if not args.methylkit and not args.pileOmeth:
print("Must set one of --methylkit or --pileOmeth",
file=sys.stderr)
exit(1)
if args.methylkit and args.pileOmeth:
print("Parameters --methylkit and --pileOmeth are mutually exclusive",
file=sys.stderr)
exit(1)
return args
if __name__ == "__main__":
args = parse_args()
if args.methylkit:
ichrom = 1
ipos = 2
elif args.pileOmeth:
ichrom = 0
ipos = 1
else:
raise Exception("Are we doing methylkit or pileOmeth?")
chrmap = {s:i for i,s in enumerate(parse_col0(args.chr_list))}
f1 = parse_tsv(args.in1, ichrom, ipos, chrmap)
f2 = parse_tsv(args.in2, ichrom, ipos, chrmap)
l1 = l2 = oline = None
with gzopen("/dev/stdout", "w", args.gzip) as fout:
try:
k1, l1 = next(f1)
k2, l2 = next(f2)
while True:
if k1<k2:
oline = l1
l1 = None
k1, l1 = next(f1)
elif k2<k1:
oline = l2
l2 = None
k2, l2 = next(f2)
else:
if args.methylkit:
oline = l1[:4]
C = int(l1[5]) + int(l2[5])
mC = int(l1[6]) + int(l2[6])
oline.extend([C+mC, C, mC])
elif args.pileOmeth:
oline = l1[:3]
mC = int(l1[4]) + int(l2[4])
C = int(l1[5]) + int(l2[5])
oline.extend([100*mC/(C+mC), mC, C])
l1 = l2 = None
k1, l1 = next(f1)
k2, l2 = next(f2)
print(*oline, file=fout, sep="\t")
except StopIteration:
pass
if oline is not None:
print(*oline, file=fout, sep="\t")
if l1 is not None:
print(*l1, file=fout, sep="\t")
if l2 is not None:
print(*l2, file=fout, sep="\t")
for _, lx in itertools.chain(f1, f2):
print(*lx, file=fout, sep="\t")