-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_RPF_profiles.py
138 lines (114 loc) · 5.37 KB
/
get_RPF_profiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import os
import sys
import numpy as np
import bisect
import pysam
from optparse import OptionParser
parser=OptionParser()
parser.add_option('-b','--bed',dest='bed',help="12-column bed file with ORF definitions, should be sorted (with ORFs on the same transcript in subsequent entries)")
parser.add_option('-B','--bam',dest='bam',help="comma-separated list of BAM files with mapped reads, should have indices")
parser.add_option('-n','--names',dest='names',default=None,help="header names for bam files (comma-separated)")
parser.add_option('-L','--L',dest='L',default='30',help="read lengths to use (comma-separated; separated by | for each bam file) [30]")
parser.add_option('-o','--offsets',dest='offsets',default='12',help="offsets of P-sites from 5' end of read (corresponding to read lengths) [12]")
parser.add_option('-s','--stranded',dest='stranded',default='yes',help="strand information (yes/no/reverse) [yes]")
parser.add_option('-N','--N',dest='N',default=150,type=int,help="nucleotide windows around start and stop codon [150]")
options,args=parser.parse_args()
print >> sys.stderr, 'using bam files',options.bam
try:
bam_files=[pysam.Samfile(bam.strip(),'rb') for bam in options.bam.split(',')]
nmapped=np.array([bam.mapped for bam in bam_files])
nB=len(bam_files)
except:
raise Exception ("couldn't open bam files")
LL=[map(int,options.L.split('|')[n].split(',')) for n in range(nB)]
Lmax=max(map(max,LL))
Lmin=min(map(min,LL))
offset=[dict(zip(LL[n],map(int,options.offsets.split('|')[n].split(',')))) for n in range(nB)]
if options.names is not None:
names=dict((n,x.strip()) for n,x in enumerate(options.names.split(',')))
if len(names)!=nB:
raise Exception("number of header names doesn't match number of bam files")
else:
names=dict(zip(range(nB),range(1,nB+1)))
print >> sys.stderr, 'using bed file',options.bed
sys.stdout.write('# bed file: '+options.bed+'\n# bam files:\n')
for n in range(nB):
sys.stdout.write('# {0}: {1} ({2} reads)\n'.format(names[n],options.bam.split(',')[n],nmapped[n]))
sys.stdout.write('# gene')
for n in range(nB):
sys.stdout.write('\tcov_start_{0}\tcov_stop_{0}'.format(names[n]))
sys.stdout.write('\n')
with open(options.bed) as inf:
for line in inf:
ls=line.split()
chrom,tstart,tend,name,_,strand,cstart,cend,_,nexons,exon_size,exon_start=ls
tstart,tend,cstart,cend,nexons=map(int,[tstart,tend,cstart,cend,nexons])
txlen=tend-tstart
exon_size=map(int,exon_size.strip(',').split(','))
exon_start=map(int,exon_start.strip(',').split(','))
fe=bisect.bisect(exon_start,cstart-tstart)-1
le=bisect.bisect(exon_start,cend-tstart)-1
rel_start=sum(exon_size[:fe])+(cstart-tstart-exon_start[fe])
rel_end=sum(exon_size[:le])+(cend-tstart-exon_start[le])
try:
cov=np.zeros((nB,txlen),dtype=int)
except MemoryError:
print >> sys.stderr, 'not enough memory for {0}; skipping this transcript'.format(name)
continue
for n,bam in enumerate(bam_files):
if chrom not in bam.references:
chrom=chrom.strip('chr')
if chrom not in bam.references:
continue
for read in bam.fetch(chrom,max(0,tstart-Lmax),tend+Lmax):
if read.is_unmapped or read.is_duplicate or read.is_qcfail:
continue
try:
is_multimapper=sum(k > 1 for t,k in read.tags if t=='NH')
except OverflowError:
is_multimapper=False
pass
pos=np.array(read.positions)-tstart
L=len(pos)
if L not in offset[n]:
continue
if strand=='-' and (options.stranded=='yes' and read.is_reverse or\
options.stranded=='reverse' and not read.is_reverse or\
options.stranded=='no'):
psite=pos[-offset[n][L]-1]
elif strand=='+' and (options.stranded=='yes' and not read.is_reverse or\
options.stranded=='reverse' and read.is_reverse or\
options.stranded=='no'):
psite=pos[offset[n][L]]
else:
continue
if psite >= 0 and psite < txlen:
cov[n,psite]+=1
cov=np.concatenate([cov[:,est:est+esi] for est,esi in zip(exon_start,exon_size)],axis=1)
txlen=cov.shape[1]
# normalize to coverage in enterior of CDS
# mean_cov=np.mean(cov[:,rel_start+options.N/2:rel_end-options.N/2],axis=1)
# normalize to RPKM of ribo reads over transcript
# mean_cov=np.sum(cov,axis=1)/(nmapped/1.e6)/(txlen/1.e3)
# normalize to mean number of reads per transcript
mean_cov=np.mean(cov,axis=1)
mean_cov[mean_cov==0]=np.nan
if np.sum(np.isfinite(mean_cov))==0:
continue
# get 2*N nt normalized coverage around start & stop, fill with NaN in intergenic space
cov_start=np.concatenate([np.nan*np.ones((nB,max(0,options.N-rel_start))),\
cov[:,max(rel_start-options.N,0):min(txlen,rel_start+options.N)],\
np.nan*np.ones((nB,max(0,rel_start+options.N-txlen)))],axis=1)/mean_cov[:,None]
cov_stop=np.concatenate([np.nan*np.ones((nB,max(0,options.N-rel_end))),\
cov[:,max(rel_end-options.N,0):min(txlen,rel_end+options.N)],\
np.nan*np.ones((nB,max(0,rel_end+options.N-txlen)))],axis=1)/mean_cov[:,None]
sys.stdout.write("{0}".format(name))
for n in range(nB):
if strand=='+':
sys.stdout.write('\t{0}\t{1}'.format(','.join('{0:.4e}'.format(c) for c in cov_start[n]),
','.join('{0:.4e}'.format(c) for c in cov_stop[n])))
else:
sys.stdout.write('\t{0}\t{1}'.format(','.join('{0:.4e}'.format(c) for c in cov_stop[n][::-1]),
','.join('{0:.4e}'.format(c) for c in cov_start[n][::-1])))
sys.stdout.write("\n")
print >> sys.stderr, 'done'