-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathhit_module.py
193 lines (155 loc) · 5.57 KB
/
hit_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""
:platform: Unix
:synopsis: Create the class HIT which represents a TFBS hit.
"""
class HIT:
""" Define the representation of a TFBS hit. """
# TODO raise an error when the strand is wrong
def __init__(self, seq_record, start, end, strand, score, tffm=None,
tffm_matched_state=-1):
"""
Create an instance of the :class:`HIT`.
:arg seq_record: Sequence containing the hit.
:type seq_record: :class:`Bio.SeqRecord`
:arg start: Start position of the hit.
:type start: int
:arg end: End position of the hit.
:type end: int
:arg strand: Strand of the hit on the sequence. It should be either '+'
or '-').
:type strand: str
:arg score: TFFM score of the hit.
:type score: float
:arg tffm: TFFM used to predict the hit (default: None).
:type tffm: :class:`TFFM`
:arg tffm_matched_state: Matching state in the TFFM to predict the hit
(default: -1, i.e. no TFFM).
:type tffm_matched_state: int
:warning: start and end are 1-based.
:warning: The seq_record attribute is not the actual sequence of the
hit but the whole sequence containing the hit.
:todo: Raise an error when the strand is wrong.
"""
self.seq_record = seq_record
self.start = start
self.end = end
self.strand = strand
self.score = score
self.tffm_matched_state = tffm_matched_state
self.tffm = tffm
def __len__(self):
"""
Give the length of the TFBS hit.
:returns: The length of the TFBS hit.
:rtype: int
"""
return self.end - self.start + 1
def __str__(self):
"""
Give the string representation of the TFBS hit.
:returns: The string representing the hit in the following format:
start\tend\tstrand\tsequence\ttffm-name\ttffm-state\tscore
:rtype: str
"""
if self.tffm:
name = self.tffm.name
else:
name = "NoName"
string = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%s" % (self.seq_record.id,
self.start, self.end,
self.strand,
self.sequence(),
name,
self.tffm_matched_state,
repr(self.score))
return string
def __lt__(self, other):
"""
Implement the **<** operator. The comparison looks at the score.
"""
if other:
return self.score < other.score
else:
return False
def __le__(self, other):
"""
Implement the **<=** operator. The comparison looks at the score.
"""
if other:
return self.score <= other.score
else:
return False
def __eq__(self, other):
"""
Implement the **==** operator. The comparison looks at the score.
"""
if other:
return self.score == other.score
else:
return False
def __ne__(self, other):
"""
Implement the **!=** operator. The comparison looks at the score.
"""
if other:
return self.score != other.score
else:
return True
def __gt__(self, other):
"""
Implement the **>** operator. The comparison looks at the score.
"""
if other:
return self.score > other.score
else:
return True
def __ge__(self, other):
"""
Implement the **>=** operator. The comparison looks at the score.
"""
if other:
return self.score >= other.score
else:
return True
def sequence(self):
"""
Give the sequence of the TFBS hit.
:returns: The sequence of the TFBS hit.
:rtype: str
"""
seq = self.seq_record.seq[self.start - 1:self.end]
if self.strand == "+" or not self.strand:
return seq
else:
return seq.reverse_complement()
def get_start_end_strand(position, seq_record, tffm, negative):
"""
Get the start and end positions of a TFBS hit given its end position on the
positive strand.
:arg position: End position of the TFBS hit on the positive strand of the
sequence.
:type position: int
:arg seq_record: The actual sequence.
:type seq_record: :class:`Bio.SeqRecord`
:arg tffm: The TFFM used to predict the TFBS hit.
:type tffm: :class:`TFFM`
:arg negative: Boolean set to True if the TFBS hit is on the negative
strand of the sequence and False otherwise.
:returns: The start and end positions and the strand.
:rtype: tuple(int, int, str)
:note: The strand is '+' if the hit is on the positive strand and '-'
otherwise.
:warning: The input *position* is given 0-based as extracted from TFFM
computations but the output start and end are 1-based since it is a
more conventionnal way to print the TFBS hit positions.
"""
# position is 0-based and we need 1-based coordinates
if negative:
start = len(seq_record) - position
end = len(seq_record) - position + len(tffm) - 1
strand = "-"
else:
end = position + 1
start = position - len(tffm) + 2
strand = "+"
return start, end, strand