-
Notifications
You must be signed in to change notification settings - Fork 20
/
plotHeat.py
253 lines (220 loc) · 11.6 KB
/
plotHeat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
"""
This code was adapted from the following recipe:
* http://altanalyze.blogspot.se/2012/06/hierarchical-clustering-heatmaps-in.html
* http://code.activestate.com/recipes/578175/
Which was in turn inspired by many other posts:
* http://stackoverflow.com/questions/7664826
* http://stackoverflow.com/questions/2982929
* http://stackoverflow.com/questions/2455761
Running this with cosine or other distance metrics can often produce negative Z scores during clustering, so adjustments to the clustering may be required. Information about distance measures can be found here:
* http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html
* http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html
The documentation about the custom color gradients can be found here:
* http://matplotlib.sourceforge.net/examples/pylab_examples/custom_cmap.html
"""
# Built-in modules #
import random
# Third party modules #
import numpy, scipy, matplotlib, pandas
from matplotlib import pyplot
import scipy.cluster.hierarchy as sch
import scipy.spatial.distance as dist
import names
###############################################################################
# Create Custom Color Gradients #
red_black_sky = {'red': ((0.0, 0.0, 0.0), (0.5, 0.0, 0.1), (1.0, 1.0, 1.0)),
'green': ((0.0, 0.0, 0.9), (0.5, 0.1, 0.0), (1.0, 0.0, 0.0)),
'blue': ((0.0, 0.0, 1.0), (0.5, 0.1, 0.0), (1.0, 0.0, 0.0))}
red_black_blue = {'red': ((0.0, 0.0, 0.0), (0.5, 0.0, 0.1), (1.0, 1.0, 1.0)),
'green': ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)),
'blue': ((0.0, 0.0, 1.0), (0.5, 0.1, 0.0), (1.0, 0.0, 0.0))}
red_black_green = {'red': ((0.0, 0.0, 0.0), (0.5, 0.0, 0.1), (1.0, 1.0, 1.0)),
'blue': ((0.0, 0.0, 0.0), (1.0, 0.0, 0.0)),
'green': ((0.0, 0.0, 1.0), (0.5, 0.1, 0.0), (1.0, 0.0, 0.0))}
yellow_black_blue = {'red': ((0.0, 0.0, 0.0), (0.5, 0.0, 0.1), (1.0, 1.0, 1.0)),
'green': ((0.0, 0.0, 0.8), (0.5, 0.1, 0.0), (1.0, 1.0, 1.0)),
'blue': ((0.0, 0.0, 1.0), (0.5, 0.1, 0.0), (1.0, 0.0, 0.0))}
make_cmap = lambda x: matplotlib.colors.LinearSegmentedColormap('my_colormap', x, 256)
color_gradients = {'red_black_sky' : make_cmap(red_black_sky),
'red_black_blue' : make_cmap(red_black_blue),
'red_black_green' : make_cmap(red_black_green),
'yellow_black_blue' : make_cmap(yellow_black_blue),
'red_white_blue' : pyplot.cm.bwr,
'seismic' : pyplot.cm.seismic,
'green_white_purple' : pyplot.cm.PiYG_r,
'coolwarm' : pyplot.cm.coolwarm,
'blues' : pyplot.cm.Blues,
'hot' : pyplot.cm.hot}
###############################################################################
class HiearchicalHeatmap(object):
"""A common use case for biologists analyzing their gene expression data is to cluster and visualize patterns of expression in the form of a heatmap and associated dendrogram."""
row_method = 'single' # Can be: linkage, single, complete, average, weighted, centroid, median, ward
column_method = 'single' # Can be: linkage, single, complete, average, weighted, centroid, median, ward
row_metric = 'braycurtis' # Can be: see scipy documentation
column_metric = 'braycurtis' # Can be: see scipy documentation
gradient_span = 'min_to_max' # Can be: min_to_max, min_to_max_centered, only_max, only_min
#color_gradient = 'yellow_black_blue' # Can be: see color_gradients dictionary
color_gradient = 'hot' # Can be: see color_gradients dictionary
fig_weight = 12
fig_height = 8.5
def plot(self):
# Names #
row_header = self.frame.index
column_header = self.frame.columns
# What color to use #
cmap = color_gradients[self.color_gradient]
# Scale the max and min colors #
value_min = self.frame.min().min()
value_max = self.frame.max().max()
if self.gradient_span == 'min_to_max_centered':
value_max = max([value_max, abs(value_min)])
value_min = value_max * -1
if self.gradient_span == 'only_max': value_min = 0
if self.gradient_span == 'only_min': value_max = 0
print "min", value_min, "max", value_max
norm = matplotlib.colors.Normalize(value_min, value_max)
# Scale the figure window size #
fig = pyplot.figure(figsize=(self.fig_weight, self.fig_height))
# Calculate positions for all elements #
# ax1, placement of dendrogram 1, on the left of the heatmap
### The second value controls the position of the matrix relative to the bottom of the view
[ax1_x, ax1_y, ax1_w, ax1_h] = [0.05, 0.22, 0.2, 0.6]
width_between_ax1_axr = 0.004
### distance between the top color bar axis and the matrix
height_between_ax1_axc = 0.004
### Sufficient size to show
color_bar_w = 0.015
# axr, placement of row side colorbar #
### second to last controls the width of the side color bar - 0.015 when showing
[axr_x, axr_y, axr_w, axr_h] = [0.31, 0.1, color_bar_w, 0.6]
axr_x = ax1_x + ax1_w + width_between_ax1_axr
axr_y = ax1_y; axr_h = ax1_h
width_between_axr_axm = 0.004
# axc, placement of column side colorbar #
### last one controls the hight of the top color bar - 0.015 when showing
[axc_x, axc_y, axc_w, axc_h] = [0.4, 0.63, 0.5, color_bar_w]
axc_x = axr_x + axr_w + width_between_axr_axm
axc_y = ax1_y + ax1_h + height_between_ax1_axc
height_between_axc_ax2 = 0.004
# axm, placement of heatmap for the data matrix #
[axm_x, axm_y, axm_w, axm_h] = [0.4, 0.9, 2.5, 0.5]
axm_x = axr_x + axr_w + width_between_axr_axm
axm_y = ax1_y; axm_h = ax1_h
axm_w = axc_w
# ax2, placement of dendrogram 2, on the top of the heatmap #
### last one controls hight of the dendrogram
[ax2_x, ax2_y, ax2_w, ax2_h] = [0.3, 0.72, 0.6, 0.15]
ax2_x = axr_x + axr_w + width_between_axr_axm
ax2_y = ax1_y + ax1_h + height_between_ax1_axc + axc_h + height_between_axc_ax2
ax2_w = axc_w
# axcb - placement of the color legend #
[axcb_x, axcb_y, axcb_w, axcb_h] = [0.07, 0.88, 0.18, 0.09]
# Compute and plot top dendrogram #
if self.column_method:
d2 = dist.pdist(self.frame.transpose())
D2 = dist.squareform(d2)
ax2 = fig.add_axes([ax2_x, ax2_y, ax2_w, ax2_h], frame_on=True)
Y2 = sch.linkage(D2, method=self.column_method, metric=self.column_metric)
Z2 = sch.dendrogram(Y2)
ind2 = sch.fcluster(Y2, 0.7*max(Y2[:,2]), 'distance')
ax2.set_xticks([])
ax2.set_yticks([])
### apply the clustering for the array-dendrograms to the actual matrix data
idx2 = Z2['leaves']
self.frame = self.frame.iloc[:,idx2]
### reorder the flat cluster to match the order of the leaves the dendrogram
ind2 = ind2[idx2]
else: idx2 = range(self.frame.shape[1])
# Compute and plot left dendrogram #
if self.row_method:
d1 = dist.pdist(self.frame)
D1 = dist.squareform(d1)
ax1 = fig.add_axes([ax1_x, ax1_y, ax1_w, ax1_h], frame_on=True)
Y1 = sch.linkage(D1, method=self.row_method, metric=self.row_metric)
Z1 = sch.dendrogram(Y1, orientation='right')
ind1 = sch.fcluster(Y1, 0.7*max(Y1[:,2]), 'distance')
ax1.set_xticks([])
ax1.set_yticks([])
### apply the clustering for the array-dendrograms to the actual matrix data
idx1 = Z1['leaves']
self.frame = self.frame.iloc[idx1,:]
### reorder the flat cluster to match the order of the leaves the dendrogram
ind1 = ind1[idx1]
else: idx1 = range(self.frame.shape[0])
# Plot distance matrix #
axm = fig.add_axes([axm_x, axm_y, axm_w, axm_h])
axm.matshow(self.frame, aspect='auto', origin='lower', cmap=cmap, norm=norm)
axm.set_xticks([])
axm.set_yticks([])
# Add text #
new_row_header = []
new_column_header = []
for i in range(self.frame.shape[0]):
axm.text(self.frame.shape[1]-0.5, i, ' ' + row_header[idx1[i]], verticalalignment="center")
new_row_header.append(row_header[idx1[i]] if self.row_method else row_header[i])
for i in range(self.frame.shape[1]):
axm.text(i, -0.9, ' '+column_header[idx2[i]], rotation=90, verticalalignment="top", horizontalalignment="center")
new_column_header.append(column_header[idx2[i]] if self.column_method else column_header[i])
# Plot column side colorbar #
if self.column_method:
axc = fig.add_axes([axc_x, axc_y, axc_w, axc_h])
cmap_c = matplotlib.colors.ListedColormap(['r', 'g', 'b', 'y', 'w', 'k', 'm'])
dc = numpy.array(ind2, dtype=int)
dc.shape = (1,len(ind2))
axc.matshow(dc, aspect='auto', origin='lower', cmap=cmap_c)
axc.set_xticks([])
axc.set_yticks([])
# Plot column side colorbar #
if self.row_method:
axr = fig.add_axes([axr_x, axr_y, axr_w, axr_h])
dr = numpy.array(ind1, dtype=int)
dr.shape = (len(ind1),1)
cmap_r = matplotlib.colors.ListedColormap(['r', 'g', 'b', 'y', 'w', 'k', 'm'])
axr.matshow(dr, aspect='auto', origin='lower', cmap=cmap_r)
axr.set_xticks([])
axr.set_yticks([])
# Plot color legend #
### axes for colorbar
axcb = fig.add_axes([axcb_x, axcb_y, axcb_w, axcb_h], frame_on=False)
cb = matplotlib.colorbar.ColorbarBase(axcb, cmap=cmap, norm=norm, orientation='horizontal')
axcb.set_title("colorkey")
max_cb_ticks = 5
axcb.xaxis.set_major_locator(pyplot.MaxNLocator(max_cb_ticks))
# Render the graphic #
if len(row_header)>50 or len(column_header)>50: pyplot.rcParams['font.size'] = 5
else: pyplot.rcParams['font.size'] = 8
# Return figure #
return fig, axm, axcb, cb
###############################################################################
class MyHeatmap(HiearchicalHeatmap):
short_name = 'test_heatmap'
def data(self, size=20):
"""Create some fake data in a dataframe"""
#numpy.random.seed(0)
#random.seed(0)
#x = scipy.rand(size)
#M = scipy.zeros([size,size])
#for i in range(size):
#for j in range(size): M[i,j] = abs(x[i] - x[j])
#df = pandas.DataFrame(M, index=[names.get_last_name() for _ in range(size)],
#columns=[names.get_first_name() for _ in range(size)])
df = pandas.read_table("out/effScoreComp.tsv", index_col=0)
for row in df:
print repr(row)
#df['Mary']['Day'] = 1.5
#df['Issac']['Day'] = 1.0
return df
def plot(self):
self.frame = self.data()
self.path = 'out/' + self. short_name + '.pdf'
fig, axm, axcb, cb = HiearchicalHeatmap.plot(self)
cb.set_label("Pearson Correlation")
pyplot.savefig(self.path)
pyplot.savefig(self.path.replace(".pdf", ".png"))
print "saved to %s and .png" % self.path
###############################################################################
def main():
graph = MyHeatmap()
graph.plot()
return graph
main()