-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathset_report.R
executable file
·200 lines (158 loc) · 5.8 KB
/
set_report.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
---
Title: "MicroArray gene expression network analysis of Mycobacterium tuberculosis (Mtb)"
Authers: "Ibrahim ElZahaby & Luka Lmelias"
Team: "Group_5"
---
# Libraries required
library(impute)
library(dendextend)
library(minet)
library(RColorBrewer)
# Read in the file
data <- read.csv('set_5.csv', row.names = 1)
# explore the data
# count the number of genes and contions
dim(data)
# get the condition names
colnames(data)
# Check the size of missing data
missing_data = which(is.na(data), arr.ind = TRUE) # get row and col with NA
# Impute missing data
data_impute = impute.knn(as.matrix(data))
data = data_impute$data
after_impute = which(is.na(data), arr.ind = TRUE)
# Check for data normality.
boxplot(data) # turns out to be too large to interpret, we try QQplot
qqnorm(data)
## Explore the PCA spaces
pca_gene = prcomp(as.matrix(data), center = TRUE, scale. = TRUE)
pca_summary = summary(pca_gene) # we need the importance of each PC
plot(abs(pca_gene$rotation[,1]),abs(pca_gene$rotation[,2]))
barplot(abs(pca_gene$x)) # check the distribution of PCs
# check the % variance explained by each PC
var_expl = (pca_gene$sdev^2)/sum(pca_gene$sdev^2)*100
barplot(var_expl, main = 'Variance explained',
xlab = 'componet',
ylab = '% variance explained')
dim(pca_gene$x)
scores = pca_gene$x[,1:4] # we added 4 PCs cause each PC explains only little % variance
# The fisrt and second PCA
plot(scores[,1:2], main = 'PCA scores of the 1st and 2nd PCs',
xlab = paste0("PC1 (",format(100* pca_summary$importance[2,1],digits=2),"%)"),
ylab = paste0("PC2 (",format(100* pca_summary$importance[2,2],digits=2),"%)"),
pch=21
)
# The third and fourth PCA
plot(scores[,3:4], main = 'PCA scores of The 3rd and 4th PCs',
xlab = paste0("PC3 (",format(100* pca_summary$importance[2,3],digits=2),"%)"),
ylab = paste0("PC4 (",format(100* pca_summary$importance[2,4],digits=2),"%)"),
pch=21
)
# Heatmap of gene correlation
colors<-brewer.pal(7,"RdBu")
heatmap(cor(t(data)),symm=TRUE, col=colors,
main = "Heatmap of gene correlation")
legend(x = "topright", legend = c("negative", "medium", "high"), fill = colors,
cex = 0.7) # we could not exactly get the legend colors to show blue when there is positive correlation
# Hierarchical clustering
distance_gene = dist(data, method = 'euclidean')
hclust_gene = hclust(as.dist(distance_gene), method = 'average') # We could also compare 'average' vs 'complete' if time allows
# Plot the dendrogram
plot(hclust_gene, main = 'clustering in data space',
xlab = '', sub = 'average hclust distance') # data space
# Network Inference
data = t(as.matrix(data))
nbins <- sqrt(nrow(data)) # compute mutual information
# method 1 clr
clr.net <- minet(data, method="clr",
estimator="mi.empirical",
disc="equalfreq", nbins=nbins)
clr.net[1:5,1:5]
# creating network table
clr.th <- clr.net
diag(clr.th) <- 0
clr.th[lower.tri(clr.th )] <- 0
clr.th[which(clr.th>0)] <- 1
xx <- which(clr.th==1, arr.ind=TRUE)
write.table(cbind(rownames(clr.th)[xx[,1]],
rownames(clr.th)[xx[,2]]),
file="clr.csv", col.names=FALSE,
row.names=FALSE, sep=",", quote=FALSE)
length(xx)
# set fixed threshold
thrsh <- 0.25
clr.th <- clr.net
clr.th[which(clr.net<= thrsh)] <- 0
clr.th[which(clr.net> thrsh)] <- 1
diag(clr.th) <- 0
length(clr.th)
#to write to a file (that can be imported in cytoscape)
clr.th[lower.tri(clr.th )] <- 0
xx <- which(clr.th==1, arr.ind=TRUE)
write.table(cbind(rownames(clr.th)[xx[,1]],
rownames(clr.th)[xx[,2]]),
file="clr_th.csv", col.names=FALSE,
row.names=FALSE, sep="\t", quote=FALSE)
length(xx)
# method 2 mrnet
mrnet.net <- minet(data, method="mrnet",
estimator="mi.empirical",
disc="equalfreq", nbins=nbins)
mrnet.net[1:5,1:5]#have a look at the adjacency matrix
# create the table
mrnet.th <- mrnet.net
diag(mrnet.th) <- 0
mrnet.th[lower.tri(mrnet.th )] <- 0
mrnet.th[which(mrnet.th>0)] <- 1
xx <- which(mrnet.th==1, arr.ind=TRUE)
write.table(cbind(rownames(mrnet.th)[xx[,1]],
rownames(mrnet.th)[xx[,2]]),
file="mrnet.csv", col.names=FALSE,
row.names=FALSE, sep=",", quote=FALSE)
length(xx)
# fixing the the threshold
thrsh <- 0.1
mrnet.th <- mrnet.net
mrnet.th[which(mrnet.net<= thrsh)] <- 0
mrnet.th[which(mrnet.net> thrsh)] <- 1
diag(mrnet.th) <- 0
length(mrnet.th)
#to write to a file (that can be imported in cytoscape)
mrnet.th[lower.tri(mrnet.th )] <- 0
xx <- which(mrnet.th==1, arr.ind=TRUE)
write.table(cbind(rownames(mrnet.th)[xx[,1]],
rownames(mrnet.th)[xx[,2]]),
file="mrnet_th.csv", col.names=FALSE,
row.names=FALSE, sep=",", quote=FALSE)
length(xx)
# method 3 Aracne
aracne.net <- minet(data, method="aracne",
estimator="mi.empirical",
disc="equalfreq", nbins=nbins)
aracne.net[1:5,1:5]
# write table without threshold
aracne.th <- aracne.net
diag(aracne.th) <- 0
aracne.th[lower.tri(aracne.th )] <- 0
aracne.th[which(aracne.th>0)] <- 1
xx <- which(aracne.th==1, arr.ind=TRUE)
write.table(cbind(rownames(aracne.th)[xx[,1]],
rownames(aracne.th)[xx[,2]]),
file="aracne.csv", col.names=FALSE,
row.names=FALSE, sep=",", quote=FALSE)
length(xx)
# fix threshold
thrsh <- 0.1
aracne.th <- aracne.net
aracne.th[which(aracne.net<= thrsh)] <- 0
aracne.th[which(aracne.net> thrsh)] <- 1
diag(aracne.th) <- 0
length(aracne.th)
#to write to a file (that can be imported in cytoscape)
aracne.th[lower.tri(aracne.th )] <- 0
xx <- which(aracne.th==1, arr.ind=TRUE)
write.table(cbind(rownames(aracne.th)[xx[,1]],
rownames(aracne.th)[xx[,2]]),
file="aracne_th.csv", col.names=FALSE,
row.names=FALSE, sep=",", quote=FALSE)
length(xx)