-
Notifications
You must be signed in to change notification settings - Fork 93
/
Copy pathpca_demo.R
142 lines (118 loc) · 4.67 KB
/
pca_demo.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
## In this example, the data is in a matrix called
## data.matrix
## columns are individual samples (i.e. cells)
## rows are measurements taken for all the samples (i.e. genes)
## Just for the sake of the example, here's some made up data...
data.matrix <- matrix(nrow=100, ncol=10)
colnames(data.matrix) <- c(
paste("wt", 1:5, sep=""),
paste("ko", 1:5, sep=""))
rownames(data.matrix) <- paste("gene", 1:100, sep="")
for (i in 1:100) {
wt.values <- rpois(5, lambda=sample(x=10:1000, size=1))
ko.values <- rpois(5, lambda=sample(x=10:1000, size=1))
data.matrix[i,] <- c(wt.values, ko.values)
}
head(data.matrix)
dim(data.matrix)
pca <- prcomp(t(data.matrix), scale=TRUE)
## plot pc1 and pc2
plot(pca$x[,1], pca$x[,2])
## make a scree plot
pca.var <- pca$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")
## now make a fancy looking plot that shows the PCs and the variation:
library(ggplot2)
pca.data <- data.frame(Sample=rownames(pca$x),
X=pca$x[,1],
Y=pca$x[,2])
pca.data
ggplot(data=pca.data, aes(x=X, y=Y, label=Sample)) +
geom_text() +
xlab(paste("PC1 - ", pca.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", pca.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("My PCA Graph")
## get the name of the top 10 measurements (genes) that contribute
## most to pc1.
loading_scores <- pca$rotation[,1]
gene_scores <- abs(loading_scores) ## get the magnitudes
gene_score_ranked <- sort(gene_scores, decreasing=TRUE)
top_10_genes <- names(gene_score_ranked[1:10])
top_10_genes ## show the names of the top 10 genes
pca$rotation[top_10_genes,1] ## show the scores (and +/- sign)
#######
##
## NOTE: Everything that follow is just bonus stuff.
## It simply demonstrates how to get the same
## results using "svd()" (Singular Value Decomposition) or using "eigen()"
## (Eigen Decomposition).
##
#######
############################################
##
## Now let's do the same thing with svd()
##
## svd() returns three things
## v = the "rotation" that prcomp() returns, this is a matrix of eigenvectors
## in other words, a matrix of loading scores
## u = this is similar to the "x" that prcomp() returns. In other words,
## sum(the rotation * the original data), but compressed to the unit vector
## You can spread it out by multiplying by "d"
## d = this is similar to the "sdev" value that prcomp() returns (and thus
## related to the eigen values), but not
## scaled by sample size in an unbiased way (ie. 1/(n-1)).
## For prcomp(), sdev = sqrt(var) = sqrt(ss(fit)/(n-1))
## For svd(), d = sqrt(ss(fit))
##
############################################
svd.stuff <- svd(scale(t(data.matrix), center=TRUE))
## calculate the PCs
svd.data <- data.frame(Sample=colnames(data.matrix),
X=(svd.stuff$u[,1] * svd.stuff$d[1]),
Y=(svd.stuff$u[,2] * svd.stuff$d[2]))
svd.data
## alternatively, we could compute the PCs with the eigen vectors and the
## original data
svd.pcs <- t(t(svd.stuff$v) %*% t(scale(t(data.matrix), center=TRUE)))
svd.pcs[,1:2] ## the first to principal components
svd.df <- ncol(data.matrix) - 1
svd.var <- svd.stuff$d^2 / svd.df
svd.var.per <- round(svd.var/sum(svd.var)*100, 1)
ggplot(data=svd.data, aes(x=X, y=Y, label=Sample)) +
geom_text() +
xlab(paste("PC1 - ", svd.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", svd.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("svd(scale(t(data.matrix), center=TRUE)")
############################################
##
## Now let's do the same thing with eigen()
##
## eigen() returns two things...
## vectors = eigen vectors (vectors of loading scores)
## NOTE: pcs = sum(loading scores * values for sample)
## values = eigen values
##
############################################
cov.mat <- cov(scale(t(data.matrix), center=TRUE))
dim(cov.mat)
## since the covariance matrix is symmetric, we can tell eigen() to just
## work on the lower triangle with "symmetric=TRUE"
eigen.stuff <- eigen(cov.mat, symmetric=TRUE)
dim(eigen.stuff$vectors)
head(eigen.stuff$vectors[,1:2])
eigen.pcs <- t(t(eigen.stuff$vectors) %*% t(scale(t(data.matrix), center=TRUE)))
eigen.pcs[,1:2]
eigen.data <- data.frame(Sample=rownames(eigen.pcs),
X=(-1 * eigen.pcs[,1]), ## eigen() flips the X-axis in this case, so we flip it back
Y=eigen.pcs[,2]) ## X axis will be PC1, Y axis will be PC2
eigen.data
eigen.var.per <- round(eigen.stuff$values/sum(eigen.stuff$values)*100, 1)
ggplot(data=eigen.data, aes(x=X, y=Y, label=Sample)) +
geom_text() +
xlab(paste("PC1 - ", eigen.var.per[1], "%", sep="")) +
ylab(paste("PC2 - ", eigen.var.per[2], "%", sep="")) +
theme_bw() +
ggtitle("eigen on cov(t(data.matrix))")