-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.Rmd
731 lines (626 loc) · 30.9 KB
/
analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
---
title: "Analysis of arrays for the knockdown of various export compoenets"
output: html_notebook
---
Arrays are single colour agilent arrays, and are stored on DropBox. First it is neccessary to lad in the data and do background correction.We want to remove samples with LUZP4 and SSRP72.
```{r,results='hide'}
library(limma)
path = "/mnt/winhome/Dropbox/Microarray data collated/Gene Expression Lists/1- Raw data files/"
targets <- readTargets(path=path, row.names="SlideName")
targets <- subset(targets, !(knockdown %in% c("FLAG_LUZP4", "SSRP1", "FLP_IN_293")))
raw_slides = read.maimages(targets$FileName, source="agilent", path=path, green.only=TRUE, names = targets$SlideName)
bc_data <- backgroundCorrect(raw_slides, method="normexp", offset=10)
```
# Background correction #
Lets take a look at the background
```{r}
library(ggplot2)
library(reshape2)
ggplot(melt(data.frame(raw_slides$Eb), variable.name = "Array", value.name="background")) +
aes(x=Array, y=background) +
geom_boxplot(outlier.shape = NA) +
scale_y_log10() +
theme_bw(base_size = 9) +
coord_cartesian(ylim = c(10,100)) +
theme(axis.text.x=element_text(angle=-90, hjust = 0))
```
and the forground:
```{r}
library(ggplot2)
library(reshape2)
ggplot(melt(data.frame(raw_slides$E), variable.name = "Array", value.name="foreground")) +
aes(x=Array, y=foreground) +
geom_boxplot(outlier.shape = NA) +
scale_y_log10() +
theme_bw(base_size = 9) +
coord_cartesian(ylim = c(10,10000)) +
theme(axis.text.x=element_text(angle=-90, hjust = 0))
```
There is clear variation in the levels of background, but the variation in the foreground is much lower. Does this mean we will be introducing extra variation by doing background correction? Lets look at the control spots:
```{r, fig.width=8}
library(dplyr)
data.frame(raw_slides$E, ControlType = raw_slides$genes$ControlType, ProbeName = raw_slides$genes$ProbeName) %>%
subset(ControlType != 0) %>%
melt(id.vars=c("ControlType","ProbeName"),
variable.name = "Array",
value.name = "Intensity") %>%
ggplot(aes(Array, Intensity, fill=as.factor(ControlType))) +
geom_boxplot(outlier.shape = NA, position="dodge", lwd=0.5) +
coord_cartesian(ylim=c(10,1000)) +
theme_bw(base_size=9) +
scale_y_log10() +
guides(color=FALSE, shape=FALSE, fill=FALSE) +
theme(axis.text.x=element_text(angle=-90, hjust = 0, vjust=0.5))
```
The control spots look remarkabley like the background. The materials and methods lists that spike-ins were used, but I'm a bit worried here.Lets look at the spike-in data. The spikeins are in a set of probes named (+)E1A..... and have concentrations defined in the agilent manual Still there is clear variable, so perhaps it does make make sense to just the background corrected values. Lets take a look at those:
```{r}
spikeins = data.frame(probe = c("(+)E1A_r60_3", "(+)E1A_r60_a97", "(+)E1A_r60_a20", "(+)E1A_r60_a135", "(+)E1A_r60_a22", "(+)E1A_r60_a104", "(+)E1A_r60_a107",
"(+)E1A_r60_1", "(+)E1A_r60_n9", "(+)E1A_r60_n11"),
conc = c(0.3, 4.82, 3.83, 3.3, 4.3, 1.3, 2.3, 6.3, 5.82, 5.3))
row.names(spikeins) <- spikeins$probe
print(spikeins)
```
```{r,fig.height=6, fig.width=8}
spikein_data = raw_slides[raw_slides$genes$ProbeName %in% spikeins$probe,]
data.frame(spikein_data$E, conc=spikeins[spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
scale_y_log10()
```
What do we note from this? Well, on the slides where the background is high, there is still a good relationship between concentration of the spike in and signal. Second that we only have a linear response after about 30-100 depending on the slide, BUT its not the slides with the high background that have the poorer linear response. In addition, in some samples there is a hint that the cytoplasmic samples have a higher intensity for the same amount of RNA. This suggests that the gain has been turned up, perhaps to compensate for there being less RNA? What does norm_exp background correction do to things?
## NormExp background correction ##
```{r}
library(ggplot2)
library(reshape2)
ggplot(melt(data.frame(bc_data$E), variable.name = "Array", value.name="intensity")) +
aes(x=Array, y=intensity) +
geom_boxplot(outlier.shape = NA) +
scale_y_log10() +
theme_bw(base_size = 9) +
theme(axis.text.x=element_text(angle=-90, hjust = 0, vjust=0.5))
```
Okay, so that definately looks worst in some cases that the uncorrected, but that doesn't seem to correlate with the ones that had different background, or rather the outliers do look like that are ones that had high background, but not all those with high background are outliers. Lets have a look at the control spots again:
```{r}
data.frame(bc_data$E, ControlType = bc_data$genes$ControlType, ProbeName = bc_data$genes$ProbeName) %>%
subset(ControlType != 0) %>%
melt(id.vars=c("ControlType","ProbeName"),
variable.name = "Array",
value.name = "Intensity") %>%
ggplot(aes(Array, Intensity, fill=as.factor(ControlType))) +
geom_boxplot(outlier.shape = NA, position="dodge", lwd=0.5) +
coord_cartesian(ylim=c(10,1000)) +
theme_bw(base_size=9) +
scale_y_log10() +
guides(color=FALSE, shape=FALSE, fill=FALSE) +
theme(axis.text.x=element_text(angle=-90, hjust = 0, vjust=0.5))
```
Okay, so that outlier arrays look off in this as well.... well off. What do the spikeins look like?
```{r, fig.height=6, fig.width=8}
bc_spikein_data = bc_data[bc_data$genes$ProbeName %in% spikeins$probe,]
data.frame(bc_spikein_data$E, conc=spikeins[bc_spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
scale_y_log10()
```
## Minimum background correction ##
So this is clearly no good. In the arrays with the funny correction now have no linear response until 1000. Is "minimum" correction any better:
```{r}
bc_data = backgroundCorrect(raw_slides, method="minimum")
bc_spikein_data = bc_data[bc_data$genes$ProbeName %in% spikeins$probe,]
data.frame(bc_spikein_data$E, conc=spikeins[bc_spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
scale_y_log10()
```
Thats much better (if not much different from the uncorrected). But we still have poor responsiveness in the low end from arrays with high background (i.e. the correction hasn't done much). Alternatively we could remove the bad arrays, as the the normexp does do a better job of correcting for higher background in all but these cases. The trouble with this is that one of them is a control, and the loss of power by losing one of the controls is not good.
What do the boxes look like now?
```{r}
ggplot(melt(data.frame(bc_data$E), variable.name = "Array", value.name="foreground")) +
aes(x=Array, y=foreground) +
geom_boxplot(outlier.shape = NA) +
scale_y_log10() +
theme_bw(base_size = 9) +
# coord_cartesian(ylim = c(10,100000)) +
theme(axis.text.x=element_text(angle=-90, hjust = 0))
```
Interesting. Although the standout samples here are the UAP56-DDX39 knockdowns, although their spikeins look fine. This should be considered a good thing I suppose.
What about the distribution of the negative controls?
```{r}
ggplot(melt(data.frame(bc_data$E[bc_data$genes$ControlType==-1,]), variable.name = "Array", value.name="foreground")) +
aes(x=Array, y=foreground) +
geom_boxplot(outlier.shape = NA) +
geom_hline(yintercept = 15) +
scale_y_log10() +
theme_bw(base_size = 9) +
# coord_cartesian(ylim = c(10,100000)) +
theme(axis.text.x=element_text(angle=-90, hjust = 0))
```
SO some of the arrays have higher intensity in the negative controls, which suggeststhat the normalization hasn't succeeded.
Do they clust with the others? Lets say the outliers are those with controlspot averages above 100:
```{r}
subset(bc_data$E, bc_data$genes$ControlType == -1) %>% log() %>% colMeans() %>% exp() -> neg_means
hist(neg_means, breaks=30)
```
Seems there are two natureal break poitns in the negatative control distribution: around 12 and around 20.
```{r, fig.height=8, fig.width=8}
outliers = neg_means > 20
row.colours = rep("white", times = length(colnames(bc_data)))
row.colours[outliers] <- "blue"
library(gplots)
cormat <- cor(log2(bc_data$E))
distmat = as.dist(1-cormat)
clust = hclust(distmat)
dend = as.dendrogram(clust)
heatmap.2(cormat, trace="none", Rowv = dend, Colv = dend, margins = c(10,10), cexRow=0.6, cexCol=0.6, lhei=c(1,6), lwid=c(1,6), RowSideColors = row.colours)
```
So the two clusters are pretty well correlated with negative controls having a geometric mean intensity of greater than 20, this almost certainly suggests that the background subtraction is failing. One thing we have not tried is normexp using the negative controls. Lets try that.
## Negative Control based NormExp backgruond correction ##
```{r, fig.height=6, fig.width=8}
raw_slides$genes$Status <- ifelse(raw_slides$genes$ControlType == -1, "negative", "regular")
bc_data <- nec(raw_slides)
bc_spikein_data = bc_data[bc_data$genes$ProbeName %in% spikeins$probe,]
data.frame(bc_spikein_data$E, conc=spikeins[bc_spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
scale_y_log10() +
ggtitle("SpikeIn distribution after NEC backgruond correction")
```
Not perfect, we still have outlier arrays - one of the SPT16 cytoplasmic samples and one of the THOC-ALYREF total samples. Are these also outliers in the boxplots:
```{r}
ggplot(melt(data.frame(bc_data$E), variable.name = "Array", value.name="foreground")) +
aes(x=Array, y=foreground) +
geom_boxplot(outlier.shape = NA) +
scale_y_log10() +
theme_bw(base_size = 9) +
coord_cartesian(ylim = c(10,100000)) +
theme(axis.text.x=element_text(angle=-90, hjust = 0)) +
ggtitle("Probe Intensity distributions after NEC background correction")
```
So those arrays definately do stand out, not by their mean, but by a truncated low intensity. And the negative control probes:
```{r}
ggplot(melt(data.frame(bc_data$E[bc_data$genes$ControlType==-1,]), variable.name = "Array", value.name="foreground")) +
aes(x=Array, y=foreground) +
geom_boxplot() +
# geom_hline(yintercept = 15) +
scale_y_log10() +
theme_bw(base_size = 9) +
# coord_cartesian(ylim = c(10,150)) +
theme(axis.text.x=element_text(angle=-90, hjust = 0))
```
Ah ha! The SPT16 array has a big fat outlier on it. NEC includes a robust mode. Lets have a look at that:
## Robust Negative control based NormExp ##
```{r, fig.height=6, fig.width=8}
raw_slides$genes$Status <- ifelse(raw_slides$genes$ControlType == -1, "negative", "regular")
bc_data <- nec(raw_slides, robust = T)
bc_spikein_data = bc_data[bc_data$genes$ProbeName %in% spikeins$probe,]
data.frame(bc_spikein_data$E, conc=spikeins[bc_spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
scale_y_log10() +
ggtitle("SpikeIn distribution after NEC backgruond correction")
```
```{r}
ggplot(melt(data.frame(bc_data$E), variable.name = "Array", value.name="foreground")) +
aes(x=Array, y=foreground) +
geom_boxplot(outlier.shape = NA) +
scale_y_log10() +
theme_bw(base_size = 9) +
coord_cartesian(ylim = c(10,100000)) +
theme(axis.text.x=element_text(angle=-90, hjust = 0)) +
ggtitle("Probe Intensity distributions after robust NEC background correction")
```
```{r}
ggplot(melt(data.frame(bc_data$E[bc_data$genes$ControlType==-1,]), variable.name = "Array", value.name="foreground")) +
aes(x=Array, y=foreground) +
geom_boxplot() +
# geom_hline(yintercept = 15) +
scale_y_log10() +
theme_bw(base_size = 9) +
coord_cartesian(ylim = c(10,75)) +
theme(axis.text.x=element_text(angle=-90, hjust = 0))+
ggtitle("Negative control probe intensity after robust NEC background correction")
```
Much better. There are still a couple of weird slides - C.UIF.96.2 and T.THOC5-ALYREF.1 amounst them, but much better than it was. How is the clustering looking?
```{r, fig.height=8, fig.width=8}
outliers = neg_means > 20
row.colours = rep("white", times = length(colnames(bc_data)))
row.colours[outliers] <- "blue"
library(gplots)
cormat <- cor(log2(bc_data$E))
distmat = as.dist(1-cormat)
clust = hclust(distmat)
dend = as.dendrogram(clust)
heatmap.2(cormat, trace="none", Rowv = dend, Colv = dend, margins = c(10,10), cexRow=0.6, cexCol=0.6, lhei=c(1,6), lwid=c(1,6), RowSideColors = row.colours)
```
Still some weirdness in the clustering, with the samples that had weird backgrounds before still be more common in one cluster than others.
Indeed, these arrays cluster together an away from the otheres. They may well be trouble and we might have to remove them at some point, although lets go through with the analysis and then think about removing them.
## Non-expressed probes ##
There is a large non-linear sectinon to many of the response curves in the spike ins. We should probably think about removing these probes, particularly for the clustering. To do this we will use the "10% brighter than the negatives" rule. Let define a per slide cutoff
```{r}
neg95 <- apply(bc_data$E[bc_data$genes$ControlType == -1,], 2, function(x) quantile(x,p=0.95))
data.frame(bc_spikein_data$E, conc=spikeins[bc_spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
scale_y_log10() +
geom_hline(yintercept = mean(neg95)*1.1, lty=2, alpha=0.5) +
ggtitle("Negative expression cutoff vs spikeins")
```
That looks like it might be okay.
We are going to do the normalisation using an external program called "IRON", as this is supposed to cope well with experiments where a large number of the probes have changes. Before we export the data to run this with, we remove the control spots and genes that are not expressed.
```{r}
neg95 <- apply(bc_data$E[bc_data$genes$ControlType == -1,], 2, function(x) quantile(x,p=0.95))
cutoff = matrix(1.2*neg95, nrow(bc_data), ncol(bc_data), byrow=TRUE)
isexpr <- rowSums(bc_data$E > cutoff) >=3
print(table(isexpr))
filtered_data <- bc_data[isexpr,]
print (paste("Spots before control removal", dim(bc_data$E)[1]))
print (paste("Spots after control removal", dim(filtered_data$E)[1]))
```
Now we export the expression matrix, identify the median sample, and run iron_generic to normalise, and then reload
```{r}
fn <- paste(path, "background_corrected.txt", sep="")
median_file = paste(path, "findmedian.txt", sep="")
median_log = paste(path, "findmedian.log", sep="/")
normed_out = paste(path, "normed_out.txt", sep="")
norm_log_file = paste(path, "norm.log", sep="/")
print(fn)
fnq = shQuote(fn)
print(fnq)
write.table(filtered_data$E, file = fn, quote = F, sep = "\t", col.names= NA, row.names = T)
system2(command = "/home/mb1ims/bin/bio/libaffy/bin/findmedian",
args = c("--spreadsheet ", fnq),
stdout = median_file,
stderr = median_log,)
median_sample <- system(command = paste("cat", shQuote(median_file),
"| grep '^Median CEL:'",
"| cut -f 4", sep=" "),
intern=TRUE)
print(median_sample)
system2(command = "/home/mb1ims/bin/bio/libaffy/bin/iron_generic",
args = c("--bg-none",
paste("--norm-iron", median_sample, sep="="),
paste("-o", shQuote(normed_out)),
fnq),
stdout = norm_log_file,
stderr = norm_log_file)
normed_exp <- read.delim(normed_out, header=T, row.names=1)
normed_data <- filtered_data
normed_data$E <- normed_exp
normed_data <- new("EList", unclass(normed_data))
```
Okay, lets have a look at the boxplots:
```{r}
ggplot(melt(data.frame(normed_data$E), variable.name = "Array", value.name="intensity")) +
aes(x=Array, y=intensity) +
geom_boxplot(outlier.shape = NA) +
theme_bw(base_size = 9) +
theme(axis.text.x=element_text(angle=-90, hjust = 0, vjust=0.5))
```
Lets look at the controls:
```{r}
normed_spikein_data = normed_data[normed_data$genes$ProbeName %in% spikeins$probe,]
data.frame(normed_spikein_data$E, conc=spikeins[normed_spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
ggtitle("Post-normalisation spike in curves")
```
This is clearly wrong. We don't expect to see Cytoplsmic samples matching total samples in the spike ins because concentrations were normalised before adding the spike-ins. But that the cytoplasmic samples are HIGHER, rings alarm bells.
There are sevearl possibilities for where to go now. The real decsision to be made is whether we should use the spike-ins to do the normalisation, or just use then to check the results. In theory, if the spike-ins were added after concentration normalisation, then they do not serve as good standards. On the other hand, if most of the RNA is ribosomal RNA, which is not dependent on the usual export mechanisms, then this will not be such and issue. So things to consider are:
1. VSN normalisation using the spike-ins
2. Global Rank-invariant set normalisation (as opposed to IRON, which is iterative rank-invarient normalisation)
3. Quantile normalisation (either using the whole spectrum or just the spikeins)
GRSN recommends using a normalisation method before application, but it takes about RMA, which is more of a background subtraction than a normalisation. We could do VSN, which claims to deal with backgruond, but GRSN might undo the work, so perhaps we should do VSN after GRSN - get the distribution right and then shift it using the spikes. We could also effectively do what GRSN is doing, but pre-selecting the spikins as the invariant set. Lets try GRSN on its own to start with (well after NormExp) and see what happens there.
First we need to define the functions. Lets start with code to select the invariant set:
```{r}
# Function to get an invarient set of probes. Will return a vector of indexes
getInvariant <- function(eset, # Main data as a limma eset
count=5000, # Number of invariant probes to select
exclude=NA) # Boolean expressing which probes to exlude from selection
{
# Data to normalize.
rawData <- eset$E
adjust <- max(0, (0.25 - min(rawData)))
M1 <- log2(rawData + adjust)
total <- dim(M1)[[1]]
idx <- 1:total
subSet <- 1:total
# exclude negative controls, spike-ins etc
if (!is.na(exclude))
{
total <- total - sum(exclude)
idx <- idx[!exclude]
subSet <- subSet[!exclude]
}
# Calculate number of probe sets to exclude at each iteration.
discardNumber <- (total - count) / 4
### Main iteration loop to get approximate GRiS. ###
while (TRUE)
{
total <- floor(max(total - discardNumber, count))
M2 <- cbind(apply(M1[idx, ], 2, rank))
V2 <- apply(M2, 1, var)
subSet <- order(V2, decreasing=FALSE)[1:total]
idx <- idx[subSet]
if (total == count) break
}
invariantIdx <- idx
return (invariantIdx)
}
#
# Function to normalize arrays to a set of standards
# Standards you be selected via the about global invarient selection.
# or could be known controls.
#
normalizeToInvarients <- function(data, # limma eset
invariantIdx, # index of probes to use as invariants
f) # smoothness paramenter for loess
{
rawData <- data$E
adjust <- max(0, (0.25 - min(rawData)))
M1 <- log2(rawData + adjust)
# Get the average of the reference set.
# Do a trimmed mean to be robust, but eliminate the "artifact" that
# shows up when doing median on an odd number of samples.
Mavg <- apply(M1[, ], 1, mean, trim=0.25)
Mnew <- NULL
x <- Mavg
for (b in 1:dim(M1)[[2]])
{
y <- M1[,b]
### M vs. A transformed data. ###
M <- y-x
A <- (y+x)/2
### Lowess curve based on M vs. A transformed data. ###
curve <- lowess(x=A[invariantIdx], y=M[invariantIdx], f=f)
### Create evenly space lookup from calibration curve. ###
aCurve <- curve[[1]]
mCurve <- curve[[2]]
steps <- 1000
sampleMin <- min(A)
sampleMax <- max(A)
step <- (sampleMax - sampleMin) / steps
position <- seq(sampleMin, sampleMax, length=steps + 1)
adjust <- array(0,c(steps+1))
count <- length(aCurve)
idxL <- 1
idxR <- 2
for (i in 1:(steps + 1))
{
while (idxR < count && position[i] > aCurve[idxR])
{
idxR <- idxR + 1
}
while ((idxL + 1) < idxR && position[i] > aCurve[idxL + 1])
{
idxL <- idxL + 1
}
while (idxR < count && aCurve[idxL] >= aCurve[idxR])
{
idxR <- idxR + 1
}
if (aCurve[idxL] < aCurve[idxR])
{
adjust[i] <- (((mCurve[idxR] - mCurve[idxL])/(aCurve[idxR] - aCurve[idxL]))
*(position[i] - aCurve[idxL]) + mCurve[idxL])
}
}
### Apply lookup to data. Can be applied to transformed or untransformed data. ###
yPrime <- y - adjust[(A - sampleMin) / step + 1.5]
mPrime <- yPrime - x
Mnew <- cbind(Mnew, yPrime)
}
colnames(Mnew) <- colnames(data$E)
data$E <- Mnew
data <- new("EList", unclass(data))
return(data)
}
```
Okay, so this should probably be applied to the background corrected data. Lets try doing the invarient selection:
```{r}
invariants <- getInvariant(filtered_data, count=1000, exclude = filtered_data$genes$ControlType != 0)
GRSN_norm_data <- normalizeToInvarients(filtered_data, invariants, 0.25)
```
```{r}
GRSN_normed_spikein_data = GRSN_norm_data[GRSN_norm_data$genes$ProbeName %in% spikeins$probe,]
data.frame(GRSN_normed_spikein_data$E, conc=spikeins[GRSN_normed_spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
ggtitle("Post-normalisation spike in curves")
```
So this looks no different from using IRON. (Much quicker though). Is it using lowly expressed genes as the invariants - that is are we not filtering aggressively enough.
```{r}
data.frame(filtered_data$E, invarient = c(1:dim(filtered_data$E)[1]) %in% invariants) %>%
melt(id.vars = "invarient", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
subset(replicate == 2) %>%
ggplot(aes(x=E, col=invarient)) +
geom_density() +
scale_x_log10() +
facet_wrap(~knockdown+fraction)
```
The invariants are all in the top and the bottom? interesting. I wonder if those at the top are selected because they are beyond the linear range of the array? Might be worth excluding genes in the top 5%? Of course it could be real: highly expressed genes escape the effects of the knockdown. Also looks like we could use a slightly harsher low expression cut off.
```{r}
neg95 <- apply(bc_data$E[bc_data$genes$ControlType == -1,], 2, function(x) quantile(x,p=0.95))
cutoff = matrix(1.5*neg95, nrow(bc_data), ncol(bc_data), byrow=TRUE)
isexpr <- rowSums(bc_data$E > cutoff) >=3
print(table(isexpr))
filtered_data <- bc_data[isexpr,]
print (paste("Spots before control removal", dim(bc_data$E)[1]))
print (paste("Spots after control removal", dim(filtered_data$E)[1]))
invariants <- getInvariant(filtered_data, count=500,
exclude = filtered_data$genes$ControlType != 0 | rowMeans(filtered_data$E) > 100000)
data.frame(filtered_data$E, invarient = c(1:dim(filtered_data$E)[1]) %in% invariants) %>%
melt(id.vars = "invarient", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
subset(replicate == 2) %>%
ggplot(aes(x=E, col=invarient)) +
geom_density() +
scale_x_log10() +
facet_wrap(~knockdown+fraction)
```
Now we normalize
```{r}
GRSN_norm_data <- normalizeToInvarients(filtered_data, invariants, 0.25)
GRSN_normed_spikein_data = GRSN_norm_data[GRSN_norm_data$genes$ProbeName %in% spikeins$probe,]
data.frame(GRSN_normed_spikein_data$E, conc=spikeins[GRSN_normed_spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
ggtitle("Post-normalisation spike in curves")
```
What happens if we use the spikins as the normalising factor?
```{r}
spikein_normed <- normalizeToInvarients(filtered_data, which(filtered_data$genes$ControlType==1), f=0.25)
sin_normed_spikein_data = spikein_normed[spikein_normed$genes$ProbeName %in% spikeins$probe,]
data.frame(sin_normed_spikein_data$E, conc=spikeins[sin_normed_spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
ggtitle("Post-normalisation spike in curves")
```
Well obviously the spikeins look perfect!
So this is definately more homogenious that it was before. Lets hope that we havn't lost important information
Lets look what the clustering looks like
```{r, fig.width=8, fig.height=8}
row.colours = rep("white", times = length(colnames(bc_data)))
row.colours[outliers] <- "blue"
library(gplots)
cormat <- cor(spikein_normed$E)
distmat = as.dist(1-cormat)
clust = hclust(distmat)
dend = as.dendrogram(clust)
heatmap.2(cormat, trace="none", Rowv = dend, Colv = dend, margins = c(10,10), cexRow=0.6, cexCol=0.6, lhei=c(1,6), lwid=c(1,6), RowSideColors = row.colours)
```
So, looking at this we see: For the most part, the Total and cytoplasmic sample now cluster into two groups. The exception to this is the UAP56_DDX39 samples, that cluster off on thier own. With each main cluster the replicates generally, but not 100% cluster together. Unfortunately the three control samples do not cluster together. The samples that were outliers before are now fully intergrated with the rest of the samples, and for the most part with their cognate replicates. On balance, I think we should probably leave those in here for now.
Now to create the design matrix. We only want to deal with the single knockdowns to start with. All of the knockdowns are:
```{r}
unique(targets$knockdown)
```
So we want ALYREF, CHTOP, CIRP, CONTROL, DDX39, SPT16, SSRP1, THOC5, UAP56, UIF, LUZP4:
```{r}
neqc_data <- normalizeBetweenArrays(filtered_data, method="quantile")
neqc_spikein_data = neqc_data[neqc_data$genes$ProbeName %in% spikeins$probe,]
data.frame(neqc_spikein_data$E, conc=spikeins[sin_normed_spikein_data$genes$ProbeName,"conc"]) %>%
melt(id.vars="conc", variable.name="SlideName", value.name="E") %>%
merge(targets) %>%
ggplot(aes(x=conc, y=E, col=fraction, group=paste(fraction,replicate))) +
geom_smooth(lwd=0.5, alpha=0.5) +
facet_wrap(~knockdown) +
theme_bw(base_size = 9) +
ggtitle("Post-normalisation spike in curves")
```
```{r}
nonorm <- normalizeBetweenArrays(filtered_data, method="none")
```
```{r}
targets$fraction <- factor(targets$fraction)
targets$fraction <- relevel(targets$fraction, ref = "TOTAL")
targets$knockdown <- factor(targets$knockdown)
targets$knockdown <- relevel(targets$knockdown, ref = "CONTROL")
single_knockdown = c("ALYREF", "CHTOP", "CIRP_37", "CONTROL", "DDX39", "SPT16", "SSRP1", "THOC5", "UAP56", "UIF", "FLAG_LUZP4")
single_targets <- subset(targets, knockdown %in% single_knockdown, drop=T)
single_targets <- droplevels(single_targets)
single_design <- model.matrix(~fraction*knockdown, data=single_targets)
single_data = nonorm[,rownames(single_targets)]
```
Okay, we are ready to fit the model:
```{r}
ave_single_data = avereps(single_data, ID=single_data$genes$SystematicName)
single_fit <- lmFit(single_data, single_design)
```
So lets start looking at this by making a heat map of the fold chages for genes that have significance in the F test for all interactions:
```{r}
colnames(single_design)
single_fit <- contrasts.fit(single_fit, coefficients = c(11:18))
single_fit <- eBayes(single_fit, trend = T)
heatmatrix = single_fit$coefficients[p.adjust(single_fit$F.p.value, method="BH") < 0.001,]
ccormat <- cor(heatmatrix)
cdistmat = as.dist(1-ccormat)
cclust = hclust(cdistmat)
cdend <- as.dendrogram(cclust)
rcormat <- cor(t(heatmatrix))
rdistmat = as.dist(1-rcormat)
rclust <- hclust(rdistmat)
rdend <- as.dendrogram(rclust)
heatmatrix_cropped <- heatmatrix
heatmatrix_cropped[heatmatrix_cropped < -3] <- -3
heatmatrix_cropped[heatmatrix_cropped > 3] <- 3
heatmap.2(heatmatrix_cropped,
trace = "none",
col = colorRampPalette(c("red", "white", "blue")),
Rowv = rdend)
```
That doesn't look very informative, lets look at the genes that are changed in at least one sample:
```{r, fig.height=6, fig.width=4}
tests <- decideTests(single_fit, lfc=1)
changed_genes = rowSums(tests !=0 ) > 0
print (table(changed_genes))
heatmat <- single_fit$coefficients[changed_genes,]
ccormat <- cor(heatmat)
cdistmat = as.dist(1-ccormat)
cclust = hclust(cdistmat)
cdend <- as.dendrogram(cclust)
rcormat <- cor(t(heatmat))
rdistmat = as.dist(1-rcormat)
rclust <- hclust(rdistmat)
rdend <- as.dendrogram(rclust)
heatmap.2(heatmat,
trace = "none",
col = colorRampPalette(c("red", "white", "blue")),
Colv= cdend,
Rowv= rdend,
dendrogram = "column",
labRow = NA)
```
Lets just check that we have all of our directions correct.
```{r}
most_down <- which.min(single_fit$coefficients[tests[,1] == -1,1])
print(normed_data$E[tests[,1] == -1,][most_down,targets$knockdown %in% c("ALYREF", "CONTROL")])
```
Yep, that looks right, things are down in the cytoplasm in knockdown compared to control, but up in Total compared to control.
We can plot a heatmap of test results
```{r, fig.height=6, fig.width=4}
heatmap.2(tests[changed_genes,],
trace="none",
col=colorRampPalette(c("red","white","blue")),
dendrogram = "column",
labRow = NA)
```