-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path16S_microbiome_analysis.html
223 lines (211 loc) · 10.2 KB
/
16S_microbiome_analysis.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
---
title: "16S Microbiome Data Visualization"
author: "Long Tian"
date: "2/15/2019"
output: html_notebook
---
```{r setup, include=FALSE}
knitr::opts_knit$set(root.dir = "/Users/longtian/Desktop/VinatzerLab/Dianthus")
knitr::opts_chunk$set(echo = TRUE)
```
## Introduction to this script
This script is for the Vinatzer lab's need of 16S microbiome data analysis.
Taking the output BIOM file and a couple of intermediate files from QIIME as well as the mapping file, this script aims to analyze and visualize the data with the R package of `phyloseq` from `BioConductor` and `ggplot2`, analyses will include but not limitted to *abundance analysis*, *alpha diversity*, *beta diversity*, and *hierarchical clustering* of the samples.
## Set working directory to where the input files are/ will be
```{r}
setwd("/Users/longtian/Desktop/VinatzerLab/Dianthus")
```
### Library/package needed
```{r}
source('http://bioconductor.org/biocLite.R')
biocLite('phyloseq')
library("phyloseq")
library("scales")
library("ggplot2")
library("Rcpp")
library("dplyr")
```
### Import files
The main files needed are the BIOM file and the the mapping file. Only to make sure there should be 8 columns in the mapping file, not sure why is that
```{r}
map <- import_qiime_sample_data("Mapping_all_Dianthus_112718.txt")
otu <- import_biom(BIOMfilename = "from_txt.biom",'rep_set.tre')
# Merge map file and OTU table and let it be a phyloseq object
run <- merge_phyloseq(otu,map)
# Use the official taxonomic ranks names
colnames(tax_table(run)) <- c("Kingdom", "Phylum", "Class",
"Order", "Family", "Genus","Species","Rank8","Rank9",
"Rank10","Rank11","Rank12","Rank13","Rank14","Rank15")
```
### Abundance analysis
#### Filtering the data
There would be a lot of taxon whose percentage is very low, and it's not really necessary to show them all. So the first step is to keep only those taxa which each contributes to at least 2% or 5% of the whole microbial community of the sample.
```{r}
run_genus <- run %>%
tax_glom(taxrank = "Genus") %>%
transform_sample_counts(function(x){x/sum(x)}) %>%
psmelt() %>%
filter(Abundance > 0.02) %>%
arrange(Genus)
```
#### Prepare palette
```{r}
library(RColorBrewer)
n <- dim(run_genus)[1]
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
```
#### Barplot
A normal abundance barplot could be easily generated in this way
```{r}
run_genus$Genus <- factor(run_genus$Genus, levels = rev(levels(run_genus$Genus)))
ggplot(run_genus,aes(x=Sample,y=Abundance,fill=Genus)) +
geom_bar(position="fill",stat="identity") +
scale_fill_manual(values = col_vector) +
guides(fill=guide_legend(reverse=T,keywidth = 1,keyheight = 1)) +
ylab("Relative Abundance (Genera > 2%) \n") +
xlab("Sample") +
# ggtitle("Genus Composition")+
scale_y_continuous(labels=percent_format(),expand=c(0,0))+
theme(axis.text.x = element_text(angle=90,hjust=1), panel.background = element_blank())
```
If there the experiment consists of different groups/treatments, the barplot can be divided into groups of choice
```{r}
# run_genus$Genus <- factor(run_genus$Genus, levels = rev(levels(run_genus$Genus)))
# Add grouping info to the table
run_genus$Condition_rev <- factor(run_genus$Condition,level=rev(levels(map$Condition)))
levels(run_genus$Condition_rev) <- c('Non-inoculated','Inoculated')
library(plyr)
# run_genus$Condition_rev <- factor(run_genus$Condition,level=rev(levels(run_genus$Condition)))
# run_genus$RealSampleID <- factor(run_genus$RealSampleID,levels = map$RealSampleID)
ggplot(run_genus,aes(x=RealSampleID,y=Abundance,fill=Genus)) +
geom_bar(position="fill",stat="identity") +
scale_fill_manual(values = col_vector) +
guides(fill=guide_legend(reverse=T,keywidth = 1,keyheight = 1)) +
ylab("Relative Abundance (Genera > 2%) \n") +
xlab("Sample") +
# ggtitle("Genus Composition")+
scale_y_continuous(labels=percent_format(),expand=c(0,0))+
theme(axis.text.x = element_text(angle=90,hjust=1), panel.background = element_blank()) +
# scale_x_discrete(limit=map_total$RealSampleID) +
# scale_x_discrete(labels=c(map_total[map_total$Condition=='Non-inoculated',]$RealSampleID,map_total[map_total$Condition=='Inoculated',]$RealSampleID)) +
facet_wrap(~Condition_rev,scales="free")
```
### Alpha diversity
#### Calculate rarefaction
First we create a function that can iterate the BIOM file, i.e. rarefaction
```{r}
calculate_rarefaction_curves <- function(psdata, measures, depths) {
require('plyr') # ldply
require('reshape2') # melt
estimate_rarified_richness <- function(psdata, measures, depth) {
if(max(sample_sums(psdata)) < depth) return()
psdata <- prune_samples(sample_sums(psdata) >= depth, psdata)
rarified_psdata <- rarefy_even_depth(psdata, depth, verbose = FALSE)
alpha_diversity <- estimate_richness(rarified_psdata, measures = measures) # as.matrix forces the use of melt.array, which includes the Sample names (rownames)
molten_alpha_diversity <- melt(as.matrix(alpha_diversity), varnames = c('Sample', 'Measure'), value.name = 'Alpha_diversity')
molten_alpha_diversity
}
names(depths) <- depths # this enables automatic addition of the Depth to the output by ldply
rarefaction_curve_data <- ldply(depths, estimate_rarified_richness, psdata = psdata, measures = measures, .id = 'Depth', .progress = ifelse(interactive(), 'text', 'none')) # convert Depth from factor to numeric
rarefaction_curve_data$Depth <- as.numeric(levels(rarefaction_curve_data$Depth))[rarefaction_curve_data$Depth]
rarefaction_curve_data
}
```
One thing worth to note is the choice of depth. For the most of the time, rarefaction curve is used to compare the richness of each sample, so the depth is the minimal number of OTU of the samples. However, if you want to see the raw sample size, use the maximum.
```{r}
depth = 7000
step = 700
rarefaction_curve_data <- calculate_rarefaction_curves(run, c('Observed'), rep(seq(1,depth,by=step), each = 10))
rarefaction_curve_data_summary <- ddply(rarefaction_curve_data, c('Depth', 'Sample', 'Measure'), summarise, Alpha_diversity_mean = mean(Alpha_diversity), Alpha_diversity_sd = sd(Alpha_diversity))
rarefaction_curve_data_summary_verbose <- merge(rarefaction_curve_data_summary, data.frame(sample_data(run)), by.x = 'Sample', by.y = 'row.names')
rarefaction_curve_data_summary_verbose$RealSampleID <- factor(rarefaction_curve_data_summary_verbose$RealSampleID,levels = map$RealSampleID)
n <- dim(rarefaction_curve_data_summary_verbose)[1]
qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
ggplot( data = rarefaction_curve_data_summary_verbose,
mapping = aes( x = Depth, y = Alpha_diversity_mean,
ymin = Alpha_diversity_mean - Alpha_diversity_sd,
ymax = Alpha_diversity_mean + Alpha_diversity_sd,
colour = RealSampleID,
group = RealSampleID)
) +
scale_fill_manual(values=col_vector) +
geom_line( ) +
geom_point(size=0.5 ) +
guides(fill=guide_legend(title="Sample")) +
xlab("Sequences per sample") +
ylab("Observed OTU")
```
Also, boxplot with other alpha diversity indices can be generated. If there is such a column in the mapping file representing the desired grouping condition, say `FigureTwo` here.
```{r}
plot_richness(run,x="FigureTwo",measures=c("Observed","Shannon","Simpson")) +
geom_boxplot() +
ylab("Alpha Diversity") +
theme(axis.text.x=element_text(angle=90,hjust=0.5,size=8,face='bold'),
axis.title = element_text(size=16),
axis.title.x = element_blank(),
strip.text.x = element_text(size=14,face="bold"))
```
### Beta diversity
Beta diviersity indicates the relatedness between samples.
First, pairwise distances between samples are calculated. The two measurement used are weighted-Unifrac and unweighted-Unifrac here.
```{r}
dm_weighted_unifrac <- distance(run, method = "wUniFrac")
dm_unweighted_unifrac <- distance(run, method='Unifrac')
Treatment <- map$Treatment
Symptom <- map$Description
```
```{r}
ordWU <- ordinate(run,method='PCoA',distance=dm_weighted_unifrac)
plot_ordination(run, ordWU,,color = 'Description',shape='Treatment') +
geom_point(size=3) +
# geom_text(aes(label=map$RealSampleID),hjust=0,vjust=0) +
# theme(axis.title = element_text(size=20),
# axis.text = element_text(size=16),
# legend.text = element_text(size=16)) +
ggtitle("PCoA: Weithed Unifrac")
ordUU <- ordinate(run,method='PCoA',distance=dm_unweighted_unifrac)
plot_ordination(run, ordUU,,color = 'Description',shape='Treatment') +
geom_point(size=3) +
# geom_text(aes(label=map$RealSampleID),hjust=0,vjust=0) +
# theme(axis.title = element_text(size=20),
# axis.text = element_text(size=16),
# legend.text = element_text(size=16)) +
ggtitle("PCoA: Unweithed Unifrac")
```
### Phylogenetics tree of the samples
Bootstrap values can be generated from `pvclust`, and the topology is the same with what calculated from regular `hclust` function
```{r}
library(pvclust)
hc_pv <- pvclust(as.matrix(dm_weighted_unifrac),method.hclust = "complete")
plot(hc_pv)
hc <- hclust(as.dist(1-cor(as.matrix(dm_weighted_unifrac))),method="complete")
plot(hc)
```
The dendrogram can be plotted through `ggplot2` with the help from `ggdendro`.
```{r}
library(ggdendro)
ggdendrogram(hc,rotate=F, size=4, theme_dendro = F)
```
```{r}
ddata <- dendro_data(as.dendrogram(hc))
mapping <- map
label <- as.character(ddata$labels$label)
rownames(mapping) <- mapping$RealSampleID
Treatment <- mapping$Condition
labeled_data <- cbind(label(ddata),Treatment)
ggplot(segment(ddata)) +
geom_segment(aes(x=x,y=y,xend=xend,yend=yend)) +
coord_flip() + scale_y_reverse(expand=c(0.5,0)) +
geom_text(data = labeled_data, aes(x=x,y=y,label=label,hjust=0,color=Treatment),size=3) +
ylab("Weighted Unifrac distance") +
theme(axis.line.y=element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
axis.title.y=element_blank(),
plot.title = element_text(hjust=0.5),
legend.text = element_text(size=16),
axis.text.x = element_text(size=10),
axis.title.x = element_text(size=20))
```