-
Notifications
You must be signed in to change notification settings - Fork 0
/
DEVITS_css_normalize_final.R
190 lines (158 loc) · 5.47 KB
/
DEVITS_css_normalize_final.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
###########################
#DEV ITS CSS normalization
#Cassandra Wattenburger
#NOTES:
#This code normalizes the count data to avoid sequencing depth bias
#CSS normalization code was kindly provided by Allison Thompson ([email protected]
#Clear workspace and load necessary packages
rm(list=ls())
#load in necessary packages
library("vegan")
library("labdsv")
library("ggplot2")
#set working directory to wherever works for you
setwd("Y:/Cassi/Sequencing Data/Argonne 2017/01 DEV Project/02 Github code")
#######################
#Read in raw count data
otu <- read.csv("DEVITS_otu.csv", row.names=1)
str(otu)
head(otu[,1:10])
head(otu[,210:216])
#remove taxonomy column
otu <- otu[,-216]
str(otu)
head(otu[210:215])
#Remove blanks
#Likely still included due to contamination or well-hoping during Illumina sequencing
blanks <- c("B1.A02", "B3.A01", "B3.A02", "B3.A03")
otu <- otu[,!(names(otu) %in% blanks)]
str(otu)
###################################
#Quality check data for bad samples
#abundance x richness graph for each sample
#transpose data frame
otu.trans <- t(otu)
head(otu.trans[,1:10])
#calculate richness per sample
richness <- specnumber(otu.trans)
head(richness)
min(richness)
max(richness)
hist(richness)
#calculate total abundance per sample
abund <- rowSums(otu.trans)
head(abund)
min(abund)
max(abund)
hist(abund)
#format for graphing
rich.abund <- rbind(richness, abund)
head(rich.abund[,1:5])
rich.abund.trans <- t(rich.abund)
head(rich.abund.trans)
str(rich.abund.trans)
rich.abund.trans.data <- data.frame(rich.abund.trans)
rich.abund.trans.data$Sample <- row.names(rich.abund.trans.data)
str(rich.abund.trans.data)
#graph
ggplot(rich.abund.trans.data, aes(x=richness, y=abund)) +
geom_point() +
geom_vline(xintercept = 10, color="red") +
geom_hline(yintercept = 1000, color="red") +
geom_text(aes(label=Sample),hjust=0, vjust=0) +
ggtitle("ITS Raw Counts Sample Abundance vs Richness")
#MDP71, MDP66, MDP25 fall below hard cut-off of 10 richness and/or 1000 abundance
#Mock communities (MF1-3) have low richness, as expected
#remove MDP71, MDP66, MDP25
remove <- c("MDP71", "MDP66", "MDP25")
otu.filter <- otu[,!names(otu) %in% remove]
str(otu.filter)
head(otu.filter[,1:5])
##############
#CSS normalize
#remove mock communities before CSS normalization
#see "DEV16S_css_normalization_clean.R" to see how mock communities weirdly affect normalization
mock <- c("MF1", "MF2", "MF3")
otu.filter <- otu.filter[,!names(otu.filter) %in% mock]
str(otu.filter)
#Remove singletons and absent taxa
otu.filter.trans <- t(otu.filter)
head(otu.filter.trans[,1:5])
otu.nosingles <- dropspc(otu.filter.trans, 1)
str(otu.nosingles)
str(otu.filter.trans)
#removed 445 taxa that only occurred in one or fewer samples
#reformat
head(otu.nosingles[,1:10])
otu.nosingles.trans <- t(otu.nosingles)
head(otu.nosingles.trans[,1:10])
otu.nosingles.trans <- data.frame(OTU.ID = row.names(otu.nosingles.trans), otu.nosingles.trans)
head(otu.nosingles.trans[,1:10])
e_data <- otu.nosingles.trans
edata_id <- "OTU.ID"
#CSS normalization function
#code from Allison Thompson ([email protected])
CSS_Norm <- function(e_data, edata_id, q=0.75, qg="median"){
e_data_norm <- e_data[,-which(colnames(e_data)==edata_id)]
e_data_norm[e_data_norm==0] <- NA
# calculate the q quantile of data, per sample and globally #
col.q = apply(e_data_norm, 2, function(x) sum(x[x<=quantile(x, probs = q, na.rm=TRUE)], na.rm=TRUE))
if(qg == 1000){
g.q = 1000
}else if(qg == "median"){
g.q = median(col.q, na.rm=TRUE)
}else{
stop("Invalid value for qg")
}
#g.q = 1000
#g.q = median(col.q, na.rm=TRUE)
# normalize omics_data data by q quantile and transform back to count data #
for(i in 1:ncol(e_data_norm)){
e_data_norm[,i] = (e_data_norm[,i] / col.q[i]) * g.q
}
e_data <- data.frame(e_data[,which(colnames(e_data)==edata_id)],e_data_norm)
colnames(e_data)[1] <- edata_id
e_data[is.na(e_data)] <- 0
return(e_data)
}
#Normalize the data
otu_cssnorm <- CSS_Norm(e_data,edata_id)
head(otu_cssnorm[,1:5])
dim(otu_cssnorm)
#calculate richness per sample
otu_cssnorm <- otu_cssnorm[,-1]
head(otu_cssnorm[,1:5])
otu_cssnorm.trans <- t(otu_cssnorm)
head(otu_cssnorm.trans[,1:5])
richness.norm <- specnumber(otu_cssnorm.trans)
head(richness.norm)
min(richness.norm)
max(richness.norm)
hist(richness.norm)
#not changed from raw, as expected
#calculate total abundance per sample
abund.norm <- rowSums(otu_cssnorm.trans)
head(abund.norm)
min(abund.norm)
max(abund.norm)
hist(abund.norm)
#very right tailed...
#make dataframe for graphing
rich.abund.norm <- rbind(richness.norm, abund.norm)
head(rich.abund.norm[,1:5])
rich.abund.norm.trans <- t(rich.abund.norm)
head(rich.abund.norm.trans)
rich.abund.norm.trans.data <- data.frame(rich.abund.norm.trans)
head(rich.abund.norm.trans.data)
rich.abund.norm.trans.data$Sample <- row.names(rich.abund.norm.trans.data)
head(rich.abund.norm.trans.data)
#graph
ggplot(rich.abund.norm.trans.data, aes(x=richness.norm, y=abund.norm)) +
geom_point() +
geom_vline(xintercept = 10, color="red") +
geom_hline(yintercept = 1000, color="red") +
geom_text(aes(label=Sample),hjust=0, vjust=0) +
ggtitle("ITS CSS Normalized Counts Sample Abundance vs Richness")
#seems to have removed some positive correlation skew
#save file with normalized data
write.csv(otu_cssnorm, "DEVITS_cssnorm.csv")