-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_deseq2_for_mirca.R
172 lines (154 loc) · 7.59 KB
/
run_deseq2_for_mirca.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
suppressPackageStartupMessages({
require(optparse)
require(DESeq2)
require(data.table)
})
option_list <- list(
make_option(c("-i","--input"),type="character",help="input file (from get_mirca_read_counts.py)",metavar="file"),
make_option(c("-o","--outf"),type='character',default="mirca_results.csv.gz",help="output file [%default]",metavar="file"),
make_option(c("-c","--conditions"),type="character",help="comma-separated list of conditions (corresponding to the columns in input file)",metavar="list"),
make_option("--reference",type='character',help="reference condition (default: alphabetically first)",metavar="string"),
make_option("--num_covariate",type="character",help="comma-separated list of numerical covariates",metavar="list"),
make_option("--cat_covariate",type="character",help="comma-separated list of categorical covariates",metavar="list"),
make_option("--alpha",type="double",default=0.1,help="significance cutoff alpha [%default]",metavar="double"),
make_option("--test",type="character",default='LRT',help="statistical test: Wald (with betaPrior) or LRT [%default]",metavar="string"),
make_option("--min_genes",type="integer",default=10,help="min number of genes per motif [%default]",metavar="string")
)
opt_parser <- OptionParser(option_list = option_list)
options <- parse_args(opt_parser)
if (is.null(options$input)) {
print_help(opt_parser)
stop("please supply input file",call.=FALSE)
}
if (is.null(options$conditions)) {
print_help(opt_parser)
stop("please supply comma-separated list of conditions",call.=FALSE)
}
conditions <- strsplit(options$conditions,',',fixed=TRUE)[[1]]
print(paste("reading from",options$input))
counts.all <- read.csv(options$input,sep='\t',comment='#',header=T)
sampleName <- colnames(counts.all)[4:length(colnames(counts.all))]
if (length(conditions) != length(sampleName)) {
stop("number of conditions (",length(conditions),") doesn't match number of columns (",length(sampleName),") in ",options$input,call.=FALSE)
}
if (options$test=='Wald') {
full_design <- ~group
} else {
full_design <- ~condition + motif + condition:motif
reduced_design <- ~condition + motif
}
# ADD COVARIATES FOR MORE COMPLEX DESIGN IF AVAILABLE
has_covariate <- FALSE
if (!is.null(options$num_covariate)) {
has_covariate <- TRUE
print('using numerical covariate')
covariate <- as.numeric(strsplit(options$num_covariate,',',fixed=TRUE)[[1]])
} else if (!is.null(options$cat_covariate)) {
has_covariate <- TRUE
print('using categorical covariate')
covariate <- strsplit(options$cat_covariate,',',fixed=TRUE)[[1]]
}
if (has_covariate) {
if (length(covariate) != length(sampleName)) {
stop("number of covariates (",length(covariate),") doesn't match number of columns (",length(sampleName),") in ",options$input,call.=FALSE)
}
if (options$test=="Wald") {
full_design <- ~group + covariate
} else{
full_design <- ~condition + motif + covariate + condition:motif
reduced_design <- ~ condition + covariate + motif
}
}
# take only named conditions, drop empty ones or those called "_"
take <- (conditions!='_') & (conditions!='')
colData <- data.frame(condition=conditions[take],row.names=sampleName[take])
if (has_covariate) {
colData['covariate'] <- covariate[take]
}
if (is.null(options$reference)) {
reference.condition <- sort(unique(conditions[take]))[1]
} else {
reference.condition <- options$reference
}
columns <- c('gene',sampleName[take])
if (has_covariate) {
print(paste('using columns',paste(columns[2:length(columns)],collapse=','),'for conditions',paste(conditions[take],collapse=','),'and covariates',paste(covariate[take],collapse=',')))
} else {
print(paste('using columns',paste(columns[2:length(columns)],collapse=','),'for conditions',paste(conditions[take],collapse=',')))
}
all.motifs <- setdiff(unique(counts.all$motif),'tot')
all.conds <- setdiff(unique(conditions[take]),reference.condition)
counts.tot <- data.table(na.omit(counts.all[counts.all$motif=='tot',columns]))
counts.tot.df <- as.data.frame.matrix(counts.tot[,2:length(counts.tot),with=FALSE],row.names=as.vector(counts.tot$gene))
dds.tot <- DESeqDataSetFromMatrix(counts.tot.df,colData,~condition)
sf.tot <- estimateSizeFactorsForMatrix(counts(dds.tot))
res <- list()
for (mot in all.motifs) {
print (paste0('deseq2 (',options$test,') for ',mot))
counts.motif <- data.table(na.omit(counts.all[counts.all$motif==mot,columns]))
if (sum(rowSums(counts.motif[,2:length(counts.motif)]) > 0) < options$min_genes) {
print(paste0('skipping ',mot))
next
}
counts.here <- merge(counts.tot,counts.motif,by='gene',all.y=TRUE,suffixes=c('_tot',paste0('_',mot)))
counts.here.df <- as.data.frame.matrix(counts.here[,2:length(counts.here),with=FALSE],row.names=as.vector(counts.here$gene))
sampleName <- colnames(counts.here)[2:length(colnames(counts.here))]
motifs <- c(rep("tot",sum(take)),rep(mot,sum(take)))
# EDIT COLDATA AND DESIGN IF MORE COVARIATES ARE AVAILABLE
colData <- data.frame(motif=motifs,condition=c(conditions[take],conditions[take]),
group=paste0(c(conditions[take],conditions[take]),motifs),
row.names=sampleName)
if (has_covariate) {
colData['covariate'] <- c(covariate[take],covariate[take])
}
dds <- DESeqDataSetFromMatrix(counts.here.df, colData, full_design)
dds$motif <- factor(dds$motif, levels=c('tot',mot))
dds$condition <- factor(dds$condition, levels=c(reference.condition,all.conds))
# filter out genes with zero counts at this motif
dds <- dds[rowSums(counts(dds)[,dds$motif==mot]) > 0,]
sizeFactors(dds) <- c(sf.tot,sf.tot)
tryCatch({
if (options$test=="Wald") {
dds <- DESeq(dds, test='Wald')
} else {
dds <- DESeq(dds, test='LRT', reduced=reduced_design)
}
for (cond in all.conds) {
print (paste0('results for ',cond,' vs ',reference.condition))
if (options$test=="Wald") {
# use contrast: compare count ratio motif/tot at cond to ratio motif/tot at reference.condition
# this works for DESeq2 v1.14.1
# contrast <- list(c(paste0('group',cond,gsub('+','.',mot,fixed=TRUE)),
# paste0('group',reference.condition,'tot')),
# c(paste0('group',reference.condition,gsub('+','.',mot,fixed=TRUE)),
# paste0('group',cond,'tot')))
# this works for DESeq2 v1.16.1
contrast <- list(c(paste0('group_',reference.condition,'tot_vs_',reference.condition,gsub('+','.',mot,fixed=TRUE)),
paste0('group_',cond,gsub('+','.',mot,fixed=TRUE),'_vs_',reference.condition,gsub('+','.',mot,fixed=TRUE))),
c(paste0('group_',cond,'tot_vs_',reference.condition,gsub('+','.',mot,fixed=TRUE))))
tmp <- as.data.frame(results(dds,contrast=contrast,alpha=options$alpha))
} else {
# use interaction term for motif and cond
tmp <- as.data.frame(results(dds,name=paste0('condition',cond,'.motif',gsub('+','.',mot,fixed=TRUE)),alpha=options$alpha))
}
tmp <- data.table(gene=row.names(tmp),tmp)
setkey(tmp,'gene')
colnames(tmp) <- c('gene',paste(mot,cond,colnames(tmp)[2:length(tmp)],sep='.'))
res[[paste0(mot,'.',cond)]] <- tmp
}
}, error=function(err) {
print(paste0('skipping ',mot,': ',err))
})
}
print('combining results')
res <- as.data.frame(Reduce(function(...) merge(...,all=T),res))
print('fixing row names')
row.names(res) <- res$gene
print('selecting columns')
res <- res[,2:length(res)]
print(paste('writing combined results to',options$outf))
if (grep("*.gz$",options$outf)) {
write.csv(res,file=gzfile(options$outf))
} else {
write.csv(res,file=options$outf)
}