Multiple pairwise comparisons with batch effect

Create a sample annotation table similar to this one (see also sampleTab, note that the row names must correspond to the column names in the count table):

	CONDITION	BATCH
liver_A	liver	B1
liver_B	liver	B2
liver_C	liver	B3
spleen_A	spleen	B1
spleen_B	spleen	B2
spleen_C	spleen	B3
blood_A	blood	B1
blood_B	blood	B2
blood_C	blood	B3
kidney_A	kidney	B1
kidney_B	kidney	B2
kidney_C	kidney	B3

# Excel is quite RAM hungry - on low RAM machines use CSV files instead or set ensemble <- NA
options(java.parameters = "-Xmx8g")
library("biomaRt")
library("edgeR")
library("DESeq2")
library("limma")
library("XLConnect")
library("RNAseqWrapper")

# choose a working directory
rDir <- "/path/to/your/working/directory"

# if possible, use biomaRt to add the gene description in the DE-tables
# see the biomart manual for details. At the time I wrote this, biomart.org
# was not available and I therefore used the DB hosted at ensembl.org.

# Choose a biomart database (only if you were using
# the corresponding reference genome and annotation).
# To display available marts and datasets for animals
# and plants (there may also be other hosts):
listMarts(host = "www.ensembl.org")     # animals
listMarts(host = "plants.ensembl.org")  # plants

# Connect to a database and check whether there
# is a dataset for -e.g.- Arabidopsis available:
ensembl <- useMart("plants_mart", host = "plants.ensembl.org")
ensemblDatasets <- listDatasets(ensembl)
ensemblDatasets[grep("Arabidopsis", ensemblDatasets$description),]

# Finally connect to the database for A. thaliana:
ensembl <- useDataset("athaliana_eg_gene", mart = ensembl)

# Set the class of the biomart DB to "ensembl".
# Note that this is a simple error-workaround
# which is only necessary for some hosts.
# I'm not sure if it is still necessary.
biomaRt:::martBM(ensembl) <- "ensembl"

# if not, set the ensembl variable to NA
ensembl <- NA

# load the table with the sample annotation
sampleTab <- read.csv(file.path(rDir, "sampleTab.csv"), row.names = 1, stringsAsFactors = FALSE)

# read the different data sets (either .csv or .txt)
myData <- read.csv(file.path(rDir, "myCountTable.csv"), row.names = 1)
myData <- read.table(file.path(rDir, "myCountTable.txt"), sep = '\t')

# remove entries with very low values (e.g. <5 in all samples)
myData <- f.strip.data(myData, minVal = 5, minTimes = 1)

# some overview plots - set skipScatters to TRUE if there are more than 16 samples
f.do.some.overview(log2(myData+1), rDir, "MTGWB", skipScatters = FALSE)

# test for differential expression between all possible 
# combinations of CONDITION. To run it with all methods
# at once - there are two ways to nest the DE-methods
# and all the comparisons:
# To get an Excel workbook containing for each DE-method all
# possible pairwise combinations, use:
deResults <- f.multiple.all.pairwise.comparisons(myData, 
                                                 rownames(sampleTab), 
                                                 sampleTab$CONDITION,
                                                 sampleTab$BATCH)
# note that this can result in huge tables and too few RAM for 
# writing them to the Excel workbook.

# To get an Excel workbook containing for each pair all the
# different DE-methods, use:
deResults <- f.multiple.two.group.comparisons.pairwise(myData, 
                                                 rownames(sampleTab), 
                                                 sampleTab$CONDITION,
                                                 sampleTab$BATCH)

# To write the tables:
for (DEMorPC in names(deResults)) {
  f.write.DEGtabs.to.workbook(deResults[[DEMorPC]], rDir, paste0("MTGWB_", DEMorPC), ensembl)
  gc() # this is only necessary if you write many tables
}

# Alternatively write csv tables for each comparison and method
for (DEMorPC in names(deResults)) {
  for (PCorDEM in names(deResults[[DEMorPC]])) {
    outTab <- deResults[[DEMorPC]][[PCorDEM]]$get_table()
    write.csv(outTab, file.path(rDir, paste0(DEMorPC, "_", PCorDEM, "_MTGWB.csv")))
  }
}

#########################################################################################
# optional: get normalized data
# there are three different packages: DESeq2, edgeR or limma
# for the limma voom transformation one can in addition choose to
# first normalize the data with the method from edgeR (TMM) or with a
# quantile normalization method. There are three separate functions:
# f.normalize.counts.DESeq
# f.normalize.counts.edgeR
# f.normalize.counts.limma
# Alternatively you may run all at once:
formulaString <- "~BATCH+CONDITION"
design <- model.matrix(formula(formulaString), data = sampleTab, contrasts.arg = NULL)
myNormData <- f.all.normalizations(myData, sampleTab, formulaString,  design)
# myNormData is a list with five data frames, the names are according to the
# normalization method:
# DESeq_default
# edgeR
# limma_none
# limma_TMM
# limma_quantile

# optional: calculate the mean/median/sum/sd within each condition
byTab <- data.frame(sample = rownames(sampleTab),
                    group = sampleTab$CONDITION,
                    stringsAsFactors = FALSE)
meanTab <- f.summarize.columns(myData, byTab, mean)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

MTGWB.md

MTGWB.md

Multiple pairwise comparisons with batch effect

Files

MTGWB.md

Latest commit

History

MTGWB.md

File metadata and controls

Multiple pairwise comparisons with batch effect