diff --git a/.gitignore b/.gitignore index 607f2ff..086b3a0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,10 +4,12 @@ .Ruserdata *.Rproj test/* +testthat/*/*/*/*/*/*html +testthat/*/*/*/*/*/*csv inst/*/*/*/*/*/*html inst/*/*/*/*/*/*csv inst/doc docs /doc/ /Meta/ -.DS* \ No newline at end of file +.DS* diff --git a/DESCRIPTION b/DESCRIPTION index cbc3225..40012fd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: bcbioR Type: Package Title: Templates and functions to guide downstream analysis and data interpretation -Version: 0.1.2 +Version: 0.1.3 Authors@R: person("Pantano", "Lorena", , "lorena.pantano@gmail.com", role = c("aut", "cre")) Description: Collaborative code repository at the Harvard Chan Bioinformatics Core. diff --git a/NEWS.md b/NEWS.md index c49ffc1..a45deb7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ -# bcbioR 0.1.2 +# bcbioR 0.1.3 +* Add combatseq * Add draft templates for TEAseq, COSMX * Adapt templates to nf-core rnaseq * Fix when sample start by number diff --git a/inst/rmarkdown/templates/common/skeleton/README.md b/inst/rmarkdown/templates/common/skeleton/README.md index 3f00ca3..6218884 100644 --- a/inst/rmarkdown/templates/common/skeleton/README.md +++ b/inst/rmarkdown/templates/common/skeleton/README.md @@ -2,10 +2,10 @@ ## Set up work-space -- [ ] Replace the Title in this file matching the title projects +- [ ] Replace the title in this file to match the project's title - [ ] Modify `information.R` with the right text for this project, it can be used to source in other `Rmd` files. The main `Rmd` file in this directory can be used to show general information of the project if needed. -- [ ] Use the same parent folder name to create a folder in *Dropbox*, and *GitHub* repo -- [ ] use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*: +- [ ] Use the same project name to create a folder in *Dropbox* and a repo in *GitHub* +- [ ] Use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*: - `bcbio_templates(type="rnaseq", outpath="reports")` or - `bcbio_templates(type="rnaseq", outpath="reports/experiment1")` - Then go to that folder and read the `README.md` @@ -30,9 +30,10 @@ ## GitHub -- [ ] Track in *GitHub* this `README` file -- [ ] Track in *GitHub* files in `scripts`, `meta`, and `reports` that belongs to these type: +- [ ] Track in *Git* this `README` file +- [ ] Track in *Git* files in `scripts`, `meta`, and `reports` that belongs to these type: - **Note** Git add `*.Rmd *.R *ipynb *.sh *.yaml`. (feel free use `.gitignore` if you use a GUI for non-tracked files). *DO NOT* use `git add *`. *DO NOT* track `html/csv/figures` +- [ ] Commit files and push to *Github* as necessary throughout the project, but especially when work is complete ## Dropbox diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd index 8139da5..107198c 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd @@ -17,18 +17,21 @@ output: editor_options: chunk_output_type: console params: + # Put hg38, mm10, mm39, or other + + ## Combatseq and ruv can both be false or ONLY ONE can be true + ## Both cannot be true numerator: tumor denominator: normal column: sample_type subset_column: null subset_value: null - # Put hg38, mm10, mm39, or other genome: hg38 ruv: false - params_file: params_de.R + combatseq: false + params_file: params_de-example.R project_file: ../information.R functions_file: load_data.R - --- ```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} @@ -38,7 +41,7 @@ source(params$params_file) source(params$project_file) # 3. Load custom functions to load data from coldata/metrics/counts source(params$functions_file) -# IMPORTANT set these values if you are not using the parameters at the top +# IMPORTANT set these values if you are not using the parameters in the header (lines 22-31) genome=params$genome column=params$column numerator=params$numerator @@ -46,6 +49,8 @@ denominator=params$denominator subset_column=params$subset_column subset_value=params$subset_value run_ruv=params$ruv +run_combatseq=params$combatseq +factor_of_interest <- column ``` @@ -54,7 +59,6 @@ library(rtracklayer) library(DESeq2) library(tidyverse) library(stringr) -library(RUVSeq) library(DEGreport) library(ggpubr) library(msigdbr) @@ -67,6 +71,9 @@ library(ggprism) library(viridis) library(pheatmap) library(janitor) +library(ggforce) +library(vegan) + colors=cb_friendly_cols(1:15) ggplot2::theme_set(theme_prism(base_size = 14)) opts_chunk[["set"]]( @@ -86,6 +93,15 @@ opts_chunk[["set"]]( set.seed(1234567890L) ``` +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df))) +} +``` + + ```{r load_data, message=F, warning=F} # This code will load from bcbio or nf-core folder @@ -108,13 +124,7 @@ coldata = coldata[rownames(metrics),] stopifnot(all(names(counts) == rownames(metrics))) ``` -```{r sanitize_datatable} -sanitize_datatable = function(df, ...) { - # remove dashes which cause wrapping - DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), - colnames=gsub("-", "_", colnames(df))) -} -``` + # Overview @@ -163,8 +173,59 @@ vsd_before <- vst(dds_to_use) norm_matrix = assay(vsd_before) ``` + +# PCA and group level variance. + +**Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).** + +Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. + +**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.** + +**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** + +```{r set group, eval=FALSE, echo=FALSE} +## Example of creating a group covariate + +meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment) + +factor_of_interest <- "insert column name for covariate of interest" +``` + + +```{r PCA} +pca <- degPCA(norm_matrix, metrics, + condition = factor_of_interest, name = "sample", data = T) + +pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) + + theme(plot.title=element_text(hjust=0.5)) + + geom_mark_ellipse(aes(color = sample_type)) + scale_color_cb_friendly() +``` + +## PERMDISP + +Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) + +Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison. + +```{r PERMDISP} +vare.disa <- vegdist(t(assay(vsd_before))) + +mod = betadisper(vare.disa, metrics[[factor_of_interest]]) +anova(mod) +permutest(mod, pairwise = TRUE) + +``` + + + # Covariate analysis +Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values. + +Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.** + + ```{r covariates, fig.height = 6, fig.width = 10} degCovariates( norm_matrix, @@ -172,15 +233,7 @@ degCovariates( ) ``` -# PCA analysis -```{r before_RUV} - -pca1 <- degPCA(norm_matrix, colData(dds_to_use), - condition = column) + ggtitle('PCA') -pca1 + scale_color_cb_friendly() - -``` ```{r init_DESEQ} formula <- as.formula(paste0("~ ", " + ", column)) @@ -194,18 +247,31 @@ norm_matrix = assay(vsd_before) new_cdata <- coldata ``` -```{r, eval=run_ruv, results='asis', echo=FALSE} + +```{r, eval=F, echo=FALSE} +#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION + +### RUV - LINES 261-296 +### COMBATSEQ - LINES 303-369 +``` + + + +```{r, eval=run_ruv, results='asis', echo=run_ruv} cat("# Remove Unwanted Variability When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") - ``` -```{r do_RUV, eval=run_ruv} +```{r do_RUV, eval=run_ruv, echo=run_ruv} +library(RUVSeq) + # If you want to skip the code, just set up formula to be your model in the next chunk of code design <- coldata[[column]] diffs <- makeGroups(design) dat <- norm_matrix +# by default is running one variable, +# change K parameter to other number to find more unknown covariates ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F) vars <- ruvset$W @@ -231,6 +297,73 @@ vsd_to_use<- vst(dds_to_use, blind=FALSE) ``` +```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq} +library(sva) + +cat("# Remove Batch Effects + +Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. + +Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).") + +``` + + +```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq} + +## FILL OUT THIS CHUNK OF CODE IF YOU WANT TO RUN COMBATSEQ + +## Set your batch effect variable here this is the variable that combatseq will try to remove + +## Column name of your batch variable +to_remove = "batch" + +## Column name of of your variable(s) of interest + +to_keep = "sample_type" + + +coldata[[to_remove]] <- as.factor(coldata[[to_remove]]) +coldata[[to_keep]] <- as.factor(coldata[[to_keep]]) + + +batch = coldata[[to_remove]] +treatment = coldata[[to_keep]] + +## If you have multiple variables of interest you will need to cbind them into one variable + +#treatment1 = metrics[[to_keep]] +#treatment2 = metrics[[to_keep]] +#treatment3 = metrics[[to_keep]] + + +# imp = cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3))) + +``` + + +```{r do_combatseq, eval=run_combatseq} +adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment) + +## For multiple variables of interest + +# adjusted_counts <- ComBat_seq(as.matrix(counts2), batch=batch, covar_mod = imp) + +``` + +```{r after_combatseq, eval=run_combatseq} +# NOTE: Make sure the formula doens't contain the covariates used in combatseq above +dds_to_use <- DESeqDataSetFromMatrix(adjusted_counts, coldata, design = formula) +vsd_combat<- vst(dds_to_use, blind=FALSE) + +combat_matrix = assay(vsd_combat) + +pca_combat <- degPCA(combat_matrix, coldata, + condition = column) + ggtitle('After Combatseq') +pca_combat + scale_color_cb_friendly() + +``` + # Differential Expression @@ -282,7 +415,7 @@ This volcano plot shows the genes that are significantly up- and down-regulated # degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) EnhancedVolcano(res_mod, lab= res_mod$gene_name, - pCutoff = 1.345719e-03, + pCutoff = 0.05, selectLab = c(res_sig$gene_name[1:15]), FCcutoff = 0.5, x = 'lfc', @@ -290,8 +423,7 @@ EnhancedVolcano(res_mod, title="Volcano Tumor vs. Normal", col=as.vector(colors[c("dark_grey", "light_blue", "purple", "purple")]), - subtitle = "", xlim=c(-5,5)) - + subtitle = "", drawConnectors = T, max.overlaps = Inf) ``` ## Heatmap @@ -428,6 +560,7 @@ write_csv(counts_norm, name_expression_fn) write_csv(res_for_writing, name_deg_fn) write_csv(pathways_for_writing, name_pathways_fn) ``` + # R session List and version of tools used for the DE report generation. diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R b/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R index fd75d18..08b3ec0 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R +++ b/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R @@ -6,6 +6,6 @@ metadata_fn='/Path/to/metadata/meta.csv' # This file is inside star_salmon/ folder se_object='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.rds' # This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder -multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data' +multiqc_data_dir='/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data' # This file is inside the genome folder in the output directory, use this only for non-model organism # gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' diff --git a/tests/testthat/rnaseq.R b/tests/testthat/rnaseq.R index 9a683ac..e30ebea 100644 --- a/tests/testthat/rnaseq.R +++ b/tests/testthat/rnaseq.R @@ -21,7 +21,7 @@ test_that("rnaseq testing", { subset_value = subset_value, numerator = numerator, denominator = denominator, - params_file = file.path(path,'DE/params_de.R'), + params_file = file.path(path,'DE/params_de-example.R'), project_file = file.path(path,'information.R'), functions_file = file.path(path,'DE/load_data.R') )