Merge pull request #31 from bcbio/hot_fix_release

demo ready fixes
bcbio · Jun 18, 2024 · d9df054 · d9df054
2 parents 298e52f + d946b0c
commit d9df054
Show file tree

Hide file tree

Showing 7 changed files with 173 additions and 36 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,10 +4,12 @@
 .Ruserdata
 *.Rproj
 test/*
+testthat/*/*/*/*/*/*html
+testthat/*/*/*/*/*/*csv
 inst/*/*/*/*/*/*html
 inst/*/*/*/*/*/*csv
 inst/doc
 docs
 /doc/
 /Meta/
-.DS*
+.DS*
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: bcbioR
 Type: Package
 Title: Templates and functions to guide downstream analysis and data interpretation
-Version: 0.1.2
+Version: 0.1.3
 Authors@R: person("Pantano", "Lorena", , "[email protected]",
             role = c("aut", "cre"))
 Description: Collaborative code repository at the Harvard Chan Bioinformatics Core.

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,6 @@
-# bcbioR 0.1.2
+# bcbioR 0.1.3
 
+* Add combatseq
 * Add draft templates for TEAseq, COSMX
 * Adapt templates to nf-core rnaseq
 * Fix when sample start by number

diff --git a/inst/rmarkdown/templates/common/skeleton/README.md b/inst/rmarkdown/templates/common/skeleton/README.md
@@ -2,10 +2,10 @@
 
 ## Set up work-space
 
--   [ ] Replace the Title in this file matching the title projects
+-   [ ] Replace the title in this file to match the project's title
 -   [ ] Modify `information.R` with the right text for this project, it can be used to source in other `Rmd` files. The main `Rmd` file in this directory can be used to show general information of the project if needed.
--   [ ] Use the same parent folder name to create a folder in *Dropbox*, and *GitHub* repo
--   [ ] use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*:
+-   [ ] Use the same project name to create a folder in *Dropbox* and a repo in *GitHub*
+-   [ ] Use the function `bcbio_templates` to create templates inside `reports` for each type of analysis. For instance, for *RNAseq*:
     -   `bcbio_templates(type="rnaseq", outpath="reports")` or
     -   `bcbio_templates(type="rnaseq", outpath="reports/experiment1")`
     -   Then go to that folder and read the `README.md`
@@ -30,9 +30,10 @@
 
 ## GitHub
 
--   [ ] Track in *GitHub* this `README` file
--   [ ] Track in *GitHub* files in `scripts`, `meta`, and `reports` that belongs to these type:
+-   [ ] Track in *Git* this `README` file
+-   [ ] Track in *Git* files in `scripts`, `meta`, and `reports` that belongs to these type:
     -   **Note** Git add `*.Rmd *.R *ipynb *.sh *.yaml`. (feel free use `.gitignore` if you use a GUI for non-tracked files). *DO NOT* use `git add *`. *DO NOT* track `html/csv/figures`
+-   [ ] Commit files and push to *Github* as necessary throughout the project, but especially when work is complete
 
 ## Dropbox
 

diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd
@@ -17,18 +17,21 @@ output:
 editor_options: 
   chunk_output_type: console
 params:
+  # Put hg38, mm10, mm39, or other
+
+  ## Combatseq and ruv can both be false or ONLY ONE can be true
+  ## Both cannot be true
   numerator: tumor
   denominator: normal
   column: sample_type
   subset_column: null
   subset_value: null
-  # Put hg38, mm10, mm39, or other
   genome: hg38
   ruv: false
-  params_file: params_de.R
+  combatseq: false
+  params_file: params_de-example.R
   project_file: ../information.R
   functions_file: load_data.R
-
 ---
 
 ```{r load_params, cache = FALSE, message = FALSE, warning=FALSE}
@@ -38,14 +41,16 @@ source(params$params_file)
 source(params$project_file)
 # 3. Load custom functions to load data from coldata/metrics/counts
 source(params$functions_file)
-# IMPORTANT set these values if you are not using the parameters at the top
+# IMPORTANT set these values if you are not using the parameters in the header (lines 22-31)
 genome=params$genome
 column=params$column
 numerator=params$numerator
 denominator=params$denominator
 subset_column=params$subset_column
 subset_value=params$subset_value
 run_ruv=params$ruv
+run_combatseq=params$combatseq
+factor_of_interest <- column
 ```
 
 
@@ -54,7 +59,6 @@ library(rtracklayer)
 library(DESeq2)
 library(tidyverse)
 library(stringr)
-library(RUVSeq)
 library(DEGreport)
 library(ggpubr)
 library(msigdbr)
@@ -67,6 +71,9 @@ library(ggprism)
 library(viridis)
 library(pheatmap)
 library(janitor)
+library(ggforce)
+library(vegan)
+
 colors=cb_friendly_cols(1:15)
 ggplot2::theme_set(theme_prism(base_size = 14))
 opts_chunk[["set"]](
@@ -86,6 +93,15 @@ opts_chunk[["set"]](
 set.seed(1234567890L)
 ```
 
+```{r sanitize_datatable}
+sanitize_datatable = function(df, ...) {
+ # remove dashes which cause wrapping
+ DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)),
+                   colnames=gsub("-", "_", colnames(df)))
+}
+```
+
+
 
 ```{r load_data, message=F, warning=F}
 # This code will load from bcbio or nf-core folder
@@ -108,13 +124,7 @@ coldata = coldata[rownames(metrics),]
 stopifnot(all(names(counts) == rownames(metrics)))
 ```
 
-```{r sanitize_datatable}
-sanitize_datatable = function(df, ...) {
- # remove dashes which cause wrapping
- DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)),
-                   colnames=gsub("-", "_", colnames(df)))
-}
-```
+
 
 # Overview
 
@@ -163,24 +173,67 @@ vsd_before <- vst(dds_to_use)
 norm_matrix = assay(vsd_before)
 ```
 
+
+# PCA and group level variance.
+
+**Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).**
+
+Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm.
+
+**It is best practice NOT to subset your data unless one group has significantly higher variance than the others. The best dispersion estimates are obtained with more data.**
+
+**This code automatically uses the column value from the header. You can also manually add a factor of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.**
+
+```{r set group, eval=FALSE, echo=FALSE}
+## Example of creating a group covariate
+
+meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment)
+
+factor_of_interest <- "insert column name for covariate of interest"
+```
+
+
+```{r PCA}
+pca <- degPCA(norm_matrix, metrics,
+              condition = factor_of_interest, name = "sample", data = T)
+
+pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vsd_before), " genes")) +
+  theme(plot.title=element_text(hjust=0.5)) +
+  geom_mark_ellipse(aes(color = sample_type)) +  scale_color_cb_friendly()
+```
+
+## PERMDISP
+
+Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test.  PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/)
+
+Here we apply this test to our variance stabilized data. We calculate distances between samples and then use the `betadisper()` function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the `anova()` function and the second from the `permutest()` function. We also get pairwise p-values for every group-group comparison.
+
+```{r PERMDISP}
+vare.disa <- vegdist(t(assay(vsd_before)))
+
+mod = betadisper(vare.disa, metrics[[factor_of_interest]])
+anova(mod)
+permutest(mod, pairwise = TRUE)
+
+```
+
+
+
 # Covariate analysis
 
+Multiple factors related to the experimental design or quality of sequencing may influence the outcomes of a given RNA-seq experiment. To further determine whether any confounding covariate risks affecting the results of our differential expression analyses, it is useful to assess the correlation between covariates and principal component (PC) values.
+
+Here, we are using `DEGreport::degCovariates()` to explore potential correlations between variables provided in the metadata and all PCs that account for at least 5% of the variability in the data. If applicable, significant correlations (FDR < 0.1) are circled. **This diagnostic plot helps us determine which variables we may need to add to our DE model.**
+
+
 ```{r covariates, fig.height = 6, fig.width = 10}
 degCovariates(
   norm_matrix,
   metrics,
 )
 ```
 
-# PCA analysis
 
-```{r before_RUV} 
-
-pca1 <- degPCA(norm_matrix, colData(dds_to_use), 
-       condition = column) + ggtitle('PCA')
-pca1 + scale_color_cb_friendly()
-
-```
 
 ```{r init_DESEQ}
 formula <- as.formula(paste0("~ ", " + ", column))
@@ -194,18 +247,31 @@ norm_matrix = assay(vsd_before)
 new_cdata <- coldata
 ```
 
-```{r, eval=run_ruv, results='asis', echo=FALSE}
+
+```{r, eval=F, echo=FALSE}
+#### IF YOU ARE RUNNING RUV OR COMBATSEQ RUN THE CHUNKS BELOW OTHERWISE SKIP TO Differential Expression SECTION
+
+### RUV - LINES 261-296
+### COMBATSEQ - LINES 303-369
+```
+
+
+
+```{r, eval=run_ruv, results='asis', echo=run_ruv}
 cat("# Remove Unwanted Variability
 
 When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.")
-
 ```
 
-```{r do_RUV, eval=run_ruv}
+```{r do_RUV, eval=run_ruv, echo=run_ruv}
+library(RUVSeq)
+
 # If you want to skip the code, just set up formula to be your model in the next chunk of code
 design <- coldata[[column]]
 diffs <- makeGroups(design)
 dat <- norm_matrix
+# by default is running one variable, 
+# change K parameter to other number to find more unknown covariates
 ruvset <- RUVs(dat, cIdx=rownames(dat), k=1, diffs, isLog = T, round = F)
 vars <- ruvset$W
 
@@ -231,6 +297,73 @@ vsd_to_use<- vst(dds_to_use, blind=FALSE)
 
 ```
 
+```{r combat-text , eval=run_combatseq, results='asis', echo=run_combatseq}
+library(sva)
+
+cat("# Remove Batch Effects
+
+Here we apply Combat-seq (https://github.com/zhangyuqing/ComBat-seq) to try to remove batch effects so we can better tease out the effects of interest. 
+
+Combat-seq uses a negative binomial regression to model batch effects, providing adjusted data by mapping the original data to an expected distribution if there were no batch effects. The adjusted data preserves the integer nature of counts, so that it is compatible with the assumptions of state-of-the-art differential expression software (e.g. edgeR, DESeq2, which specifically request untransformed count data).")
+
+```
+
+
+```{r set_variable_combatseq, eval=run_combatseq, echo=run_combatseq}
+
+## FILL OUT THIS CHUNK OF CODE IF YOU WANT TO RUN COMBATSEQ
+
+## Set your batch effect variable here this is the variable that combatseq will try to remove
+
+## Column name of your batch variable
+to_remove = "batch"
+
+## Column name of of your variable(s) of interest
+
+to_keep = "sample_type"
+
+
+coldata[[to_remove]] <- as.factor(coldata[[to_remove]])
+coldata[[to_keep]] <- as.factor(coldata[[to_keep]])
+
+
+batch = coldata[[to_remove]]
+treatment = coldata[[to_keep]]
+
+## If you have multiple variables of interest you will need to cbind them into one variable
+
+#treatment1 = metrics[[to_keep]]
+#treatment2 = metrics[[to_keep]]
+#treatment3 = metrics[[to_keep]]
+
+
+# imp = cbind(as.numeric(as.character(treatment1)),as.numeric(as.character(treatment2)), as.numeric(as.character(treatment3)))
+
+```
+
+
+```{r do_combatseq, eval=run_combatseq}
+adjusted_counts <- ComBat_seq(as.matrix(counts), batch=batch, group = treatment)
+
+## For multiple variables of interest
+
+# adjusted_counts <- ComBat_seq(as.matrix(counts2), batch=batch, covar_mod = imp)
+
+```
+
+```{r after_combatseq, eval=run_combatseq}
+# NOTE: Make sure the formula doens't contain the covariates used in combatseq above
+dds_to_use <- DESeqDataSetFromMatrix(adjusted_counts, coldata, design = formula)
+vsd_combat<- vst(dds_to_use, blind=FALSE)
+
+combat_matrix = assay(vsd_combat)
+
+pca_combat <- degPCA(combat_matrix, coldata, 
+       condition = column) + ggtitle('After Combatseq')
+pca_combat + scale_color_cb_friendly()
+
+```
+
 
 # Differential Expression
 
@@ -282,16 +415,15 @@ This volcano plot shows the genes that are significantly up- and down-regulated
 # degVolcano(res_mod[,c('lfc', 'padj')], plot_text = show) 
 EnhancedVolcano(res_mod,
                 lab= res_mod$gene_name, 
-                pCutoff = 1.345719e-03, 
+                pCutoff = 0.05, 
                 selectLab = c(res_sig$gene_name[1:15]),
                 FCcutoff = 0.5,
                 x = 'lfc',
                 y = 'padj', 
                 title="Volcano Tumor vs. Normal",
                 col=as.vector(colors[c("dark_grey", "light_blue",
                                          "purple", "purple")]),
-                subtitle = "", xlim=c(-5,5)) 
-
+                subtitle = "", drawConnectors = T,  max.overlaps = Inf) 
 ```
 
 ## Heatmap
@@ -428,6 +560,7 @@ write_csv(counts_norm, name_expression_fn)
 write_csv(res_for_writing, name_deg_fn)
 write_csv(pathways_for_writing, name_pathways_fn)
 ```
+
 # R session
 
 List and version of tools used for the DE report generation.

diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R b/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R
@@ -6,6 +6,6 @@ metadata_fn='/Path/to/metadata/meta.csv'
 # This file is inside star_salmon/ folder
 se_object='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.rds'
 # This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder
-multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data'
+multiqc_data_dir='/path/to/nf-core/output/multiqc/star_salmon/multiqc_report_data'
 # This file is inside the genome folder in the output directory, use this only for non-model organism
 # gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf'
diff --git a/tests/testthat/rnaseq.R b/tests/testthat/rnaseq.R
@@ -21,7 +21,7 @@ test_that("rnaseq testing", {
                       subset_value = subset_value,
                       numerator = numerator,
                       denominator = denominator,
-                      params_file = file.path(path,'DE/params_de.R'),
+                      params_file = file.path(path,'DE/params_de-example.R'),
                       project_file = file.path(path,'information.R'),
                       functions_file = file.path(path,'DE/load_data.R')
                     )