diff --git a/.Rbuildignore b/.Rbuildignore index 6e71100..73a7c9a 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,3 +11,4 @@ test/* inst/*/*/*/*/*/*html inst/*/*/*/*/*/*csv +^data-raw$ diff --git a/.gitignore b/.gitignore index 80e3c2f..607f2ff 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ inst/doc docs /doc/ /Meta/ +.DS* \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index ef9d990..cbc3225 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: bcbioR Type: Package Title: Templates and functions to guide downstream analysis and data interpretation -Version: 0.1.1 +Version: 0.1.2 Authors@R: person("Pantano", "Lorena", , "lorena.pantano@gmail.com", role = c("aut", "cre")) Description: Collaborative code repository at the Harvard Chan Bioinformatics Core. @@ -11,15 +11,23 @@ Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 Imports: + DESeq2, stringr, ggplot2, magrittr, hues, ggprism, grDevices, - R.utils + R.utils, + readr, + fs, + withr Suggests: knitr, - rmarkdown + rmarkdown, + testthat (>= 3.0.0) VignetteBuilder: knitr URL: http://bcb.io/bcbioR/ +Config/testthat/edition: 3 +Depends: + R (>= 2.10) diff --git a/NAMESPACE b/NAMESPACE index 4cf6271..3cc700a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(bcbio_nfcore_check) export(bcbio_set_project) export(bcbio_templates) export(cb_friendly_cols) @@ -7,10 +8,12 @@ export(cb_friendly_pal) export(list_cb_friendly_cols) export(scale_color_cb_friendly) export(scale_fill_cb_friendly) +import(DESeq2) import(R.utils) import(ggplot2) import(ggprism) import(hues) importFrom(grDevices,colorRampPalette) importFrom(magrittr,"%>%") +importFrom(readr,read_csv) importFrom(stringr,str_replace_all) diff --git a/NEWS.md b/NEWS.md index 7c10e9e..c49ffc1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# bcbioR 0.1.2 + +* Add draft templates for TEAseq, COSMX +* Adapt templates to nf-core rnaseq +* Fix when sample start by number +* Fix when rRNA rate is missing +* Add by sample plots in QC +* Add function to check nfcore samplesheet +* Add PCA with variance analysis +* Minor details fixed in QC RNAseq report +* Correct metric for rRNA and tRNA + # bcbioR 0.1.1 * Add page to github diff --git a/R/bcbioR-package.R b/R/bcbioR-package.R index 3f87d74..64459df 100644 --- a/R/bcbioR-package.R +++ b/R/bcbioR-package.R @@ -2,10 +2,12 @@ "_PACKAGE" ## usethis namespace: start +#' @importFrom grDevices colorRampPalette #' @importFrom magrittr %>% +#' @importFrom readr read_csv #' @importFrom stringr str_replace_all -#' @importFrom grDevices colorRampPalette ## usethis namespace: end +#' @import DESeq2 #' @import ggplot2 #' @import hues #' @import ggprism diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..5982116 --- /dev/null +++ b/R/data.R @@ -0,0 +1,8 @@ +#' varianceStabilizingTransformation Object from Test Data +#' +#' An example of the output of varianceStabilizingTransformation to +#' show different visualization code +#' +#' @format ## `bcbio_vsd_data` +#' VSD object from DESeq2 data +"bcbio_vsd_data" diff --git a/R/hello.R b/R/hello.R index 0c8188e..aee6389 100644 --- a/R/hello.R +++ b/R/hello.R @@ -1,8 +1,34 @@ .fix <- function(x){ x <- tolower(x) %>% str_replace_all(., "[[:punct:]]", "_") + x <- str_replace_all(x, " ", "_") return(x) } + +#' Function to check samplesheet for nf-core +#' +#' @param file path to CSV file for nf-core +#' @examples +#' +#' bcbio_nfcore_check(system.file("extdata", "rnaseq_good.csv", package = "bcbioR") ) +#' +#' @export +bcbio_nfcore_check <- function(file){ + required=c("sample","fastq_1","fastq_2","strandedness") + samplesheet=read_csv(file) + + if (!(all(required %in% colnames(samplesheet)))){ + print(colnames(samplesheet)) + stop("Missing required columns ", paste(required, collapse = " ")) + }else if (any(grepl("^[1-9]", samplesheet[["sample"]]))){ + stop("Avoid samples starting with numbers ") + }else if (any(is.na(samplesheet))){ + warning("Columns with missing values") + }else{ + message("All good.") + } +} + #' Function to help deploy analysis folder inside a project folder #' #' This function contains Rmd, R, md, files that help to structure @@ -13,7 +39,11 @@ #' Normally these helper files are inside a report folder inside a #' project folder. #' -#' @param type string indicating the type of analysis, supported: rnaseq. +#' @param type string indicating the type of analysis, supported: +#' - base +#' - rnaseq, scrnaseq, +#' - teaseq +#' - cosmx #' #' @param outpath string path indicating where to copy all the files to #' @examples @@ -23,20 +53,30 @@ #' @export bcbio_templates <- function(type="rnaseq", outpath){ switch(type, + base={ + fpath <- system.file("rmarkdown/templates/common", "skeleton", package="bcbioR") + copyDirectory(fpath, outpath) + }, rnaseq={ - fpath <- system.file("rmarkdown/templates/rnaseq", "skeleton", package="bcbioR") - #file.copy(fpath, outpath, recursive = T) copyDirectory(fpath, outpath) }, scrnaseq={ - fpath <- system.file("rmarkdown/templates/singlecell", "skeleton", package="bcbioR") - #file.copy(fpath, outpath, recursive = T) + copyDirectory(fpath, outpath) + }, + teaseq={ + fpath <- system.file("rmarkdown/templates/teaseq", "skeleton", package="bcbioR") + copyDirectory(fpath, outpath) + }, + cosmx={ + fpath <- system.file("rmarkdown/templates/cosmx", "skeleton", package="bcbioR") copyDirectory(fpath, outpath) }, { - stop('project type not recognize, please choose: ', 'rnaseq', 'scrnaseq') + stop('project type not recognize, please choose: ', 'base', + 'rnaseq', 'scrnaseq', + 'teaseq', 'cosmx') } ) } diff --git a/README.Rmd b/README.Rmd deleted file mode 100644 index 8029c88..0000000 --- a/README.Rmd +++ /dev/null @@ -1,67 +0,0 @@ ---- -output: github_document ---- - -# bcbioR - - - -[![R-CMD-check](https://github.com/bcbio/bcbioR/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/bcbio/bcbioR/actions/workflows/R-CMD-check.yaml) - -The goal of `bcbioR` is to create guidelines for NGS data interpretation based on the experience of the Harvard Chan Bioinformatics Core and everybody who contributes to this package. - -## Installation - -You can install the development version of bcbioR from [GitHub](https://github.com/) with: - -``` r -# install.packages("devtools") -devtools::install_github("bcbio/bcbioR") -``` - -## Quick start - -``` r -library(bcbioR) -## basic example code -# will help you to build a folder name following HCBC naming rules -bcbio_set_project() -``` - -### Set base project - -The following code will pop up a Rmd template and then clicking 'save' will populate that folder with HCBC data structure guidelines - -``` r -rmarkdown::draft("project_folder",template="common",package="bcbioR") -``` - -As well, You can get this by going to File -\> New File -\> R markdown: - then `From Template`, and choose `bcbio base` - choose the folder to deploy files, it could be a new folder or an existing one - -### Set RNAseq report folder - -This code will populate the folder with HCBC data structure guidelines and Rmd code: - -``` r -bcbio_templates(type="rnaseq", outpath="test_folder/reports") -``` - -### Discover more... - -Go to the vignette to know more `vignette("bcbioR_quick_start,package="bcbioR")` - -## How to Contribute - -You'll still need to render `README.Rmd` regularly, to keep `README.md` up-to-date. `devtools::build_readme()` is handy for this. - -Use `usethis::use_import_from("stringr","str_replace_all")` to add a function you are using in the code. - -Don't forget to commit and push the resulting figure files, so they display on GitHub and CRAN. - -### Contributors - -- Lorena Pantano -- Alex Bartlett -- Emma Berdan -- Heather Wick -- James Billingsley diff --git a/README.md b/README.md index 0578665..35161bb 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,28 @@ - # bcbioR [![R-CMD-check](https://github.com/bcbio/bcbioR/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/bcbio/bcbioR/actions/workflows/R-CMD-check.yaml) + -The goal of `bcbioR` is to create guidelines for NGS data interpretation -based on the experience of the Harvard Chan Bioinformatics Core and -everybody who contributes to this package. +The goal of `bcbioR` is to create guidelines for NGS data interpretation based on the experience of the Harvard Chan Bioinformatics Core and everybody who contributes to this package. ## Installation -You can install the development version of bcbioR from -[GitHub](https://github.com/) with: +You can install the development version of bcbioR from [GitHub](https://github.com/) with: -``` r +``` # install.packages("devtools") devtools::install_github("bcbio/bcbioR") +devtools::install_github("bcbio/bcbioR",ref = "devel") ``` ## Quick start -``` r +Use this code to generate a standard project name for all of your folders. **This code will not create any folders or files.** + +``` library(bcbioR) ## basic example code # will help you to build a folder name following HCBC naming rules @@ -31,46 +31,59 @@ bcbio_set_project() ### Set base project -The following code will pop up a Rmd template and then clicking ‘save’ -will populate that folder with HCBC data structure guidelines +use `setwd()` to set your current directory to the place where you want to work. The bcbioR functions will automatically write to whatever directory you have set. -``` r -rmarkdown::draft("project_folder",template="common",package="bcbioR") +``` +setwd("/path/to/analysis/folder") ``` -As well, You can get this by going to File -\> New File -\> R -markdown: - then `From Template`, and choose `bcbio base` - choose the -folder to deploy files, it could be a new folder or an existing one +The following code will pop up a Rmd template will populate that folder with HCBC data structure guidelines + +``` +bcbio_templates(type="base", outpath="/path/to/analysis/folder") +``` ### Set RNAseq report folder -This code will populate the folder with HCBC data structure guidelines -and Rmd code: +This code will populate the folder with HCBC data structure guidelines and Rmd code: **You do not need to create a reports folder prior to running this code. This will create and populate the reports folder.** -``` r -bcbio_templates(type="rnaseq", outpath="test_folder/reports") +``` +bcbio_templates(type="rnaseq", outpath="/path/to/analysis/folder/reports") ``` ### Discover more… -Go to the vignette to know more -`vignette("bcbioR_quick_start,package="bcbioR")` +Go to the vignette to know more `vignette("bcbioR_quick_start",package="bcbioR")` ## How to Contribute -You’ll still need to render `README.Rmd` regularly, to keep `README.md` -up-to-date. `devtools::build_readme()` is handy for this. +### Open an issue + +- If you find a bug +- If you want a new feature +- If you want to add code to the templates + +### Modify the code + +- Clone the repository +- Make sure you are in the `devel` branch +- Create a new branch `git checkout -b feature1` +- Modify you code, add and commit +- Push to GitHub the new branch +- Create a PR from your branch to `devel` +- Assignt the PR to me or Alex -Use `usethis::use_import_from("stringr","str_replace_all")` to add a -function you are using in the code. +Some best practices when developing: -Don’t forget to commit and push the resulting figure files, so they -display on GitHub and CRAN. +- install `devtools` +- Use `usethis::use_import_from("stringr","str_replace_all")` to add a new function you are using in the code. ### Contributors -- Lorena Pantano -- Alex Bartlett -- Emma Berdan -- Heather Wick -- James Billingsley +- Lorena Pantano +- Alex Bartlett +- Emma Berdan +- Heather Wick +- James Billingsley +- Zhu Zhuo +- Elizabeth Partan diff --git a/data-raw/DATASET.R b/data-raw/DATASET.R new file mode 100644 index 0000000..c530f42 --- /dev/null +++ b/data-raw/DATASET.R @@ -0,0 +1,29 @@ +## code to prepare `DATASET` dataset goes here +library(DESeq2) +library(tidyverse) +coldata_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/coldata.csv' +counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/salmon.merged.gene_counts.tsv' +multiqc_data_dir = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/multiqc-report-data/' +se_object = NA +column = "sample_type" +numerator = "tumor" +denominator = "normal" +subset_column = NULL +subset_value = NULL + +coldata <- load_coldata(coldata_fn, column, + numerator, denominator, + subset_column, subset_value) +metrics <- load_metrics(se_object, multiqc_data_dir) %>% + left_join(coldata, by = c('sample' = 'description')) %>% + as.data.frame() +rownames(metrics) <- metrics$sample +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$description] +# if the names don't match in order or string check files names and coldata information +counts = counts[rownames(coldata)] +stopifnot(all(names(counts) == rownames(coldata))) + +dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = ~sample_type) +bcbio_vsd_data <- vst(dds_to_use) +usethis::use_data(bcbio_vsd_data, overwrite = TRUE) diff --git a/data/bcbio_vsd_data.rda b/data/bcbio_vsd_data.rda new file mode 100644 index 0000000..c1ccf33 Binary files /dev/null and b/data/bcbio_vsd_data.rda differ diff --git a/inst/extdata/annotation/hg38.rna.gtf.gz b/inst/extdata/annotation/hg38.rna.gtf.gz new file mode 100644 index 0000000..26bf2d9 Binary files /dev/null and b/inst/extdata/annotation/hg38.rna.gtf.gz differ diff --git a/inst/extdata/annotation/mm10.rna.gtf.gz b/inst/extdata/annotation/mm10.rna.gtf.gz new file mode 100644 index 0000000..7abda7b Binary files /dev/null and b/inst/extdata/annotation/mm10.rna.gtf.gz differ diff --git a/inst/extdata/rnaseq_good.csv b/inst/extdata/rnaseq_good.csv new file mode 100644 index 0000000..dfa8b78 --- /dev/null +++ b/inst/extdata/rnaseq_good.csv @@ -0,0 +1,7 @@ +sample,fastq_1,fastq_2,strandedness,tissue,sex,4_weeks,bone_tissue +Fourwkbone1,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L001_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone1,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L002_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone2,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L001_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone2,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L002_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone3,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L001_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Male,4_weeks,bone_tissue +Fourwkbone3,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L002_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Male,4_weeks,bone_tissue diff --git a/inst/extdata/rnaseq_missing_col.csv b/inst/extdata/rnaseq_missing_col.csv new file mode 100644 index 0000000..f588040 --- /dev/null +++ b/inst/extdata/rnaseq_missing_col.csv @@ -0,0 +1,7 @@ +sample,fastq_1,strandedness,tissue,sex,4_weeks,bone_tissue +Fourwkbone1,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone1,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone2,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone2,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone3,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Male,4_weeks,bone_tissue +Fourwkbone3,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Male,4_weeks,bone_tissue diff --git a/inst/extdata/rnaseq_na.csv b/inst/extdata/rnaseq_na.csv new file mode 100644 index 0000000..a685b99 --- /dev/null +++ b/inst/extdata/rnaseq_na.csv @@ -0,0 +1,7 @@ +sample,fastq_1,fastq_2,strandedness,tissue,sex,4_weeks,bone_tissue +Fourwkbone1,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L001_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone1,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L002_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone2,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L001_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone2,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L002_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +Fourwkbone3,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L001_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Male,4_weeks,bone_tissue +Fourwkbone3,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L002_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Male,,bone_tissue diff --git a/inst/extdata/rnaseq_wnumbers.csv b/inst/extdata/rnaseq_wnumbers.csv new file mode 100644 index 0000000..75bf6d1 --- /dev/null +++ b/inst/extdata/rnaseq_wnumbers.csv @@ -0,0 +1,7 @@ +sample,fastq_1,fastq_2,strandedness,tissue,sex,4_weeks,bone_tissue +4wkbone1,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L001_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +4wkbone1,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L002_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_E02v1_4wkbone1_S13_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +4wkbone2,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L001_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +4wkbone2,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L002_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_F02v1_4wkbone2_S14_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Female,4_weeks,bone_tissue +4wkbone3,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L001_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L001_R2_001.fastq.gz,auto,4 week old bone tissue,Male,4_weeks,bone_tissue +4wkbone3,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L002_R1_001.fastq.gz,s3://hcbc-test/input/rawdata/P496_EMoor18780_G02v1_4wkbone3_S15_L002_R2_001.fastq.gz,auto,4 week old bone tissue,Male,,bone_tissue diff --git a/inst/rmarkdown/templates/common/skeleton/.gitignore b/inst/rmarkdown/templates/common/skeleton/.gitignore new file mode 100644 index 0000000..4cdfa50 --- /dev/null +++ b/inst/rmarkdown/templates/common/skeleton/.gitignore @@ -0,0 +1,9 @@ +*.Rproj* +.Rhistory +.Rproj.user +.Rhistory +.DS* +*._* +*placeholder* +data/* +final/* \ No newline at end of file diff --git a/inst/rmarkdown/templates/common/skeleton/README.md b/inst/rmarkdown/templates/common/skeleton/README.md index 1cb86f1..3f00ca3 100644 --- a/inst/rmarkdown/templates/common/skeleton/README.md +++ b/inst/rmarkdown/templates/common/skeleton/README.md @@ -14,10 +14,11 @@ - `meta` should contain the CSV/YAML files used by *bcbio* or *nextflow* - `scripts` should contain `sbatch` scripts or any custom scripts used in this project -- `data` contains raw data +- `data` contains raw data, it can contains big data objects - `reports` contains `Rmd` and `html` together with their files that will be added to *DropBox*. Each type of project have different guidelines. - `final` contains the output of *nextflow/bcbio* -- For any relevant files or papers use the `docs` folder on *DropBox* +- `code` contains any other files that support custom analysis and don't generate a report +- For any relevant client files or papers use the `docs` folder on *DropBox* ## Download diff --git a/inst/rmarkdown/templates/cosmx/skeleton/QC/QC.Rmd b/inst/rmarkdown/templates/cosmx/skeleton/QC/QC.Rmd new file mode 100644 index 0000000..4271822 --- /dev/null +++ b/inst/rmarkdown/templates/cosmx/skeleton/QC/QC.Rmd @@ -0,0 +1,317 @@ +--- +title: "QC" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: false + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + project_file: ./reports/information.R + seurat_fn: ./data/CTCL_PAT8_DEV3_LVL7.RDS + results_dir: ./results + min_transcripts: 100 + min_genes: 20 + min_novelty: .7 + umap_dim: approximateumap_8c6f278e.b9f4.4535.aeca.8955c1dff614_1 +--- + +```{r load_params, echo = F} +source(params$project_file) +``` + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,} + +library(tidyverse) +library(Seurat) +library(bcbioR) +library(ggprism) +library(knitr) +library(tools) +library(qs) + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) +opts_chunk[["set"]]( + cache = F, + cache.lazy = FALSE, + dev = c("png", "pdf"), + error = TRUE, + highlight = TRUE, + message = FALSE, + prompt = FALSE, + tidy = FALSE, + warning = FALSE, + echo = T, + fig.height = 4) + +# set seed for reproducibility +set.seed(1234567890L) +``` + +```{r sanitize_datatable} +sanitize_datatable = function(df, ...) { + # remove dashes which cause wrapping + DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)), + colnames=gsub("-", "_", colnames(df)), + filter = 'top') +} +``` + +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` +- Aim: `r aim` +- Sample: `r params$seurat_fn` + +```{r read rds} + +# resave RDS as QS for faster reading/writing +seurat_qs_fn <- paste0(file_path_sans_ext(params$seurat_fn), '.qs') + +if (!file.exists(seurat_qs_fn)){ + seurat <- readRDS(params$seurat_fn) + qsave(seurat, seurat_qs_fn, preset = 'fast') +} else { + seurat <- qread(seurat_qs_fn) +} + +centroids <- data.frame(x = seurat$x_slide_mm, y = seurat$y_slide_mm, cell = seurat$cell_id) +cents <- CreateCentroids(centroids) +coords <- CreateFOV(coords = list(centroids = cents), type = "centroids") + +``` + +```{r plot tissue} + +ggplot(seurat@meta.data, aes(y = x_slide_mm, x = y_slide_mm)) + + geom_point(alpha = 0.05, size = 0.01) + facet_wrap(~Run_Tissue_name) + coord_equal() + + labs(title = "Cell coordinates in XY space") + + +``` + +# QC Plots {.tabset} + +## nGenes + +```{r plot n_genes hist} +ggplot(data = seurat[[]], aes(x = nFeature_RNA)) + geom_histogram(binwidth = 50) + + geom_vline(xintercept = params$min_genes, col = "red", linetype = "dashed") + + xlab("nGenes") +``` + +## nUMIs + +```{r plot n_umis hist} +ggplot(data = seurat[[]], aes(x = nCount_RNA)) + geom_histogram(binwidth = 50) + + geom_vline(xintercept = params$min_transcripts, col = "red", linetype = "dashed") + + xlab("nUMIs") +``` + +## nUMIs ranked +```{r plot n_umis ranked} +nUMIs_df <- seurat[[]] %>% + as.data.frame() %>% + rownames_to_column(var = "barcode") %>% + dplyr::select(c(barcode, nCount_RNA)) %>% + dplyr::arrange(-nCount_RNA) + +nUMIs_df$ranked_bc <- as.integer(rownames(nUMIs_df)) + +``` + +## novelty +```{r plot novelty} +seurat$novelty <- log10(seurat@meta.data$nFeature_RNA) / log10(seurat@meta.data$nCount_RNA) + +novelty_df <- seurat[[]] %>% + as.data.frame() %>% + rownames_to_column(var = "barcode") %>% + dplyr::select(c(barcode, novelty)) %>% + dplyr::arrange(-novelty) + +novelty_df$ranked_bc <- as.integer(rownames(novelty_df)) + +ggplot() + geom_line(data = novelty_df, aes(x = ranked_bc, y = novelty), col = "red") + + scale_y_continuous(trans = "log10") + + geom_hline(yintercept = params$min_novelty, linetype = "dashed") + + ylab("novelty") + xlab("barcode") + +``` + +# Filtering Low-Quality Cells + +We discard cells that have less than `r params$min_genes` features and genes present in less than 3 cells. Additionally, we apply the following AtoMx QC flags to select/filter cells: + +qcFlagsCellComplex = Pass. Filtering for complexity (nCount_RNA / nFeature_RNA) qcFlagsCellArea = Pass. Cell areas flagged as outliers by Grubbs test + + +```{r qc filtering} +counts <- LayerData(seurat, layer = "counts", assay = "RNA") + +seurat_filtered <- CreateSeuratObject(counts = counts, meta.data = seurat@meta.data, min.cells = 3, + min.features = params$min_genes) + +selected_cells <- seurat_filtered[[]] %>% + as.data.frame() %>% + dplyr::filter(qcFlagsCellComplex == "Pass", + qcFlagsCellArea == "Pass", + novelty > params$min_novelty + # qcFlagsFOV == "Pass" + ) %>% + pull(cell_id) + +seurat_filtered <- subset(seurat_filtered, cells = selected_cells) + +n_before <- nrow(seurat[[]]) +n_after <- nrow(seurat_filtered[[]]) +``` + +There were `r n_before` cells before filtering and `r n_after` afterwards, for a total of +`r n_after/n_before * 100`% remaining + +```{r plot pre/post qc} + +pre <- ImageDimPlot(seurat) + NoLegend() + ggtitle("Pre-Filtering") +seurat_filtered[["FOV"]] <- subset(coords, cell = Cells(seurat_filtered)) +post <- ImageDimPlot(seurat_filtered) + NoLegend() + ggtitle("Post-Filtering") + +discarded_cells <- colnames(seurat)[!colnames(seurat) %in% colnames(seurat_filtered)] +seurat$selected <- seurat[[]] %>% + as.data.frame() %>% + dplyr::mutate( + selected = case_when( + cell_id %in% discarded_cells ~ "discarded", + TRUE ~ "selected") + ) %>% + pull(selected) + +seurat$selected <- factor(seurat$selected, levels = c("selected", "discarded")) +discarded <- ImageDimPlot(seurat, group.by = "selected") + NoLegend() + ggtitle("Blue - discarded") +print(pre + post + discarded) + +``` + +# Processing + +```{r processing} + +# perform processing (one time). if already done previously, load from file +processed_seurat_fn <- paste0(file_path_sans_ext(params$seurat_fn), '_processed.qs') +if (!file.exists(processed_seurat_fn)) { + seurat_filtered <- SCTransform(seurat_filtered, assay = "RNA", clip.range = c(-10,10), verbose = FALSE) + seurat_filtered <- NormalizeData(seurat_filtered, assay = "RNA") + seurat_filtered <- RunPCA(seurat_filtered) + seurat_filtered <- FindNeighbors(seurat_filtered, dims = 1:30) + seurat_filtered <- RunUMAP(seurat_filtered, dims = 1:30) + seurat_filtered <- FindClusters(seurat_filtered, resolution = 0.1, verbose = FALSE) + qsave(seurat_filtered, processed_seurat_fn, preset = 'fast') +} else { + seurat_filtered <- qread(processed_seurat_fn) +} + +``` + +```{r plot umap before} + +# TODO find colname of pre-filtering umap data in seurat object, use as params$umap_dim at top of file +DimPlot(seurat, reduction = paste(params$umap_dim), + pt.size = 0.6) + labs(x = "umap_1", y = "umap_2", title = "Pre-Filtering") + +``` + +```{r plot umap after} +DimPlot(seurat_filtered, reduction = "umap", pt.size = 0.6) + + ggtitle("Post-Filtering") + +``` + +```{r plot image clusters} +ImageDimPlot(seurat_filtered, axes = TRUE, crop = TRUE, combine = TRUE) + +``` + +# Markers + +## Cell Type Markers {.tabset} + +```{r markers of interest, results = 'asis'} + +## TODO replace with markers relevant to your project +markers_of_interest <- c('CD4', 'CD8A', 'CD8B', 'CD63', 'CD69', 'HBB') + +for (marker in markers_of_interest) { + cat("### ", marker, "\n") + + FeaturePlot(seurat_filtered, features = marker, max.cutoff = "q95", min.cutoff = "q05", + reduction = "umap", pt.size = 0.6, order = T) %>% + print() + + p <- ImageFeaturePlot(seurat_filtered, features = marker, max.cutoff = "q95", + size = 1, crop = TRUE, combine = FALSE) + print(p) + cat('\n') +} +``` + +# Cell Type Identification + +## Azimuth + +```{r} +# perform Azimuth cell type identification (once). if already done previously, load from file +azimuth_seurat_fn <- paste0(file_path_sans_ext(params$seurat_fn), '_azimuth.qs') +if (!file.exists(azimuth_seurat_fn)) { + seurat_filtered_ann_pbmc <- RunAzimuth(seurat_filtered, assay = "RNA", reference = "pbmcref") + qsave(seurat_filtered_ann_pbmc, azimuth_seurat_fn, preset = 'fast') + +} else { + seurat_filtered_ann_pbmc <- qread(azimuth_seurat_fn) +} +DimPlot(seurat_filtered_ann_pbmc, group.by = 'predicted.celltype.l1') +ImageDimPlot(seurat_filtered_ann_pbmc, axes = TRUE, crop = TRUE, combine = TRUE, group.by = 'predicted.celltype.l1') + +``` + +## Seurat and GPT4 +```{r} +markers <- FindAllMarkers(seurat_filtered) + +# TODO: uncomment this chunk to get markers for pasting into GPT4 +# markers %>% +# dplyr::filter(avg_log2FC > 0) %>% +# select(c("cluster", "gene")) %>% +# group_by(cluster) %>% +# slice(1:20) %>% +# summarise(gene = paste(gene, collapse = ", ")) + +markers %>% sanitize_datatable() + +## TODO replace with cell types identified by GPT4 for your markers +cluster_ids <- data.frame(cluster = as.factor(c(0:6)), + cell_type = as.factor(c('B-cells', 'Fibroblasts', 'Monocytes', + 'Cytotoxic T-cells', 'Endothelial Cells', + 'T-cells', 'Keratinocytes'))) + + +seurat_filtered@meta.data <- left_join(seurat_filtered@meta.data, cluster_ids, by = c('seurat_clusters' = 'cluster')) +rownames(seurat_filtered@meta.data) <- seurat_filtered@meta.data$cell_id + +ImageDimPlot(seurat_filtered, group.by = 'cell_type', axes = TRUE, crop = TRUE) + + +``` \ No newline at end of file diff --git a/inst/rmarkdown/templates/cosmx/skeleton/QC/run_markdown.R b/inst/rmarkdown/templates/cosmx/skeleton/QC/run_markdown.R new file mode 100644 index 0000000..d454e45 --- /dev/null +++ b/inst/rmarkdown/templates/cosmx/skeleton/QC/run_markdown.R @@ -0,0 +1,13 @@ +library(rmarkdown) + +rmarkdown::render("reports/QC.Rmd", + output_dir = "reports", + clean = TRUE, + output_format = "html_document", + output_file = "QC.html", + params = list( + seurat_fn = '../data/CTCL_PAT8_DEV3_LVL7.RDS', + project_file = './information.R', + umap_dim = 'approximateumap_8c6f278e.b9f4.4535.aeca.8955c1dff614_1' + ) +) \ No newline at end of file diff --git a/inst/rmarkdown/templates/cosmx/skeleton/information.R b/inst/rmarkdown/templates/cosmx/skeleton/information.R new file mode 100644 index 0000000..6e15eef --- /dev/null +++ b/inst/rmarkdown/templates/cosmx/skeleton/information.R @@ -0,0 +1,6 @@ +# info params +project = "name_hbcXXXXX" +PI = 'person name' +experiment = 'short description' +aim = 'short description' +analyst = 'person in the core' diff --git a/inst/rmarkdown/templates/cosmx/template.yml b/inst/rmarkdown/templates/cosmx/template.yml new file mode 100644 index 0000000..12712ff --- /dev/null +++ b/inst/rmarkdown/templates/cosmx/template.yml @@ -0,0 +1,3 @@ +name: bcbio CosMx +description: Standard CoxMx down-stream analyses +create_dir: false diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd index f390ad4..8139da5 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/DEG.Rmd @@ -19,18 +19,37 @@ editor_options: params: numerator: tumor denominator: normal - subset_value: NA - ruv: true + column: sample_type + subset_column: null + subset_value: null + # Put hg38, mm10, mm39, or other + genome: hg38 + ruv: false params_file: params_de.R project_file: ../information.R + functions_file: load_data.R + --- -```{r load_params, echo = F} +```{r load_params, cache = FALSE, message = FALSE, warning=FALSE} +# 1. Set up input files in this R file (params_de.R) source(params$params_file) +# 2. Set up project file (already done from QC probably) source(params$project_file) +# 3. Load custom functions to load data from coldata/metrics/counts +source(params$functions_file) +# IMPORTANT set these values if you are not using the parameters at the top +genome=params$genome +column=params$column +numerator=params$numerator +denominator=params$denominator +subset_column=params$subset_column +subset_value=params$subset_value +run_ruv=params$ruv ``` -```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE, echo=FALSE,} + +```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} library(rtracklayer) library(DESeq2) library(tidyverse) @@ -47,6 +66,7 @@ library(bcbioR) library(ggprism) library(viridis) library(pheatmap) +library(janitor) colors=cb_friendly_cols(1:15) ggplot2::theme_set(theme_prism(base_size = 14)) opts_chunk[["set"]]( @@ -66,6 +86,28 @@ opts_chunk[["set"]]( set.seed(1234567890L) ``` + +```{r load_data, message=F, warning=F} +# This code will load from bcbio or nf-core folder +# NOTE make sure to set numerator and denominator +coldata <- load_coldata(coldata_fn, column, + numerator, denominator, + subset_column, subset_value) +coldata$sample=row.names(coldata) + +counts <- load_counts(counts_fn) +counts <- counts[,colnames(counts) %in% coldata$sample] + +metrics <- load_metrics(se_object, multiqc_data_dir, gtf_fn, counts) %>% + left_join(coldata, by = c('sample')) %>% + as.data.frame() +rownames(metrics) <- metrics$sample +# if the names don't match in order or string check files names and coldata information +counts = counts[,rownames(metrics)] +coldata = coldata[rownames(metrics),] +stopifnot(all(names(counts) == rownames(metrics))) +``` + ```{r sanitize_datatable} sanitize_datatable = function(df, ...) { # remove dashes which cause wrapping @@ -81,18 +123,18 @@ sanitize_datatable = function(df, ...) { - Analyst: `r analyst` - Experiment: `r experiment` - Aim: `r aim` -- Comparison: `r paste0(params$subset_value, ': ', params$numerator, ' vs. ', params$denominator)` +- Comparison: `r ifelse(is.null(subset_value), paste0(numerator, ' vs. ', denominator), paste0(subset_value, ': ', numerator, ' vs. ', denominator))` ```{r create_filenames} -if (!is.na(params$subset_value)){ - filenames = str_interp("${params$subset_value}_${params$numerator}_vs_${params$denominator}") +if (!is.null(subset_value) & !is.null(subset_value)){ + filenames = str_interp("${subset_value}_${numerator}_vs_${denominator}") } else { - filenames = str_interp("${params$numerator}_vs_${params$denominator}") + filenames = str_interp("${numerator}_vs_${denominator}") } -contrasts = c(column,params$numerator,params$denominator) -coef=paste0(column,"_",params$numerator,"_vs_",params$denominator) +contrasts = c(column,numerator,denominator) +coef=paste0(column,"_",numerator,"_vs_",denominator) name_expression_fn=file.path( basedir, @@ -106,25 +148,7 @@ name_pathways_fn=file.path( ``` -```{r read_counts_data} -coldata=read.csv(coldata_fn) -stopifnot(column %in% names(coldata)) - -# use only some samples, by default use all -if (!is.na(subset_column)){ - coldata <- coldata[coldata[[paste(subset_column)]] == params$subset_value, ] -} -coldata <- coldata[coldata[[paste(column)]] %in% c(params$numerator, params$denominator), ] -rownames(coldata) <- coldata$description -coldata[[column]] = relevel(as.factor(coldata[[column]]), params$denominator) - -counts <- read_csv(counts_fn) %>% column_to_rownames('gene') -counts <- counts[,colnames(counts) %in% coldata$description] - - -# if the names don't match in order or string check files names and coldata information -counts = counts[rownames(coldata)] -stopifnot(all(names(counts) == rownames(coldata))) +```{r load_counts_data} rdata = AnnotationDbi::select(org.Hs.eg.db, rownames(counts), 'SYMBOL', 'ENSEMBL') %>% dplyr::select(gene_id = ENSEMBL, gene_name = SYMBOL) @@ -142,14 +166,9 @@ norm_matrix = assay(vsd_before) # Covariate analysis ```{r covariates, fig.height = 6, fig.width = 10} - -se <- readRDS(se_object) #local -metrics <- metadata(se)$metrics %>% mutate(sample = toupper(sample)) %>% - left_join(coldata %>% rownames_to_column('sample')) %>% column_to_rownames('sample') - degCovariates( norm_matrix, - metrics + metrics, ) ``` @@ -172,16 +191,17 @@ dds_to_use <- DESeqDataSetFromMatrix(counts, coldata, design = formula) vsd_before <- vst(dds_to_use) norm_matrix = assay(vsd_before) +new_cdata <- coldata ``` -```{r, eval=params$ruv, results='asis', echo=FALSE} +```{r, eval=run_ruv, results='asis', echo=FALSE} cat("# Remove Unwanted Variability When performing differential expression analysis, it is important to ensure that any detected differences are truly a result of the experimental comparison being made and not any additional variability in the data.") ``` -```{r do_RUV, eval=params$ruv} +```{r do_RUV, eval=run_ruv} # If you want to skip the code, just set up formula to be your model in the next chunk of code design <- coldata[[column]] diffs <- makeGroups(design) @@ -204,7 +224,7 @@ pca2 + scale_color_cb_friendly() ``` -```{r after_RUV, eval=params$ruv} +```{r after_RUV, eval=run_ruv} dds_to_use <- DESeqDataSetFromMatrix(counts, new_cdata, design = formula) vsd_to_use<- vst(dds_to_use, blind=FALSE) @@ -279,7 +299,7 @@ EnhancedVolcano(res_mod, ```{r heapmap} ### Run pheatmap using the metadata data frame for the annotation ma=norm_matrix[res_sig$gene_id,] -colma=coldata[,c("sample_type"), drop=FALSE] +colma=coldata[,c(column), drop=FALSE] colors=lapply(colnames(colma), function(c){ l.col=colors[1:length(unique(colma[[c]]))] names(l.col)=unique(colma[[c]]) @@ -317,13 +337,14 @@ top_n_exp <- norm_matrix %>% as.data.frame() %>% # dplyr::select(-group, -group_name) %>% pivot_longer(!gene_id, names_to = 'sample', values_to = 'log2_expression') %>% right_join(top_n, relationship = "many-to-many") %>% - left_join(coldata, by = c('sample' = 'description')) + left_join(coldata, by = 'sample') ggplot(top_n_exp, aes_string(x = column, y = 'log2_expression')) + geom_boxplot(outlier.shape = NA, linewidth=0.5, color="grey") + geom_point() + facet_wrap(~gene_name) + - ggtitle(str_interp('Expression of Top ${n} DEGs')) + ggtitle(str_interp('Expression of Top ${n} DEGs')) + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) ``` @@ -386,21 +407,21 @@ pathways_ora_all %>% sanitize_datatable() ```{r write-files} counts_norm=norm_matrix %>% as.data.frame() %>% rownames_to_column("gene_id") %>% - mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}")) + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) res_for_writing <- res %>% - mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}")) + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) pathways_for_writing <- pathways_long %>% - mutate(comparison = str_interp("${params$numerator}_vs_${params$denominator}")) + mutate(comparison = str_interp("${numerator}_vs_${denominator}")) -if (!is.na(params$subset_value)){ +if (!is.null(subset_value)){ counts_norm <- counts_norm %>% - mutate(subset = params$subset_value) + mutate(subset = subset_value) res_for_writing <- res_for_writing %>% - mutate(subset = params$subset_value) + mutate(subset = subset_value) pathways_for_writing <- pathways_for_writing %>% - mutate(subset = params$subset_value) + mutate(subset = subset_value) } write_csv(counts_norm, name_expression_fn) diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/PCA_variance_analysis.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/DE/PCA_variance_analysis.Rmd new file mode 100644 index 0000000..4074693 --- /dev/null +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/PCA_variance_analysis.Rmd @@ -0,0 +1,54 @@ +--- +title: "PCA with variance analysis" +author: "Harvard Chan Bioinformatics Core" +--- + +Dispersion estimates are a key part of the DESEQ2 analysis. DESEQ2 uses data from all samples and all genes to generate a relationship between level expression and variance and then shrinks per gene dispersions to match this distribution. If one group has higher variance than all others this will affect the dispersion estimates. Here we visually check that the variance per group is similar using a PCA. The ellipses are minimal volume enclosing ellipses using the Khachiyan algorithm. + + +**Manually add in your covariate of interest to define the groups. One can be created by combining multiple metadata columns using the paste0 function.** + +```{r } +## Example of creating a group covariate + +# meta$group <- paste0(meta$sex,"_", meta$age,"_",meta$treatment) + +factor_of_interest <- "insert column name for covariate of interest" +``` + + +```{r } +library(DEGreport) +library(ggplot2) +library(ggforce) + +data("bcbio_vsd_data") + +colors=cb_friendly_cols(1:15) +ggplot2::theme_set(theme_prism(base_size = 14)) + +pca <- degPCA(assay(bcbio_vsd_data), colData(bcbio_vsd_data), + condition = factor_of_interest, name = "sample", data = T) + +pca$plot + ggtitle(paste0("All samples", "\nPCA using ", nrow(vst), " genes")) + + theme(plot.title=element_text(hjust=0.5)) + + geom_mark_ellipse(aes(color = sample_type)) +``` + +## PERMDISP + +Groups in a univariate analysis can also differ with regard to their mean values, variation around those means, or both. In univariate analyses, dispersion can be examined using Levene’s test. PERMDISP is a multivariate extension of Levene’s test to examine whether groups differ in variability. In essence, PERMDISP involves calculating the distance from each data point to its group centroid and then testing whether those distances differ among the groups. [Source](https://uw.pressbooks.pub/appliedmultivariatestatistics/chapter/permdisp/) + +Here we apply this test to our variance stabilized data. We calculate distances betwen samples and then use the betadisper() function from the popular vegan package. We get two overall p-values where significant means that the dispersions are different between groups. The first p-value comes from the anova() function and the second from the permutest() function. We also get pairwise p-values for every group-group comparison. + +```{r} +library(vegan) +vare.disa <- vegdist(t(assay(bcbio_vsd_data))) + +mod = betadisper(vare.disa, colData(bcbio_vsd_data)[['sample_type']]) +anova(mod) +permutest(mod, pairwise = TRUE) + +``` + + diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/load_data.R b/inst/rmarkdown/templates/rnaseq/skeleton/DE/load_data.R new file mode 100644 index 0000000..8a1d297 --- /dev/null +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/load_data.R @@ -0,0 +1,146 @@ +library(tidyverse) +library(SummarizedExperiment) +library(janitor) +load_metrics <- function(se_object, multiqc_data_dir, gtf_fn, counts){ + + # bcbio input + if (!is.na(se_object)){ + + se <- readRDS(se_object) + metrics <- metadata(se)$metrics %>% as.data.frame() + # left_join(coldata %>% rownames_to_column('sample')) %>% column_to_rownames('sample') + } else { #nf-core input + + # Get metrics from nf-core into bcbio like table + # many metrics are already in the Genereal Table of MultiQC, this reads the file + metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) + + # we get some more metrics from Qualimap and rename columns + metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) + metrics <- metrics %>% full_join(metrics_qualimap) + metrics <- metrics %>% + clean_names() %>% + dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) + + # This uses the fastqc metrics to get total reads + total_reads <- metrics %>% + dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% + remove_empty(which = 'cols') %>% + dplyr::rename(single_sample = sample) %>% + mutate(sample = gsub('_[12]+$', '', single_sample)) %>% + group_by(sample) %>% + summarize(total_reads = sum(fastqc_raw_total_sequences)) + + # This renames to user-friendly names the metrics columns + metrics <- metrics %>% + dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% + remove_empty(which = 'cols') %>% + full_join(total_reads) %>% + mutate(mapped_reads = samtools_reads_mapped) %>% + mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% + mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% + mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% + mutate(x5_3_bias = qualimap_5_3_bias) + + # Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case + gtf <- NULL + if (genome =="other"){ + gtf <- gtf_fn + }else{ + if (genome == "hg38") { + gtf <- "hg38.rna.gtf.gz" + } else if (genome == "mm10") { + gtf <- "mm10.rna.gtf.gz" + } else if (genome == "mm39") { + gtf <- "mm39.rna.gtf.gz" + } + gtf <- system.file("extdata", "annotation", + gtf, + package="bcbioR") + } + if (is.null(gtf)) { + print("No genome provided! Please add it at the top of this Rmd") + } + + gtf=rtracklayer::import(gtf) + + + one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) + another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) + biotype=NULL + if(length(one)==1){ + biotype=one + }else if(length(another)==1){ + biotype=another + }else{ + warning("No gene biotype founded") + } + + if (!is.null(biotype)){ + annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] + rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) + genes=intersect(annotation[rRNA,"gene_id"],row.names(counts)) + ratio=data.frame(sample=colnames(counts), + r_and_t_rna_rate=colSums(counts[genes,])/colSums(counts)) + metrics = left_join(metrics, ratio, by="sample") + }else{ + metrics[["r_and_t_rna_rate"]] <- NA + } + + # if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ + # metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) + # }else{ + # metrics[["r_rna_rate"]] <- NA + # } + metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", + "total_reads", + "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] + } + metrics$sample <- make.names(metrics$sample) + rownames(metrics) <- metrics$sample + return(metrics) +} + +load_coldata <- function(coldata_fn, column, numerator, denominator, subset_column = NULL, subset_value = NULL){ + coldata=read.csv(coldata_fn) %>% + dplyr::select(!matches("fastq") & !matches("strandness")) %>% + distinct() + if('description' %in% names(coldata)){ + coldata$sample <- tolower(coldata$description) + } + coldata <- coldata %>% distinct(sample, .keep_all = T) + stopifnot(column %in% names(coldata)) + + # use only some samples, by default use all + if (!is.null(subset_column)){ + coldata <- coldata[coldata[[paste(subset_column)]] == subset_value, ] + } + #coldata <- coldata[coldata[[paste(column)]] %in% c(numerator, denominator), ] + #browser() + coldata$sample <- make.names(coldata$sample) + rownames(coldata) <- coldata$sample + coldata$description <- coldata$sample + + coldata[[column]] = relevel(as.factor(coldata[[column]]), denominator) + + return(coldata) +} + +load_counts <- function(counts_fn){ + + # bcbio input + if(grepl('csv', counts_fn)){ + counts <- read_csv(counts_fn) %>% + mutate(gene = str_replace(gene, pattern = "\\.[0-9]+$", "")) %>% + column_to_rownames('gene') + colnames(counts) = tolower(colnames(counts)) + return(counts) + } else { # nf-core input + counts <- read_tsv(counts_fn) %>% dplyr::select(-gene_name) %>% + mutate(gene_id = str_replace(gene_id, pattern = "\\.[0-9]+$", "")) %>% + column_to_rownames('gene_id') %>% round + + return(counts) + } + +} diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de-example.R b/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de-example.R new file mode 100644 index 0000000..cc75ad2 --- /dev/null +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de-example.R @@ -0,0 +1,18 @@ +# project params +date = "YYYYMMDD" +basedir <- './' # where to write down output files + +# params for bcbio +# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" +# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' +# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") +# + +# Example data +coldata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' +counts_fn=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.tsv') +# This folder is in the output directory inside multiqc folder +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' +# This file is inside the genome folder in the output directory +gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' +se_object = NA diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de.R b/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de.R index c1212fc..8426428 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de.R +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/params_de.R @@ -1,8 +1,22 @@ # project params date = "YYYYMMDD" -column = "sample_type" -subset_column = NA -coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" -counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' -se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") -basedir <- './' +basedir <- './' # where to write down output files + +# params for bcbio +# coldata_fn = "https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/coldata.csv" +# counts_fn = 'https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/tximport-counts.csv' +# se_object=url("https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/bcbio/bcbio-se.rds") +# + +# params for nfcore +# Your data +# This is the file used to run nf-core or compatible to that +coldata_fn='/Path/to/metadata/meta.csv' +# This file is inside star_salmon/ folder +counts_fn='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.tsv' +# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data' +# This file is inside the genome folder in the output directory, use this only non-model organism +# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' +se_object = NA + diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/DE/run_markdown.R b/inst/rmarkdown/templates/rnaseq/skeleton/DE/run_markdown.R index 44e430b..79e15a0 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/DE/run_markdown.R +++ b/inst/rmarkdown/templates/rnaseq/skeleton/DE/run_markdown.R @@ -1,25 +1,32 @@ library(rmarkdown) +# set working directory to this file before using the function -render_de <- function(numerator, denominator, subset_value = NA, - params_file = 'params_de.R'){ +# set directory to this file folder +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +# example running with test data +render_de <- function(column, numerator, denominator, subset_value = NULL, + params_file = 'params_de-testdata.R'){ + rmarkdown::render(input = "DEG.Rmd", - output_dir = "./", + output_dir = ".", output_format = "html_document", - output_file = ifelse(!is.na(subset_value), + output_file = ifelse(!is.null(subset_value), paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), paste0('DE_', numerator, '_vs_', denominator, '.html') ), clean = TRUE, envir = new.env(), params = list( + column = column, subset_value = subset_value, numerator = numerator, denominator = denominator, params_file = params_file, - project_file = '../information.R' + project_file = '../information.R', + functions_file = 'load_data.R' ) ) } - -render_de("tumor", "normal") +#Example data +render_de("sample_type","tumor", "normal") diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd index 14a5006..48afe76 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd +++ b/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC.Rmd @@ -17,8 +17,8 @@ output: editor_options: chunk_output_type: console params: - params_file: ./inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc.R - project_file: ./inst/rmarkdown/templates/rnaseq/skeleton/information.R + params_file: params_qc.R + project_file: ../information.R --- diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd b/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd index 89b2a84..f1b035e 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd +++ b/inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd @@ -17,27 +17,46 @@ output: editor_options: chunk_output_type: console params: - params_file: ./inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R - project_file: ./inst/rmarkdown/templates/rnaseq/skeleton/information.R + params_file: params_qc_nf-core.R + # Put hg38, mm10, mm39, or other + genome: hg38 + project_file: ../information.R + factor_of_interest: sample_type --- ```{r source_params, echo = F} +# 1. set up factor_of_interest parameter from parameter above or manually +# this is used to color plots, it needs to be part of the metadata +factor_of_interest=params$factor_of_interest +genome=params$genome +# 2. Set input files in this file source(params$params_file) +# 3. If you set up this file, project information will be printed below and +#. it can be reused for other Rmd files. source(params$project_file) ``` +# Overview + +- Project: `r project` +- PI: `r PI` +- Analyst: `r analyst` +- Experiment: `r experiment` + + ```{r load_libraries, cache = FALSE, message = FALSE, warning=FALSE} library(tidyverse) library(knitr) +library(rtracklayer) library(DESeq2) library(DEGreport) library(ggrepel) -library(pheatmap) # library(RColorBrewer) library(DT) library(pheatmap) library(bcbioR) +library(janitor) ggplot2::theme_set(theme_light(base_size = 14)) opts_chunk[["set"]]( cache = FALSE, @@ -105,62 +124,126 @@ sanitize_datatable = function(df, ...) { } ``` -# Overview - -- Project: `r project` -- PI: `r PI` -- Analyst: `r analyst` -- Experiment: `r experiment` -- Aim: `r aim` - # Samples and metadata -```{r load_metadata} -meta_df=read_csv(metadata_fn) %>% mutate(sample = description) %>% - dplyr::select(-description) +```{r load_metadata} -ggplot(meta_df, aes(sample_type, fill = sample_type)) + - geom_bar() + ylab("") + xlab("") + - scale_fill_cb_friendly() +meta_df=read_csv(metadata_fn) %>% + arrange(.data[[factor_of_interest]]) %>% + distinct(sample, .keep_all = T) %>% + dplyr::select(!matches("fastq"), !matches("strandness")) +meta_df$sample <- make.names(meta_df$sample) +order <- meta_df$sample + +ggplot(meta_df, aes(.data[[factor_of_interest]], + fill = .data[[factor_of_interest]])) + + geom_bar() + ylab("") + xlab("") + ylab("# of samples") + + scale_fill_cb_friendly() + theme(axis.text.x=element_text(angle = 90, vjust = 0.5), legend.position = "none") ``` -```{r prepare metrics} +```{r load_data} +# read counts from SE object +se <- readRDS(se_object) +raw_counts <- assays(se)[["counts"]] %>% round() %>% + as.matrix() +raw_counts=raw_counts[rowSums(raw_counts)!=0,] +``` -metrics <- read_tsv(paste0(multiqc_data_dir, 'multiqc_general_stats.txt')) -metrics_qualimap <- read_tsv(paste0(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) +```{r prepare_metrics} +# Get metrics from nf-core into bcbio like table +# many metrics are already in the General Table of MultiQC, this reads the file +metrics <- read_tsv(file.path(multiqc_data_dir, 'multiqc_general_stats.txt')) +# we get some more metrics from Qualimap and rename columns +metrics_qualimap <- read_tsv(file.path(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt')) metrics <- metrics %>% full_join(metrics_qualimap) metrics <- metrics %>% clean_names() %>% dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .)) +# This uses the fastqc metrics to get total reads total_reads <- metrics %>% - filter(grepl('_', sample)) %>% + dplyr::filter(!is.na(fastqc_raw_total_sequences)) %>% remove_empty(which = 'cols') %>% dplyr::rename(single_sample = sample) %>% mutate(sample = gsub('_[12]+$', '', single_sample)) %>% group_by(sample) %>% summarize(total_reads = sum(fastqc_raw_total_sequences)) +# This renames to user-friendly names the metrics columns metrics <- metrics %>% - filter(!grepl('_', sample)) %>% + dplyr::filter(is.na(fastqc_raw_total_sequences)) %>% remove_empty(which = 'cols') %>% full_join(total_reads) %>% mutate(mapped_reads = samtools_reads_mapped) %>% mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>% mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>% mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>% - mutate(r_rna_rate = custom_content_biotype_counts_percent_r_rna) %>% mutate(x5_3_bias = qualimap_5_3_bias) +# Sometimes we don't have rRNA due to mismatch annotation, We skip this if is the case +gtf <- NULL +if (genome =="other"){ + gtf <- gtf_fn +}else{ + if (genome == "hg38") { + gtf <- "hg38.rna.gtf.gz" + } else if (genome == "mm10") { + gtf <- "mm10.rna.gtf.gz" + } else if (genome == "mm39") { + gtf <- "mm39.rna.gtf.gz" + } + gtf <- system.file("extdata", "annotation", + gtf, + package="bcbioR") +} +if (is.null(gtf)) { + print("No genome provided! Please add it at the top of this Rmd") +} + +gtf=rtracklayer::import(gtf) + +one=grep("gene_type", colnames(as.data.frame(gtf)), value = TRUE) +another=grep("gene_biotype", colnames(as.data.frame(gtf)), value = TRUE) +biotype=NULL +if(length(one)==1){ + biotype=one +}else if(length(another)==1){ + biotype=another +}else{ + warning("No gene biotype founded") +} + +if (!is.null(biotype)){ + annotation=as.data.frame(gtf) %>% .[,c("gene_id", biotype)] + rRNA=grepl("rRNA|tRNA",annotation[[biotype]]) + genes=intersect(annotation[rRNA,"gene_id"],row.names(raw_counts)) + ratio=data.frame(sample=colnames(raw_counts), + r_and_t_rna_rate=colSums(raw_counts[genes,])/colSums(raw_counts)) + metrics = left_join(metrics, ratio, by="sample") +}else{ + metrics[["r_and_t_rna_rate"]] <- NA +} + +# if ("custom_content_biotype_counts_percent_r_rna" %in% colnames(metrics)){ +# metrics <- mutate(metrics, r_rna_rate = custom_content_biotype_counts_percent_r_rna) +# }else{ +# metrics[["r_rna_rate"]] <- NA +# } +metrics=metrics[,c("sample","mapped_reads","exonic_rate","intronic_rate", + "total_reads", + "x5_3_bias", "r_and_t_rna_rate","intergenic_rate")] +metrics$sample <- make.names(metrics$sample) metrics <- metrics %>% - full_join(meta_df , by = c("sample" = "sample")) + full_join(meta_df , by = c("sample" = "sample")) %>% + dplyr::select(where(~!all(is.na(.)))) + ``` -```{r show-metadata} +```{r show_metadata} meta_sm <- meta_df %>% as.data.frame() %>% column_to_rownames("sample") @@ -173,19 +256,29 @@ meta_sm %>% sanitize_datatable() ## Total reads -Here, we want to see consistency and a minimum of 20 million reads. +Here, we want to see consistency and a minimum of 20 million reads (the grey line). ```{r plot_total_reads} metrics %>% - ggplot(aes(x = sample_type, + ggplot(aes(x = factor(sample, level = order), y = total_reads, - color = sample_type)) + - geom_point(alpha=0.5) + + fill = .data[[factor_of_interest]])) + + geom_bar(stat = "identity") + coord_flip() + scale_y_continuous(name = "million reads") + - scale_color_cb_friendly() + - ggtitle("Total reads") + scale_fill_cb_friendly() + xlab("") + + ggtitle("Total reads") + + geom_hline(yintercept=20000000, color = "grey", size=2) +metrics %>% + ggplot(aes(x = .data[[factor_of_interest]], + y = total_reads, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_color_cb_friendly() + xlab("") + + ggtitle("Total reads") ``` ```{r calc_min_max_pct_mapped} @@ -196,29 +289,28 @@ max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1) ## Mapping rate -The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping. These samples have mapping rates (`r min_pct_mapped` - `r max_pct_mapped`%). +The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping (the grey line). These samples have mapping rates: `r min_pct_mapped` - `r max_pct_mapped`%. ```{r plot_mapping_rate} metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1) metrics %>% - ggplot(aes(x = sample_type, + ggplot(aes(x = factor(sample, level = order), y = mapped_reads_pct, - color = sample_type)) + - geom_point() + + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + coord_flip() + scale_color_cb_friendly() + ylim(0, 100) + - ggtitle("Mapping rate") + - geom_hline(yintercept=70, color = cb_friendly_cols('blue')) + ggtitle("Mapping rate") + xlab("") + + geom_hline(yintercept=70, color = "grey", size=2) ``` ## Number of genes detected -The number of genes represented in every sample is expected to be consistent and over 20K (blue line). +The number of genes represented in every sample is expected to be consistent and over 20K (grey line). -```{r plot_genes_detected} -se <- readRDS(se_object) +```{r calc_genes_detected} genes_detected <- colSums(assays(se)[["counts"]] > 0) %>% enframe() sample_names <- metrics[,c("sample"), drop=F] genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample")) @@ -228,18 +320,35 @@ genes_detected <- summarise(genes_detected, metrics <- metrics %>% left_join(genes_detected, by = c("sample" = "name")) -ggplot(metrics,aes(x = sample_type, - y = n_genes, color = sample_type)) + - geom_point() + + +``` + + +```{r plot_genes_detected} +ggplot(metrics,aes(x = factor(sample, level = order), + y = n_genes, fill = .data[[factor_of_interest]])) + + geom_bar(stat = "identity") + coord_flip() + - scale_color_cb_friendly() + + scale_fill_cb_friendly() + ggtitle("Number of genes") + ylab("Number of genes") + xlab("") + - geom_hline(yintercept=20000, color = cb_friendly_cols('blue')) + geom_hline(yintercept=20000, color = "grey", size=2) + +metrics %>% + ggplot(aes(x = .data[[factor_of_interest]], + y = n_genes, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + coord_flip() + + scale_y_continuous(name = "million reads") + + scale_color_cb_friendly() + xlab("") + + ggtitle("Number of Genes") + ``` + ## Gene detection saturation This plot shows how complex the samples are. We expect samples with more reads to detect more genes. @@ -248,8 +357,8 @@ This plot shows how complex the samples are. We expect samples with more reads t metrics %>% ggplot(aes(x = total_reads, y = n_genes, - color = sample_type)) + - geom_point()+ + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + scale_x_log10() + scale_color_cb_friendly() + ggtitle("Gene saturation") + @@ -258,78 +367,79 @@ metrics %>% ## Exonic mapping rate -Here we are looking for consistency, and exonic mapping rates around 70% or 75% (blue and red lines, respectively). +Here we are looking for consistency, and exonic mapping rates around or above 70% (grey line). ```{r plot_exonic_mapping_rate} metrics %>% - ggplot(aes(x = sample_type, + ggplot(aes(x = factor(sample, level = order), y = exonic_rate * 100, - color = sample_type)) + - geom_point() + + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + ylab("Exonic rate %") + ggtitle("Exonic mapping rate") + scale_color_cb_friendly() + coord_flip() + xlab("") + ylim(c(0,100)) + - geom_hline(yintercept=70, color = cb_friendly_cols('blue')) + - geom_hline(yintercept=75, color = cb_friendly_cols('brown')) + geom_hline(yintercept=70, color = "grey", size=2) ``` ## Intronic mapping rate -Here, we expect a low intronic mapping rate (≤ 15% - 20%) +Here, we expect a low intronic mapping rate (≤ 15% - 20%). The grey line indicates 20%. ```{r plot_intronic_mapping_rate} metrics %>% - ggplot(aes(x = sample_type, + ggplot(aes(x = factor(sample, level = order), y = intronic_rate * 100, - color = sample_type)) + - geom_point() + + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + ylab("Intronic rate %") + ggtitle("Intronic mapping rate") + scale_color_cb_friendly() + coord_flip() + xlab("") + ylim(c(0,100)) + - geom_hline(yintercept=20, color = cb_friendly_cols('blue')) + - geom_hline(yintercept=15, color = cb_friendly_cols('brown')) + geom_hline(yintercept=20, color = "grey", size=2) ``` ## Intergenic mapping rate -Here, we expect a low intergenic mapping rate, which is true for all samples. +Here, we expect a low intergenic mapping rate, which is true for all samples. The grey line indicates 15% ```{r plot_intergenic_mapping_rate} metrics %>% - ggplot(aes(x = sample_type, + ggplot(aes(x = factor(sample, level = order), y = intergenic_rate * 100, - color = sample_type)) + - geom_point() + + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + ylab("Intergenic rate %") + ggtitle("Intergenic mapping rate") + - coord_flip() + + coord_flip() + xlab("") + scale_color_cb_friendly() + - ylim(c(0, 100)) + ylim(c(0, 100)) + + geom_hline(yintercept=15, color = "grey", size=2) ``` -## rRNA mapping rate +## tRNA/rRNA mapping rate -Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% +Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10% (the grey line). ```{r plot_rrna_mapping_rate} -# for some bad samples it could be > 50% -rrna_ylim <- max(round(metrics$r_rna_rate*100, 2)) + 10 + +rrna_ylim <- max(round(metrics$r_and_t_rna_rate*100, 2)) + 10 metrics %>% - ggplot(aes(x = sample_type, - y = r_rna_rate * 100, - color = sample_type)) + - geom_point() + - ylab("rRNA rate, %")+ - ylim(0, rrna_ylim) + - ggtitle("rRNA mapping rate") + - coord_flip() + - scale_color_cb_friendly() + ggplot(aes(x = factor(sample, level = order), + y = r_and_t_rna_rate * 100, + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + + ylab("tRNA/rRNA rate, %")+ + ylim(0, rrna_ylim) + + ggtitle("tRNA/rRNA mapping rate") + + coord_flip() + + scale_color_cb_friendly() + + ylim(c(0, 100)) + xlab("") + + geom_hline(yintercept=10, color = "grey", size=2) ``` ## 5'->3' bias @@ -338,15 +448,15 @@ There should be little bias, i.e. the values should be close to 1, or at least c ```{r plot_53_bias} metrics %>% - ggplot(aes(x = sample_type, + ggplot(aes(x = factor(sample, level = order), y = x5_3_bias, - color = sample_type)) + - geom_point() + + color = .data[[factor_of_interest]])) + + geom_point(alpha = 0.5, size=4) + ggtitle("5'-3' bias") + coord_flip() + - ylim(c(0.5,1.5)) + + ylim(c(0.5,1.5)) + xlab("") + ylab("5'-3' bias") + scale_color_cb_friendly()+ - geom_hline(yintercept=1, color = cb_friendly_cols('blue')) + geom_hline(yintercept=1, color = "grey", size=2) ``` ## Counts per gene - all genes @@ -354,7 +464,7 @@ metrics %>% We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar ```{r plot_counts_per_gene} -metrics_small <- metrics %>% dplyr::select(sample, sample_type) +metrics_small <- metrics %>% dplyr::select(sample, .data[[factor_of_interest]]) metrics_small <- left_join(sample_names, metrics_small) counts <- @@ -363,13 +473,14 @@ counts <- filter(rowSums(.)!=0) %>% gather(name, counts) -counts <- left_join(counts, metrics, by = c("name" = "sample")) +counts <- left_join(counts, metrics_small, by = c("name" = "sample")) -ggplot(counts, aes(sample_type, +ggplot(counts, aes(factor(name, level = order), log2(counts+1), - fill = sample_type)) + + fill = .data[[factor_of_interest]])) + geom_boxplot() + scale_fill_cb_friendly() + + coord_flip() + xlab("") + ggtitle("Counts per gene, all non-zero genes") + scale_color_cb_friendly() ``` @@ -379,55 +490,72 @@ ggplot(counts, aes(sample_type, In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA). -## Principal component analysis (PCA) {.tabset} +## Principal component analysis (PCA) Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells). - + ```{r PCA1:5 summary, all, unlabeled, fig.width= 7, fig.height = 5} -raw_counts <- assays(se)[["counts"]] %>% round() %>% - as_tibble() %>% - filter(rowSums(.)!=0) %>% - as.matrix() vst <- vst(raw_counts) -#fix samples names coldat_for_pca <- as.data.frame(metrics) rownames(coldat_for_pca) <- coldat_for_pca$sample coldat_for_pca <- coldat_for_pca[colnames(raw_counts),] pca1 <- degPCA(vst, coldat_for_pca, - condition = "sample_type", data = T)[["plot"]] + condition = factor_of_interest, data = T)[["plot"]] pca2 <- degPCA(vst, coldat_for_pca, - condition = "sample_type", data = T, pc1="PC3", pc2="PC4")[["plot"]] + condition = factor_of_interest, data = T, pc1="PC3", pc2="PC4")[["plot"]] + + pca1 + scale_color_cb_friendly() pca2 + scale_color_cb_friendly() + ``` +# Covariates analysis + +When there are multiple factors that can influence the results of a given experiment, it is useful to assess which of them is responsible for the most variance as determined by PCA. This method adapts the method described by Daily et al. for which they integrated a method to correlate covariates with principal components values to determine the importance of each factor. + +```{r covariate-plot,fig.height=12, fig.width=10} +## Remove non-useful columns output by nf-core +coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness", "qualimap_5_3_bias"))]) -```{r, eval=FALSE} -variables=degCovariates(vst, coldat_for_pca) +# Remove missing data +coldat_2 <- na.omit(coldat_2) +degCovariates(vst, metadata = coldat_2) ``` +## Hierarchical clustering + +Inter-correlation analysis (ICA) is another way to look at how well samples +cluster by plotting the correlation between the expression profiles of the +samples. ```{r clustering fig, fig.width = 10, fig.asp = .62} -## Hierarchical clustering vst_cor <- cor(vst) -annotation_cols <- cb_friendly_pal('grey')(length(unique(coldat_for_pca$sample_type))) -names(annotation_cols) <- unique(coldat_for_pca$sample_type) +colma=meta_df %>% as.data.frame() +rownames(colma) <- colma$sample +colma <- colma[rownames(vst_cor), ] +colma <- colma %>% dplyr::select(.data[[factor_of_interest]]) +anno_colors=lapply(colnames(colma), function(c){ + l.col=cb_friendly_pal('grey')(length(unique(colma[[c]]))) + names(l.col)=unique(colma[[c]]) + l.col +}) +names(anno_colors)=colnames(colma) p <- pheatmap(vst_cor, - annotation = coldat_for_pca %>% select(sample_type) %>% mutate(sample_type = as.factor(sample_type)), - show_rownames = T, - show_colnames = T, - color = cb_friendly_pal('heatmap')(15), - annotation_colors = list(sample_type = annotation_cols) -) + annotation = colma, + annotation_colors = anno_colors, + show_rownames = T, + show_colnames = T, + color = cb_friendly_pal('heatmap')(15) + ) p - ``` # R session diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core-example.R b/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core-example.R new file mode 100644 index 0000000..dae62ce --- /dev/null +++ b/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core-example.R @@ -0,0 +1,9 @@ +# info params + +# Example data: COMMENT THESE LINE IF YOU ARE USING YOUR DATA +metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/coldata.csv' +se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/star_salmon/salmon.merged.gene_counts.rds') +# This folder is in the output directory inside multiqc folder +multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/devel/rnaseq/nf-core/multiqc/star_salmon/multiqc-report-data/' +# This file is inside the genome folder in the output directory +gtf_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/devel/nf-core/genome/genome.filtered.gtf.gz' diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R b/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R index e9eb536..fd75d18 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R +++ b/inst/rmarkdown/templates/rnaseq/skeleton/QC/params_qc_nf-core.R @@ -1,5 +1,11 @@ # info params -metadata_fn='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/coldata.csv' -se_object=url('https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/salmon.merged.gene_counts.rds') -multiqc_data_dir='https://raw.githubusercontent.com/bcbio/bcbioR-test-data/main/rnaseq/nf-core/multiqc-report-data/' +# Your data +# This is the file used to run nf-core or compatible to that +metadata_fn='/Path/to/metadata/meta.csv' +# This file is inside star_salmon/ folder +se_object='/path/to/nf-core/output/star_salmon/salmon.merged.gene_counts.rds' +# This folder called "multiqc_report_data" is inside the output directory star_salmon inside multiqc folder +multiqc_data_dir='/path/to/nf-core/output/star_salmon/multiqc_report_data' +# This file is inside the genome folder in the output directory, use this only for non-model organism +# gtf_fn='/path/to/nf-core/output/genome/hg38.filtered.gtf' diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R b/inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R index a2a2ec5..51acbef 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R +++ b/inst/rmarkdown/templates/rnaseq/skeleton/QC/run_markdown.R @@ -1,10 +1,13 @@ library(rmarkdown) -rmarkdown::render("./inst/rmarkdown/templates/rnaseq/skeleton/QC/QC_nf-core.Rmd", - output_dir = "./inst/rmarkdown/templates/rnaseq/skeleton/QC/", +# set directory to this file folder +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +# example running with test data +rmarkdown::render("QC_nf-core.Rmd", + output_dir = ".", clean = TRUE, output_format = "html_document", params = list( - params_file = 'params_qc_nf-core.R', + params_file = 'params_qc_nf-core-testdata.R', project_file = '../information.R') ) diff --git a/inst/rmarkdown/templates/rnaseq/skeleton/README.md b/inst/rmarkdown/templates/rnaseq/skeleton/README.md index 9df2e43..b04a5d5 100644 --- a/inst/rmarkdown/templates/rnaseq/skeleton/README.md +++ b/inst/rmarkdown/templates/rnaseq/skeleton/README.md @@ -1,6 +1,54 @@ # Guideline for RNAseq downstream analysis -# DropBox +Make sure there is a project name for this. + +## Run data with nf-core rnaseq + +- Make sure you have access to our [Seqera WorkSpace](https://cloud.seqera.io/orgs/HBC/workspaces/core_production/launchpad) +- Transfer data to HCBC S3: Ask Alex/Lorena. Files will be at our S3 bucket `input/rawdata` folder +- Prepare the CSV file according this [instructions](https://nf-co.re/rnaseq/3.14.0/docs/usage#multiple-runs-of-the-same-sample). File should look like this: + +```csv +sample,fastq_1,fastq_2,strandedness +CONTROL_REP1,s3path/AEG588A1_S1_L002_R1_001.fastq.gz,s3path/AEG588A1_S1_L002_R2_001.fastq.gz,auto +CONTROL_REP1,s3path/AEG588A1_S1_L003_R1_001.fastq.gz,s3path/AEG588A1_S1_L003_R2_001.fastq.gz,auto +CONTROL_REP1,s3path/AEG588A1_S1_L004_R1_001.fastq.gz,s3path/AEG588A1_S1_L004_R2_001.fastq.gz,auto +``` + +Use `bcbio_nfcore_check(csv_file)` to check the file is correct. + +You can add more columns to this file with more metadata, and use this file as the `coldata` file in the templates. + +- Upload file to our `Datasets` in Seqera using the name of the project but starting with `rnaseq-pi_lastname-hbc_code` +- Go to `Launchpad`, select `nf-core_rnaseq` pipeline, and select the previous created `Datasets` in the `input` parameter after clicking in `Browser` + - Select an output directory with the same name used for the `Dataset` inside the `results` folder in S3 +- When pipeline is down, data will be copied to our on-premise HPC in the scratch system under `scratch/groups/hsph/hbc/bcbio/` folder + +## Downstream analysis + +Please, modify `information.R` with the right information. You can use this file with any other Rmd to include the project/analysis information. + +### QC + +`QC/QC.Rmd` is a template for QC metrics. Use `params_qc.R` for `bcbio` + or `QC/QC_nf-core.Rmd` `params_qc_nf-core.R` for `nf-core/rnaseq` outputs. + +Read instruction in the R and Rmd scripts to render it. + +### DE + +`DE/DEG.Rmd` is a template for two groups comparison. `params_de.R` has the information of the input files to load. You can point to `bcbio` or `nf-core/rnaseq` output files. + +On the `YAML` header file of the `Rmd` you can specify some parameters or just set them up in the first chunk of code of the template. This template has examples of: + +- sub-setting data +- two groups comparison +- volcano plot +- MA plot +- Pathway analysis +- Tables + +## DropBox - In `reports/QC` - [ ] copy `bcbio-se.rds` and `tximport-counts.csv` @@ -16,7 +64,7 @@ - [ ] Significant genes results file as described above, but additionally append columns containing normalized count values for each sample. - Make sure to append the gene symbols to these tables so the researcher can interpret the results. -# GitHub +## GitHub - [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. diff --git a/inst/rmarkdown/templates/teaseq/skeleton/QC/QC-01-load_data.R b/inst/rmarkdown/templates/teaseq/skeleton/QC/QC-01-load_data.R new file mode 100644 index 0000000..ba68ffb --- /dev/null +++ b/inst/rmarkdown/templates/teaseq/skeleton/QC/QC-01-load_data.R @@ -0,0 +1,58 @@ +library(Seurat) +library(tidyverse) +library(Matrix) +library(data.table) +library(magrittr) +library(Signac) +library(EnsDb.Hsapiens.v86) +library(qs) +library(bcbioR) +# prepare seurat object with all 3 modes + +# read cellranger-arc outs +#arc_h5 <- '../aspera_download_sep23/SN0288408/KW11249_atsibris/230628_10X_KW11252_bcl/cellranger-arc-2.0.0/GRCh38/BRI-2286/outs/filtered_feature_bc_matrix.h5' +arc_h5 <- '../final/batch3_atac/cellranger-arc_output/outs/filtered_feature_bc_matrix.h5' + +arc_data <- Read10X_h5(arc_h5) + +# read cellranger outs +#h5path <- '../aspera_download_sep23/SN0288408/KW11249_atsibris/230711_10X_KW11249_bcl/cellranger-7.1.0/GRCh38/BRI-2285_hashing/outs/filtered_feature_bc_matrix.h5' +h5path <- '../final/batch3_adt/AtheTeaSeqMulti/outs/multi/count/raw_feature_bc_matrix.h5' +counts <- Read10X_h5(h5path) + +# select cell barcodes detected by both ATAC and ADT +joint.bcs <- intersect(colnames(arc_data$Peaks), colnames(counts$`Antibody Capture`)) + +so <- CreateSeuratObject(counts = arc_data$`Gene Expression`[, joint.bcs]) + +# add cellranger ATAC +# fragpath <- '../aspera_download_sep23/SN0288408/KW11249_atsibris/230628_10X_KW11252_bcl/cellranger-arc-2.0.0/GRCh38/BRI-2286/outs/atac_fragments.tsv.gz' +fragpath <- '../final/batch3_atac/cellranger-arc_output/outs/atac_fragments.tsv.gz' +annotation <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86) +seqlevels(annotation) <- paste0('chr', seqlevels(annotation)) + +so[["CellRangerPeaks"]] <- CreateChromatinAssay( + counts = arc_data$Peaks[, joint.bcs], + sep = c(":", "-"), + fragments = fragpath, + annotation = annotation +) + +# add HTO +so[["HTO"]] <- CreateAssayObject(counts = counts$`Multiplexing Capture`[c('Hashtag1', 'Hashtag2','Hashtag3', 'Hashtag4', 'Hashtag5','Hashtag6','Hashtag7'), joint.bcs]) +# Normalize HTO data, here we use centered log-ratio (CLR) transformation +so <- NormalizeData(so, assay = "HTO", normalization.method = "CLR") +so <- HTODemux(so, assay = "HTO", positive.quantile = 0.99) + +# table(so@meta.data$orig.ident, so@meta.data$hash.ID) %>% as.matrix() %>% kable() %>% kable_styling() + +# add ADT data +so[["ADT"]] <- CreateAssayObject(counts = counts$`Antibody Capture`[, joint.bcs]) +so <- NormalizeData(so, assay = "ADT", normalization.method = "CLR", margin = 2) + +qsave(so, "data/tea_seurat.qs") + +hashtag_counts <- table(so@meta.data$hash.ID) %>% as.data.frame() %>% + set_colnames(c('Hashtag', 'Number of cells')) %>% dplyr::filter(grepl("Hash", Hashtag)) +write.csv(hashtag_counts, 'data/hashtag_counts.csv', row.names = FALSE) + diff --git a/inst/rmarkdown/templates/teaseq/skeleton/QC/QC-02-run_analysis.R b/inst/rmarkdown/templates/teaseq/skeleton/QC/QC-02-run_analysis.R new file mode 100644 index 0000000..6c2e493 --- /dev/null +++ b/inst/rmarkdown/templates/teaseq/skeleton/QC/QC-02-run_analysis.R @@ -0,0 +1,109 @@ +library(Seurat) +library(tidyverse) +library(Matrix) +library(data.table) +library(magrittr) +library(Signac) +library(EnsDb.Hsapiens.v86) +library(qs) +library(bcbioR) +# Run analysis on multi-omic sc Data +seurat <- qread("data/tea_seurat.qs") + +# keep singlet only +seurat <- subset(x = seurat, subset = HTO_classification.global == "Singlet") +# table(seurat@meta.data$hash.ID) + +# add sampleinfo to metadata +seurat@meta.data <- seurat@meta.data %>% + rownames_to_column("cellID") %>% + left_join(sampleinfo, by = "hash.ID") %>% + column_to_rownames("cellID") + +#Compute percent mito ratio +seurat$percent_mito <- PercentageFeatureSet(seurat, pattern = "^MT") +# Compute total reads +seurat$total_reads <- Matrix::colSums(GetAssayData(seurat, slot = "counts")) +# Compute novelty score +seurat$novelty <- log10(seurat@meta.data$nFeature_RNA)/log10(seurat@meta.data$nCount_RNA) + +seurat <- SCTransform(seurat, conserve.memory=TRUE) + +# cell cycle +cc_genes <- cc.genes.updated.2019 + +# Extract phase specific genes +s.genes <- cc_genes$s.genes +g2m.genes <- cc_genes$g2m.genes + +# Perform cell cycle scoring +seurat <- CellCycleScoring(object = seurat, + s.features = s.genes, + g2m.features = g2m.genes, + set.ident = TRUE) + +# clustering +seurat <- RunPCA(object = seurat, verbose = T) +pct <- seurat@reductions$pca@stdev/sum(seurat@reductions$pca@stdev) * 100 +co1 <- which(cumsum(pct) > 90 & pct < 5)[1] +co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1 +# co <- min(co1, co2) +co <- 30 +seurat <- RunPCA(object = seurat, npcs = co, verbose = TRUE) +seurat <- FindNeighbors(object = seurat, reduction = "pca", dims = 1:co) +seurat <- RunUMAP(object = seurat, dims=1:co ) + +# clustering by ADT data # Error: didn't do CLR norm because copied the code from 1st dataset +seurat <- ScaleData(seurat, assay = "ADT") +seurat <- RunPCA(seurat, features = rownames(seurat@assays$ADT@counts), verbose = TRUE, assay= "ADT", reduction.name = "pcaADT", reduction.key = "pcaADT_") +seurat <- RunUMAP(seurat, dims = 1:20, reduction = "pcaADT", assay = "ADT", reduction.name = "umapADT", reduction.key = "umapADT_") + +# calling peaks using MACS2, following James's code +DefaultAssay(seurat)<-"CellRangerPeaks" +peaks <- CallPeaks(seurat) # took 16 min +peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse") +# peaks <- subsetByOverlaps(x = peaks, ranges = blacklist_hg38_unified, invert = TRUE) + + +# quantify counts in each peak +macs2_counts <- FeatureMatrix( + fragments = Fragments(seurat), + features = peaks, + cells = colnames(seurat) +) + +#qsave(seurat,"data/tmp.qs") +# ATAC data normalization, dimension reduction, and clustering +DefaultAssay(seurat)<-"MACS2peaks" +seurat <- RunTFIDF(seurat, assay = "MACS2peaks") +seurat <- FindTopFeatures(seurat, min.cutoff = 'q0', assay = "MACS2peaks") +seurat <- RunSVD(seurat, assay = "MACS2peaks") +seurat <- RunUMAP(object = seurat, dims = 2:30, reduction = 'lsi', assay = "MACS2peaks", reduction.name = "umapATAC", reduction.key = "umapATAC_" ) +seurat <- FindNeighbors(object = seurat, reduction = 'lsi', dims = 2:30) +seurat <- FindClusters(object = seurat, verbose = FALSE, algorithm = 3) + +seurat$blacklist_fraction <- FractionCountsInRegion( + object = seurat, + assay = 'MACS2peaks', + regions = blacklist_hg38_unified +) + +## add ATAC per barcode metrics +pbm <- read.csv( + file = "cellranger-arc_output/outs/per_barcode_metrics.csv", + header = TRUE, + row.names = 1 +) + +seurat@meta.data <- seurat@meta.data %>% + rownames_to_column("cellID") %>% + left_join(pbm %>% rownames_to_column('cellID'), by = "cellID") %>% + column_to_rownames("cellID") + +seurat$pct_reads_in_peaks <- seurat$atac_peak_region_fragments / seurat$atac_fragments * 100 + +# compute nucleosome signal score per cell +seurat <- NucleosomeSignal(object = seurat) # added "nucleosome_signal", "nucleosome_percentile" to metadata +# compute TSS enrichment score per cell +seurat <- TSSEnrichment(object = seurat, fast = FALSE) # added "TSS.enrichment", "TSS.percentile" to metadata +qsave(seurat, 'data/tea_seurat_unfiltered_clustered.qs') diff --git a/inst/rmarkdown/templates/teaseq/skeleton/QC/QC.Rmd b/inst/rmarkdown/templates/teaseq/skeleton/QC/QC.Rmd new file mode 100644 index 0000000..9f1a302 --- /dev/null +++ b/inst/rmarkdown/templates/teaseq/skeleton/QC/QC.Rmd @@ -0,0 +1,725 @@ +--- +title: "Quality Control TEAseq" +title: "Differential Expression" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + maxGenes: !r Inf + minGenes: 750 + maxUMIs: !r Inf + minUMIs: 1000 + maxMitoPer: 20 + min_novelty: 0.85 + min_cells_per_gene: 10 + min_reads_per_cell: 1000 + max_nCount_ADT: !r Inf + min_nCount_ADT: 400 + max_nFeature_ADT: 135 + min_nFeature_ADT: 75 + max_PRF: !r Inf + min_PRF: 500 + max_FRiP: !r Inf + min_FRiP: 25 + min_TSS: 2 + max_NS: 2 + max_blacklistratio: 0.02 + data_dir: !r file.path("data") + results_dir: !r file.path("results") +output: + html_document: + code_folding: hide + df_print: paged + highlight: tango + number_sections: false + self_contained: true + theme: paper + toc: true + toc_float: + collapsed: true + smooth_scroll: false +--- + +```{r render} +# Set seed for reproducibility +set.seed(1454944673L) +library(knitr) +opts_chunk[["set"]]( + audodep = TRUE, + cache = FALSE, + cache.lazy = FALSE, + error = TRUE, + fig.height = 6L, + fig.width = 6L, + message = FALSE, + tidy = TRUE, + warning = FALSE +) +``` + +```{r setup, cache=FALSE, message=FALSE} +library(Seurat) +library(tidyverse) +library(ggplot2) +library(ggrepel) +library(cowplot) +library(Matrix) +library(data.table) +library(magrittr) +library(kableExtra) +library(Signac) +library(EnsDb.Hsapiens.v86) +library(qs) +library(bcbioR) +invisible(mapply( + FUN = dir.create, + path = c(params$data_dir, params$results_dir), + MoreArgs = list(showWarnings = FALSE, recursive = TRUE) +)) + +ggplot2::theme_set(theme_light(base_size = 14)) +``` + + +```{r functions, cache=FALSE, message=FALSE} +# Load functions +`%!in%` = Negate(`%in%`) +``` + +# Overview + +```{r} +# Run QC-01-load_data.R and then +# Run QC-02-run_analysis.R +stopifnot(file.exists('data/tea_seurat_unfiltered_clustered.qs')) +seurat <- qread('data/tea_seurat_unfiltered_clustered.qs') + +stopifnot(file.exists('data/hashtag_counts.csv')) +hashtag_counts <- read.csv('data/hashtag_counts.csv') +``` + +- Principal Investigator: +- Experiment: TEA-Seq of .... + +The downloaded data are cellranger-arc (for gene expression and ATAC-Seq) and cellranger (for gene expression and ADT) outputs. Alignment was done using reference genome GRCh38. We use the shared cell barcodes from both results to create a Seurat object that has all 3 modalities. + +First of all, we demultiplex the data so we know which cell belongs to which sample. Below shows the number of cells for each hashtag after demultiplexing. Doublets and Negatives are removed. Overall we have good number of cells per sample. + +```{r} +hashtag_counts <- hashtag_counts %>% plyr::rename(c('Hashtag'='hash.ID')) +# hashtag_counts %>% kable() %>% kable_styling() +``` + + +```{r} +# sample info +sampleinfo = c("Hashtag1: no drug (DMSO)=Control +Hashtag2: Romidepsin=RMD +Hashtag3: Bryostatin=BRY +Hashtag4: AZD=AZD +Hashtag5: RB=RMD+BRY +Hashtag6: RA=RMD+AZD +Hashtag7: PMAi=PMAi") + +sampleinfo <- as.data.frame(matrix(unlist(str_split(sampleinfo[1], '\n|: |=')), + ncol = 3, byrow = T)) %>% + setNames(c('hash.ID', 'Description', 'Sample')) + +sampleinfo %>% full_join(hashtag_counts, by = 'hash.ID') %>% kable() %>% kable_styling() +``` + +Then we normalize and cluster the data for each modality separately for QC purpose. + +# RNA {.tabset} + +Let's first check the gene expression data. + +## Number of cells per sample + + +```{r} +# Visualize the number of counts per sample +seurat@meta.data %>% + ggplot() + + geom_bar(aes(x=sample)) + + theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust=1), + plot.title = element_text(hjust=0.5)) + + ggtitle("Number of Cells per Sample") +``` + +## Reads per sample + + +```{r warning=F, message=F} +seurat@meta.data %>% + group_by(sample) %>% + summarise(Reads = sum(nCount_RNA)) %>% + ggplot(aes(x=sample, y=Reads)) + + geom_bar(stat = "identity") + + theme(axis.text.x =element_text(angle = 90,vjust = 1, hjust=1)) + + ggtitle("Number of Reads per Sample") +``` + +## UMI counts per cell {.tabset} + +Now let's assess the distribution of unique molecular identifier (UMI)-deconvoluted counts per cell. The UMI counts per cell should be generally above 500. The cutoff is at `r params$minUMIs`. + +```{r warning=FALSE, message=FALSE, fig.height=8, fig.width=12, results='asis'} +cat('### Histogram \n\n') +seurat@meta.data %>% + ggplot(aes(x = nCount_RNA)) + + geom_histogram(binwidth = 50) + + xlab("nUMI") + + facet_wrap(~sample) + + geom_vline(xintercept = params$minUMIs) + +cat('\n### Violin plot \n\n') +VlnPlot(seurat, features="nCount_RNA", pt.size = 0) + + scale_fill_cb_friendly() + + geom_hline(yintercept = params$minUMIs) +``` + +## Genes detected per cell {.tabset} + +Here by "detected", we mean genes with a non-zero count measurement per cell. Seeing gene detection in the range of `500`-`5000` is normal for most single-cell experiments. +The cutoff is at `r params$minGenes`. + +```{r warning=FALSE, message=FALSE, fig.height=8, fig.width=12, results='asis'} +cat('### Histogram \n\n') +seurat@meta.data %>% + ggplot(aes(x = nFeature_RNA)) + + geom_histogram(binwidth = 50) + + xlab("nGene") + + facet_wrap(~sample) + + geom_vline(xintercept = params$minGenes) + +cat('\n### Violin plot \n\n') +VlnPlot(seurat, features="nFeature_RNA", pt.size = 0) + + scale_fill_cb_friendly() + + geom_hline(yintercept = params$minGenes) +``` + + +## Mitochondrial abundance {.tabset} + +We evaluate overall mitochondrial gene expression as a biomarker of cellular stress during sample preparation. The mitochondrial percentages are mostly below 20%. + + +```{r warning=F, message=F, fig.height=8, fig.width=12, results='asis'} +cat('### Histogram \n\n') +seurat@meta.data %>% + ggplot(aes(x = percent_mito)) + + geom_histogram() + + facet_wrap(~sample) + + geom_vline(xintercept = params$maxMitoPer) + +cat('\n### Violin plot \n\n') +VlnPlot(seurat, features="percent_mito", pt.size = 0) + + scale_fill_cb_friendly() + + geom_hline(yintercept = params$maxMitoPer) +``` + +## Novelty {.tabset} + +Another way to QC the data is to look for less novelty, that is cells that have less genes detected per count than other cells. We can see the samples where we sequenced each cell less have a higher overall novelty, that is because we have not started saturated the sequencing for any given gene for these samples. Outlier cells in these samples might be cells that we have a less complex RNA species than other cells. Sometimes we can detect contamination with low complexity cell types like red blood cells via this metric. + +All cells have high novelty scores (> 0.85). + +```{r novelty, fig.height=8, fig.width=12, results='asis'} +cat('### Histogram \n\n') +# Visualize the overall novelty of the gene expression by visualizing the genes detected per UMI +seurat@meta.data %>% + ggplot(aes(x=novelty)) + + geom_histogram(bins=50) + + facet_wrap(~sample) + +cat('\n### Violin plot \n\n') +VlnPlot(seurat, features="novelty", pt.size = 0) + + scale_fill_cb_friendly() +``` + +## UMIs vs. genes detected {.tabset} + +If we graph out the total number of UMI counts per cell vs. the genes detected per cell, we can assess whether there is a large population of low quality cells with low counts and/or gene detection. For some of the samples there is a cluster of cells that have increasing UMIs yet gene detection is low. We will want to filter out these low complexity cells. The data points are colored by different metrics to see if there is some correlation. + +### Data points colored by Percent mitochondrial genes + +```{r umis_vs_genes_mito, warning=F, message=F, fig.height=8, fig.width=12} +# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs +seurat@meta.data %>% + ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=percent_mito)) + + geom_point() + + stat_smooth(method=lm) + + scale_x_log10() + + scale_y_log10() + + geom_vline(xintercept = params$minUMIs) + + geom_hline(yintercept = params$minGenes) + + xlab("nUMI") + ylab("nGene") + + facet_wrap(~sample) +``` + +### Data points colored by Novelty + +```{r umis_vs_genes_novelty, warning=F, message=F, fig.height=8, fig.width=12} +# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs +seurat@meta.data %>% + ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=novelty)) + + geom_point() + + stat_smooth(method=lm) + + scale_x_log10() + + scale_y_log10() + + geom_vline(xintercept = params$minUMIs) + + geom_hline(yintercept = params$minGenes) + + xlab("nUMI") + ylab("nGene") + + facet_wrap(~sample) +``` + + +## UMAP plots {.tabset} + +### Color by sample + +Clustering of the cells is similar to the first dataset. + +```{r results='asis'} +DimPlot(object = seurat, reduction = "umap", group.by = "sample", order = T) + + scale_color_cb_friendly() + +cat('\n\n') +``` + +```{r results='asis', fig.width=12} +DimPlot(object = seurat, reduction = "umap", split.by = 'sample')+ + scale_color_cb_friendly() +cat('\n\n') +``` + +### Color by cell cycle + +```{r results='asis'} +DimPlot(object = seurat, reduction = "umap", group.by = "Phase", order = T) + + scale_color_cb_friendly() +cat('\n\n') +``` + +```{r results='asis', fig.width=12} +DimPlot(object = seurat, reduction = "umap", group.by = "Phase", split.by = 'sample') + + scale_color_cb_friendly() +cat('\n\n') +``` + +### Color by UMI counts + +```{r results='asis', fig.width=12} +FeaturePlot(seurat, reduction = "umap", features = "nCount_RNA", split.by = "sample", cols = c("light blue", "red")) & theme(legend.position = c(0.2,0.2)) +cat('\n\n') +``` +### Color by the number of detected genes + +```{r results='asis', fig.width=12} +FeaturePlot(seurat, reduction = "umap", features = "nFeature_RNA", split.by = "sample", cols = c("light blue", "red")) & theme(legend.position = c(0.2,0.2)) +cat('\n\n') +``` +### Color by mito percentage + +```{r results='asis', fig.width=12} +FeaturePlot(seurat, reduction = "umap", features = "percent_mito", split.by = "sample", cols = c("light blue", "red")) & theme(legend.position = c(0.2,0.2)) +cat('\n') +``` +### Color by novelty + +```{r results='asis', fig.width=12} +FeaturePlot(seurat, reduction = "umap", features = "novelty", split.by = "sample", cols = c("light blue", "red")) & theme(legend.position = c(0.2,0.2)) +cat('\n') +``` + +# ADT {.tabset} + +The numbers of detected proteins are more than the the first experiment. The cells are clustered slightly differently from gene expression and ATAC-Seq results on the UMAP and the first dataset. + +## UMI counts per cell + +```{r warning=FALSE, message=FALSE, results='asis'} +VlnPlot(seurat, features = "nCount_ADT", ncol = 1, pt.size = 0) + + scale_fill_cb_friendly() + + xlab("") + + ylab("UMI") +cat('\n\n') +``` + +**Zoom in** (cutoff line is at `r params$min_nCount_ADT`): + +```{r warning=FALSE, message=FALSE, results='asis'} +VlnPlot(seurat, features = "nCount_ADT", ncol = 1, pt.size = 0) + + scale_fill_cb_friendly() + + xlab("") + + ylab("UMI") + + ylim(0,3000) + + geom_hline(yintercept = params$min_nCount_ADT) +cat('\n\n') +``` + +## Detected proteins per cell + +Lines are drawn at `r params$min_nFeature_ADT` and `r params$max_nFeature_ADT`. + +```{r warning=FALSE, message=FALSE, results='asis'} +VlnPlot(seurat, features = "nFeature_ADT", ncol = 1, pt.size = 0) + + scale_fill_cb_friendly()+ + xlab("") + + geom_hline(yintercept = params$max_nFeature_ADT) + + geom_hline(yintercept = params$min_nFeature_ADT) + + ylab("Number of detected proteins") +cat('\n\n') +``` + +## UMAP plots {.tabset} + +### Color by sample + +We can still see cells are clustered by sample, but not as clear as the first dataset. + +```{r results='asis'} +DimPlot(object = seurat, reduction = "umapADT", group.by = "sample", order = T) + + scale_color_cb_friendly() +cat('\n\n') +``` +```{r results='asis', fig.width=12} +DimPlot(object = seurat, reduction = "umapADT", split.by = 'sample') + + scale_color_cb_friendly() +cat('\n\n') +``` + + +### Color by UMI counts + +```{r results='asis', fig.width=12} +FeaturePlot(seurat, reduction = "umapADT", features = "nCount_ADT", split.by = "sample", cols = c("light blue", "red")) & theme(legend.position = c(0.2,0.2)) +cat('\n\n') +``` +### Color by the number of detected proteins + +```{r results='asis', fig.width=12} +FeaturePlot(seurat, reduction = "umapADT", features = "nFeature_ADT", split.by = "sample", cols = c("light blue", "red")) & theme(legend.position = c(0.2,0.2)) +cat('\n\n') +``` + + + +# ATAC-Seq {.tabset} + +We use some of the results from cellranger outputs and the peaks called using MACS2 for QCing the scATAC-Seq data. + +## UMI counts per cell + +```{r warning=FALSE, message=FALSE, results='asis'} +VlnPlot(seurat, features = "nCount_MACS2peaks", ncol = 1, pt.size = 0) + + scale_fill_cb_friendly() + + xlab("") + + ylab("UMI") + +cat('\n\n') +``` +## Detected peaks per cell + +```{r warning=FALSE, message=FALSE} +VlnPlot(seurat, features = "nFeature_MACS2peaks", ncol = 1, pt.size = 0) + + scale_fill_cb_friendly() + + xlab("") + + ylab("UMI") + +VlnPlot(seurat, features = "nFeature_CellRangerPeaks", ncol = 1, pt.size = 0) + + scale_fill_cb_friendly() + + xlab("") + + ylab("UMI") +``` + +## QC metrics {.tabset} + +### Total number of fragments in peaks {.tabset} + +This metric represents the total number of fragments (= reads) mapping within a region of the genome that is predicted to be accessible (= a peak). It's a measure of cellular sequencing depth / complexity. Cells with very few reads may need to be excluded due to low sequencing depth. Cells with extremely high levels may represent doublets, nuclei clumps, or other artefacts. + + +```{r results='asis', warning=FALSE, message=FALSE} +DefaultAssay(seurat) <- "MACS2peaks" + +cat('#### Histogram \n\n') +seurat@meta.data %>% + ggplot(aes(x = atac_peak_region_fragments)) + + geom_histogram() + + facet_wrap(~sample) + + geom_vline(xintercept = params$min_PRF) + + +cat('\n#### Violin plot\n\n') +VlnPlot( + object = seurat, + features = 'atac_peak_region_fragments', + pt.size = 0 +) + ylab('Total number of fragments in peaks') + + NoLegend() + + geom_hline(yintercept = params$min_PRF) + xlab("") + + scale_fill_cb_friendly() + +cat('\n\n') +``` + +### Fraction of fragments in peaks + +It represents the fraction of all fragments that fall within ATAC-seq peaks. Cells with low values (i.e. <15-20%) often represent low-quality cells or technical artifacts that should be removed. Note that this value can be sensitive to the set of peaks used. + +```{r results='asis'} +VlnPlot( + object = seurat, + features = 'pct_reads_in_peaks', + pt.size = 0 +) + NoLegend() + xlab("") + + scale_fill_cb_friendly() + + geom_hline(yintercept = params$min_FRiP) + +cat('\n\n') +``` + + +### Transcriptional start site (TSS) enrichment score {.tabset} + +The ENCODE project has defined an ATAC-seq targeting score based on the ratio of fragments centered at the TSS to fragments in TSS-flanking regions (see https://www.encodeproject.org/data-standards/terms/). Poor ATAC-seq experiments typically will have a low TSS enrichment score. We can compute this metric for each cell with the TSSEnrichment() function, and the results are stored in metadata under the column name TSS.enrichment. + + +```{r results='asis'} +VlnPlot( + object = seurat, + features = 'TSS.enrichment', + pt.size = 0 +) + scale_fill_cb_friendly() + +NoLegend() + xlab("") +cat('\n\n') +``` +The following tabs show the TSS enrichment score distribution for each sample. Cells with high-quality ATAC-seq data should show a clear peak in reads at the TSS, with a decay the further we get from it. + +Each plot is split between cells with a high or low global TSS enrichment score (cuffoff at `r params$min_TSS`), to double-check whether cells with lowest enrichment scores still follow the expected pattern or rather need to be excluded. + +```{r results='asis'} +seurat$TSS.group <- ifelse(seurat$TSS.enrichment > params$min_TSS, "High", "Low") + +for (sample in sort(unique(seurat$sample))){ + cat('####', sample, '\n\n') + p <- TSSPlot(subset(x = seurat, idents = sample), group.by = "TSS.group") + NoLegend() + print(p) + cat('\n\n') +} +``` + +### Nucleosome signal + +The histogram of DNA fragment sizes (determined from the paired-end sequencing reads) should exhibit a strong nucleosome banding pattern corresponding to the length of DNA wrapped around a single nucleosome, i.e peaks at approximately 100bp (nucleosome-free), and mono, di and tri nucleosome-bound peaks at 200, 400 and 600bp. We calculate this per single cell, and quantify the approximate ratio of mononucleosomal to nucleosome-free fragments (stored as nucleosome_signal). Cells with lower nucleosome signal have a higher ratio of nucleosome-free fragments. + +```{r warning = FALSE} +seurat$nucleosome_group <- ifelse(seurat$nucleosome_signal > 1, "high NS", "low NS") +FragmentHistogram(seurat, group.by = 'nucleosome_group', cells = colnames(seurat[, seurat$sample == 'Control'])) +``` + +```{r } +VlnPlot( + object = seurat, + features = 'nucleosome_signal', + pt.size = 0 +) + scale_fill_cb_friendly() + +NoLegend() + xlab("") +``` + + + + +### Blacklist ratio + +tIt's he ratio of reads in genomic blacklist regions. The [ENCODE project](https://www.encodeproject.org/) has provided a list of [blacklist regions](https://github.com/Boyle-Lab/Blacklist), representing reads which are often associated with artefactual signal. Cells with a high proportion of reads mapping to these areas (compared to reads mapping to peaks) often represent technical artifacts and should be removed. ENCODE blacklist regions for human (hg19 and GRCh38), mouse (mm10), Drosophila (dm3), and C. elegans (ce10) are included in the Signac package. **Peaks overlapping with the balcklist regions were removed in the analysis, so we don't show blacklist fraction here**. + +Line is drawn at `r params$min_blacklistratio`. + +```{r} +VlnPlot( + object = seurat, + features = 'blacklist_fraction', + pt.size = 0 +) + scale_fill_cb_friendly() + + NoLegend() + + geom_hline(yintercept = params$min_blacklistratio) + +``` + + +## Normalization, linear dimensional reduction, and clustering + +* Normalization: Signac performs term frequency-inverse document frequency (TF-IDF) normalization. This is a two-step normalization procedure, that both normalizes across cells to correct for differences in cellular sequencing depth, and across peaks to give higher values to more rare peaks. + +* Feature selection: The low dynamic range of scATAC-seq data makes it challenging to perform variable feature selection, as we do for scRNA-seq. Instead, we can choose to use only the top n% of features (peaks) for dimensional reduction, or remove features present in less than n cells with the FindTopFeatures() function. Here, we will use all features, though we note that we see very similar results when using only a subset of features (try setting min.cutoff to ‘q75’ to use the top 25% all peaks), with faster runtimes. Features used for dimensional reduction are automatically set as VariableFeatures() for the Seurat object by this function. + +* Dimension reduction: We next run singular value decomposition (SVD) on the TD-IDF matrix, using the features (peaks) selected above. This returns a reduced dimension representation of the object (for users who are more familiar with scRNA-seq, you can think of this as analogous to the output of PCA). + +The combined steps of TF-IDF followed by SVD are known as latent semantic indexing (LSI), and were first introduced for the analysis of scATAC-seq data by [Cusanovich et al. 2015.](https://www.science.org/doi/10.1126/science.aax6234) + +The first LSI component often captures sequencing depth (techni ccal variation) rather than biological variation. If this is the case, the component should be removed from downstream analysis. We can assess the correlation between each LSI component and sequencing depth using the DepthCor() function (see below). +Here we see there is a very strong correlation between the first LSI component and the total number of counts for the cell, so we will perform downstream steps without this component. + +```{r results='asis'} +DepthCor(seurat, assay = 'MACS2peaks') +cat('\n\n') +``` + +## UMAP plots + +```{r results='asis'} +DimPlot(object = seurat, group.by = 'sample', reduction = "umapATAC") + + scale_color_cb_friendly() + +cat('\n\n') +``` + + + + +# Conclusion + +We will apply the following filters to **keep the high-quality cells**. For most of the criteria below, we don't have an upper limit, as we have removed doublets after demultiplexing. + +**RNA** + +* Number of UMIs (nCount_RNA) > `r params$minUMIs` +* Number of detected genes (nFeature_RNA) > `r params$minGenes` +* mitochondrial percentages (nFeature_RNA) < `r params$maxMitoPer` + +We don't apply any filter for complexity, as all the cells have relatively high complexity (> 0.85). + +**ADT** + +* Number of UMIs (nCount_ADT) > `r params$min_nCount_ADT` +* Number of detected proteins (nFeature_ADT) < `r params$max_nFeature_ADT` and > `r params$min_nFeature_ADT` + +**ATAC** + +* Total number of fragments in peaks (atac_peak_region_fragments) > `r params$min_PRF` +* Percentage of fragments in peaks (pct_reads_in_peaks) > `r params$min_FRiP` +* TSS enrichment (TSS.enrichment) > `r params$min_TSS` +* Nucleosome signal (nucleosome_signal) < `r params$max_NS` +* Blacklisted regions ratio (blacklist_fraction) < `r params$max_blacklistratio` + + +Number of cells before filtering: + +```{r} +table(seurat$sample) %>% as.data.frame() %>% set_colnames(c('sample', '#Cells')) %>% kable() %>% kable_styling() +``` + + +```{r} +# Please select your cutoffs +rna_keep <- seurat@meta.data[(seurat@meta.data$nCount_RNA > params$minUMIs & + seurat@meta.data$nFeature_RNA > params$minGenes & + seurat@meta.data$percent_mito < params$maxMitoPer), ] +rna_filt <- seurat@meta.data[rownames(seurat@meta.data) %!in% rownames(rna_keep),] + +adt_keep <- seurat@meta.data[(seurat@meta.data$nCount_ADT > params$min_nCount_ADT & + seurat@meta.data$nFeature_ADT > params$min_nFeature_ADT & + seurat@meta.data$nFeature_ADT < params$max_nFeature_ADT), ] + +adt_filt <- seurat@meta.data[rownames(seurat@meta.data) %!in% rownames(adt_keep),] + +atac_keep <- seurat@meta.data[(seurat@meta.data$atac_peak_region_fragments > params$min_PRF & + seurat@meta.data$pct_reads_in_peaks > params$min_FRiP & + seurat@meta.data$TSS.enrichment > params$min_TSS & + seurat@meta.data$nucleosome_signal < params$max_NS & + seurat@meta.data$blacklist_fraction < params$max_blacklistratio), ] + +atac_filt <- seurat@meta.data[rownames(seurat@meta.data) %!in% rownames(atac_keep),] + +library(ggvenn) +x = list( + RNA = rownames(rna_filt), + ADT = rownames(adt_filt), + ATAC = rownames(atac_filt) +) +ggvenn(x, names(x)) +``` + +```{r eval=FALSE} +# this is only run once before rendering +unfiltered_seurat <- seurat +seurat <- subset( + x = seurat, + subset = nCount_RNA > params$minUMIs & + nFeature_RNA > params$minGenes & + percent_mito < params$maxMitoPer & + nCount_ADT > params$min_nCount_ADT & + nFeature_ADT > params$min_nFeature_ADT & + nFeature_ADT < params$max_nFeature_ADT & + atac_peak_region_fragments > params$min_PRF & + TSS.enrichment > params$min_TSS & + nucleosome_signal < params$max_NS & + blacklist_fraction < params$max_blacklistratio + ) + +# reclustering +# RNA +DefaultAssay(seurat) <- 'RNA' +seurat <- SCTransform(seurat, conserve.memory=TRUE) +seurat <- RunPCA(object = seurat, verbose = T) +co <- 40 +seurat <- FindNeighbors(object = seurat, reduction = "pca", dims = 1:co) +seurat <- RunUMAP(object = seurat, dims=1:co ) + +# clustering by ADT data +seurat <- NormalizeData(seurat, assay = "ADT", normalization.method = "CLR", margin = 2) +seurat <- ScaleData(seurat, assay = "ADT") +seurat <- RunPCA(seurat, features = rownames(seurat@assays$ADT@counts), verbose = TRUE, assay= "ADT", reduction.name = "pcaADT", reduction.key = "pcaADT_") +seurat <- RunUMAP(seurat, dims = 1:20, reduction = "pcaADT", assay = "ADT", reduction.name = "umapADT", reduction.key = "umapADT_") + +# ATAC data normalization, dimension reduction, and clustering +seurat <- RunTFIDF(seurat, assay = "MACS2peaks") +seurat <- FindTopFeatures(seurat, min.cutoff = 'q0', assay = "MACS2peaks") +seurat <- RunSVD(seurat, assay = "MACS2peaks") +seurat <- RunUMAP(object = seurat, dims = 2:30, reduction = 'lsi', assay = "MACS2peaks", reduction.name = "umapATAC", reduction.key = "umapATAC_" ) + +qsave(seurat, 'data/tea_seurat_filtered_clustered.qs') +``` + +Number of cells **after filtering**: + +```{r} +# get the filtered seurat +stopifnot(file.exists('data/tea_seurat_filtered_clustered.qs')) +seurat <- qread('data/tea_seurat_filtered_clustered.qs') +table(seurat$sample) %>% as.data.frame() %>% set_colnames(c('sample', '#Cells')) %>% kable() %>% kable_styling() +``` + +**We take a look at the UMAPs after filtering out low quality cells for each modality.** + +```{r eval=FALSE, fig.width=12, fig.height=12} +plot_grid( + DimPlot(object = seurat, group.by = "sample", reduction = "umap") + ggtitle('RNA'), + DimPlot(object = seurat, group.by = "sample", reduction = "umapADT") + ggtitle('ADT'), + DimPlot(object = seurat, group.by = "sample", reduction = "umapATAC") + ggtitle('ATAC') +) +``` + + +# Session info + +```{r} +devtools::session_info() +``` + diff --git a/inst/rmarkdown/templates/teaseq/skeleton/README.md b/inst/rmarkdown/templates/teaseq/skeleton/README.md new file mode 100644 index 0000000..de1ce6c --- /dev/null +++ b/inst/rmarkdown/templates/teaseq/skeleton/README.md @@ -0,0 +1,60 @@ +# Tipical steps for scRNAseq downstream analysis + +# Set up + +This template assumes there is 4 folders with FASTQ files. + +- GEX +- ATAC +- ADT +- HTO + +**NOTE** use absolute paths inside all these files to avoid errors. + +## Check file names + +FASTQ files should all start with the same sufix, cellrange will identify that as FASTQ from same samples. You may not have that, and you need to fix the filenames to looks like: + +`20240417_GEX1_6_AT12013_S41_L001_R2_001.fastq.gz` -> `20240417_GEX1_AT12013_S41_L001_R2_001.fastq.gz` + +The R scripts `scripts/fix_filenames.R` has some code to do that. + +## Analysis of scRNAseq and scATACseq + +`cellrange-arc` is used for this analysis. It needs a `libraries.csv` file. In the `meta` folder you will find a template. It needs the FASTQ_PATH to scRNAseq and scATACseq with filenames corrected. + +`scripts/gex_atac.sbatch` is an example of how to run this. It is assuming using HMS O2 cluster. + +## Analysis of scADT with Hashing + +`cellranger multi` is used for this analysis. It needs two config files: + +- `feature_reference.csv` is a list of barcodes used to identify the proteins and the hashing. The example file in `meta` folder shows how to add 7 different hashing barcodes. +- `multiconfig.csv` has three sections, to point to reference genome, input files (scRNAseq, scADT, HTO), the `feature_reference.csv` and samples. The example file in `meta` folder shows an example with 7 hasjing barcodes. + +`scripts/gex_adt_hto.sbatch` is an example of how to run this. It is assuming using HMS O2 cluster. + +# QC analysis + +The file `QC-01-load_data.R` helps loading the output of `cellranger` and creating the right object. It stores R objects in `.qs` format under a `data` folder. + +The file `QC-02-run_analysis.R` helps analyzing the object for the three type of data. + +The file `QC.Rmd` helps visualizing the analyzed data. It stores R objects in `.qs` format under a `data` folder. + + +# DropBox + +- In `reports/QC` + - [ ] copy QC `Rmd/R/html/figures` +- In `reports/Clusters` + - [ ] the analysis of `SCTransform`, ,`RunPCA` ,`FindNeighbors`, ,`FindClusters`, `RunUMAP` + - [ ] the analysis of `FindMarkers` and `Cell Identification` +- In `reports/DE`, for *each analysis*: + - TBD + +# GitHub + +- [ ] Push all `*Rmd` `*R` files used for the *QC* and *DE* analysis respecting folder structure. + +Please, ignore `*html/figures/csv` and any output of the code. diff --git a/inst/rmarkdown/templates/teaseq/skeleton/information.R b/inst/rmarkdown/templates/teaseq/skeleton/information.R new file mode 100644 index 0000000..3444196 --- /dev/null +++ b/inst/rmarkdown/templates/teaseq/skeleton/information.R @@ -0,0 +1,8 @@ +# project params +root = "../" +date = "YYYYMMDD" +column = "treatment" +subset_column = 'cell' +metadata_fn = "../meta/samplesheet.csv" +counts_fn = '../data/tximport-counts.csv' +basedir <- 'reports' diff --git a/inst/rmarkdown/templates/teaseq/skeleton/scripts/fix_filenames.R b/inst/rmarkdown/templates/teaseq/skeleton/scripts/fix_filenames.R new file mode 100644 index 0000000..4d3727a --- /dev/null +++ b/inst/rmarkdown/templates/teaseq/skeleton/scripts/fix_filenames.R @@ -0,0 +1,40 @@ +library(stringr) +fsq=list.files("data/GEX_fastq/", pattern = "_S[1-9]") + + +dir.create("data/GEX_fastq_fixed") +sapply(fsq, function(fn){ + ending=str_extract(fn, "_S.*.gz") + fnnew=paste0("GEX",ending) + fs::link_create(path = fs::path_abs(file.path("data/GEX_fastq",fn)), + new_path = fs::path_abs(file.path("data/GEX_fastq_fixed",fnnew))) +}) + + +fsq=list.files("data/ATAC_fastq", pattern = "_S[1-9]") +dir.create("data/ATAC_fastq_fixed") +sapply(fsq, function(fn){ + ending=str_extract(fn, "_S.*.gz") + fnnew=paste0("ATAC",ending) + fs::link_create(path = fs::path_abs(file.path("data/ATAC_fastq",fn)), + new_path = fs::path_abs(file.path("data/ATAC_fastq_fixed",fnnew))) +}) + + +fsq=list.files("data/ADT_fastq/", pattern = "_S[1-9]") +dir.create("data/ADT_fastq_fixed") +sapply(fsq, function(fn){ + ending=str_extract(fn, "_S.*.gz") + fnnew=paste0("ADT",ending) + fs::link_create(path = fs::path_abs(file.path("data/ADT_fastq",fn)), + new_path = fs::path_abs(file.path("data/ADT_fastq_fixed",fnnew))) +}) + +fsq=list.files("data/HTO_fastq/", pattern = "_S[1-9]") +dir.create("data/HTO_fastq_fixed") +sapply(fsq, function(fn){ + ending=str_extract(fn, "_S.*.gz") + fnnew=paste0("HTO",ending) + fs::link_create(path = fs::path_abs(file.path("data/HTO_fastq",fn)), + new_path = fs::path_abs(file.path("data/HTO_fastq_fixed",fnnew))) +}) diff --git a/inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_adt_hto.sbatch b/inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_adt_hto.sbatch new file mode 100644 index 0000000..cd86c5b --- /dev/null +++ b/inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_adt_hto.sbatch @@ -0,0 +1,23 @@ +#!/bin/bash + +#SBATCH --job-name=CellrangerMulti # Job name +#SBATCH --partition=priority # Partition name +#SBATCH --time=1-23:59 # Runtime in D-HH:MM format +#SBATCH --nodes=1 # Number of nodes (keep at 1) +#SBATCH --ntasks=1 # Number of tasks per node (keep at 1) +#SBATCH --cpus-per-task=16 # CPU cores requested per task (change for threaded jobs) +#SBATCH --mem=128G # Memory needed per node (total) +#SBATCH --error=jobid_%j.err # File to which STDERR will be written, including job ID +#SBATCH --output=jobid_%j.out # File to which STDOUT will be written, including job ID +#SBATCH --mail-type=ALL # Type of email notification (BEGIN, END, FAIL, ALL) + +module load cellranger/7.0.0 + +localcores=$SLURM_CPUS_PER_TASK +localmem=$SLURM_MEM_PER_NODE + +#change CSV to full path +cellranger multi --id=AtheTeaSeqMulti --csv=meta/multiconfig.csv + + + diff --git a/inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_atac.sbatch b/inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_atac.sbatch new file mode 100644 index 0000000..2af343c --- /dev/null +++ b/inst/rmarkdown/templates/teaseq/skeleton/scripts/gex_atac.sbatch @@ -0,0 +1,24 @@ +#!/bin/bash + +#SBATCH --job-name=Cellranger-Arc # Job name +#SBATCH --partition=priority # Partition name +#SBATCH --time=1-23:59 # Runtime in D-HH:MM format +#SBATCH --nodes=1 # Number of nodes (keep at 1) +#SBATCH --ntasks=1 # Number of tasks per node (keep at 1) +#SBATCH --cpus-per-task=16 # CPU cores requested per task (change for threaded jobs) +#SBATCH --mem=128G # Memory needed per node (total) +#SBATCH --error=jobid_%j.err # File to which STDERR will be written, including job ID +#SBATCH --output=jobid_%j.out # File to which STDOUT will be written, including job ID +#SBATCH --mail-type=ALL # Type of email notification (BEGIN, END, FAIL, ALL) + +module load cellranger-ARC/2.0.0 + +localcores=$SLURM_CPUS_PER_TASK +localmem=$SLURM_MEM_PER_NODE + +# Change libraries to full path +cellranger-arc count --id=cellranger-arc_output \ + --reference=/n/shared_db/GRCh38/uk/cellranger-ARC/2.0.0/2.0.0/refdata-cellranger-arc-GRCh38-2020-A-2.0.0 \ + --libraries=meta/libraries.csv \ + --localcores=16 \ + --localmem=128 diff --git a/inst/rmarkdown/templates/teaseq/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/teaseq/skeleton/skeleton.Rmd new file mode 100644 index 0000000..dc4bbf5 --- /dev/null +++ b/inst/rmarkdown/templates/teaseq/skeleton/skeleton.Rmd @@ -0,0 +1,25 @@ +--- +title: "General Project Information" +author: "Harvard Chan Bioinformatics Core" +date: "`r Sys.Date()`" +output: + html_document: + code_folding: hide + df_print: paged + highlights: pygments + number_sections: true + self_contained: true + theme: default + toc: true + toc_float: + collapsed: true + smooth_scroll: true +editor_options: + chunk_output_type: console +params: + params_file: information.R +--- + +```{r echo = F} +source(params$params_file) +``` diff --git a/inst/rmarkdown/templates/teaseq/template.yml b/inst/rmarkdown/templates/teaseq/template.yml new file mode 100644 index 0000000..6838f13 --- /dev/null +++ b/inst/rmarkdown/templates/teaseq/template.yml @@ -0,0 +1,3 @@ +name: bcbio TEAseq +description: Standard TEAseq down-stream analyses +create_dir: false diff --git a/man/bcbio_nfcore_check.Rd b/man/bcbio_nfcore_check.Rd new file mode 100644 index 0000000..76c954f --- /dev/null +++ b/man/bcbio_nfcore_check.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hello.R +\name{bcbio_nfcore_check} +\alias{bcbio_nfcore_check} +\title{Function to check samplesheet for nf-core} +\usage{ +bcbio_nfcore_check(file) +} +\arguments{ +\item{file}{path to CSV file for nf-core} +} +\description{ +Function to check samplesheet for nf-core +} +\examples{ + +bcbio_nfcore_check(system.file("extdata", "rnaseq_good.csv", package = "bcbioR") ) + +} diff --git a/man/bcbio_templates.Rd b/man/bcbio_templates.Rd index bc2e50f..45c9ad9 100644 --- a/man/bcbio_templates.Rd +++ b/man/bcbio_templates.Rd @@ -7,7 +7,13 @@ bcbio_templates(type = "rnaseq", outpath) } \arguments{ -\item{type}{string indicating the type of analysis, supported: rnaseq.} +\item{type}{string indicating the type of analysis, supported: +\itemize{ +\item base +\item rnaseq, scrnaseq, +\item teaseq +\item cosmx +}} \item{outpath}{string path indicating where to copy all the files to} } diff --git a/man/bcbio_vsd_data.Rd b/man/bcbio_vsd_data.Rd new file mode 100644 index 0000000..9dbd982 --- /dev/null +++ b/man/bcbio_vsd_data.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{bcbio_vsd_data} +\alias{bcbio_vsd_data} +\title{varianceStabilizingTransformation Object from Test Data} +\format{ +\subsection{\code{bcbio_vsd_data}}{ + +VSD object from DESeq2 data +} +} +\usage{ +bcbio_vsd_data +} +\description{ +An example of the output of varianceStabilizingTransformation to +show different visualization code +} +\keyword{datasets} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..93f2dbb --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,12 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + +library(testthat) +library(bcbioR) + +test_check("bcbioR") diff --git a/tests/testthat/misc.R b/tests/testthat/misc.R new file mode 100644 index 0000000..eebdfc8 --- /dev/null +++ b/tests/testthat/misc.R @@ -0,0 +1,10 @@ +library(bcbioR) + + +test_that("rnaseq samplesheet", { + + expect_no_error(bcbio_nfcore_check(system.file("extdata", "rnaseq_good.csv", package = "bcbioR"))) + expect_error(bcbio_nfcore_check(system.file("extdata", "rnaseq_missing_col.csv", package = "bcbioR"))) + expect_error(bcbio_nfcore_check(system.file("extdata", "rnaseq_wnumber.csv", package = "bcbioR"))) + expect_warning(bcbio_nfcore_check(system.file("extdata", "rnaseq_na.csv", package = "bcbioR"))) +}) diff --git a/tests/testthat/rnaseq.R b/tests/testthat/rnaseq.R new file mode 100644 index 0000000..9a683ac --- /dev/null +++ b/tests/testthat/rnaseq.R @@ -0,0 +1,29 @@ +library(bcbioR) + + +test_that("rnaseq testing", { + path <- withr::local_tempdir() + print(path) + bcbio_templates(type="rnaseq", outpath=path) + numerator="tumor" + denominator="normal" + subset_value=NA + rmarkdown::render(input = file.path(path,"DE/DEG.Rmd"), + output_dir = file.path(path,"DE"), + output_format = "html_document", + output_file = ifelse(!is.na(subset_value), + paste0('DE_', subset_value, '_', numerator, '_vs_', denominator, '.html'), + paste0('DE_', numerator, '_vs_', denominator, '.html') + ), + clean = TRUE, + envir = new.env(), + params = list( + subset_value = subset_value, + numerator = numerator, + denominator = denominator, + params_file = file.path(path,'DE/params_de.R'), + project_file = file.path(path,'information.R'), + functions_file = file.path(path,'DE/load_data.R') + ) + ) +}) diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R new file mode 100644 index 0000000..03fe938 --- /dev/null +++ b/tests/testthat/setup.R @@ -0,0 +1,3 @@ +op <- options(reprex.clipboard = FALSE, reprex.html_preview = FALSE) + +withr::defer(options(op), teardown_env()) diff --git a/vignettes/bcbioR_quick_start.Rmd b/vignettes/bcbioR_quick_start.Rmd index 7282482..a05670a 100644 --- a/vignettes/bcbioR_quick_start.Rmd +++ b/vignettes/bcbioR_quick_start.Rmd @@ -42,8 +42,37 @@ And get the colors directly: ```{r} cb_friendly_cols(1:16) ``` -```{r} +This is the full palette: + +```{r fig.width=9, fig.height=6} library(hues) swatch(cb_friendly_cols(1:16)) ``` +# Set projects + +HCBC uses a structured based directory to organize projects. You can set up this by using: + +```{r} +tmp_dir=withr::local_tempdir() +bcbio_templates(type="base", outpath=tmp_dir) +fs::dir_ls(tmp_dir, recurse=TRUE) +``` + + +We support multiple analyses type: + +- RNAseq +- scRNAseq +- TEAseq +- COSMX + +To get the example code for any of them you can use a similar command: + +```{r} +analysis_tmp=fs::path_join(c(tmp_dir, "reports")) +bcbio_templates(type="rnaseq", outpath=analysis_tmp) +fs::dir_ls(analysis_tmp, recurse=TRUE) +``` + +Use `scrnaseq`, `teaseq` or `cosmx` to get those other templates.