Merge pull request #199 from mskcc/release/0.9.0

Release 0.9.0
mskcc · Apr 30, 2019 · 2d3c54d · 2d3c54d
2 parents 9238d1f + 582af9e
commit 2d3c54d
Show file tree

Hide file tree

Showing 18 changed files with 539 additions and 642 deletions.
diff --git a/README.md b/README.md
diff --git a/conf/awsbatch.config.template b/conf/awsbatch.config.template
@@ -5,7 +5,7 @@
 
 aws.region = <AWS-REGION> 
 aws.client.storageEncryption = <STORAGE-ENCRYPTION>
-aws.mountPoint = '/scratch'
+aws.batch.volumes = ['/scratch']
 workDir = <AWS-S3-WORKDIR>
 outDir = <AWS-S3-OUTDIR>
 

diff --git a/conf/containers.config b/conf/containers.config
@@ -79,14 +79,17 @@
   withName:MergeDellyAndManta {
     container = "cmopipeline/bcftools:v1.9"
   }
+  withName:MergeStrelka2Vcfs {
+    container = "cmopipeline/bcftools:v1.9"
+  }
   withName:RunBcfToolsOnDellyManta {
     container = "cmopipeline/bcftools:v1.9"
   }
   withName:RunBcfToolsFilterOnDellyManta {
     container = "cmopipeline/bcftools:v1.9"
   }
   withName:CombineChannel {
-    container = "ubuntu:latest"
+    container = "cmopipeline/bcftools:v1.9"
   }
   withName:RunBcfToolsFilterNorm {
     container = "cmopipeline/bcftools:v1.9"
@@ -103,4 +106,7 @@
   withName:RunHlaPolysolver {
     container = "sachet/polysolver:v4"
   }
+  withName:RunConpair {
+    container = "cmopipeline/conpair:v0.3.3"
+  }
 }
diff --git a/conf/references.config b/conf/references.config
@@ -41,8 +41,6 @@ params {
       snpeffDb    = "GRCh37.75"
       vepCacheVersion  = "95"
       vepCache = "${params.reference_base}/mskcc-igenomes/grch37/vep/cache"
-      vcf2mafFilterVcf = "${vepCache}/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz"
-      vcf2mafFilterVcfIndex = "${vcf2mafFilterVcf}.tbi"
       msiSensorList = "${params.genome_base}/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.microsatellites.list"
       facetsVcf   = "${params.reference_base}/mskcc-igenomes/igenomes/Homo_sapiens/GATK/b37/dbsnp_137.b37__RmDupsClean__plusPseudo50__DROP_SORT.vcf"
       svCallingExcludeRegions = "${params.reference_base}/mskcc-igenomes/grch37/sv_calling/human.hg19.excl.tsv"
@@ -54,6 +52,11 @@ params {
       agilentTargetsIndex = "${agilentTargets}.tbi"
       wgsTargets = "${params.reference_base}/mskcc-igenomes/grch37/targets/b37_wgs_calling_regions.v1.bed.gz"
       wgsTargetsIndex = "${wgsTargets}.tbi"
+      repeatMasker = "${params.reference_base}/mskcc-igenomes/grch37/annotation/rmsk_mod.bed.gz"
+      repeatMaskerIndex = "${repeatMasker}.tbi"
+      mapabilityBlacklist = "${params.reference_base}/mskcc-igenomes/grch37/annotation/wgEncodeDacMapabilityConsensusExcludable.bed.gz"
+      mapabilityBlacklistIndex = "${mapabilityBlacklist}.tbi"
+      isoforms = "${params.reference_base}/mskcc-igenomes/grch37/annotation/isoforms"
     } 
     'GRCh38' {
       acLoci           = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci"

diff --git a/conf/resources.config b/conf/resources.config
@@ -86,6 +86,10 @@
     cpus = { 2 }
     memory = { 6.GB }
   }
+  withName:MergeStrelka2Vcfs {
+    cpus = { 2 }
+    memory = { 6.GB }
+  }
   withName:MergeDellyAndManta {
     cpus = { 2 }
     memory = { 6.GB }
@@ -122,5 +126,8 @@
     cpus = { 2 }
     memory = { 6.GB }
   }
+  withName:RunConpair {
+    cpus = { 2 }
+    memory = { 6.GB }
+  }
 }
-
diff --git a/conf/resources_aws.config b/conf/resources_aws.config
@@ -86,6 +86,10 @@
     cpus = { 8 }
     memory = { 32.GB }
   }
+  withName:MergeStrelka2Vcfs {
+    cpus = { 8 }
+    memory = { 32.GB }
+  }
   withName:MergeDellyAndManta {
     cpus = { 8 }
     memory = { 32.GB }
@@ -103,8 +107,8 @@
     memory = { 32.GB }
   }
   withName:CombineChannel{
-    cpus = { 2 }
-    memory = { 4.GB }
+    cpus = { 8 }
+    memory = { 32.GB }
   }
   withName:RunBcfToolsFilterNorm {
     cpus = { 8 }
@@ -122,5 +126,8 @@
     cpus = { 8 }
     memory = { 32.GB }
   }
+  withName:RunConpair {
+    cpus = { 8 }
+    memory = { 32.GB }
+  }
 }
-
diff --git a/conf/resources_juno.config b/conf/resources_juno.config
@@ -86,6 +86,10 @@
     cpus = { 8 }
     memory = { 4.MB }
   }
+  withName:MergeStrelka2Vcfs {
+    cpus = { 8 }
+    memory = { 4.MB }
+  }
   withName:MergeDellyAndManta {
     cpus = { 8 }
     memory = { 4.MB }
@@ -103,8 +107,8 @@
     memory = { 4.MB }
   }
   withName:CombineChannel{
-    cpus = { 2 }
-    memory = { 2.MB }
+    cpus = { 8 }
+    memory = { 4.MB }
   }
   withName:RunBcfToolsFilterNorm {
     cpus = { 8 }
@@ -122,5 +126,9 @@
     cpus = { 8 }
     memory = { 4.MB }
   }
+  withName:RunConpair {
+    cpus = { 8 }
+    memory = { 4.MB }
+  }
 }
 
diff --git a/containers/conpair/Dockerfile b/containers/conpair/Dockerfile
@@ -0,0 +1,56 @@
+FROM ubuntu:16.04
+
+LABEL authors="Nikhil Kumar ([email protected]), Evan Biederstedt ([email protected]), C. Allan Bolipata ([email protected])" \
+      version.image="1.0.0" \
+      version.conpair="0.3.3" \
+      version.gatk="3.8-1" \
+      source.conpair="https://github.com/mskcc/Conpair/releases/tag/0.3.3" \
+      source.r="https://pkgs.alpinelinux.org/package/edge/community/x86/R"      
+
+ENV CONPAIR_VERSION 0.3.3
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y python && \
+    apt-get install -y wget \
+    curl \
+    bc \
+    unzip \
+    less \
+    bedtools \
+    samtools \
+    openjdk-8-jdk \
+    tabix \
+    bzip2 \
+    software-properties-common && \
+    apt-get -y clean  && \
+    apt-get -y autoclean  && \
+    apt-get -y autoremove
+
+# Download GATK
+RUN cd /tmp \ 
+    && wget "https://software.broadinstitute.org/gatk/download/auth?package=GATK-archive&version=3.8-1-0-gf15c1c3ef" -O gatk-3.8-1.tar.bz2 \ 
+    && tar xvjf gatk-3.8-1.tar.bz2 \
+    && mv /tmp/GenomeAnalysisTK-3.8-1-0-gf15c1c3ef/GenomeAnalysisTK.jar /usr/bin/GenomeAnalysisTK.jar \
+    && rm -rf /tmp/*
+
+## install R 
+RUN apt-get install -y r-base-core r-base-dev \
+    ## install R dependencies
+    && R -e "install.packages('ggplot2', dependencies=TRUE, repos='http://cran.rstudio.com/')" \
+    && R -e "install.packages('reshape2', dependencies=TRUE, repos='http://cran.rstudio.com/')" 
+
+## Python dependencies and Java 
+## install numpy and scip
+RUN pip install numpy==1.15.4 \
+    && pip install scipy==1.1.0
+
+## download Conpair
+RUN cd /tmp && wget https://github.com/mskcc/Conpair/archive/${CONPAIR_VERSION}.tar.gz \
+    && tar xvzf ${CONPAIR_VERSION}.tar.gz \
+    ## install conpair
+    && mv /tmp/Conpair-${CONPAIR_VERSION} /usr/bin/conpair \
+    ## clean up
+    && rm -rf /tmp/*
+
+ENV PYTHONNOUSERSITE set
diff --git a/containers/lohhla/Dockerfile b/containers/lohhla/Dockerfile
@@ -0,0 +1,28 @@
+FROM continuumio/anaconda3:5.3.0
+
+LABEL authors="Evan Biederstedt ([email protected])" \
+      version.image="1.0.0" \
+      version.lohhla="1.0.0"   
+
+RUN conda create -n env python=3.6
+RUN echo "source activate env" > ~/.bashrc
+
+ENV PATH /opt/conda/envs/env/bin:$PATH
+
+# Updating Anaconda packages
+RUN conda update conda 
+RUN conda update anaconda
+RUN conda update --all
+
+## Set up channels
+## https://bioconda.github.io/index.html
+RUN conda config --add channels defaults
+RUN conda config --add channels bioconda
+RUN conda config --add channels conda-forge
+
+## install lohhla
+## https://bioconda.github.io/recipes/lohhla/README.html
+RUN conda install lohhla
+RUN conda update lohhla
+
+
diff --git a/docs/ANNOTATION.md b/docs/ANNOTATION.md
@@ -0,0 +1,32 @@
+# Variant annotation
+
+## GRCh37
+
+### SNVs and indels
+Basic annotation of merged `vcf` files from the individual variants callers is carried out in two steps. First, the combined `vcf` is annotated with information from [RepeatMasker](http://www.repeatmasker.org/) and the [ENCODE consortium](http://rohsdb.cmb.usc.edu/GBshape/ENCODE/index.html). These files are retrieved from the [UCSC genome browser](https://genome.ucsc.edu) and parsed as such:
+
+``` shell
+wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/rmsk.txt.gz
+gunzip rmsk.txt.gz
+cut -f6-8,12 rmsk.txt | \
+    grep -e "Low_complexity" -e "Simple_repeat" | \
+    sed 's/^chr//g'> rmsk_mod.bed
+bgzip rmsk_mod.bed
+tabix --preset bed rmsk_mod.bed.gz
+
+wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeMapability/wgEncodeDacMapabilityConsensusExcludable.bed.gz
+gunzip wgEncodeDacMapabilityConsensusExcludable.bed.gz
+sed -i 's/^chr//g' wgEncodeDacMapabilityConsensusExcludable.bed
+bgzip wgEncodeDacMapabilityConsensusExcludable.bed
+tabix --preset bed wgEncodeDacMapabilityConsensusExcludable.bed.gz
+```
+
+Subsequently, [vcf2maf](https://github.com/mskcc/vcf2maf) is used to annotate functional effects of mutations as well as other metadata using [VEP](https://www.ensembl.org/vep). The `--custom-enst` argument to vcf2maf takes a list of preferred gene transcript isoforms which to map mutations onto. We supply a consensus list of [`isoform_overrides_at_mskcc` and `isoform_overrides_uniprot`](https://github.com/mskcc/vcf2maf/tree/master/data), generated as such:
+``` r
+t1 = readr::read_tsv('isoform_overrides_at_mskcc')
+t2 = readr::read_tsv('isoform_overrides_uniprot')
+t2 %>%
+    dplyr::filter(gene_name %nin% t1$gene_name) %>%
+    dplyr::bind_rows(., t1) %>%
+    readr::write_tsv('isoforms')
+```
diff --git a/docs/INTERVALS.md b/docs/INTERVALS.md
@@ -1,3 +1,5 @@
+# Genomic intervals
+
 ## GRCH37
 
 ### Genome

diff --git a/docs/README.md b/docs/README.md
@@ -1,6 +1,6 @@
 # Contents
 
-1. [References](https://github.mskcc/vaporware/blob/master/docs/REFERENCES.md)
-  1. [Intervals](https://github.mskcc/vaporware/blob/master/docs/INTERVALS.md)
-2. ...
-3. ...
+1. [References](https://github.com/mskcc/vaporware/blob/master/docs/REFERENCES.md)
+2. [Intervals](https://github.com/mskcc/vaporware/blob/master/docs/INTERVALS.md)
+3. [Variant Annotation](https://github.com/mskcc/vaporware/blob/readme_updates/docs/ANNOTATION.md)
+4. ...
diff --git a/germline.nf b/germline.nf
@@ -594,20 +594,37 @@ def extractBamFiles(tsvFile) {
   Channel.from(tsvFile)
   .splitCsv(sep: '\t')
   .map { row ->
-    VaporwareUtils.checkNumberOfItem(row, 8)
+    checkNumberOfItem(row, 8)
     def assay = row[0]
     def target = row[1]
     def idTumor = row[2]
     def idNormal = row[3]
-    def bamTumor = VaporwareUtils.returnFile(row[4])
-    def bamNormal = VaporwareUtils.returnFile(row[5])
-    def baiTumor = VaporwareUtils.returnFile(row[6])
-    def baiNormal = VaporwareUtils.returnFile(row[7])
-
-    VaporwareUtils.checkFileExtension(bamTumor,".bam")
-    VaporwareUtils.checkFileExtension(bamNormal,".bam")
-    VaporwareUtils.checkFileExtension(baiTumor,".bai")
-    VaporwareUtils.checkFileExtension(baiNormal,".bai")
+    def bamTumor = returnFile(row[4])
+    def bamNormal = returnFile(row[5])
+    def baiTumor = returnFile(row[6])
+    def baiNormal = returnFile(row[7])
+
+    checkFileExtension(bamTumor,".bam")
+    checkFileExtension(bamNormal,".bam")
+    checkFileExtension(baiTumor,".bai")
+    checkFileExtension(baiNormal,".bai")
     [ assay, target, idTumor, idNormal, bamTumor, bamNormal, baiTumor, baiNormal ]
   }
 }
+
+// Check file extension
+def checkFileExtension(it, extension) {
+  if (!it.toString().toLowerCase().endsWith(extension.toLowerCase())) exit 1, "File: ${it} has the wrong extension: ${extension} see --help for more information"
+}
+
+// Check if a row has the expected number of item
+def checkNumberOfItem(row, number) {
+  if (row.size() != number) exit 1, "Malformed row in TSV file: ${row}, see --help for more information"
+    return true
+}
+
+// Return file if it exists
+def returnFile(it) {
+  if (!file(it).exists()) exit 1, "Missing file in TSV file: ${it}, see --help for more information"
+    return file(it)
+}