From ab275b64403e0ea179047ae61e60ded3d5d87884 Mon Sep 17 00:00:00 2001 From: Sarah Griffiths Date: Wed, 31 May 2023 20:48:35 +0000 Subject: [PATCH] CW-1920-Annotation-file-types --- .gitlab-ci.yml | 58 +++++++++++++++-- CHANGELOG.md | 6 ++ README.md | 2 +- bin/de_analysis.R | 7 +- bin/workflow_glue/de_plots.py | 29 +++++---- docs/intro.md | 2 +- main.nf | 85 ++++++++++++++++++++----- nextflow.config | 2 +- nextflow_schema.json | 4 +- subworkflows/differential_expression.nf | 33 +++++----- subworkflows/reference_assembly.nf | 6 +- 11 files changed, 172 insertions(+), 62 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 78fd841..90c7463 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,6 +18,22 @@ macos-run: - x86 docker-run: + artifacts: + when: always + paths: + - ${CI_PROJECT_NAME} + - .nextflow.log + exclude: + - ${CI_PROJECT_NAME}/**/*.gtf + - ${CI_PROJECT_NAME}/**/*.gtf.gz + - ${CI_PROJECT_NAME}/**/*.gff3 + - ${CI_PROJECT_NAME}/**/*.gff3.gz + - ${CI_PROJECT_NAME}/**/*.gff + - ${CI_PROJECT_NAME}/**/*.gff.gz + - ${CI_PROJECT_NAME}/**/*.fna + - ${CI_PROJECT_NAME}/**/*.fasta + - ${CI_PROJECT_NAME}/**/*.mmi + # Define a 1D job matrix to inject a variable named MATRIX_NAME into # the CI environment, we can use the value of MATRIX_NAME to determine @@ -27,7 +43,9 @@ docker-run: parallel: matrix: - MATRIX_NAME: [ - "fusions", "differential_expression", "only_differential_expression", "isoforms" + "fusions", "differential_expression", "isoforms", + "only_differential_expression", "differential_expression_gff3", + "ncbi_gzip" ] rules: # NOTE As we're overriding the rules block for the included docker-run @@ -40,14 +58,14 @@ docker-run: NF_BEFORE_SCRIPT: wget -O test_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/wf-isoforms_test_data.tar.gz && tar -xzvf test_data.tar.gz NF_WORKFLOW_OPTS: "--fastq ERR6053095_chr20.fastq --transcriptome-source reference-guided \ --ref_genome chr20/hg38_chr20.fa --ref_annotation chr20/gencode.v22.annotation.chr20.gtf" - NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes + NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref - if: $MATRIX_NAME == "fusions" variables: NF_BEFORE_SCRIPT: wget -O test_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/wf-isoforms_test_data.tar.gz && tar -xzvf test_data.tar.gz NF_WORKFLOW_OPTS: "--fastq ERR6053095_chr20.fastq --transcriptome-source reference-guided \ --ref_genome chr20/hg38_chr20.fa --ref_annotation chr20/gencode.v22.annotation.chr20.gtf \ --jaffal_refBase chr20/ --jaffal_genome hg38_chr20 --jaffal_annotation genCode22" - NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes + NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref - if: $MATRIX_NAME == "differential_expression" variables: NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz @@ -56,7 +74,7 @@ docker-run: --ref_genome differential_expression/hg38_chr20.fa --transcriptome-source reference-guided \ --ref_annotation differential_expression/gencode.v22.annotation.chr20.gtf \ --direct_rna --minimap_index_opts \\-k15" - NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes + NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref - if: $MATRIX_NAME == "only_differential_expression" variables: NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz @@ -64,10 +82,38 @@ docker-run: --transcriptome-source precomputed \ --de_analysis \ --ref_genome differential_expression/hg38_chr20.fa \ - --ref_annotation differential_expression/gencode.v22.annotation.chr20.gtf \ + --ref_annotation differential_expression/gencode.v22.annotation.chr20.gff \ --direct_rna --minimap_index_opts \\-k15 \ --ref_transcriptome differential_expression/ref_transcriptome.fasta \ --transcriptome_assembly false" NF_IGNORE_PROCESSES: > - preprocess_reads,merge_transcriptomes,assemble_transcripts, + preprocess_reads,merge_transcriptomes,assemble_transcripts,decompress_annotation,decompress_ref, + build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam + - if: $MATRIX_NAME == "differential_expression_gff3" + variables: + NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz + NF_WORKFLOW_OPTS: "--fastq differential_expression/differential_expression_fastq \ + --transcriptome-source precomputed \ + --de_analysis \ + --ref_genome differential_expression/hg38_chr20.fa \ + --ref_annotation differential_expression/gencode.v22.annotation.chr20.gff3 \ + --direct_rna --minimap_index_opts \\-k15 \ + --ref_transcriptome differential_expression/ref_transcriptome.fasta \ + --transcriptome_assembly false" + NF_IGNORE_PROCESSES: > + preprocess_reads,merge_transcriptomes,assemble_transcripts,decompress_annotation,decompress_ref, build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam + - if: $MATRIX_NAME == "ncbi_gzip" + variables: + NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz + NF_WORKFLOW_OPTS: "-executor.$$local.memory 16GB \ +--fastq differential_expression/differential_expression_fastq \ + --transcriptome-source precomputed \ + --de_analysis \ + --ref_genome differential_expression/GRCh38.p14.NCBI_test.fna.gz \ + --ref_annotation differential_expression/GRCh38.p14_NCBI_test.gtf.gz \ + --direct_rna --minimap_index_opts \\-w25 \ + --transcriptome_assembly false" + NF_IGNORE_PROCESSES: > + preprocess_reads,merge_transcriptomes,assemble_transcripts, + build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index b10eb6b..e7a9449 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.1.12] +### Added +- Handling for GFF3 reference_annotation file type. +- Handling gzip input reference and annotation parameters. +- Handling for NCBI gtfs that contain some empty transcript ID fields. + ## [v0.1.11] ### Changed - LICENSE to Oxford Nanopore Technologies PLC. Public License Version 1.0. diff --git a/README.md b/README.md index 34570e9..ed47794 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ Differential gene expression is sensitive to the input data quantity and quality - Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples (in fastq/fastq.gz format) - Reference genome in fasta format (required for reference-based assembly). -- Optional reference annotation in GFF2/3 format (required for differential expression analysis `--de_analysis`). +- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`). - For fusion detection, JAFFAL reference files (see Quickstart) diff --git a/bin/de_analysis.R b/bin/de_analysis.R index 2b32c45..f1a46a1 100755 --- a/bin/de_analysis.R +++ b/bin/de_analysis.R @@ -16,8 +16,9 @@ coldata$condition <- factor(coldata$condition, levels=rev(levels(coldata$conditi de_params <- read.csv("de_analysis/de_params.tsv", sep="\t", stringsAsFactors=FALSE) cat("Loading annotation database.\n") -#txdb <- makeTxDbFromGFF(de_params$Annotation[[1]]) -txdb <- makeTxDbFromGFF("annotation.gtf") + +annotationtype <- de_params$annotation_type[[1]] +txdb <- makeTxDbFromGFF("annotation.gtf", format = annotationtype) txdf <- select(txdb, keys(txdb,"GENEID"), "TXNAME", "GENEID") tab <- table(txdf$GENEID) txdf$ntx<- tab[match(txdf$GENEID, names(tab))] @@ -42,7 +43,7 @@ rownames(txdf) <- NULL counts<-data.frame(gene_id=txdf$GENEID, feature_id=txdf$TXNAME, cts) cat("Filtering counts using DRIMSeq.\n") -print(coldata) + d <- dmDSdata(counts=counts, samples=coldata) trs_cts_unfiltered <- counts(d) diff --git a/bin/workflow_glue/de_plots.py b/bin/workflow_glue/de_plots.py index 0847f1f..98174ca 100755 --- a/bin/workflow_glue/de_plots.py +++ b/bin/workflow_glue/de_plots.py @@ -305,30 +305,31 @@ def salmon_table(salmon_counts, section): def get_translations(gtf): - """Create dic with gene_name and gene_references.""" + """Create dict with gene_name and gene_references.""" fn = open(gtf).readlines() gene_txid = {} gene_geid = {} + + def get_feature(row, feature): + return row.split(feature)[1].split( + ";")[0].replace('=', '').replace("\"", "").strip() + for i in fn: if i.startswith("#"): continue try: - gene_name = i.split("gene_name")[1].split(";")[0] - except Exception: - gene_name = i.split("gene_id")[1].split(";")[0] + gene_name = get_feature(i, 'gene_name') + except IndexError: + gene_name = get_feature(i, 'gene_id') try: - gene_reference = i.split("ref_gene_id")[1].split(";")[0] - except Exception: - gene_reference = i.split("gene_id")[1].split(";")[0] + gene_reference = get_feature(i, 'ref_gene_id') + except IndexError: + gene_reference = get_feature(i, 'gene_id') try: - transcript_id = i.split("transcript_id")[1].split(";")[0] - except Exception: + transcript_id = get_feature(i, 'transcript_id') + except IndexError: transcript_id = "unknown" - transcript_id = transcript_id.replace("\"", "").strip() - gene_id = i.split("gene_id")[1].split(";")[0] - gene_id = gene_id.replace("\"", "").strip() - gene_name = gene_name.replace("\"", "").strip() - gene_reference = gene_reference.replace("\"", "").strip() + gene_id = get_feature(i, 'gene_id') gene_txid[transcript_id] = gene_name gene_geid[gene_id] = gene_reference return gene_txid, gene_geid diff --git a/docs/intro.md b/docs/intro.md index b3a9458..a7c0fa3 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -70,5 +70,5 @@ Differential gene expression is sensitive to the input data quantity and quality - Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples (in fastq/fastq.gz format) - Reference genome in fasta format (required for reference-based assembly). -- Optional reference annotation in GFF2/3 format (required for differential expression analysis `--de_analysis`). +- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`). - For fusion detection, JAFFAL reference files (see Quickstart) diff --git a/main.nf b/main.nf index dc8d283..e47fb20 100644 --- a/main.nf +++ b/main.nf @@ -57,6 +57,47 @@ process getParams { """ } + +process decompress_ref { + label "isoforms" + cpus 1 + input: + path compressed_ref + output: + path "${compressed_ref.baseName}", emit: decompressed_ref + """ + gzip -df ${compressed_ref} + """ +} + + +process decompress_annotation { + label "isoforms" + cpus 1 + input: + path compressed_annotation + output: + path "${compressed_annotation.baseName}" + """ + gzip -df ${compressed_annotation} + """ +} + +// Remove empty transcript ID fields +process preprocess_ref_annotation { + label "isoforms" + cpus 1 + input: + path ref_annotation + output: + path "ammended.${ref_annotation}" + """ + sed -i -e 's/transcript_id "";//g' ${ref_annotation} + mv ${ref_annotation} "ammended.${ref_annotation}" + """ +} + + process preprocess_reads { /* Concatenate reads from a sample directory. @@ -157,12 +198,12 @@ process assemble_transcripts{ cpus params.threads input: - tuple val(sample_id), path(bam) - path ref_annotation + tuple val(sample_id), path(bam), path(ref_annotation) + val use_ref_ann output: tuple val(sample_id), path('*.gff'), emit: gff_bundles script: - def G_FLAG = ref_annotation.name.startsWith('OPTIONAL_FILE') ? '' : "-G ${ref_annotation}" + def G_FLAG = use_ref_ann == false ? '' : "-G ${ref_annotation}" def prefix = bam.name.split(/\./)[0] """ @@ -311,7 +352,7 @@ process makeReport { dereport="" else dereport="--de_report true --de_stats "seqkit/*"" - mv de_report/*.gtf de_report/stringtie_merged.gtf + mv de_report/*.g*f* de_report/stringtie_merged.gtf fi if [ -f "gff_annotation/OPTIONAL_FILE" ]; then OPT_GFF="" @@ -411,7 +452,24 @@ workflow pipeline { jaffal_annotation condition_sheet ref_transcriptome + use_ref_ann main: + if (params.ref_genome.toLowerCase().endsWith("gz")) { + // gzipped ref not supported by some downstream tools + // easier to just decompress and pass it around. + ref_genome = decompress_ref(ref_genome) + }else { + ref_genome = Channel.fromPath(ref_genome) + } + if (params.ref_annotation.toLowerCase().endsWith("gz")) { + // gzipped ref not supported by some downstream tools + // easier to just decompress and pass it around. + decompress_annot= decompress_annotation(ref_annotation) + ref_annotation = preprocess_ref_annotation(decompress_annot) + }else { + ref_annotation = preprocess_ref_annotation(ref_annotation) + } + fastq_ingress_results = reads // replace `null` with path to optional file | map { [ it[0], it[1] ?: OPTIONAL_FILE, it[2] ?: OPTIONAL_FILE ] } @@ -471,13 +529,10 @@ workflow pipeline { assembly_stats = assembly.stats.map{ it -> it[1]}.collect() split_bam(assembly.bam) - - assemble_transcripts(split_bam.out.bundles.flatMap(map_sample_ids_cls), ref_annotation) - merge_gff_bundles(assemble_transcripts.out.gff_bundles.groupTuple()) - - use_ref_ann = !ref_annotation.name.startsWith('OPTIONAL_FILE') + assemble_transcripts(split_bam.out.bundles.flatMap(map_sample_ids_cls).combine(ref_annotation),use_ref_ann) + merge_gff_bundles(assemble_transcripts.out.gff_bundles.groupTuple()) run_gffcompare(merge_gff_bundles.out.gff, ref_annotation) if (params.transcriptome_source == "denovo"){ @@ -486,7 +541,7 @@ workflow pipeline { }else { // For reference based assembly, there is only one reference // So map this reference to all sample_ids - seq_for_transcriptome_build = sample_ids.flatten().combine(Channel.fromPath(params.ref_genome)) + seq_for_transcriptome_build = sample_ids.flatten().combine(ref_genome) } get_transcriptome( merge_gff_bundles.out.gff @@ -521,7 +576,7 @@ workflow pipeline { } else { transcriptome = ref_transcriptome - gtf = Channel.fromPath(ref_annotation) + gtf = ref_annotation } check_match = Channel.fromPath(params.condition_sheet) check_condition_sheet = check_match.splitCsv(header: true).map{ row -> tuple( @@ -634,7 +689,6 @@ workflow { }else { ref_genome = file("$projectDir/data/OPTIONAL_FILE") } - if (params.transcriptome_source == "denovo" && params.ref_annotation) { error = "Reference annotation with de denovo assembly is not supported" } @@ -645,8 +699,10 @@ workflow { if (!ref_annotation.exists()) { error = "--ref_annotation: File doesn't exist, check path." } + use_ref_ann = true }else{ - ref_annotation = file("$projectDir/data/OPTIONAL_FILE") + ref_annotation= file("$projectDir/data/OPTIONAL_FILE") + use_ref_ann = false } if (params.jaffal_refBase){ jaffal_refBase = file(params.jaffal_refBase, type: "dir") @@ -666,7 +722,6 @@ workflow { error = "You must provide a reference annotation." } if (!params.condition_sheet){ - error = "You must provide a condition_sheet or set de_analysis to false." } condition_sheet = file(params.condition_sheet, type:"file") @@ -686,7 +741,7 @@ workflow { pipeline(reads, ref_genome, ref_annotation, jaffal_refBase, params.jaffal_genome, params.jaffal_annotation, - condition_sheet, ref_transcriptome) + condition_sheet, ref_transcriptome, use_ref_ann) output(pipeline.out.results) } diff --git a/nextflow.config b/nextflow.config index 985099f..70beb61 100644 --- a/nextflow.config +++ b/nextflow.config @@ -116,7 +116,7 @@ manifest { description = 'Transcriptome analysis including gene fusions, differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.' mainScript = 'main.nf' nextflowVersion = '>=20.10.0' - version = 'v0.1.11' + version = 'v0.1.12' } executor { diff --git a/nextflow_schema.json b/nextflow_schema.json index 84b1dbd..98731e9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -49,7 +49,7 @@ "type": "string", "format": "file-path", "demo_data": "${projectDir}/test_data/SIRV_isoforms.gtf", - "description": "A reference annotation in GTF format", + "description": "A reference annotation in GFF2 or GFF3 format (extensions .gtf(.gz), .gff(.gz), .gff3(.gz))", "help_text": "This will be used for guiding the transcriptome assembly and to label transcripts with their corresponding gene identifiers." }, "direct_rna": { @@ -352,7 +352,7 @@ } }, "docs": { - "intro": "## Introduction\n\nThis workflow identifies RNA isoforms using either cDNA or direct RNA (dRNA) \nOxford Nanopore reads.\n\n### Preprocesing\ncDNA reads are initially preprocessed by [pychopper](https://github.com/epi2me-labs/pychopper) \nfor the identification of full-length reads, as well as trimming and orientation correction (This step is omitted for \n direct RNA reads).\n\n\n### Transcript assembly\n\n#### Reference-aided transcript assembly approach\n* Full length reads are mapped to a supplied reference genome using [minimap2](https://github.com/lh3/minimap2)\n* Transcripts are assembled by [stringtie](http://ccb.jhu.edu/software/stringtie) \nin long read mode (with or without a guide reference annotation) to generate the GFF annotation.\n* The annotation generated by the pipeline is compared to the reference annotation. \nusing [gffcompare](http://ccb.jhu.edu/software/stringtie/gffcompare.shtml)\n\n#### de novo-based transcript assembly (experimental!)\n* Sequence clusters are generated using [isONclust2](https://github.com/nanoporetech/isONclust2)\n * If a reference genome is supplied, cluster quality metrics are determined by comparing \n with clusters generated from a minimap2 alignment.\n* A consensus sequence for each cluster is generated using [spoa](https://github.com/rvaser/spoa)\n* Three rounds of polishing using racon and minimap2 to give a final polished CDS for each gene.\n* Full-length reads are then mapped to these polished CDS.\n* Transcripts are assembled by stringtie as for the reference-based approach.\n* __Note__: This approach is currently not supported with direct RNA reads.\n\n### Fusion gene detection\nFusion gene detection is performed using [JAFFA](https://github.com/Oshlack/JAFFA), with the JAFFAL extension for use \nwith ONT long reads. \n\n### Differential expression analysis\n\nDifferential gene expression (DGE) and differential transcript usage (DTU) analyses aim to identify genes and/or transcripts that show statistically altered expression patterns in a studied biological system. The results of the differential analyses are presented in a quantitative format and therefore the degree of change (up or down regulation) between experimental conditions can be calculated for each gene identified.\n\nThese differential analyses work by taking a \u201csnapshot\u201d of mRNA abundance and calculating the relative levels of transcripts and isoforms. In this context, expression corresponds to the number of messenger RNAs (mRNA) measured from each gene isoform within the organism / tissue / culture being investigated. In order to determine expression levels across the whole genome, sequence data specifically targeting the mRNA molecules can be generated.\n\nOxford Nanopore Technologies provides a number of sequencing solutions to allow users to generate the required snapshot of gene expression. This can be achieved by both sequencing the mRNA directly, or via a complementary DNA (cDNA) proxy. In contrast to short read sequencing technologies, entire mRNA transcripts can be captured as single reads. The example data provided with this tutorial is from a study based on the PCR-cDNA kit. This is a robust choice for performing differential transcript usage studies. This kit is suitable for preparation of sequence libraries from low mRNA input quantities. The cDNA population is enriched through PCR with low bias; an important prerequisite for the subsequent statistical analysis.\n\n[Workflow-transcriptomes](https://github.com/epi2me-labs/wf-transcriptomes) includes a subworkflow for DGE and DTU. The first step involves using either a reference alignment or _de novo_ assembly approach to create a set of mRNA sequences per sample. These are merged into a non-redundant transcriptome using [stringtie merge](http://ccb.jhu.edu/software/stringtie). The reads are then aligned to the transcriptome using minimap2 in a splice-aware manner. [Salmon](https://github.com/COMBINE-lab/salmon) is used for transcript quantification, giving per transcript counts and then the following R packages are used for analysis.\n\n### Pre-filtering of quantitative data using DRIMSeq\nDRIMSeq (Nowicka and Robinson (2016)) is used to filter the transcript count data from the salmon analysis. The filter step will be used to select for genes and transcripts that satisfy rules for the number of samples in which a gene or transcript must be observed and minimum threshold levels for the number of observed reads. The parameters used for filtering are defined in the config.yaml file. The default parameters defined for this analysis include\n* min_samps_gene_expr = 3 - a transcript must be mapped to a gene in at least this minimum number of samples for the gene be included in the analysis\n*\tmin_samps_feature_expr = 1 - a transcript must be mapped to an isoform in at least this this minimum number of samples for the gene isoform to be included in the analysis\n*\tmin_gene_expr = 10 - the minimum number of total mapped sequence reads for a gene to be considered expressed\n*\tmin_feature_expr = 3 - the minimum number of total mapped sequence reads for a gene isoform to be considered\n\n### edgeR based differential expression analysis\n+A statistical analysis is first performed using edgeR (Robinson, McCarthy, and Smyth (2010), McCarthy et al. (2012)) to identify the subset of differentially expressed genes. The filtered list of gene counts is used as input. A normalisation factor is calculated for each sequence library (using the default TMM method - please see McCarthy et al. (2012) for further details). The defined experimental design is used to calculate estimates of dispersion for each of the gene features. Statistical tests are calculated using the contrasts defined in the experimental design. The differentially expressed genes are corrected for false discovery (fdr) using the method of Benjamini & Hochberg (Benjamini and Hochberg (1995))\n\n### Differential transcript usage using DEXSeq\nDifferential transcript usage analysis is performed using the R DEXSeq package (Reyes et al. (2013)). Similar to the edgeR package, DEXSeq estimates the variance between the biological replicates and applies generalised linear models for the statistical testing. The key difference is that the DEXSeq method looks for differences at the exon count level. DEXSeq uses the filtered transcript count data prepared earlier in this analysis. \n\n### StageR stage-wise analysis of DGE and DTU\nThe final component of this isoform analysis is a stage-wise statistical test using the R software package `stageR` (Van den Berge and Clement (2018)). stageR uses (1) the raw p-values for DTU from the DEXSeq analysis in the previous section and (2) a false-discovery corrected set of p-values from testing whether individual genes contain at least one exon showing DTU. A hierarchical two-stage statistical testing evaluates the set of genes for DTU.\n\n## Running the workflow\nFor the differential expression analysis section you should have at least 3 repeats for each sample. \nYour fastq data will need to be organised in to 6 directories that represent 3 repeats for each condition. You may also need to provide a condition sheet. \n\n\n## Analysis \nDifferential gene expression is sensitive to the input data quantity and quality. There should be equivalence between samples in the number of sequence reads, mapped reads and quality scores. The sequence and alignment summary plots in the report can be used to assess these metrics. There is also a table that shows the transcript per million(TPM) calculated from the salmon counts. TPM normalizes the data for gene length and then sequencing depth, and makes it easier to compare across samples compared to counts.\n\n### Workflow inputs\n- Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples\n (in fastq/fastq.gz format)\n- Reference genome in fasta format (required for reference-based assembly).\n- Optional reference annotation in GFF2/3 format (required for differential expression analysis `--de_analysis`).\n- For fusion detection, JAFFAL reference files (see Quickstart) \n", + "intro": "## Introduction\n\nThis workflow identifies RNA isoforms using either cDNA or direct RNA (dRNA) \nOxford Nanopore reads.\n\n### Preprocesing\ncDNA reads are initially preprocessed by [pychopper](https://github.com/epi2me-labs/pychopper) \nfor the identification of full-length reads, as well as trimming and orientation correction (This step is omitted for \n direct RNA reads).\n\n\n### Transcript assembly\n\n#### Reference-aided transcript assembly approach\n* Full length reads are mapped to a supplied reference genome using [minimap2](https://github.com/lh3/minimap2)\n* Transcripts are assembled by [stringtie](http://ccb.jhu.edu/software/stringtie) \nin long read mode (with or without a guide reference annotation) to generate the GFF annotation.\n* The annotation generated by the pipeline is compared to the reference annotation. \nusing [gffcompare](http://ccb.jhu.edu/software/stringtie/gffcompare.shtml)\n\n#### de novo-based transcript assembly (experimental!)\n* Sequence clusters are generated using [isONclust2](https://github.com/nanoporetech/isONclust2)\n * If a reference genome is supplied, cluster quality metrics are determined by comparing \n with clusters generated from a minimap2 alignment.\n* A consensus sequence for each cluster is generated using [spoa](https://github.com/rvaser/spoa)\n* Three rounds of polishing using racon and minimap2 to give a final polished CDS for each gene.\n* Full-length reads are then mapped to these polished CDS.\n* Transcripts are assembled by stringtie as for the reference-based approach.\n* __Note__: This approach is currently not supported with direct RNA reads.\n\n### Fusion gene detection\nFusion gene detection is performed using [JAFFA](https://github.com/Oshlack/JAFFA), with the JAFFAL extension for use \nwith ONT long reads. \n\n### Differential expression analysis\n\nDifferential gene expression (DGE) and differential transcript usage (DTU) analyses aim to identify genes and/or transcripts that show statistically altered expression patterns in a studied biological system. The results of the differential analyses are presented in a quantitative format and therefore the degree of change (up or down regulation) between experimental conditions can be calculated for each gene identified.\n\nThese differential analyses work by taking a \u201csnapshot\u201d of mRNA abundance and calculating the relative levels of transcripts and isoforms. In this context, expression corresponds to the number of messenger RNAs (mRNA) measured from each gene isoform within the organism / tissue / culture being investigated. In order to determine expression levels across the whole genome, sequence data specifically targeting the mRNA molecules can be generated.\n\nOxford Nanopore Technologies provides a number of sequencing solutions to allow users to generate the required snapshot of gene expression. This can be achieved by both sequencing the mRNA directly, or via a complementary DNA (cDNA) proxy. In contrast to short read sequencing technologies, entire mRNA transcripts can be captured as single reads. The example data provided with this tutorial is from a study based on the PCR-cDNA kit. This is a robust choice for performing differential transcript usage studies. This kit is suitable for preparation of sequence libraries from low mRNA input quantities. The cDNA population is enriched through PCR with low bias; an important prerequisite for the subsequent statistical analysis.\n\n[Workflow-transcriptomes](https://github.com/epi2me-labs/wf-transcriptomes) includes a subworkflow for DGE and DTU. The first step involves using either a reference alignment or _de novo_ assembly approach to create a set of mRNA sequences per sample. These are merged into a non-redundant transcriptome using [stringtie merge](http://ccb.jhu.edu/software/stringtie). The reads are then aligned to the transcriptome using minimap2 in a splice-aware manner. [Salmon](https://github.com/COMBINE-lab/salmon) is used for transcript quantification, giving per transcript counts and then the following R packages are used for analysis.\n\n### Pre-filtering of quantitative data using DRIMSeq\nDRIMSeq (Nowicka and Robinson (2016)) is used to filter the transcript count data from the salmon analysis. The filter step will be used to select for genes and transcripts that satisfy rules for the number of samples in which a gene or transcript must be observed and minimum threshold levels for the number of observed reads. The parameters used for filtering are defined in the config.yaml file. The default parameters defined for this analysis include\n* min_samps_gene_expr = 3 - a transcript must be mapped to a gene in at least this minimum number of samples for the gene be included in the analysis\n*\tmin_samps_feature_expr = 1 - a transcript must be mapped to an isoform in at least this this minimum number of samples for the gene isoform to be included in the analysis\n*\tmin_gene_expr = 10 - the minimum number of total mapped sequence reads for a gene to be considered expressed\n*\tmin_feature_expr = 3 - the minimum number of total mapped sequence reads for a gene isoform to be considered\n\n### edgeR based differential expression analysis\n+A statistical analysis is first performed using edgeR (Robinson, McCarthy, and Smyth (2010), McCarthy et al. (2012)) to identify the subset of differentially expressed genes. The filtered list of gene counts is used as input. A normalisation factor is calculated for each sequence library (using the default TMM method - please see McCarthy et al. (2012) for further details). The defined experimental design is used to calculate estimates of dispersion for each of the gene features. Statistical tests are calculated using the contrasts defined in the experimental design. The differentially expressed genes are corrected for false discovery (fdr) using the method of Benjamini & Hochberg (Benjamini and Hochberg (1995))\n\n### Differential transcript usage using DEXSeq\nDifferential transcript usage analysis is performed using the R DEXSeq package (Reyes et al. (2013)). Similar to the edgeR package, DEXSeq estimates the variance between the biological replicates and applies generalised linear models for the statistical testing. The key difference is that the DEXSeq method looks for differences at the exon count level. DEXSeq uses the filtered transcript count data prepared earlier in this analysis. \n\n### StageR stage-wise analysis of DGE and DTU\nThe final component of this isoform analysis is a stage-wise statistical test using the R software package `stageR` (Van den Berge and Clement (2018)). stageR uses (1) the raw p-values for DTU from the DEXSeq analysis in the previous section and (2) a false-discovery corrected set of p-values from testing whether individual genes contain at least one exon showing DTU. A hierarchical two-stage statistical testing evaluates the set of genes for DTU.\n\n## Running the workflow\nFor the differential expression analysis section you should have at least 3 repeats for each sample. \nYour fastq data will need to be organised in to 6 directories that represent 3 repeats for each condition. You may also need to provide a condition sheet. \n\n\n## Analysis \nDifferential gene expression is sensitive to the input data quantity and quality. There should be equivalence between samples in the number of sequence reads, mapped reads and quality scores. The sequence and alignment summary plots in the report can be used to assess these metrics. There is also a table that shows the transcript per million(TPM) calculated from the salmon counts. TPM normalizes the data for gene length and then sequencing depth, and makes it easier to compare across samples compared to counts.\n\n### Workflow inputs\n- Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples\n (in fastq/fastq.gz format)\n- Reference genome in fasta format (required for reference-based assembly).\n- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`).\n- For fusion detection, JAFFAL reference files (see Quickstart) \n", "links": "## Useful links\n\n* [nextflow](https://www.nextflow.io/)\n* [docker](https://www.docker.com/products/docker-desktop)\n* [Singularity](https://sylabs.io/singularity/)\n* [racon](https://github.com/isovic/racon)\n* [spoa](https://github.com/rvaser/spoa)\n* [inONclust](https://github.com/ksahlin/isONclust)\n* [isONclust2](https://github.com/nanoporetech/isONclust2)" } } \ No newline at end of file diff --git a/subworkflows/differential_expression.nf b/subworkflows/differential_expression.nf index 8a5998d..a3fda47 100644 --- a/subworkflows/differential_expression.nf +++ b/subworkflows/differential_expression.nf @@ -4,15 +4,14 @@ process count_transcripts { label "isoforms" cpus params.threads input: - tuple val(sample_id), path(bam) - path ref_transcriptome + tuple val(meta), path(bam), path(ref_transcriptome) output: path "*transcript_counts.tsv", emit: counts path "*seqkit.stats", emit: seqkit_stats """ salmon quant --noErrorModel -p "${task.cpus}" -t "${ref_transcriptome}" -l SF -a "${bam}" -o counts - mv counts/quant.sf "${sample_id}.transcript_counts.tsv" - seqkit bam "${bam}" 2> "${sample_id}.seqkit.stats" + mv counts/quant.sf "${meta.alias}.transcript_counts.tsv" + seqkit bam "${bam}" 2> "${meta.alias}.seqkit.stats" """ } @@ -42,6 +41,8 @@ process mergeTPM { process deAnalysis { label "isoforms" + errorStrategy "retry" + maxRetries 1 input: path condition_sheet path merged_tsv @@ -53,13 +54,18 @@ process deAnalysis { path "de_analysis/results_dge.tsv", emit: dge path "de_analysis/results_dexseq.tsv", emit: dexseq path "de_analysis", emit: de_analysis - + script: + // Just try both annotation file type because a .gff extension may be gff2(gtf) or gff3 + String annotation_type = "gtf" + if (task.attempt == 2){ + annotation_type = "gff3" + } """ cp $annotation annotation.gtf echo \$(realpath annotation.gtf) - echo Annotation\$'\t'min_samps_gene_expr\$'\t'min_samps_feature_expr\$'\t'min_gene_expr\$'\t'min_feature_expr > params.tsv + echo Annotation\$'\t'min_samps_gene_expr\$'\t'min_samps_feature_expr\$'\t'min_gene_expr\$'\t'min_feature_expr\$'\t'annotation_type > params.tsv echo \$(realpath $params.ref_annotation)\$'\t'$params.min_samps_gene_expr\$'\t'\ - $params.min_samps_feature_expr\$'\t'$params.min_gene_expr\$'\t'$params.min_feature_expr >> params.tsv + $params.min_samps_feature_expr\$'\t'$params.min_gene_expr\$'\t'$params.min_feature_expr\$'\t'$annotation_type >> params.tsv mkdir merged mkdir de_analysis mv $merged_tsv merged/all_counts.tsv @@ -99,7 +105,7 @@ process build_minimap_index_transcriptome{ input: path reference output: - path "genome_index.mmi", emit: index + tuple path("genome_index.mmi"), path(reference), emit: index script: """ minimap2 -t "${task.cpus}" ${params.minimap_index_opts} -I 1000G -d "genome_index.mmi" "${reference}" @@ -118,16 +124,13 @@ process map_transcriptome{ cpus params.threads input: - tuple val(meta), path (fastq_reads) - file index - file transcript_reference + tuple val(meta), path (fastq_reads), path(index) output: - tuple val("${meta.alias}"), path("${meta.alias}_reads_aln_sorted.bam"), emit: bam + tuple val(meta), path("${meta.alias}_reads_aln_sorted.bam"), emit: bam """ minimap2 -t ${task.cpus} -ax splice -uf -p 1.0 "${index}" "${fastq_reads}" \ | samtools view -Sb > "output.bam" samtools sort -@ ${task.cpus} "output.bam" -o "${meta.alias}_reads_aln_sorted.bam" - samtools index "${meta.alias}_reads_aln_sorted.bam" """ } @@ -140,8 +143,8 @@ workflow differential_expression { ref_annotation main: t_index = build_minimap_index_transcriptome(ref_transcriptome) - mapped = map_transcriptome(full_len_reads, t_index, ref_transcriptome) - count_transcripts(mapped.bam, ref_transcriptome) + mapped = map_transcriptome(full_len_reads.combine(t_index)) + count_transcripts(mapped.bam.combine(t_index.map{ mmi, reference -> reference})) merged = mergeCounts(count_transcripts.out.counts.collect()) merged_TPM = mergeTPM(count_transcripts.out.counts.collect()) analysis = deAnalysis(condition_sheet, merged, ref_annotation) diff --git a/subworkflows/reference_assembly.nf b/subworkflows/reference_assembly.nf index 76a8c15..215af54 100644 --- a/subworkflows/reference_assembly.nf +++ b/subworkflows/reference_assembly.nf @@ -8,9 +8,7 @@ process map_reads{ cpus params.threads input: - path index - path reference - tuple val(sample_id), path (fastq_reads) + tuple val(sample_id), path (fastq_reads), path(index), path(reference) output: tuple val(sample_id), path("${sample_id}_reads_aln_sorted.bam"), emit: bam @@ -40,7 +38,7 @@ workflow reference_assembly { reference fastq_reads main: - map_reads(index, reference, fastq_reads) + map_reads(fastq_reads.combine(index).combine(reference)) emit: bam = map_reads.out.bam stats = map_reads.out.stats