CW-1920-Annotation-file-types

epi2me-labs · May 31, 2023 · ab275b6 · ab275b6
1 parent 2b95323
commit ab275b6
Show file tree

Hide file tree

Showing 11 changed files with 172 additions and 62 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -18,6 +18,22 @@ macos-run:
     - x86
 
 docker-run:
+    artifacts:
+        when: always
+        paths:
+            - ${CI_PROJECT_NAME}
+            - .nextflow.log
+        exclude:
+            - ${CI_PROJECT_NAME}/**/*.gtf
+            - ${CI_PROJECT_NAME}/**/*.gtf.gz
+            - ${CI_PROJECT_NAME}/**/*.gff3
+            - ${CI_PROJECT_NAME}/**/*.gff3.gz
+            - ${CI_PROJECT_NAME}/**/*.gff
+            - ${CI_PROJECT_NAME}/**/*.gff.gz
+            - ${CI_PROJECT_NAME}/**/*.fna
+            - ${CI_PROJECT_NAME}/**/*.fasta
+            - ${CI_PROJECT_NAME}/**/*.mmi
+
 
     # Define a 1D job matrix to inject a variable named MATRIX_NAME into
     #   the CI environment, we can use the value of MATRIX_NAME to determine
@@ -27,7 +43,9 @@ docker-run:
     parallel:
         matrix:
             - MATRIX_NAME: [
-                "fusions", "differential_expression", "only_differential_expression", "isoforms"
+                "fusions", "differential_expression", "isoforms",
+                "only_differential_expression", "differential_expression_gff3",
+                "ncbi_gzip"
             ]
     rules:
         # NOTE As we're overriding the rules block for the included docker-run
@@ -40,14 +58,14 @@ docker-run:
               NF_BEFORE_SCRIPT: wget -O test_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/wf-isoforms_test_data.tar.gz && tar -xzvf  test_data.tar.gz
               NF_WORKFLOW_OPTS: "--fastq  ERR6053095_chr20.fastq --transcriptome-source reference-guided \
                   --ref_genome chr20/hg38_chr20.fa --ref_annotation chr20/gencode.v22.annotation.chr20.gtf"
-              NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes
+              NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref
         - if: $MATRIX_NAME == "fusions"
           variables:
               NF_BEFORE_SCRIPT: wget -O test_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/wf-isoforms_test_data.tar.gz && tar -xzvf  test_data.tar.gz
               NF_WORKFLOW_OPTS: "--fastq  ERR6053095_chr20.fastq --transcriptome-source reference-guided \
                   --ref_genome chr20/hg38_chr20.fa --ref_annotation chr20/gencode.v22.annotation.chr20.gtf \
                   --jaffal_refBase chr20/ --jaffal_genome hg38_chr20 --jaffal_annotation genCode22"
-              NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes
+              NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref
         - if: $MATRIX_NAME == "differential_expression"
           variables:
               NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
@@ -56,18 +74,46 @@ docker-run:
                   --ref_genome differential_expression/hg38_chr20.fa --transcriptome-source reference-guided \
                 --ref_annotation differential_expression/gencode.v22.annotation.chr20.gtf \
                 --direct_rna --minimap_index_opts \\-k15"
-              NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes
+              NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref
         - if: $MATRIX_NAME == "only_differential_expression"
           variables:
               NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
               NF_WORKFLOW_OPTS: "--fastq  differential_expression/differential_expression_fastq \
                   --transcriptome-source precomputed \
                   --de_analysis \
                   --ref_genome differential_expression/hg38_chr20.fa \
-                --ref_annotation differential_expression/gencode.v22.annotation.chr20.gtf \
+                --ref_annotation differential_expression/gencode.v22.annotation.chr20.gff \
                 --direct_rna --minimap_index_opts \\-k15 \
                 --ref_transcriptome differential_expression/ref_transcriptome.fasta \
                 --transcriptome_assembly false"
               NF_IGNORE_PROCESSES: >
-                preprocess_reads,merge_transcriptomes,assemble_transcripts,
+                preprocess_reads,merge_transcriptomes,assemble_transcripts,decompress_annotation,decompress_ref,
+                build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam
+        - if: $MATRIX_NAME == "differential_expression_gff3"
+          variables:
+              NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
+              NF_WORKFLOW_OPTS: "--fastq  differential_expression/differential_expression_fastq \
+                  --transcriptome-source precomputed \
+                  --de_analysis \
+                  --ref_genome differential_expression/hg38_chr20.fa \
+                --ref_annotation differential_expression/gencode.v22.annotation.chr20.gff3 \
+                --direct_rna --minimap_index_opts \\-k15 \
+                --ref_transcriptome differential_expression/ref_transcriptome.fasta \
+                --transcriptome_assembly false"
+              NF_IGNORE_PROCESSES: >
+                preprocess_reads,merge_transcriptomes,assemble_transcripts,decompress_annotation,decompress_ref,
                 build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam
+        - if: $MATRIX_NAME == "ncbi_gzip"
+          variables:
+              NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
+              NF_WORKFLOW_OPTS: "-executor.$$local.memory 16GB \
+--fastq  differential_expression/differential_expression_fastq \
+                  --transcriptome-source precomputed \
+                  --de_analysis \
+                  --ref_genome differential_expression/GRCh38.p14.NCBI_test.fna.gz \
+                --ref_annotation differential_expression/GRCh38.p14_NCBI_test.gtf.gz \
+                --direct_rna --minimap_index_opts \\-w25 \
+               --transcriptome_assembly false"
+              NF_IGNORE_PROCESSES: >
+                preprocess_reads,merge_transcriptomes,assemble_transcripts,
+                build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam           
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.1.12]
+### Added
+- Handling for GFF3 reference_annotation file type.
+- Handling gzip input reference and annotation parameters.
+- Handling for NCBI gtfs that contain some empty transcript ID fields.
+
 ## [v0.1.11]
 ### Changed
 - LICENSE to Oxford Nanopore Technologies PLC. Public License Version 1.0.

diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ Differential gene expression is sensitive to the input data quantity and quality
 - Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples
   (in fastq/fastq.gz format)
 - Reference genome in fasta format (required for reference-based assembly).
-- Optional reference annotation in GFF2/3 format (required for differential expression analysis `--de_analysis`).
+- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`).
 - For fusion detection, JAFFAL reference files (see Quickstart) 
 
 

diff --git a/bin/de_analysis.R b/bin/de_analysis.R
@@ -16,8 +16,9 @@ coldata$condition <- factor(coldata$condition, levels=rev(levels(coldata$conditi
 de_params <- read.csv("de_analysis/de_params.tsv", sep="\t", stringsAsFactors=FALSE)
 
 cat("Loading annotation database.\n")
-#txdb <- makeTxDbFromGFF(de_params$Annotation[[1]])
-txdb <- makeTxDbFromGFF("annotation.gtf")
+
+annotationtype <- de_params$annotation_type[[1]]
+txdb <- makeTxDbFromGFF("annotation.gtf",  format = annotationtype)
 txdf <- select(txdb, keys(txdb,"GENEID"), "TXNAME", "GENEID")
 tab <- table(txdf$GENEID)
 txdf$ntx<- tab[match(txdf$GENEID, names(tab))]
@@ -42,7 +43,7 @@ rownames(txdf) <- NULL
 counts<-data.frame(gene_id=txdf$GENEID, feature_id=txdf$TXNAME, cts)
 
 cat("Filtering counts using DRIMSeq.\n")
-print(coldata)
+
 d <- dmDSdata(counts=counts, samples=coldata)
 trs_cts_unfiltered <- counts(d)
 

diff --git a/bin/workflow_glue/de_plots.py b/bin/workflow_glue/de_plots.py
@@ -305,30 +305,31 @@ def salmon_table(salmon_counts, section):
 
 
 def get_translations(gtf):
-    """Create dic with gene_name and gene_references."""
+    """Create dict with gene_name and gene_references."""
     fn = open(gtf).readlines()
     gene_txid = {}
     gene_geid = {}
+
+    def get_feature(row, feature):
+        return row.split(feature)[1].split(
+            ";")[0].replace('=', '').replace("\"", "").strip()
+
     for i in fn:
         if i.startswith("#"):
             continue
         try:
-            gene_name = i.split("gene_name")[1].split(";")[0]
-        except Exception:
-            gene_name = i.split("gene_id")[1].split(";")[0]
+            gene_name = get_feature(i, 'gene_name')
+        except IndexError:
+            gene_name = get_feature(i, 'gene_id')
         try:
-            gene_reference = i.split("ref_gene_id")[1].split(";")[0]
-        except Exception:
-            gene_reference = i.split("gene_id")[1].split(";")[0]
+            gene_reference = get_feature(i, 'ref_gene_id')
+        except IndexError:
+            gene_reference = get_feature(i, 'gene_id')
         try:
-            transcript_id = i.split("transcript_id")[1].split(";")[0]
-        except Exception:
+            transcript_id = get_feature(i, 'transcript_id')
+        except IndexError:
             transcript_id = "unknown"
-        transcript_id = transcript_id.replace("\"", "").strip()
-        gene_id = i.split("gene_id")[1].split(";")[0]
-        gene_id = gene_id.replace("\"", "").strip()
-        gene_name = gene_name.replace("\"", "").strip()
-        gene_reference = gene_reference.replace("\"", "").strip()
+        gene_id = get_feature(i, 'gene_id')
         gene_txid[transcript_id] = gene_name
         gene_geid[gene_id] = gene_reference
     return gene_txid, gene_geid

diff --git a/docs/intro.md b/docs/intro.md
@@ -70,5 +70,5 @@ Differential gene expression is sensitive to the input data quantity and quality
 - Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples
   (in fastq/fastq.gz format)
 - Reference genome in fasta format (required for reference-based assembly).
-- Optional reference annotation in GFF2/3 format (required for differential expression analysis `--de_analysis`).
+- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`).
 - For fusion detection, JAFFAL reference files (see Quickstart) 
diff --git a/main.nf b/main.nf
@@ -57,6 +57,47 @@ process getParams {
     """
 }
 
+
+process decompress_ref {
+    label "isoforms"
+    cpus 1
+    input:
+        path compressed_ref
+    output:
+        path "${compressed_ref.baseName}", emit: decompressed_ref
+    """
+    gzip -df ${compressed_ref}
+    """
+}
+
+
+process decompress_annotation {
+    label "isoforms"
+    cpus 1
+    input:
+        path compressed_annotation
+    output:
+        path "${compressed_annotation.baseName}"
+    """
+    gzip -df ${compressed_annotation}
+    """
+}
+
+// Remove empty transcript ID fields
+process preprocess_ref_annotation {
+    label "isoforms"
+    cpus 1
+    input:
+        path ref_annotation
+    output:
+        path "ammended.${ref_annotation}"
+    """
+    sed -i -e 's/transcript_id "";//g' ${ref_annotation}
+    mv ${ref_annotation} "ammended.${ref_annotation}"
+    """
+}
+
+
 process preprocess_reads {
     /*
     Concatenate reads from a sample directory.
@@ -157,12 +198,12 @@ process assemble_transcripts{
     cpus params.threads
 
     input:
-        tuple val(sample_id), path(bam)
-        path ref_annotation
+        tuple val(sample_id), path(bam), path(ref_annotation)
+        val use_ref_ann
     output:
         tuple val(sample_id), path('*.gff'), emit: gff_bundles
     script:
-        def G_FLAG = ref_annotation.name.startsWith('OPTIONAL_FILE') ? '' : "-G ${ref_annotation}"
+        def G_FLAG = use_ref_ann == false ? '' : "-G ${ref_annotation}"
         def prefix =  bam.name.split(/\./)[0]
 
     """
@@ -311,7 +352,7 @@ process makeReport {
         dereport=""
     else
         dereport="--de_report true --de_stats "seqkit/*""
-        mv de_report/*.gtf de_report/stringtie_merged.gtf
+        mv de_report/*.g*f* de_report/stringtie_merged.gtf
     fi
     if [ -f "gff_annotation/OPTIONAL_FILE" ]; then
         OPT_GFF=""
@@ -411,7 +452,24 @@ workflow pipeline {
         jaffal_annotation
         condition_sheet
         ref_transcriptome
+        use_ref_ann
     main:
+        if (params.ref_genome.toLowerCase().endsWith("gz")) {
+            // gzipped ref not supported by some downstream tools
+            // easier to just decompress and pass it around.
+            ref_genome = decompress_ref(ref_genome)
+        }else {
+            ref_genome = Channel.fromPath(ref_genome)
+        }
+        if (params.ref_annotation.toLowerCase().endsWith("gz")) {
+            // gzipped ref not supported by some downstream tools
+            // easier to just decompress and pass it around.
+            decompress_annot= decompress_annotation(ref_annotation)
+            ref_annotation = preprocess_ref_annotation(decompress_annot)
+        }else {
+            ref_annotation = preprocess_ref_annotation(ref_annotation)
+        }
+
         fastq_ingress_results = reads
         // replace `null` with path to optional file
         | map { [ it[0], it[1] ?: OPTIONAL_FILE, it[2] ?: OPTIONAL_FILE ] }
@@ -471,13 +529,10 @@ workflow pipeline {
             assembly_stats = assembly.stats.map{ it -> it[1]}.collect()
 
             split_bam(assembly.bam)
-
-            assemble_transcripts(split_bam.out.bundles.flatMap(map_sample_ids_cls), ref_annotation)
 
-            merge_gff_bundles(assemble_transcripts.out.gff_bundles.groupTuple())
-
-            use_ref_ann = !ref_annotation.name.startsWith('OPTIONAL_FILE')
+            assemble_transcripts(split_bam.out.bundles.flatMap(map_sample_ids_cls).combine(ref_annotation),use_ref_ann)
 
+            merge_gff_bundles(assemble_transcripts.out.gff_bundles.groupTuple())
             run_gffcompare(merge_gff_bundles.out.gff, ref_annotation)
 
             if (params.transcriptome_source == "denovo"){
@@ -486,7 +541,7 @@ workflow pipeline {
             }else {
                 // For reference based assembly, there is only one reference
                 // So map this reference to all sample_ids
-                seq_for_transcriptome_build = sample_ids.flatten().combine(Channel.fromPath(params.ref_genome))
+                seq_for_transcriptome_build = sample_ids.flatten().combine(ref_genome)
             }
             get_transcriptome(
                 merge_gff_bundles.out.gff
@@ -521,7 +576,7 @@ workflow pipeline {
             }
             else {
                 transcriptome = ref_transcriptome
-                gtf = Channel.fromPath(ref_annotation)
+                gtf = ref_annotation
             }
             check_match = Channel.fromPath(params.condition_sheet)
             check_condition_sheet = check_match.splitCsv(header: true).map{ row -> tuple(
@@ -634,7 +689,6 @@ workflow {
     }else {
         ref_genome = file("$projectDir/data/OPTIONAL_FILE")
     }
-
     if (params.transcriptome_source == "denovo" && params.ref_annotation) {
         error = "Reference annotation with de denovo assembly is not supported"
     }
@@ -645,8 +699,10 @@ workflow {
         if (!ref_annotation.exists()) {
             error = "--ref_annotation: File doesn't exist, check path."
         }
+        use_ref_ann = true
     }else{
-        ref_annotation = file("$projectDir/data/OPTIONAL_FILE")
+        ref_annotation= file("$projectDir/data/OPTIONAL_FILE")
+        use_ref_ann = false
     }
     if (params.jaffal_refBase){
         jaffal_refBase = file(params.jaffal_refBase, type: "dir")
@@ -666,7 +722,6 @@ workflow {
             error = "You must provide a reference annotation."
         }
         if (!params.condition_sheet){
-
             error = "You must provide a condition_sheet or set de_analysis to false."
         }
         condition_sheet = file(params.condition_sheet, type:"file")
@@ -686,7 +741,7 @@ workflow {
 
         pipeline(reads, ref_genome, ref_annotation,
             jaffal_refBase, params.jaffal_genome, params.jaffal_annotation,
-            condition_sheet, ref_transcriptome)
+            condition_sheet, ref_transcriptome, use_ref_ann)
 
         output(pipeline.out.results)
     }

diff --git a/nextflow.config b/nextflow.config
@@ -116,7 +116,7 @@ manifest {
     description     = 'Transcriptome analysis including gene fusions, differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.'
     mainScript      = 'main.nf'
     nextflowVersion = '>=20.10.0'
-    version         = 'v0.1.11'
+    version         = 'v0.1.12'
 }
 
 executor {