Skip to content

Commit

Permalink
CW-1920-Annotation-file-types
Browse files Browse the repository at this point in the history
  • Loading branch information
sarahjeeeze committed May 31, 2023
1 parent 2b95323 commit ab275b6
Show file tree
Hide file tree
Showing 11 changed files with 172 additions and 62 deletions.
58 changes: 52 additions & 6 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,22 @@ macos-run:
- x86

docker-run:
artifacts:
when: always
paths:
- ${CI_PROJECT_NAME}
- .nextflow.log
exclude:
- ${CI_PROJECT_NAME}/**/*.gtf
- ${CI_PROJECT_NAME}/**/*.gtf.gz
- ${CI_PROJECT_NAME}/**/*.gff3
- ${CI_PROJECT_NAME}/**/*.gff3.gz
- ${CI_PROJECT_NAME}/**/*.gff
- ${CI_PROJECT_NAME}/**/*.gff.gz
- ${CI_PROJECT_NAME}/**/*.fna
- ${CI_PROJECT_NAME}/**/*.fasta
- ${CI_PROJECT_NAME}/**/*.mmi


# Define a 1D job matrix to inject a variable named MATRIX_NAME into
# the CI environment, we can use the value of MATRIX_NAME to determine
Expand All @@ -27,7 +43,9 @@ docker-run:
parallel:
matrix:
- MATRIX_NAME: [
"fusions", "differential_expression", "only_differential_expression", "isoforms"
"fusions", "differential_expression", "isoforms",
"only_differential_expression", "differential_expression_gff3",
"ncbi_gzip"
]
rules:
# NOTE As we're overriding the rules block for the included docker-run
Expand All @@ -40,14 +58,14 @@ docker-run:
NF_BEFORE_SCRIPT: wget -O test_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/wf-isoforms_test_data.tar.gz && tar -xzvf test_data.tar.gz
NF_WORKFLOW_OPTS: "--fastq ERR6053095_chr20.fastq --transcriptome-source reference-guided \
--ref_genome chr20/hg38_chr20.fa --ref_annotation chr20/gencode.v22.annotation.chr20.gtf"
NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes
NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref
- if: $MATRIX_NAME == "fusions"
variables:
NF_BEFORE_SCRIPT: wget -O test_data.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/wf-isoforms_test_data.tar.gz && tar -xzvf test_data.tar.gz
NF_WORKFLOW_OPTS: "--fastq ERR6053095_chr20.fastq --transcriptome-source reference-guided \
--ref_genome chr20/hg38_chr20.fa --ref_annotation chr20/gencode.v22.annotation.chr20.gtf \
--jaffal_refBase chr20/ --jaffal_genome hg38_chr20 --jaffal_annotation genCode22"
NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes
NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref
- if: $MATRIX_NAME == "differential_expression"
variables:
NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
Expand All @@ -56,18 +74,46 @@ docker-run:
--ref_genome differential_expression/hg38_chr20.fa --transcriptome-source reference-guided \
--ref_annotation differential_expression/gencode.v22.annotation.chr20.gtf \
--direct_rna --minimap_index_opts \\-k15"
NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes
NF_IGNORE_PROCESSES: preprocess_reads,merge_transcriptomes,decompress_annotation,decompress_ref
- if: $MATRIX_NAME == "only_differential_expression"
variables:
NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
NF_WORKFLOW_OPTS: "--fastq differential_expression/differential_expression_fastq \
--transcriptome-source precomputed \
--de_analysis \
--ref_genome differential_expression/hg38_chr20.fa \
--ref_annotation differential_expression/gencode.v22.annotation.chr20.gtf \
--ref_annotation differential_expression/gencode.v22.annotation.chr20.gff \
--direct_rna --minimap_index_opts \\-k15 \
--ref_transcriptome differential_expression/ref_transcriptome.fasta \
--transcriptome_assembly false"
NF_IGNORE_PROCESSES: >
preprocess_reads,merge_transcriptomes,assemble_transcripts,
preprocess_reads,merge_transcriptomes,assemble_transcripts,decompress_annotation,decompress_ref,
build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam
- if: $MATRIX_NAME == "differential_expression_gff3"
variables:
NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
NF_WORKFLOW_OPTS: "--fastq differential_expression/differential_expression_fastq \
--transcriptome-source precomputed \
--de_analysis \
--ref_genome differential_expression/hg38_chr20.fa \
--ref_annotation differential_expression/gencode.v22.annotation.chr20.gff3 \
--direct_rna --minimap_index_opts \\-k15 \
--ref_transcriptome differential_expression/ref_transcriptome.fasta \
--transcriptome_assembly false"
NF_IGNORE_PROCESSES: >
preprocess_reads,merge_transcriptomes,assemble_transcripts,decompress_annotation,decompress_ref,
build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam
- if: $MATRIX_NAME == "ncbi_gzip"
variables:
NF_BEFORE_SCRIPT: wget -O differential_expression.tar.gz https://ont-exd-int-s3-euwst1-epi2me-labs.s3.amazonaws.com/wf-isoforms/differential_expression.tar.gz && tar -xzvf differential_expression.tar.gz
NF_WORKFLOW_OPTS: "-executor.$$local.memory 16GB \
--fastq differential_expression/differential_expression_fastq \
--transcriptome-source precomputed \
--de_analysis \
--ref_genome differential_expression/GRCh38.p14.NCBI_test.fna.gz \
--ref_annotation differential_expression/GRCh38.p14_NCBI_test.gtf.gz \
--direct_rna --minimap_index_opts \\-w25 \
--transcriptome_assembly false"
NF_IGNORE_PROCESSES: >
preprocess_reads,merge_transcriptomes,assemble_transcripts,
build_minimap_index,get_transcriptome,merge_gff_bundles,run_gffcompare,build_minimap_index,split_bam
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.1.12]
### Added
- Handling for GFF3 reference_annotation file type.
- Handling gzip input reference and annotation parameters.
- Handling for NCBI gtfs that contain some empty transcript ID fields.

## [v0.1.11]
### Changed
- LICENSE to Oxford Nanopore Technologies PLC. Public License Version 1.0.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ Differential gene expression is sensitive to the input data quantity and quality
- Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples
(in fastq/fastq.gz format)
- Reference genome in fasta format (required for reference-based assembly).
- Optional reference annotation in GFF2/3 format (required for differential expression analysis `--de_analysis`).
- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`).
- For fusion detection, JAFFAL reference files (see Quickstart)


Expand Down
7 changes: 4 additions & 3 deletions bin/de_analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ coldata$condition <- factor(coldata$condition, levels=rev(levels(coldata$conditi
de_params <- read.csv("de_analysis/de_params.tsv", sep="\t", stringsAsFactors=FALSE)

cat("Loading annotation database.\n")
#txdb <- makeTxDbFromGFF(de_params$Annotation[[1]])
txdb <- makeTxDbFromGFF("annotation.gtf")

annotationtype <- de_params$annotation_type[[1]]
txdb <- makeTxDbFromGFF("annotation.gtf", format = annotationtype)
txdf <- select(txdb, keys(txdb,"GENEID"), "TXNAME", "GENEID")
tab <- table(txdf$GENEID)
txdf$ntx<- tab[match(txdf$GENEID, names(tab))]
Expand All @@ -42,7 +43,7 @@ rownames(txdf) <- NULL
counts<-data.frame(gene_id=txdf$GENEID, feature_id=txdf$TXNAME, cts)

cat("Filtering counts using DRIMSeq.\n")
print(coldata)

d <- dmDSdata(counts=counts, samples=coldata)
trs_cts_unfiltered <- counts(d)

Expand Down
29 changes: 15 additions & 14 deletions bin/workflow_glue/de_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,30 +305,31 @@ def salmon_table(salmon_counts, section):


def get_translations(gtf):
"""Create dic with gene_name and gene_references."""
"""Create dict with gene_name and gene_references."""
fn = open(gtf).readlines()
gene_txid = {}
gene_geid = {}

def get_feature(row, feature):
return row.split(feature)[1].split(
";")[0].replace('=', '').replace("\"", "").strip()

for i in fn:
if i.startswith("#"):
continue
try:
gene_name = i.split("gene_name")[1].split(";")[0]
except Exception:
gene_name = i.split("gene_id")[1].split(";")[0]
gene_name = get_feature(i, 'gene_name')
except IndexError:
gene_name = get_feature(i, 'gene_id')
try:
gene_reference = i.split("ref_gene_id")[1].split(";")[0]
except Exception:
gene_reference = i.split("gene_id")[1].split(";")[0]
gene_reference = get_feature(i, 'ref_gene_id')
except IndexError:
gene_reference = get_feature(i, 'gene_id')
try:
transcript_id = i.split("transcript_id")[1].split(";")[0]
except Exception:
transcript_id = get_feature(i, 'transcript_id')
except IndexError:
transcript_id = "unknown"
transcript_id = transcript_id.replace("\"", "").strip()
gene_id = i.split("gene_id")[1].split(";")[0]
gene_id = gene_id.replace("\"", "").strip()
gene_name = gene_name.replace("\"", "").strip()
gene_reference = gene_reference.replace("\"", "").strip()
gene_id = get_feature(i, 'gene_id')
gene_txid[transcript_id] = gene_name
gene_geid[gene_id] = gene_reference
return gene_txid, gene_geid
Expand Down
2 changes: 1 addition & 1 deletion docs/intro.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,5 +70,5 @@ Differential gene expression is sensitive to the input data quantity and quality
- Directory containing cDNA/direct RNA reads. Or a directory containing subdirectories each with reads from different samples
(in fastq/fastq.gz format)
- Reference genome in fasta format (required for reference-based assembly).
- Optional reference annotation in GFF2/3 format (required for differential expression analysis `--de_analysis`).
- Optional reference annotation in GFF2/3 format (extensions allowed are .gtf(.gz), .gff(.gz), .gff3(.gz)) (required for differential expression analysis `--de_analysis`).
- For fusion detection, JAFFAL reference files (see Quickstart)
85 changes: 70 additions & 15 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,47 @@ process getParams {
"""
}


process decompress_ref {
label "isoforms"
cpus 1
input:
path compressed_ref
output:
path "${compressed_ref.baseName}", emit: decompressed_ref
"""
gzip -df ${compressed_ref}
"""
}


process decompress_annotation {
label "isoforms"
cpus 1
input:
path compressed_annotation
output:
path "${compressed_annotation.baseName}"
"""
gzip -df ${compressed_annotation}
"""
}

// Remove empty transcript ID fields
process preprocess_ref_annotation {
label "isoforms"
cpus 1
input:
path ref_annotation
output:
path "ammended.${ref_annotation}"
"""
sed -i -e 's/transcript_id "";//g' ${ref_annotation}
mv ${ref_annotation} "ammended.${ref_annotation}"
"""
}


process preprocess_reads {
/*
Concatenate reads from a sample directory.
Expand Down Expand Up @@ -157,12 +198,12 @@ process assemble_transcripts{
cpus params.threads

input:
tuple val(sample_id), path(bam)
path ref_annotation
tuple val(sample_id), path(bam), path(ref_annotation)
val use_ref_ann
output:
tuple val(sample_id), path('*.gff'), emit: gff_bundles
script:
def G_FLAG = ref_annotation.name.startsWith('OPTIONAL_FILE') ? '' : "-G ${ref_annotation}"
def G_FLAG = use_ref_ann == false ? '' : "-G ${ref_annotation}"
def prefix = bam.name.split(/\./)[0]

"""
Expand Down Expand Up @@ -311,7 +352,7 @@ process makeReport {
dereport=""
else
dereport="--de_report true --de_stats "seqkit/*""
mv de_report/*.gtf de_report/stringtie_merged.gtf
mv de_report/*.g*f* de_report/stringtie_merged.gtf
fi
if [ -f "gff_annotation/OPTIONAL_FILE" ]; then
OPT_GFF=""
Expand Down Expand Up @@ -411,7 +452,24 @@ workflow pipeline {
jaffal_annotation
condition_sheet
ref_transcriptome
use_ref_ann
main:
if (params.ref_genome.toLowerCase().endsWith("gz")) {
// gzipped ref not supported by some downstream tools
// easier to just decompress and pass it around.
ref_genome = decompress_ref(ref_genome)
}else {
ref_genome = Channel.fromPath(ref_genome)
}
if (params.ref_annotation.toLowerCase().endsWith("gz")) {
// gzipped ref not supported by some downstream tools
// easier to just decompress and pass it around.
decompress_annot= decompress_annotation(ref_annotation)
ref_annotation = preprocess_ref_annotation(decompress_annot)
}else {
ref_annotation = preprocess_ref_annotation(ref_annotation)
}

fastq_ingress_results = reads
// replace `null` with path to optional file
| map { [ it[0], it[1] ?: OPTIONAL_FILE, it[2] ?: OPTIONAL_FILE ] }
Expand Down Expand Up @@ -471,13 +529,10 @@ workflow pipeline {
assembly_stats = assembly.stats.map{ it -> it[1]}.collect()

split_bam(assembly.bam)

assemble_transcripts(split_bam.out.bundles.flatMap(map_sample_ids_cls), ref_annotation)

merge_gff_bundles(assemble_transcripts.out.gff_bundles.groupTuple())

use_ref_ann = !ref_annotation.name.startsWith('OPTIONAL_FILE')
assemble_transcripts(split_bam.out.bundles.flatMap(map_sample_ids_cls).combine(ref_annotation),use_ref_ann)

merge_gff_bundles(assemble_transcripts.out.gff_bundles.groupTuple())
run_gffcompare(merge_gff_bundles.out.gff, ref_annotation)

if (params.transcriptome_source == "denovo"){
Expand All @@ -486,7 +541,7 @@ workflow pipeline {
}else {
// For reference based assembly, there is only one reference
// So map this reference to all sample_ids
seq_for_transcriptome_build = sample_ids.flatten().combine(Channel.fromPath(params.ref_genome))
seq_for_transcriptome_build = sample_ids.flatten().combine(ref_genome)
}
get_transcriptome(
merge_gff_bundles.out.gff
Expand Down Expand Up @@ -521,7 +576,7 @@ workflow pipeline {
}
else {
transcriptome = ref_transcriptome
gtf = Channel.fromPath(ref_annotation)
gtf = ref_annotation
}
check_match = Channel.fromPath(params.condition_sheet)
check_condition_sheet = check_match.splitCsv(header: true).map{ row -> tuple(
Expand Down Expand Up @@ -634,7 +689,6 @@ workflow {
}else {
ref_genome = file("$projectDir/data/OPTIONAL_FILE")
}

if (params.transcriptome_source == "denovo" && params.ref_annotation) {
error = "Reference annotation with de denovo assembly is not supported"
}
Expand All @@ -645,8 +699,10 @@ workflow {
if (!ref_annotation.exists()) {
error = "--ref_annotation: File doesn't exist, check path."
}
use_ref_ann = true
}else{
ref_annotation = file("$projectDir/data/OPTIONAL_FILE")
ref_annotation= file("$projectDir/data/OPTIONAL_FILE")
use_ref_ann = false
}
if (params.jaffal_refBase){
jaffal_refBase = file(params.jaffal_refBase, type: "dir")
Expand All @@ -666,7 +722,6 @@ workflow {
error = "You must provide a reference annotation."
}
if (!params.condition_sheet){

error = "You must provide a condition_sheet or set de_analysis to false."
}
condition_sheet = file(params.condition_sheet, type:"file")
Expand All @@ -686,7 +741,7 @@ workflow {

pipeline(reads, ref_genome, ref_annotation,
jaffal_refBase, params.jaffal_genome, params.jaffal_annotation,
condition_sheet, ref_transcriptome)
condition_sheet, ref_transcriptome, use_ref_ann)

output(pipeline.out.results)
}
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ manifest {
description = 'Transcriptome analysis including gene fusions, differential expression as well as assembly and annotation of cDNA and direct RNA sequencing data.'
mainScript = 'main.nf'
nextflowVersion = '>=20.10.0'
version = 'v0.1.11'
version = 'v0.1.12'
}

executor {
Expand Down
Loading

0 comments on commit ab275b6

Please sign in to comment.