diff --git a/.test/config/samples.tsv b/.test/config/samples.tsv index 567f8e6..3e84e6b 100644 --- a/.test/config/samples.tsv +++ b/.test/config/samples.tsv @@ -1,5 +1,5 @@ sample_name condition -A treated -B untreated +A1 treated +B1 untreated A2 treated B2 untreated diff --git a/.test/config/units.tsv b/.test/config/units.tsv index 95a6b57..ed1a602 100644 --- a/.test/config/units.tsv +++ b/.test/config/units.tsv @@ -1,5 +1,5 @@ sample_name unit_name fq1 fq2 sra adapters strandedness A1 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq B1 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq -A2 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq -B2 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq +A2 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq +B2 1 ngs-test-data/reads/b.scerevisiae.1.fq ngs-test-data/reads/b.scerevisiae.2.fq diff --git a/config/config.yaml b/config/config.yaml index d4aacec..9abea28 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -26,12 +26,12 @@ pca: diffexp: # contrasts for the deseq2 results method contrasts: - treated-vs-untreated: - - treated - - untreated + A-vs-B: + - A + - B model: ~condition params: cutadapt-pe: "" cutadapt-se: "" - star: "" + star: "--outSAMtype BAM Unsorted" diff --git a/workflow/envs/biomart.yaml b/workflow/envs/biomart.yaml new file mode 100644 index 0000000..9a4947b --- /dev/null +++ b/workflow/envs/biomart.yaml @@ -0,0 +1,6 @@ +channels: + - bioconda + - conda-forge +dependencies: + - bioconductor-biomart =2.46 + - r-tidyverse =1.3 diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index bf78f6a..3059958 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -17,9 +17,11 @@ samples = ( def get_final_output(): final_output = expand( - "results/diffexp/{contrast}.diffexp.tsv", + "results/diffexp/{contrast}.diffexp.symbol.tsv", contrast=config["diffexp"]["contrasts"], ) + final_output.append("results/deseq2/normcounts.symbol.tsv") + final_output.append("results/counts/all.symbol.tsv") return final_output @@ -144,6 +146,12 @@ def is_activated(xpath): return bool(c.get("activate", False)) +def get_bioc_species_name(): + first_letter = config["ref"]["species"][0] + subspecies = config["ref"]["species"].split("_")[1] + return first_letter + subspecies + + def get_fastqs(wc): if config["trimming"]["activate"]: return expand( diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk index c7128fe..7b9d02d 100644 --- a/workflow/rules/diffexp.smk +++ b/workflow/rules/diffexp.smk @@ -17,6 +17,21 @@ rule count_matrix: "../scripts/count-matrix.py" +rule gene_2_symbol: + input: + counts="{prefix}.tsv", + output: + symbol="{prefix}.symbol.tsv", + params: + species=get_bioc_species_name(), + log: + "logs/gene2symbol/{prefix}.log", + conda: + "../envs/biomart.yaml" + script: + "../scripts/gene2symbol.R" + + rule deseq2_init: input: counts="results/counts/all.tsv", diff --git a/workflow/scripts/deseq2-init.R b/workflow/scripts/deseq2-init.R index e44aac0..03cf4c9 100644 --- a/workflow/scripts/deseq2-init.R +++ b/workflow/scripts/deseq2-init.R @@ -15,7 +15,10 @@ if (snakemake@threads > 1) { # colData and countData must have the same sample order, but this is ensured # by the way we create the count matrix cts <- read.table(snakemake@input[["counts"]], header=TRUE, row.names="gene", check.names=FALSE) +cts <- cts[ , order(names(cts))] + coldata <- read.table(snakemake@params[["samples"]], header=TRUE, row.names="sample_name", check.names=FALSE) +coldata <- coldata[order(row.names(coldata)), , drop=F] dds <- DESeqDataSetFromMatrix(countData=cts, colData=coldata, diff --git a/workflow/scripts/gene2symbol.R b/workflow/scripts/gene2symbol.R new file mode 100644 index 0000000..c14ef85 --- /dev/null +++ b/workflow/scripts/gene2symbol.R @@ -0,0 +1,62 @@ +library(biomaRt) +library(tidyverse) + +# this variable holds a mirror name until +# useEnsembl succeeds ("www" is last, because +# of very frequent "Internal Server Error"s) +mart <- "useast" +rounds <- 0 +while ( class(mart)[[1]] != "Mart" ) { + mart <- tryCatch( + { + # done here, because error function does not + # modify outer scope variables, I tried + if (mart == "www") rounds <- rounds + 1 + # equivalent to useMart, but you can choose + # the mirror instead of specifying a host + biomaRt::useEnsembl( + biomart = "ENSEMBL_MART_ENSEMBL", + dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"), + mirror = mart + ) + }, + error = function(e) { + # change or make configurable if you want more or + # less rounds of tries of all the mirrors + if (rounds >= 3) { + stop( + ) + } + # hop to next mirror + mart <- switch(mart, + useast = "uswest", + uswest = "asia", + asia = "www", + www = { + # wait before starting another round through the mirrors, + # hoping that intermittent problems disappear + Sys.sleep(30) + "useast" + } + ) + } + ) +} + + +df <- read.table(snakemake@input[["counts"]], sep='\t', header=1) + +g2g <- biomaRt::getBM( + attributes = c( "ensembl_gene_id", + "external_gene_name"), + filters = "ensembl_gene_id", + values = df$gene, + mart = mart, + ) + +annotated <- merge(df, g2g, by.x="gene", by.y="ensembl_gene_id") +annotated$gene <- ifelse(annotated$external_gene_name == '', annotated$gene, annotated$external_gene_name) +annotated$external_gene_name <- NULL +write.table(annotated, snakemake@output[["symbol"]], sep='\t', row.names=F) + +