diff --git a/workflow/Snakefile b/workflow/Snakefile index 23b642b..1fdc0a5 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -37,8 +37,8 @@ rule combine_outputs_split: "logs/emu/combined_split.log", conda: "envs/dplyr.yaml" - shell: - "Rscript --vanilla workflow/scripts/combine_outputs.R {input} {output} 2>&1 | tee {log}" + script: + "scripts/combine_outputs.R" rule abundance: input: diff --git a/workflow/scripts/combine_outputs.R b/workflow/scripts/combine_outputs.R index abaf0de..1d44b22 100644 --- a/workflow/scripts/combine_outputs.R +++ b/workflow/scripts/combine_outputs.R @@ -1,30 +1,34 @@ # Get list of files from command line library(dplyr) -args <- commandArgs(trailingOnly = TRUE) -infiles <- args[1:length(args)-1] -outfile <- args[length(args)] -# Read all TSV files into dataframes and combine them -samples_names <- gsub(".tsv", "", basename(infiles)) -process_file <- function(infile) { - sample_name <- gsub(".tsv", "", basename(infile)) - df <- infile %>% - read.table(, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>% - select(-abundance) - # Replace estimated.counts with sample name - colnames(df)[grep("estimated.counts", colnames(df))] <- sample_name - df -} -dfs <- lapply(infiles, process_file) +run <- function(infiles, outfile){ + # Read all TSV files into dataframes and combine them + samples_names <- gsub(".tsv", "", basename(infiles)) + process_file <- function(infile) { + sample_name <- gsub(".tsv", "", basename(infile)) + df <- infile %>% + read.table(, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>% + select(-abundance) + # Replace estimated.counts with sample name + colnames(df)[grep("estimated.counts", colnames(df))] <- sample_name + df + } + dfs <- lapply(infiles, process_file) -# Combine all dataframes into one using left_join with all columns that are not samples_names as keys -taxa_cols <- setdiff(colnames(dfs[[1]]), samples_names) -combined_df <- do.call(\(x, y) dplyr::full_join(x, y, by = taxa_cols), dfs) -# Replace NA with 0 for every column in samples_names -replace_na <- function(x) { - x[is.na(x)] <- 0 - x + # Combine all dataframes into one using left_join with all columns that are not samples_names as keys + taxa_cols <- setdiff(colnames(dfs[[1]]), samples_names) + combined_df <- do.call(\(x, y) dplyr::full_join(x, y, by = taxa_cols), dfs) + # Replace NA with 0 for every column in samples_names + replace_na <- function(x) { + x[is.na(x)] <- 0 + x + } + combined_df[samples_names] <- lapply(combined_df[samples_names], replace_na) + # Write combined dataframe to file + write.table(combined_df, outfile, sep = "\t", row.names = FALSE) } -combined_df[samples_names] <- lapply(combined_df[samples_names], replace_na) -# Write combined dataframe to file -write.table(combined_df, outfile, sep = "\t", row.names = FALSE) \ No newline at end of file + +run( + as.character(snakemake@input), + snakemake@output[[1]] +) \ No newline at end of file