Skip to content

Commit

Permalink
Snakemake script rather than Rscript
Browse files Browse the repository at this point in the history
  • Loading branch information
currocam committed Apr 23, 2024
1 parent 2bb9386 commit 54794ab
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 27 deletions.
4 changes: 2 additions & 2 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ rule combine_outputs_split:
"logs/emu/combined_split.log",
conda:
"envs/dplyr.yaml"
shell:
"Rscript --vanilla workflow/scripts/combine_outputs.R {input} {output} 2>&1 | tee {log}"
script:
"scripts/combine_outputs.R"

rule abundance:
input:
Expand Down
54 changes: 29 additions & 25 deletions workflow/scripts/combine_outputs.R
Original file line number Diff line number Diff line change
@@ -1,30 +1,34 @@
# Get list of files from command line
library(dplyr)
args <- commandArgs(trailingOnly = TRUE)
infiles <- args[1:length(args)-1]
outfile <- args[length(args)]
# Read all TSV files into dataframes and combine them
samples_names <- gsub(".tsv", "", basename(infiles))
process_file <- function(infile) {
sample_name <- gsub(".tsv", "", basename(infile))
df <- infile %>%
read.table(, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
select(-abundance)
# Replace estimated.counts with sample name
colnames(df)[grep("estimated.counts", colnames(df))] <- sample_name
df
}
dfs <- lapply(infiles, process_file)
run <- function(infiles, outfile){
# Read all TSV files into dataframes and combine them
samples_names <- gsub(".tsv", "", basename(infiles))
process_file <- function(infile) {
sample_name <- gsub(".tsv", "", basename(infile))
df <- infile %>%
read.table(, header = TRUE, sep = "\t", stringsAsFactors = FALSE) %>%
select(-abundance)
# Replace estimated.counts with sample name
colnames(df)[grep("estimated.counts", colnames(df))] <- sample_name
df
}
dfs <- lapply(infiles, process_file)


# Combine all dataframes into one using left_join with all columns that are not samples_names as keys
taxa_cols <- setdiff(colnames(dfs[[1]]), samples_names)
combined_df <- do.call(\(x, y) dplyr::full_join(x, y, by = taxa_cols), dfs)
# Replace NA with 0 for every column in samples_names
replace_na <- function(x) {
x[is.na(x)] <- 0
x
# Combine all dataframes into one using left_join with all columns that are not samples_names as keys
taxa_cols <- setdiff(colnames(dfs[[1]]), samples_names)
combined_df <- do.call(\(x, y) dplyr::full_join(x, y, by = taxa_cols), dfs)
# Replace NA with 0 for every column in samples_names
replace_na <- function(x) {
x[is.na(x)] <- 0
x
}
combined_df[samples_names] <- lapply(combined_df[samples_names], replace_na)
# Write combined dataframe to file
write.table(combined_df, outfile, sep = "\t", row.names = FALSE)
}
combined_df[samples_names] <- lapply(combined_df[samples_names], replace_na)
# Write combined dataframe to file
write.table(combined_df, outfile, sep = "\t", row.names = FALSE)

run(
as.character(snakemake@input),
snakemake@output[[1]]
)

0 comments on commit 54794ab

Please sign in to comment.