From 7c2de043b30963422b980633220beb506068b20e Mon Sep 17 00:00:00 2001 From: "Luis M. Rodriguez-R" Date: Mon, 1 Apr 2024 19:41:46 -0400 Subject: [PATCH] Vectorize distances to allow for larger matrices --- lib/miga/cli/action/browse.rb | 2 +- lib/miga/project/result.rb | 7 ++++++- lib/miga/version.rb | 2 +- manual/part5/workflow.md | 6 ++++-- scripts/aai_distances.bash | 19 ++++++++++--------- scripts/ani_distances.bash | 19 ++++++++++--------- utils/find-medoid.R | 19 ++++++++++++------- utils/subclade/pipeline.rb | 5 +++-- utils/subclades.R | 29 ++++++++++++++++++----------- 9 files changed, 65 insertions(+), 43 deletions(-) diff --git a/lib/miga/cli/action/browse.rb b/lib/miga/cli/action/browse.rb index 193a361..81cd7cd 100644 --- a/lib/miga/cli/action/browse.rb +++ b/lib/miga/cli/action/browse.rb @@ -179,7 +179,7 @@ def format_name(str) str .to_s.unmiga_name .sub(/^./, &:upcase) - .gsub(/(Aai|Ani|Ogs|Cds|Ssu|Rds|ani95|aai90| db$| ssu )/, &:upcase) + .gsub(/(Aai|Ani|Ogs|Cds|Ssu|Rds|Rda|ani95|aai90| db$| ssu )/, &:upcase) .sub(/Haai/, 'hAAI') .sub(/Mytaxa/, 'MyTaxa') .sub(/ pvalue$/, ' p-value') diff --git a/lib/miga/project/result.rb b/lib/miga/project/result.rb index a8d899b..6a46b57 100644 --- a/lib/miga/project/result.rb +++ b/lib/miga/project/result.rb @@ -55,10 +55,13 @@ def next_inclade(save = true) ## # Add result of any type +:*_distances+ at +base+ (no +_opts+ supported). def add_result_distances(base, _opts) - return nil unless result_files_exist?(base, %w[.rds .txt]) + return nil unless result_files_exist?(base, ['.txt']) && + (result_files_exist?(base, ['.rds']) || + result_files_exist?(base, ['.rda'])) r = MiGA::Result.new("#{base}.json") r.add_file(:rds, 'miga-project.rds') + r.add_file(:rda, 'miga-project.rda') r.add_file(:rdata, 'miga-project.Rdata') # Legacy file r.add_file(:matrix, 'miga-project.txt') r.add_file(:log, 'miga-project.log') # Legacy file @@ -84,6 +87,7 @@ def add_result_clade_finding(base, _opts) r = add_result_iter_clades(base) r.add_file(:aai_dist_rds, 'miga-project.dist.rds') + r.add_file(:aai_dist_rda, 'miga-project.dist.rda') r.add_file(:aai_tree, 'miga-project.aai.nwk') r.add_file(:proposal, 'miga-project.proposed-clades') r.add_file(:clades_aai90, 'miga-project.aai90-clades') @@ -108,6 +112,7 @@ def add_result_subclades(base, _opts) r = add_result_iter_clades(base) r.add_file(:ani_tree, 'miga-project.ani.nwk') r.add_file(:ani_dist_rds, 'miga-project.dist.rds') + r.add_file(:ani_dist_rda, 'miga-project.dist.rda') r end diff --git a/lib/miga/version.rb b/lib/miga/version.rb index 9e61637..8ba039b 100644 --- a/lib/miga/version.rb +++ b/lib/miga/version.rb @@ -12,7 +12,7 @@ module MiGA # - String indicating release status: # - rc* release candidate, not released as gem # - [0-9]+ stable release, released as gem - VERSION = [1.3, 13, 10].freeze + VERSION = [1.3, 14, 1].freeze ## # Nickname for the current major.minor version. diff --git a/manual/part5/workflow.md b/manual/part5/workflow.md index 997317e..9cf6dfa 100644 --- a/manual/part5/workflow.md +++ b/manual/part5/workflow.md @@ -463,10 +463,11 @@ Consolidation of AAI distances. Supported file keys: -* `rds` (*req*): Pairwise values in a `data.frame` for `R` +* `rda` (*req*): Pairwise values for `R` in three vectors * `matrix` (*req*): Pairwise values in a raw tab-delimited file * `log` (*req*): List of datasets included in the matrix * `hist`: Histogram of AAI values as raw tab-delimited file +* `rds` (*deprecated*): Pairwise values in a `data.frame` for `R` MiGA symbol: `aai_distances`. @@ -476,10 +477,11 @@ Consolidation of ANI distances. Supported file keys: -* `rds` (*req*): Pairwise values in a `data.frame` for `R` +* `rda` (*req*): Pairwise values for `R` in three vectors * `matrix` (*req*): Pairwise values in a raw tab-delimited file * `log` (*req*): List of datasets included in the matrix * `hist`: Histogram of ANI values as raw tab-delimited file +* `rds` (*deprecated*): Pairwise values in a `data.frame` for `R` MiGA symbol: `ani_distances`. diff --git a/scripts/aai_distances.bash b/scripts/aai_distances.bash index dd9e4cb..c14e4b4 100755 --- a/scripts/aai_distances.bash +++ b/scripts/aai_distances.bash @@ -40,15 +40,16 @@ rm "miga-project.txt.lno" # R-ify cat < 0) { - h <- hist(aai[aai[, "a"] != aai[, "b"], "value"], breaks = 100, plot = FALSE) +text <- readLines(file, n = $LNO + 1, ok = FALSE) +list <- strsplit(text[-1], "\t", fixed = TRUE) +a <- sapply(list, function(x) x[1]) +b <- sapply(list, function(x) x[2]) +d <- sapply(list, function(x) 1 - (as.numeric(x[3]) / 100)) +save(a, b, d, file = "miga-project.rda") + +non_self <- a != b +if(sum(non_self) > 0) { + h <- hist((1 - d[non_self]) * 100, breaks = 100, plot = FALSE) len <- length(h[["breaks"]]) write.table( cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]), diff --git a/scripts/ani_distances.bash b/scripts/ani_distances.bash index ab51a77..ad9dc59 100755 --- a/scripts/ani_distances.bash +++ b/scripts/ani_distances.bash @@ -34,15 +34,16 @@ rm "miga-project.txt.lno" # R-ify cat < 0) { - h <- hist(ani[ani[, "a"] != ani[, "b"], "value"], breaks = 100, plot = FALSE) +text <- readLines(file, n = $LNO + 1, ok = FALSE) +list <- strsplit(text[-1], "\t", fixed = TRUE) +a <- sapply(list, function(x) x[1]) +b <- sapply(list, function(x) x[2]) +d <- sapply(list, function(x) 1 - (as.numeric(x[3]) / 100)) +save(a, b, d, file = "miga-project.rda") + +non_self <- a != b +if(sum(non_self) > 0) { + h <- hist((1 - d[non_self]) * 100, breaks = 100, plot = FALSE) len <- length(h[["breaks"]]) write.table( cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]), diff --git a/utils/find-medoid.R b/utils/find-medoid.R index 1ae6f96..ef11f9c 100755 --- a/utils/find-medoid.R +++ b/utils/find-medoid.R @@ -16,15 +16,14 @@ if(Sys.getenv("MIGA") == ""){ )) } -find_medoids <- function (ani.df, out, clades) { - if(nrow(ani.df) == 0) return(NULL) - ani.df$d <- 1 - (ani.df$value/100) - dist <- enve.df2dist(ani.df, "a", "b", "d", default.d = max(ani.df$d) * 1.2) +find_medoids <- function (a, b, d, out, clades) { + if (length(d) == 0) return(NULL) + dist <- enve.df2dist(cbind(a, b, d), "a", "b", "d", default.d = max(d) * 1.2) dist <- as.matrix(dist) cl <- read.table(clades, header = FALSE, sep = "\t", as.is = TRUE)[,1] cl.s <- c() medoids <- c() - for(i in cl){ + for (i in cl) { lab <- strsplit(i, ",")[[1]] if(length(lab) == 1) { lab.s <- lab @@ -44,6 +43,12 @@ find_medoids <- function (ani.df, out, clades) { #= Main cat("Finding Medoids\n") -ani <- readRDS(argv[1]) -find_medoids(ani.df = ani, out = argv[2], clades = argv[3]) +if (grepl("\\.rds$", argv[1])) { + ani <- readRDS(argv[1]) + find_medoids(ani$a, ani$b, 1 - (ani$value / 100), + out = argv[2], clades = argv[3]) +} else { + load(argv[1]) # assume .rda + find_medoids(a, b, d, out = argv[2], clades = argv[3]) +} diff --git a/utils/subclade/pipeline.rb b/utils/subclade/pipeline.rb index 013bf3d..5477ef6 100644 --- a/utils/subclade/pipeline.rb +++ b/utils/subclade/pipeline.rb @@ -48,9 +48,10 @@ def cluster_species # Find genomospecies medoids src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path) dir = opts[:gsp_metric] == 'aai' ? '02.aai' : '03.ani' + dat = "../../09.distances/#{dir}/miga-project.rda" + dat = "../../09.distances/#{dir}/miga-project.rds" unless File.exist?(dat) run_cmd([ - 'Rscript', src, "../../09.distances/#{dir}/miga-project.rds", - 'miga-project.gsp-medoids', 'miga-project.gsp-clades' + 'Rscript', src, dat, 'miga-project.gsp-medoids', 'miga-project.gsp-clades' ]) if File.exist? 'miga-project.gsp-clades.sorted' File.rename 'miga-project.gsp-clades.sorted', 'miga-project.gsp-clades' diff --git a/utils/subclades.R b/utils/subclades.R index 0c98715..fad803b 100755 --- a/utils/subclades.R +++ b/utils/subclades.R @@ -338,18 +338,25 @@ ggplotColours <- function (n = 6, h = c(0, 360) + 15, alpha = 1) { } ani_distance <- function (ani_file, sel) { - # Try to locate rds, otherwise read gzipped table - rds <- gsub("\\.txt\\.gz$", ".rds", ani_file) - if (file.exists(rds)) { - sim <- readRDS(rds) + # Try to locate rda, then rds, and otherwise read gzipped table + rda <- gsub("\\.txt\\.gz$", ".rda", ani_file) + if (file.exists(rda)) { + load(rda) # Should already contain `a`, `b`, and `d` as vectors } else { - sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE) - } + rds <- gsub("\\.txt\\.gz$", ".rds", ani_file) + if (file.exists(rds)) { + sim <- readRDS(rds) + } else { + sim <- read.table( + gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE + ) + } - # Extract individual variables to deal with very large matrices - a <- sim$a - b <- sim$b - d <- 1 - (sim$value / 100) + # Extract individual variables to deal with very large matrices + a <- sim$a + b <- sim$b + d <- 1 - (sim$value / 100) + } # If there is no data, end process if (length(a) == 0) return(NULL) @@ -359,7 +366,7 @@ ani_distance <- function (ani_file, sel) { if (!is.na(sel) && file.exists(sel)) { say("Filter selection") ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1] - sel.idx <- which(sim$a %in% ids & sim$b %in% ids) + sel.idx <- which(a %in% ids & b %in% ids) a <- a[sel.idx] b <- b[sel.idx] d <- d[sel.idx]