Skip to content

Commit

Permalink
Vectorize distances to allow for larger matrices
Browse files Browse the repository at this point in the history
  • Loading branch information
lmrodriguezr committed Apr 1, 2024
1 parent e18c651 commit 7c2de04
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 43 deletions.
2 changes: 1 addition & 1 deletion lib/miga/cli/action/browse.rb
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def format_name(str)
str
.to_s.unmiga_name
.sub(/^./, &:upcase)
.gsub(/(Aai|Ani|Ogs|Cds|Ssu|Rds|ani95|aai90| db$| ssu )/, &:upcase)
.gsub(/(Aai|Ani|Ogs|Cds|Ssu|Rds|Rda|ani95|aai90| db$| ssu )/, &:upcase)
.sub(/Haai/, 'hAAI')
.sub(/Mytaxa/, 'MyTaxa')
.sub(/ pvalue$/, ' p-value')
Expand Down
7 changes: 6 additions & 1 deletion lib/miga/project/result.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,13 @@ def next_inclade(save = true)
##
# Add result of any type +:*_distances+ at +base+ (no +_opts+ supported).
def add_result_distances(base, _opts)
return nil unless result_files_exist?(base, %w[.rds .txt])
return nil unless result_files_exist?(base, ['.txt']) &&
(result_files_exist?(base, ['.rds']) ||
result_files_exist?(base, ['.rda']))

r = MiGA::Result.new("#{base}.json")
r.add_file(:rds, 'miga-project.rds')
r.add_file(:rda, 'miga-project.rda')
r.add_file(:rdata, 'miga-project.Rdata') # Legacy file
r.add_file(:matrix, 'miga-project.txt')
r.add_file(:log, 'miga-project.log') # Legacy file
Expand All @@ -84,6 +87,7 @@ def add_result_clade_finding(base, _opts)

r = add_result_iter_clades(base)
r.add_file(:aai_dist_rds, 'miga-project.dist.rds')
r.add_file(:aai_dist_rda, 'miga-project.dist.rda')
r.add_file(:aai_tree, 'miga-project.aai.nwk')
r.add_file(:proposal, 'miga-project.proposed-clades')
r.add_file(:clades_aai90, 'miga-project.aai90-clades')
Expand All @@ -108,6 +112,7 @@ def add_result_subclades(base, _opts)
r = add_result_iter_clades(base)
r.add_file(:ani_tree, 'miga-project.ani.nwk')
r.add_file(:ani_dist_rds, 'miga-project.dist.rds')
r.add_file(:ani_dist_rda, 'miga-project.dist.rda')
r
end

Expand Down
2 changes: 1 addition & 1 deletion lib/miga/version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ module MiGA
# - String indicating release status:
# - rc* release candidate, not released as gem
# - [0-9]+ stable release, released as gem
VERSION = [1.3, 13, 10].freeze
VERSION = [1.3, 14, 1].freeze

##
# Nickname for the current major.minor version.
Expand Down
6 changes: 4 additions & 2 deletions manual/part5/workflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -463,10 +463,11 @@ Consolidation of AAI distances.

Supported file keys:

* `rds` (*req*): Pairwise values in a `data.frame` for `R`
* `rda` (*req*): Pairwise values for `R` in three vectors
* `matrix` (*req*): Pairwise values in a raw tab-delimited file
* `log` (*req*): List of datasets included in the matrix
* `hist`: Histogram of AAI values as raw tab-delimited file
* `rds` (*deprecated*): Pairwise values in a `data.frame` for `R`

MiGA symbol: `aai_distances`.

Expand All @@ -476,10 +477,11 @@ Consolidation of ANI distances.

Supported file keys:

* `rds` (*req*): Pairwise values in a `data.frame` for `R`
* `rda` (*req*): Pairwise values for `R` in three vectors
* `matrix` (*req*): Pairwise values in a raw tab-delimited file
* `log` (*req*): List of datasets included in the matrix
* `hist`: Histogram of ANI values as raw tab-delimited file
* `rds` (*deprecated*): Pairwise values in a `data.frame` for `R`

MiGA symbol: `ani_distances`.

Expand Down
19 changes: 10 additions & 9 deletions scripts/aai_distances.bash
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,16 @@ rm "miga-project.txt.lno"
# R-ify
cat <<R | R --vanilla
file <- gzfile("miga-project.txt.gz")
aai <- read.table(
file, sep = "\t", header = TRUE, as.is = TRUE, quote = "",
stringsAsFactors = FALSE, comment.char = "", nrows = $LNO,
colClasses = c("character", "character",
"numeric", "numeric", "integer", "integer")
)
saveRDS(aai, file = "miga-project.rds")
if(sum(aai[, "a"] != aai[, "b"]) > 0) {
h <- hist(aai[aai[, "a"] != aai[, "b"], "value"], breaks = 100, plot = FALSE)
text <- readLines(file, n = $LNO + 1, ok = FALSE)
list <- strsplit(text[-1], "\t", fixed = TRUE)
a <- sapply(list, function(x) x[1])
b <- sapply(list, function(x) x[2])
d <- sapply(list, function(x) 1 - (as.numeric(x[3]) / 100))
save(a, b, d, file = "miga-project.rda")
non_self <- a != b
if(sum(non_self) > 0) {
h <- hist((1 - d[non_self]) * 100, breaks = 100, plot = FALSE)
len <- length(h[["breaks"]])
write.table(
cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
Expand Down
19 changes: 10 additions & 9 deletions scripts/ani_distances.bash
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,16 @@ rm "miga-project.txt.lno"
# R-ify
cat <<R | R --vanilla
file <- gzfile("miga-project.txt.gz")
ani <- read.table(
file, sep = "\t", header = TRUE, as.is = TRUE, quote = "",
stringsAsFactors = FALSE, comment.char = "", nrows = $LNO,
colClasses = c("character", "character",
"numeric", "numeric", "integer", "integer")
)
saveRDS(ani, file = "miga-project.rds")
if(sum(ani[, "a"] != ani[, "b"]) > 0) {
h <- hist(ani[ani[, "a"] != ani[, "b"], "value"], breaks = 100, plot = FALSE)
text <- readLines(file, n = $LNO + 1, ok = FALSE)
list <- strsplit(text[-1], "\t", fixed = TRUE)
a <- sapply(list, function(x) x[1])
b <- sapply(list, function(x) x[2])
d <- sapply(list, function(x) 1 - (as.numeric(x[3]) / 100))
save(a, b, d, file = "miga-project.rda")
non_self <- a != b
if(sum(non_self) > 0) {
h <- hist((1 - d[non_self]) * 100, breaks = 100, plot = FALSE)
len <- length(h[["breaks"]])
write.table(
cbind(h[["breaks"]][-len], h[["breaks"]][-1], h[["counts"]]),
Expand Down
19 changes: 12 additions & 7 deletions utils/find-medoid.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,14 @@ if(Sys.getenv("MIGA") == ""){
))
}

find_medoids <- function (ani.df, out, clades) {
if(nrow(ani.df) == 0) return(NULL)
ani.df$d <- 1 - (ani.df$value/100)
dist <- enve.df2dist(ani.df, "a", "b", "d", default.d = max(ani.df$d) * 1.2)
find_medoids <- function (a, b, d, out, clades) {
if (length(d) == 0) return(NULL)
dist <- enve.df2dist(cbind(a, b, d), "a", "b", "d", default.d = max(d) * 1.2)
dist <- as.matrix(dist)
cl <- read.table(clades, header = FALSE, sep = "\t", as.is = TRUE)[,1]
cl.s <- c()
medoids <- c()
for(i in cl){
for (i in cl) {
lab <- strsplit(i, ",")[[1]]
if(length(lab) == 1) {
lab.s <- lab
Expand All @@ -44,6 +43,12 @@ find_medoids <- function (ani.df, out, clades) {

#= Main
cat("Finding Medoids\n")
ani <- readRDS(argv[1])
find_medoids(ani.df = ani, out = argv[2], clades = argv[3])
if (grepl("\\.rds$", argv[1])) {
ani <- readRDS(argv[1])
find_medoids(ani$a, ani$b, 1 - (ani$value / 100),
out = argv[2], clades = argv[3])
} else {
load(argv[1]) # assume .rda
find_medoids(a, b, d, out = argv[2], clades = argv[3])
}

5 changes: 3 additions & 2 deletions utils/subclade/pipeline.rb
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ def cluster_species
# Find genomospecies medoids
src = File.expand_path('utils/find-medoid.R', MiGA::MiGA.root_path)
dir = opts[:gsp_metric] == 'aai' ? '02.aai' : '03.ani'
dat = "../../09.distances/#{dir}/miga-project.rda"
dat = "../../09.distances/#{dir}/miga-project.rds" unless File.exist?(dat)
run_cmd([
'Rscript', src, "../../09.distances/#{dir}/miga-project.rds",
'miga-project.gsp-medoids', 'miga-project.gsp-clades'
'Rscript', src, dat, 'miga-project.gsp-medoids', 'miga-project.gsp-clades'
])
if File.exist? 'miga-project.gsp-clades.sorted'
File.rename 'miga-project.gsp-clades.sorted', 'miga-project.gsp-clades'
Expand Down
29 changes: 18 additions & 11 deletions utils/subclades.R
Original file line number Diff line number Diff line change
Expand Up @@ -338,18 +338,25 @@ ggplotColours <- function (n = 6, h = c(0, 360) + 15, alpha = 1) {
}

ani_distance <- function (ani_file, sel) {
# Try to locate rds, otherwise read gzipped table
rds <- gsub("\\.txt\\.gz$", ".rds", ani_file)
if (file.exists(rds)) {
sim <- readRDS(rds)
# Try to locate rda, then rds, and otherwise read gzipped table
rda <- gsub("\\.txt\\.gz$", ".rda", ani_file)
if (file.exists(rda)) {
load(rda) # Should already contain `a`, `b`, and `d` as vectors
} else {
sim <- read.table(gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE)
}
rds <- gsub("\\.txt\\.gz$", ".rds", ani_file)
if (file.exists(rds)) {
sim <- readRDS(rds)
} else {
sim <- read.table(
gzfile(ani_file), sep = "\t", header = TRUE, as.is = TRUE
)
}

# Extract individual variables to deal with very large matrices
a <- sim$a
b <- sim$b
d <- 1 - (sim$value / 100)
# Extract individual variables to deal with very large matrices
a <- sim$a
b <- sim$b
d <- 1 - (sim$value / 100)
}

# If there is no data, end process
if (length(a) == 0) return(NULL)
Expand All @@ -359,7 +366,7 @@ ani_distance <- function (ani_file, sel) {
if (!is.na(sel) && file.exists(sel)) {
say("Filter selection")
ids <- read.table(sel, sep = "\t", head = FALSE, as.is = TRUE)[,1]
sel.idx <- which(sim$a %in% ids & sim$b %in% ids)
sel.idx <- which(a %in% ids & b %in% ids)
a <- a[sel.idx]
b <- b[sel.idx]
d <- d[sel.idx]
Expand Down

0 comments on commit 7c2de04

Please sign in to comment.