Skip to content

Commit

Permalink
Support indexing of extremely large collections
Browse files Browse the repository at this point in the history
  • Loading branch information
lmrodriguezr committed Apr 15, 2024
1 parent 8ede7ac commit 270d367
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 25 deletions.
4 changes: 2 additions & 2 deletions lib/miga/version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ module MiGA
# - String indicating release status:
# - rc* release candidate, not released as gem
# - [0-9]+ stable release, released as gem
VERSION = [1.3, 14, 4].freeze
VERSION = [1.3, 14, 5].freeze

##
# Nickname for the current major.minor version.
VERSION_NAME = 'mezzotint'

##
# Date of the current gem relese.
VERSION_DATE = Date.new(2024, 4, 13)
VERSION_DATE = Date.new(2024, 4, 15)

##
# References of MiGA
Expand Down
31 changes: 21 additions & 10 deletions scripts/aai_distances.bash
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,31 @@ function aai_tsv {
fi
}

rm -f "miga-project.txt"
aai_tsv | tee >(wc -l | awk '{print $1-1}' > "miga-project.txt.lno") \
| gzip -9c > "miga-project.txt.gz"
LNO=$(cat "miga-project.txt.lno")
rm "miga-project.txt.lno"
if [[ ! -s "miga-project.txt.gz" ]] ; then
rm -f "miga-project.txt"
aai_tsv | tee >(wc -l | awk '{print $1-1}' > "miga-project.txt.lno") \
| gzip -9c > "miga-project.txt.gz"
LNO=$(cat "miga-project.txt.lno")
rm "miga-project.txt.lno"
else
LNO=$(gzip -cd "miga-project.txt.gz" | wc -l | awk '{print $1-1}')
fi

# R-ify
cat <<R | R --vanilla
file <- gzfile("miga-project.txt.gz")
text <- readLines(file, n = $LNO + 1, ok = FALSE)
list <- strsplit(text[-1], "\t", fixed = TRUE)
a <- sapply(list, function(x) x[1])
b <- sapply(list, function(x) x[2])
d <- sapply(list, function(x) 1 - (as.numeric(x[3]) / 100))
text <- readLines(file, n = $LNO + 1, ok = FALSE)[-1]
a <- vector("character", $LNO)
b <- vector("character", $LNO)
d <- vector("numeric", $LNO)
chunk.n <- 1024 * 1024
for (chunk in seq_len(ceiling(length(text) / chunk.n))) {
sel <- (chunk * chunk.n - chunk.n + 1):min(chunk * chunk.n, length(text))
list <- strsplit(text[sel], "\t", fixed = TRUE)
a[sel] <- sapply(list, function(x) x[1])
b[sel] <- sapply(list, function(x) x[2])
d[sel] <- sapply(list, function(x) 1 - (as.numeric(x[3]) / 100))
}
save(a, b, d, file = "miga-project.rda")
non_self <- a != b
Expand Down
29 changes: 20 additions & 9 deletions scripts/ani_distances.bash
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,31 @@ function ani_tsv {
foreach_database_ani
}

rm -f "miga-project.txt"
ani_tsv | tee >(wc -l | awk '{print $1-1}' > "miga-project.txt.lno") \
| gzip -9c > "miga-project.txt.gz"
LNO=$(cat "miga-project.txt.lno")
rm "miga-project.txt.lno"
if [[ ! -s "miga-project.txt.gz" ]] ; then
rm -f "miga-project.txt"
ani_tsv | tee >(wc -l | awk '{print $1-1}' > "miga-project.txt.lno") \
| gzip -9c > "miga-project.txt.gz"
LNO=$(cat "miga-project.txt.lno")
rm "miga-project.txt.lno"
else
LNO=$(gzip -cd "miga-project.txt.gz" | wc -l | awk '{print $1-1}')
fi

# R-ify
cat <<R | R --vanilla
file <- gzfile("miga-project.txt.gz")
text <- readLines(file, n = $LNO + 1, ok = FALSE)
list <- strsplit(text[-1], "\t", fixed = TRUE)
a <- sapply(list, function(x) x[1])
b <- sapply(list, function(x) x[2])
d <- sapply(list, function(x) 1 - (as.numeric(x[3]) / 100))
a <- vector("character", $LNO)
b <- vector("character", $LNO)
d <- vector("numeric", $LNO)
chunk.n <- 1024 * 1024
for (chunk in seq_len(ceiling(length(text) / chunk.n))) {
sel <- (chunk * chunk.n - chunk.n + 1):min(chunk * chunk.n, length(text))
list <- strsplit(text[sel], "\t", fixed = TRUE)
a[sel] <- sapply(list, function(x) x[1])
b[sel] <- sapply(list, function(x) x[2])
d[sel] <- sapply(list, function(x) 1 - (as.numeric(x[3]) / 100))
}
save(a, b, d, file = "miga-project.rda")
non_self <- a != b
Expand Down
5 changes: 1 addition & 4 deletions test/remote_dataset_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@ def test_bad_remote_dataset

def test_get
hiv2 = 'M30502.1'
# TODO
# EMBL Temporarily down, enable back whenever possible!
#{ embl: :ebi, nuccore: :ncbi }.each do |db, universe|
{ nuccore: :ncbi }.each do |db, universe|
{ embl: :ebi, nuccore: :ncbi }.each do |db, universe|
rd = MiGA::RemoteDataset.new(hiv2, db, universe)
assert_equal([hiv2], rd.ids)

Expand Down

0 comments on commit 270d367

Please sign in to comment.