Skip to content

Commit

Permalink
Efficiency improvement for large projects
Browse files Browse the repository at this point in the history
Projects with a large number of datasets should now work much faster with two improvements:
- Using name Set instead if Hash
- Adding names to Set instead of re-creating it when a new dataset is added
  • Loading branch information
lmrodriguezr committed Feb 6, 2024
1 parent c911688 commit ac5c66b
Show file tree
Hide file tree
Showing 7 changed files with 84 additions and 24 deletions.
43 changes: 35 additions & 8 deletions lib/miga/cli/action/download/ncbi.rb
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,14 @@ def sanitize_cli
def remote_list
if cli[:ncbi_taxonomy_dump]
cli.say "Reading NCBI Taxonomy dump: #{cli[:ncbi_taxonomy_dump]}"
MiGA::RemoteDataset.use_ncbi_taxonomy_dump(cli[:ncbi_taxonomy_dump])
MiGA::RemoteDataset.use_ncbi_taxonomy_dump(cli[:ncbi_taxonomy_dump], cli)
end

if cli[:ncbi_list_json] && File.size?(cli[:ncbi_list_json])
cli.say "Reusing remote list: #{cli[:ncbi_list_json]}"
return MiGA::Json.parse(cli[:ncbi_list_json])
return read_ncbi_list_json(cli[:ncbi_list_json])
end

cli.say "Obtaining remote list of datasets"
list = {}
query = remote_list_query
loop do
Expand All @@ -79,18 +79,45 @@ def remote_list
list.merge!(parse_reports_as_datasets(page[:reports]))

# Next page
cli.advance('Datasets:', list.size, page[:total_count])
break unless page[:next_page_token]
query[:page_token] = page[:next_page_token]
end
cli.say

if cli[:ncbi_list_json]
cli.say "Saving remote list: #{cli[:ncbi_list_json]}"
MiGA::Json.generate_fast(list, cli[:ncbi_list_json])
write_ncbi_list_json(cli[:ncbi_list_json], list) if cli[:ncbi_list_json]
list
end

def read_ncbi_list_json(file)
cli.say "Reusing remote list: #{file}"
list = {}
n_tot = nil
File.open(file, 'r') do |fh|
n_tot = fh.gets.chomp.sub(/^# /, '').to_i
fh.each_with_index do |ln, k|
row = ln.chomp.split("\t", 2)
list[row[0]] = MiGA::Json.parse(row[1], contents: true)
cli.advance('Lines:', k, n_tot)
end
cli.say
end
return list
end

list
def write_ncbi_list_json(file, list)
cli.say "Saving remote list: #{file}"
File.open(file, 'w') do |fh|
fh.puts('# %i' % list.size)
kk = 0
list.each do |k, v|
fh.puts([k, MiGA::Json.generate_fast(v)].join("\t"))
cli.advance('Datasets:', kk += 1, list.size)
end
cli.say
end
end

def parse_reports_as_datasets(reports)
ds = {}
reports.each do |r|
Expand Down
3 changes: 2 additions & 1 deletion lib/miga/dataset.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# @package MiGA
# @license Artistic-2.0

require'set'
require 'miga/metadata'
require 'miga/dataset/result'
require 'miga/dataset/status'
Expand All @@ -27,7 +28,7 @@ class << self
##
# Does the +project+ already have a dataset with that +name+?
def exist?(project, name)
!project.dataset_names_hash[name].nil?
project.dataset_names_set.include? name
end

##
Expand Down
19 changes: 14 additions & 5 deletions lib/miga/json.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ class << self
# - +:symbolize+: If names should be symbolized. By default it's true if
# additions is false, or false otherwise. They can both be false, but an
# exception will be raised if both are true
# - +:large_file+: If passed, the file is treated as a file with very long
# lines (possibly a single long line)
def default_opts(opts = {})
opts[:contents] ||= false
opts[:additions] ||= false
Expand All @@ -36,11 +38,18 @@ def parse(path, opts = {})

# Read JSON
cont = path
12.times do
cont = File.read(path)
break unless cont.empty?
sleep 1 # Wait up to 12 seconds for racing processes (iff empty file)
end unless opts[:contents]
if opts[:large_file]
cont = ''
File.open(path, 'r') do |fh|
cont += fh.read(2 ** 16) until fh.eof?
end
elsif !opts[:contents]
12.times do
cont = File.read(path)
break unless cont.empty?
sleep 1 # Wait up to 12 seconds for racing processes (iff empty file)
end
end
raise "Empty descriptor: #{opts[:contents] ? "''" : path}" if cont.empty?

# Parse JSON
Expand Down
1 change: 1 addition & 0 deletions lib/miga/project.rb
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def save!
def load
@datasets = {}
@dataset_names_hash = nil
@dataset_names_set = nil
@metadata = MiGA::Metadata.load "#{path}/miga.project.json"
raise "Couldn't find project metadata at #{path}" if metadata.nil?

Expand Down
19 changes: 14 additions & 5 deletions lib/miga/project/dataset.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,32 @@
# Helper module including specific functions handle datasets.
module MiGA::Project::Dataset
##
# Returns Array of MiGA::Dataset.
# Returns Array of MiGA::Dataset
def datasets
metadata[:datasets].map { |name| dataset(name) }
end

##
# Returns Array of String (without evaluating dataset objects).
# Returns Array of String (without evaluating dataset objects)
def dataset_names
metadata[:datasets]
end

##
# Returns Hash of Strings => true. Similar to +dataset_names+ but as
# Hash for efficiency.
# Returns Hash of +{ String => true }+. Similar to +dataset_names+ but as
# Hash for efficiency
def dataset_names_hash
warn 'The Project#dataset_names_hash method will be deprecated soon'
@dataset_names_hash ||= Hash[dataset_names.map { |i| [i, true] }]
end

##
# Returns Set of Strings. Similar to +dataset_names+ but as Set for
# efficiency
def dataset_names_set
@dataset_names_set ||= Set.new(dataset_names)
end

##
# Returns MiGA::Dataset
def dataset(name)
Expand Down Expand Up @@ -50,7 +58,8 @@ def add_dataset(name)
unless metadata[:datasets].include? name
d = MiGA::Dataset.new(self, name)
@metadata[:datasets] << name
@dataset_names_hash = nil # Ensure loading even if +do_not_save+ is true
@dataset_names_hash[name] = true if @dataset_names_hash
@dataset_names_set << name if @dataset_names_set
save
if d.ref? && d.active?
recalculate_tasks("Reference dataset added: #{d.name}")
Expand Down
19 changes: 16 additions & 3 deletions lib/miga/remote_dataset.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,44 @@ class << self
# Path to a directory with a recent NCBI Taxonomy dump to use instead of
# making API calls to NCBI servers, which can be obtained at:
# https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
def use_ncbi_taxonomy_dump(path)
#
# The +cli+ parameter, if passed, should be a MiGA::Cli object that will
# be used to report advance in the reading. Other objects can be passed,
# minimally supporting the MiGA::Cli#say and MiGA::Cli#advance method
# interfaces
def use_ncbi_taxonomy_dump(path, cli = nil)
raise "Directory doesn't exist: #{path}" unless File.directory?(path)

# Structure: { TaxID => ["name", "rank", parent TaxID] }
MiGA::MiGA.DEBUG "Loading NCBI Taxonomy dump: #{path}"
@ncbi_taxonomy_names = {}

# Read names.dmp
File.open(File.join(path, 'names.dmp')) do |fh|
File.open(file = File.join(path, 'names.dmp')) do |fh|
read = 0
size = File.size(file)
fh.each do |ln|
cli&.advance('- names.dmp:', read += ln.size, size)
row = ln.split(/\t\|\t?/)
next unless row[3] == 'scientific name'
@ncbi_taxonomy_names[row[0].to_i] = [row[1].strip]
end
cli&.say
end

# Read nodes.dmp
File.open(File.join(path, 'nodes.dmp')) do |fh|
File.open(file = File.join(path, 'nodes.dmp')) do |fh|
read = 0
size = File.size(file)
fh.each do |ln|
cli&.advance('- nodes.dmp:', read += ln.size, size)
row = ln.split(/\t\|\t?/)
child = row[0].to_i
parent = row[1].to_i
@ncbi_taxonomy_names[child][1] = row[2]
@ncbi_taxonomy_names[child][2] = parent unless parent == child
end
cli&.say
end
end

Expand Down
4 changes: 2 additions & 2 deletions lib/miga/version.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ module MiGA
# - String indicating release status:
# - rc* release candidate, not released as gem
# - [0-9]+ stable release, released as gem
VERSION = [1.3, 10, 1].freeze
VERSION = [1.3, 10, 2].freeze

##
# Nickname for the current major.minor version.
VERSION_NAME = 'mezzotint'

##
# Date of the current gem relese.
VERSION_DATE = Date.new(2024, 1, 31)
VERSION_DATE = Date.new(2024, 2, 6)

##
# References of MiGA
Expand Down

0 comments on commit ac5c66b

Please sign in to comment.