Skip to content

Commit

Permalink
Merge pull request #281 from ddbj/parse-error
Browse files Browse the repository at this point in the history
Fix a bug that prevents parsing errors in certain situations
  • Loading branch information
ursm authored Jan 7, 2024
2 parents bac721a + 1b6ea79 commit c1865f2
Show file tree
Hide file tree
Showing 10 changed files with 326 additions and 117 deletions.
95 changes: 11 additions & 84 deletions backend/app/jobs/extract_metadata_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,102 +3,29 @@ def perform(extraction)
ActiveRecord::Base.transaction do
begin
extraction.prepare_files
rescue ExtractionError => e
rescue DfastExtraction::ExtractionError => e
extraction.update! state: 'rejected', error: {id: e.id, **e.data}
return
end

extraction.files.find_each do |file|
file.update!(
parsing: false,
parsed_data: parse(file),
parsed_data: file.parse,
_errors: []
)
end

extraction.update! state: 'fulfilled'
end
rescue => e
Rails.logger.error '******'
Rails.logger.error e

raise
end

private

def parse(file)
case File.extname(file.name)
when *MassDirectoryExtraction::ANN_EXT.map { ".#{_1}" }
parse_ann(file)
when *MassDirectoryExtraction::SEQ_EXT.map { ".#{_1}" }
parse_seq(file)
else
raise "unsupported file: #{file.name}"
end
end

def parse_ann(file)
in_common = false
full_name = nil
email = nil
affiliation = nil
hold_date = nil

file.fullpath.each_line chomp: true do |line|
break if full_name && email && affiliation && hold_date

entry, _feature, _location, qualifier, value = line.split("\t")

break if in_common && entry.present?

in_common = entry == 'COMMON' if entry.present?

next unless in_common
rescue ExtractionFile::ParseError => e
file.update!(
parsing: false,
parsed_data: nil,

case qualifier
when 'contact'
full_name = value
when 'email'
email = value
when 'institute'
affiliation = value
when 'hold_date'
hold_date = Date.strptime(value, '%Y%m%d').strftime('%Y-%m-%d')
else
# do nothing
_errors: [
{id: e.id, value: e.value}
]
)
end
end

{
contactPerson: {
fullName: full_name,
email:,
affiliation:
},

holdDate: hold_date
}
end

def parse_seq(file)
count = 0
buf = String.new(capacity: 1.megabyte)
bol = true

file.fullpath.open 'rb' do |io|
while io.readpartial(1.megabyte, buf)
count += 1 if bol && buf.start_with?('>')
count += buf.scan(/[\r\n]>/).count

bol = buf.end_with?("\r", "\n")
end
rescue EOFError
# done
extraction.update! state: 'fulfilled'
end

{
entriesCount: count
}
end
end
95 changes: 95 additions & 0 deletions backend/app/models/concerns/extraction_file.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,99 @@
module ExtractionFile
class ParseError < StandardError
def initialize(id, value = nil)
@id = id
@value = value
end

attr_reader :id, :value
end

def fullpath = extraction.working_dir.join(name)
def size = fullpath.size

def parse
case File.extname(name)
when *MassDirectoryExtraction::ANN_EXT.map { ".#{_1}" }
parse_ann
when *MassDirectoryExtraction::SEQ_EXT.map { ".#{_1}" }
parse_seq
else
raise "unsupported file: #{name}"
end
end

private

def parse_ann
in_common = false
full_name = nil
email = nil
affiliation = nil
hold_date = nil

fullpath.each_line chomp: true do |line|
break if full_name && email && affiliation && hold_date

entry, _feature, _location, qualifier, value = line.split("\t")

break if in_common && entry.present?

in_common = entry == 'COMMON' if entry.present?

next unless in_common

case qualifier
when 'contact'
full_name = value
when 'email'
email = value
when 'institute'
affiliation = value
when 'hold_date'
begin
hold_date = Date.strptime(value, '%Y%m%d').strftime('%Y-%m-%d')
rescue Date::Error
raise ParseError.new('annotation-file-parser.invalid-hold-date', value)
end
else
# do nothing
end
end

raise ParseError.new('annotation-file-parser.missing-contact-person') if !full_name && !email && !affiliation
raise ParseError.new('annotation-file-parser.invalid-contact-person') if !full_name || !email || !affiliation

{
contactPerson: {
fullName: full_name,
email:,
affiliation:
},

holdDate: hold_date
}
end

def parse_seq
count = 0
buf = String.new(capacity: 1.megabyte)
bol = true

fullpath.open 'rb' do |io|
while io.readpartial(1.megabyte, buf)
count += 1 if bol && buf.start_with?('>')
count += buf.scan(/[\r\n]>/).count

bol = buf.end_with?("\r", "\n")
end
rescue EOFError
# done
end

raise ParseError.new('sequence-file-parser.no-entries') if count.zero?

{
entriesCount: count
}
end
end
9 changes: 9 additions & 0 deletions backend/app/models/dfast_extraction.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
require 'open-uri'

class DfastExtraction < ApplicationRecord
class ExtractionError < StandardError
def initialize(id, **data)
@id = id
@data = data
end

attr_reader :id, :data
end

belongs_to :user

has_many :files, dependent: :destroy, class_name: 'DfastExtractionFile', foreign_key: :extraction_id
Expand Down
8 changes: 0 additions & 8 deletions backend/app/models/extraction_error.rb

This file was deleted.

11 changes: 11 additions & 0 deletions backend/spec/factories/mass_directory_extractions.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FactoryBot.define do
factory :mass_directory_extraction do
user
end

factory :mass_directory_extraction_file do
extraction factory: :mass_directory_extraction

parsing { false }
end
end
10 changes: 10 additions & 0 deletions backend/spec/factories/users.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@
factory :user do
sequence(:openid_sub) {|i| "user:#{i}" }

id_token {|user|
uid = user.openid_sub.sub(':', '_')

{
sub: user.openid_sub,
preferred_username: uid,
email: "#{uid}@example.com"
}
}

trait :alice do
id_token {|user|
{
Expand Down
Loading

0 comments on commit c1865f2

Please sign in to comment.