From 1b6ea795f4b77ffb3e6cb2133dc46dfaa410f388 Mon Sep 17 00:00:00 2001 From: Keita Urashima Date: Sun, 7 Jan 2024 16:20:38 +0900 Subject: [PATCH] Fix a bug that prevents parsing errors in certain situations --- backend/app/jobs/extract_metadata_job.rb | 95 ++--------- .../app/models/concerns/extraction_file.rb | 95 +++++++++++ backend/app/models/dfast_extraction.rb | 9 ++ backend/app/models/extraction_error.rb | 8 - .../factories/mass_directory_extractions.rb | 11 ++ backend/spec/factories/users.rb | 10 ++ .../spec/jobs/extract_metadata_job_spec.rb | 147 ++++++++++++++++++ backend/spec/support/env.rb | 30 ++-- .../public/workers/sequence-file-parser.js | 2 +- .../tests/unit/models/sequence-file-test.js | 36 +++-- 10 files changed, 326 insertions(+), 117 deletions(-) delete mode 100644 backend/app/models/extraction_error.rb create mode 100644 backend/spec/factories/mass_directory_extractions.rb create mode 100644 backend/spec/jobs/extract_metadata_job_spec.rb diff --git a/backend/app/jobs/extract_metadata_job.rb b/backend/app/jobs/extract_metadata_job.rb index cd80d87b..d3ffdde5 100644 --- a/backend/app/jobs/extract_metadata_job.rb +++ b/backend/app/jobs/extract_metadata_job.rb @@ -3,7 +3,7 @@ def perform(extraction) ActiveRecord::Base.transaction do begin extraction.prepare_files - rescue ExtractionError => e + rescue DfastExtraction::ExtractionError => e extraction.update! state: 'rejected', error: {id: e.id, **e.data} return end @@ -11,94 +11,21 @@ def perform(extraction) extraction.files.find_each do |file| file.update!( parsing: false, - parsed_data: parse(file), + parsed_data: file.parse, _errors: [] ) - end - - extraction.update! state: 'fulfilled' - end - rescue => e - Rails.logger.error '******' - Rails.logger.error e - - raise - end - - private - - def parse(file) - case File.extname(file.name) - when *MassDirectoryExtraction::ANN_EXT.map { ".#{_1}" } - parse_ann(file) - when *MassDirectoryExtraction::SEQ_EXT.map { ".#{_1}" } - parse_seq(file) - else - raise "unsupported file: #{file.name}" - end - end - - def parse_ann(file) - in_common = false - full_name = nil - email = nil - affiliation = nil - hold_date = nil - - file.fullpath.each_line chomp: true do |line| - break if full_name && email && affiliation && hold_date - - entry, _feature, _location, qualifier, value = line.split("\t") - - break if in_common && entry.present? - - in_common = entry == 'COMMON' if entry.present? - - next unless in_common + rescue ExtractionFile::ParseError => e + file.update!( + parsing: false, + parsed_data: nil, - case qualifier - when 'contact' - full_name = value - when 'email' - email = value - when 'institute' - affiliation = value - when 'hold_date' - hold_date = Date.strptime(value, '%Y%m%d').strftime('%Y-%m-%d') - else - # do nothing + _errors: [ + {id: e.id, value: e.value} + ] + ) end - end - - { - contactPerson: { - fullName: full_name, - email:, - affiliation: - }, - holdDate: hold_date - } - end - - def parse_seq(file) - count = 0 - buf = String.new(capacity: 1.megabyte) - bol = true - - file.fullpath.open 'rb' do |io| - while io.readpartial(1.megabyte, buf) - count += 1 if bol && buf.start_with?('>') - count += buf.scan(/[\r\n]>/).count - - bol = buf.end_with?("\r", "\n") - end - rescue EOFError - # done + extraction.update! state: 'fulfilled' end - - { - entriesCount: count - } end end diff --git a/backend/app/models/concerns/extraction_file.rb b/backend/app/models/concerns/extraction_file.rb index ecf2317b..6d983560 100644 --- a/backend/app/models/concerns/extraction_file.rb +++ b/backend/app/models/concerns/extraction_file.rb @@ -1,4 +1,99 @@ module ExtractionFile + class ParseError < StandardError + def initialize(id, value = nil) + @id = id + @value = value + end + + attr_reader :id, :value + end + def fullpath = extraction.working_dir.join(name) def size = fullpath.size + + def parse + case File.extname(name) + when *MassDirectoryExtraction::ANN_EXT.map { ".#{_1}" } + parse_ann + when *MassDirectoryExtraction::SEQ_EXT.map { ".#{_1}" } + parse_seq + else + raise "unsupported file: #{name}" + end + end + + private + + def parse_ann + in_common = false + full_name = nil + email = nil + affiliation = nil + hold_date = nil + + fullpath.each_line chomp: true do |line| + break if full_name && email && affiliation && hold_date + + entry, _feature, _location, qualifier, value = line.split("\t") + + break if in_common && entry.present? + + in_common = entry == 'COMMON' if entry.present? + + next unless in_common + + case qualifier + when 'contact' + full_name = value + when 'email' + email = value + when 'institute' + affiliation = value + when 'hold_date' + begin + hold_date = Date.strptime(value, '%Y%m%d').strftime('%Y-%m-%d') + rescue Date::Error + raise ParseError.new('annotation-file-parser.invalid-hold-date', value) + end + else + # do nothing + end + end + + raise ParseError.new('annotation-file-parser.missing-contact-person') if !full_name && !email && !affiliation + raise ParseError.new('annotation-file-parser.invalid-contact-person') if !full_name || !email || !affiliation + + { + contactPerson: { + fullName: full_name, + email:, + affiliation: + }, + + holdDate: hold_date + } + end + + def parse_seq + count = 0 + buf = String.new(capacity: 1.megabyte) + bol = true + + fullpath.open 'rb' do |io| + while io.readpartial(1.megabyte, buf) + count += 1 if bol && buf.start_with?('>') + count += buf.scan(/[\r\n]>/).count + + bol = buf.end_with?("\r", "\n") + end + rescue EOFError + # done + end + + raise ParseError.new('sequence-file-parser.no-entries') if count.zero? + + { + entriesCount: count + } + end end diff --git a/backend/app/models/dfast_extraction.rb b/backend/app/models/dfast_extraction.rb index 41f0e085..c444f625 100644 --- a/backend/app/models/dfast_extraction.rb +++ b/backend/app/models/dfast_extraction.rb @@ -1,6 +1,15 @@ require 'open-uri' class DfastExtraction < ApplicationRecord + class ExtractionError < StandardError + def initialize(id, **data) + @id = id + @data = data + end + + attr_reader :id, :data + end + belongs_to :user has_many :files, dependent: :destroy, class_name: 'DfastExtractionFile', foreign_key: :extraction_id diff --git a/backend/app/models/extraction_error.rb b/backend/app/models/extraction_error.rb deleted file mode 100644 index 351cfc2a..00000000 --- a/backend/app/models/extraction_error.rb +++ /dev/null @@ -1,8 +0,0 @@ -class ExtractionError < StandardError - def initialize(id, **data) - @id = id - @data = data - end - - attr_reader :id, :data -end diff --git a/backend/spec/factories/mass_directory_extractions.rb b/backend/spec/factories/mass_directory_extractions.rb new file mode 100644 index 00000000..19596a94 --- /dev/null +++ b/backend/spec/factories/mass_directory_extractions.rb @@ -0,0 +1,11 @@ +FactoryBot.define do + factory :mass_directory_extraction do + user + end + + factory :mass_directory_extraction_file do + extraction factory: :mass_directory_extraction + + parsing { false } + end +end diff --git a/backend/spec/factories/users.rb b/backend/spec/factories/users.rb index 990f3ce0..65107610 100644 --- a/backend/spec/factories/users.rb +++ b/backend/spec/factories/users.rb @@ -2,6 +2,16 @@ factory :user do sequence(:openid_sub) {|i| "user:#{i}" } + id_token {|user| + uid = user.openid_sub.sub(':', '_') + + { + sub: user.openid_sub, + preferred_username: uid, + email: "#{uid}@example.com" + } + } + trait :alice do id_token {|user| { diff --git a/backend/spec/jobs/extract_metadata_job_spec.rb b/backend/spec/jobs/extract_metadata_job_spec.rb new file mode 100644 index 00000000..bf0a8330 --- /dev/null +++ b/backend/spec/jobs/extract_metadata_job_spec.rb @@ -0,0 +1,147 @@ +require 'rails_helper' + +RSpec.describe ExtractMetadataJob, type: :job do + def write_file(path, content) + File.write File.join(ENV.fetch('MASS_DIR_PATH_TEMPLATE'), path), content + end + + let(:extraction) { extraction = create(:mass_directory_extraction) } + + describe 'ann' do + example 'ok' do + write_file 'foo.ann', <<~ANN + COMMON SUBMITTER contact Alice Liddell + email alice@example.com + institute Wonderland Inc. + DATE hold_date 20200102 + ANN + + ExtractMetadataJob.perform_now extraction + + expect(extraction.files.first).to have_attributes( + parsing: false, + + parsed_data: { + 'contactPerson' => { + 'fullName' => 'Alice Liddell', + 'email' => 'alice@example.com', + 'affiliation' => 'Wonderland Inc.' + }, + 'holdDate' => '2020-01-02' + }, + + _errors: [] + ) + end + + example 'empty' do + write_file 'foo.ann', '' + + ExtractMetadataJob.perform_now extraction + + expect(extraction.files.first).to have_attributes( + parsing: false, + parsed_data: nil, + + _errors: [ + {'id' => 'annotation-file-parser.missing-contact-person', 'value' => nil} + ] + ) + end + + example 'missing contact person' do + write_file 'foo.ann', <<~ANN + COMMON DATE hold_date 20231126 + ANN + + ExtractMetadataJob.perform_now extraction + + expect(extraction.files.first).to have_attributes( + parsing: false, + parsed_data: nil, + + _errors: [ + {'id' => 'annotation-file-parser.missing-contact-person', 'value' => nil} + ] + ) + end + + example 'invalid contact person' do + write_file 'foo.ann', <<~ANN + COMMON SUBMITTER contact Alice Liddell + ANN + + ExtractMetadataJob.perform_now extraction + + expect(extraction.files.first).to have_attributes( + parsing: false, + parsed_data: nil, + + _errors: [ + {'id' => 'annotation-file-parser.invalid-contact-person', 'value' => nil} + ] + ) + end + + example 'invalid hold_date' do + write_file 'foo.ann', <<~ANN + COMMON DATE hold_date foo + ANN + + ExtractMetadataJob.perform_now extraction + + expect(extraction.files.first).to have_attributes( + parsing: false, + parsed_data: nil, + + _errors: [ + {'id' => 'annotation-file-parser.invalid-hold-date', 'value' => 'foo'} + ] + ) + end + end + + describe 'seq' do + example 'ok' do + write_file 'foo.fasta', <<~SEQ + >CLN01 + ggacaggctgccgcaggagccaggccgggagcaggaagaggcttcgggggagccggagaa + ctgggccagatgcgcttcgtgggcgaagcctgaggaaaaagagagtgaggcaggagaatc + gcttgaaccccggaggcggaaccgcactccagcctgggcgacagagtgagactta + // + >CLN02 + ctcacacagatgcgcgcacaccagtggttgtaacagaagcctgaggtgcgctcgtggtca + gaagagggcatgcgcttcagtcgtgggcgaagcctgaggaaaaaatagtcattcatataa + atttgaacacacctgctgtggctgtaactctgagatgtgctaaataaaccctctt + // + SEQ + + ExtractMetadataJob.perform_now extraction + + expect(extraction.files.first).to have_attributes( + parsing: false, + + parsed_data: { + 'entriesCount' => 2 + }, + + _errors: [] + ) + end + + example 'empty' do + write_file 'foo.fasta', '' + + ExtractMetadataJob.perform_now extraction + + expect(extraction.files.first).to have_attributes( + parsing: false, + parsed_data: nil, + + _errors: [ + {'id' => 'sequence-file-parser.no-entries', 'value' => nil} + ] + ) + end + end +end diff --git a/backend/spec/support/env.rb b/backend/spec/support/env.rb index fdde38ad..ae16b886 100644 --- a/backend/spec/support/env.rb +++ b/backend/spec/support/env.rb @@ -1,18 +1,24 @@ RSpec.configure do |config| config.around do |example| - Dir.mktmpdir do |dir| - env = { - CURATOR_ML_ADDRESS: 'Admin ', - MAIL_ALLOWED_DOMAINS: nil, - MSSFORM_URL: 'http://mssform.example.com', - MSS_WORKING_LIST_SHEET_ID: 'SHEET_ID', - MSS_WORKING_LIST_SHEET_NAME: 'SHEET_NAME', - OPENID_CLIENT_ID: 'CLIENT_ID', - STAGE: nil, - SUBMISSIONS_DIR: dir - } + Dir.mktmpdir do |workdir| + Dir.mktmpdir do |mass_dir| + Dir.mktmpdir do |submissions_dir| + env = { + CURATOR_ML_ADDRESS: 'Admin ', + EXTRACTION_WORKDIR: workdir, + MAIL_ALLOWED_DOMAINS: nil, + MASS_DIR_PATH_TEMPLATE: mass_dir, + MSSFORM_URL: 'http://mssform.example.com', + MSS_WORKING_LIST_SHEET_ID: 'SHEET_ID', + MSS_WORKING_LIST_SHEET_NAME: 'SHEET_NAME', + OPENID_CLIENT_ID: 'CLIENT_ID', + STAGE: nil, + SUBMISSIONS_DIR: submissions_dir + } - ClimateControl.modify(env, &example) + ClimateControl.modify(env, &example) + end + end end end end diff --git a/frontend/public/workers/sequence-file-parser.js b/frontend/public/workers/sequence-file-parser.js index b7c5073a..7787b578 100644 --- a/frontend/public/workers/sequence-file-parser.js +++ b/frontend/public/workers/sequence-file-parser.js @@ -6,7 +6,7 @@ addEventListener('message', async ({data: {file}}) => { } catch (err) { console.error(err); - postMessage([err, null]); + postMessage([err.message, null]); } }); diff --git a/frontend/tests/unit/models/sequence-file-test.js b/frontend/tests/unit/models/sequence-file-test.js index 8561b5a6..5b8417a1 100644 --- a/frontend/tests/unit/models/sequence-file-test.js +++ b/frontend/tests/unit/models/sequence-file-test.js @@ -10,22 +10,34 @@ module('Unit | Model | sequence file', function(hooks) { for (const newline of ['\n', '\r\n', '\r']) { test(`parse (newline: ${JSON.stringify(newline)})`, async function(assert) { - const file = new File([outdent({newline})` ->CLN01 -ggacaggctgccgcaggagccaggccgggagcaggaagaggcttcgggggagccggagaa -ctgggccagatgcgcttcgtgggcgaagcctgaggaaaaagagagtgaggcaggagaatc -gcttgaaccccggaggcggaaccgcactccagcctgggcgacagagtgagactta -// ->CLN02 -ctcacacagatgcgcgcacaccagtggttgtaacagaagcctgaggtgcgctcgtggtca -gaagagggcatgcgcttcagtcgtgggcgaagcctgaggaaaaaatagtcattcatataa -atttgaacacacctgctgtggctgtaactctgagatgtgctaaataaaccctctt -// + const raw = new File([outdent({newline})` + >CLN01 + ggacaggctgccgcaggagccaggccgggagcaggaagaggcttcgggggagccggagaa + ctgggccagatgcgcttcgtgggcgaagcctgaggaaaaagagagtgaggcaggagaatc + gcttgaaccccggaggcggaaccgcactccagcctgggcgacagagtgagactta + // + >CLN02 + ctcacacagatgcgcgcacaccagtggttgtaacagaagcctgaggtgcgctcgtggtca + gaagagggcatgcgcttcagtcgtgggcgaagcctgaggaaaaaatagtcattcatataa + atttgaacacacctgctgtggctgtaactctgagatgtgctaaataaaccctctt + // `], 'foo.fasta'); - const {entriesCount} = await new SequenceFile(file).parse(); + const file = new SequenceFile(raw); + const {entriesCount} = await file.parse(); assert.strictEqual(entriesCount, 2); }); } + + test('empty', async function(assert) { + const raw = new File([''], 'foo.fasta'); + + const file = new SequenceFile(raw); + await file.parse(); + + assert.deepEqual(file.errors, [ + {id: 'sequence-file-parser.no-entries', value: undefined} + ]); + }); });