From 1b6ea795f4b77ffb3e6cb2133dc46dfaa410f388 Mon Sep 17 00:00:00 2001
From: Keita Urashima <ursm@ursm.jp>
Date: Sun, 7 Jan 2024 16:20:38 +0900
Subject: [PATCH] Fix a bug that prevents parsing errors in certain situations

---
 backend/app/jobs/extract_metadata_job.rb      |  95 ++---------
 .../app/models/concerns/extraction_file.rb    |  95 +++++++++++
 backend/app/models/dfast_extraction.rb        |   9 ++
 backend/app/models/extraction_error.rb        |   8 -
 .../factories/mass_directory_extractions.rb   |  11 ++
 backend/spec/factories/users.rb               |  10 ++
 .../spec/jobs/extract_metadata_job_spec.rb    | 147 ++++++++++++++++++
 backend/spec/support/env.rb                   |  30 ++--
 .../public/workers/sequence-file-parser.js    |   2 +-
 .../tests/unit/models/sequence-file-test.js   |  36 +++--
 10 files changed, 326 insertions(+), 117 deletions(-)
 delete mode 100644 backend/app/models/extraction_error.rb
 create mode 100644 backend/spec/factories/mass_directory_extractions.rb
 create mode 100644 backend/spec/jobs/extract_metadata_job_spec.rb

diff --git a/backend/app/jobs/extract_metadata_job.rb b/backend/app/jobs/extract_metadata_job.rb
index cd80d87b..d3ffdde5 100644
--- a/backend/app/jobs/extract_metadata_job.rb
+++ b/backend/app/jobs/extract_metadata_job.rb
@@ -3,7 +3,7 @@ def perform(extraction)
     ActiveRecord::Base.transaction do
       begin
         extraction.prepare_files
-      rescue ExtractionError => e
+      rescue DfastExtraction::ExtractionError => e
         extraction.update! state: 'rejected', error: {id: e.id, **e.data}
         return
       end
@@ -11,94 +11,21 @@ def perform(extraction)
       extraction.files.find_each do |file|
         file.update!(
           parsing:     false,
-          parsed_data: parse(file),
+          parsed_data: file.parse,
           _errors:     []
         )
-      end
-
-      extraction.update! state: 'fulfilled'
-    end
-  rescue => e
-    Rails.logger.error '******'
-    Rails.logger.error e
-
-    raise
-  end
-
-  private
-
-  def parse(file)
-    case File.extname(file.name)
-    when *MassDirectoryExtraction::ANN_EXT.map { ".#{_1}" }
-      parse_ann(file)
-    when *MassDirectoryExtraction::SEQ_EXT.map { ".#{_1}" }
-      parse_seq(file)
-    else
-      raise "unsupported file: #{file.name}"
-    end
-  end
-
-  def parse_ann(file)
-    in_common   = false
-    full_name   = nil
-    email       = nil
-    affiliation = nil
-    hold_date   = nil
-
-    file.fullpath.each_line chomp: true do |line|
-      break if full_name && email && affiliation && hold_date
-
-      entry, _feature, _location, qualifier, value = line.split("\t")
-
-      break if in_common && entry.present?
-
-      in_common = entry == 'COMMON' if entry.present?
-
-      next unless in_common
+      rescue ExtractionFile::ParseError => e
+        file.update!(
+          parsing:     false,
+          parsed_data: nil,
 
-      case qualifier
-      when 'contact'
-        full_name = value
-      when 'email'
-        email = value
-      when 'institute'
-        affiliation = value
-      when 'hold_date'
-        hold_date = Date.strptime(value, '%Y%m%d').strftime('%Y-%m-%d')
-      else
-        # do nothing
+          _errors: [
+            {id: e.id, value: e.value}
+          ]
+        )
       end
-    end
-
-    {
-      contactPerson: {
-        fullName: full_name,
-        email:,
-        affiliation:
-      },
 
-      holdDate: hold_date
-    }
-  end
-
-  def parse_seq(file)
-    count = 0
-    buf   = String.new(capacity: 1.megabyte)
-    bol   = true
-
-    file.fullpath.open 'rb' do |io|
-      while io.readpartial(1.megabyte, buf)
-        count += 1 if bol && buf.start_with?('>')
-        count += buf.scan(/[\r\n]>/).count
-
-        bol = buf.end_with?("\r", "\n")
-      end
-    rescue EOFError
-      # done
+      extraction.update! state: 'fulfilled'
     end
-
-    {
-      entriesCount: count
-    }
   end
 end
diff --git a/backend/app/models/concerns/extraction_file.rb b/backend/app/models/concerns/extraction_file.rb
index ecf2317b..6d983560 100644
--- a/backend/app/models/concerns/extraction_file.rb
+++ b/backend/app/models/concerns/extraction_file.rb
@@ -1,4 +1,99 @@
 module ExtractionFile
+  class ParseError < StandardError
+    def initialize(id, value = nil)
+      @id    = id
+      @value = value
+    end
+
+    attr_reader :id, :value
+  end
+
   def fullpath = extraction.working_dir.join(name)
   def size     = fullpath.size
+
+  def parse
+    case File.extname(name)
+    when *MassDirectoryExtraction::ANN_EXT.map { ".#{_1}" }
+      parse_ann
+    when *MassDirectoryExtraction::SEQ_EXT.map { ".#{_1}" }
+      parse_seq
+    else
+      raise "unsupported file: #{name}"
+    end
+  end
+
+  private
+
+  def parse_ann
+    in_common   = false
+    full_name   = nil
+    email       = nil
+    affiliation = nil
+    hold_date   = nil
+
+    fullpath.each_line chomp: true do |line|
+      break if full_name && email && affiliation && hold_date
+
+      entry, _feature, _location, qualifier, value = line.split("\t")
+
+      break if in_common && entry.present?
+
+      in_common = entry == 'COMMON' if entry.present?
+
+      next unless in_common
+
+      case qualifier
+      when 'contact'
+        full_name = value
+      when 'email'
+        email = value
+      when 'institute'
+        affiliation = value
+      when 'hold_date'
+        begin
+          hold_date = Date.strptime(value, '%Y%m%d').strftime('%Y-%m-%d')
+        rescue Date::Error
+          raise ParseError.new('annotation-file-parser.invalid-hold-date', value)
+        end
+      else
+        # do nothing
+      end
+    end
+
+    raise ParseError.new('annotation-file-parser.missing-contact-person') if !full_name && !email && !affiliation
+    raise ParseError.new('annotation-file-parser.invalid-contact-person') if !full_name || !email || !affiliation
+
+    {
+      contactPerson: {
+        fullName: full_name,
+        email:,
+        affiliation:
+      },
+
+      holdDate: hold_date
+    }
+  end
+
+  def parse_seq
+    count = 0
+    buf   = String.new(capacity: 1.megabyte)
+    bol   = true
+
+    fullpath.open 'rb' do |io|
+      while io.readpartial(1.megabyte, buf)
+        count += 1 if bol && buf.start_with?('>')
+        count += buf.scan(/[\r\n]>/).count
+
+        bol = buf.end_with?("\r", "\n")
+      end
+    rescue EOFError
+      # done
+    end
+
+    raise ParseError.new('sequence-file-parser.no-entries') if count.zero?
+
+    {
+      entriesCount: count
+    }
+  end
 end
diff --git a/backend/app/models/dfast_extraction.rb b/backend/app/models/dfast_extraction.rb
index 41f0e085..c444f625 100644
--- a/backend/app/models/dfast_extraction.rb
+++ b/backend/app/models/dfast_extraction.rb
@@ -1,6 +1,15 @@
 require 'open-uri'
 
 class DfastExtraction < ApplicationRecord
+  class ExtractionError < StandardError
+    def initialize(id, **data)
+      @id   = id
+      @data = data
+    end
+
+    attr_reader :id, :data
+  end
+
   belongs_to :user
 
   has_many :files, dependent: :destroy, class_name: 'DfastExtractionFile', foreign_key: :extraction_id
diff --git a/backend/app/models/extraction_error.rb b/backend/app/models/extraction_error.rb
deleted file mode 100644
index 351cfc2a..00000000
--- a/backend/app/models/extraction_error.rb
+++ /dev/null
@@ -1,8 +0,0 @@
-class ExtractionError < StandardError
-  def initialize(id, **data)
-    @id   = id
-    @data = data
-  end
-
-  attr_reader :id, :data
-end
diff --git a/backend/spec/factories/mass_directory_extractions.rb b/backend/spec/factories/mass_directory_extractions.rb
new file mode 100644
index 00000000..19596a94
--- /dev/null
+++ b/backend/spec/factories/mass_directory_extractions.rb
@@ -0,0 +1,11 @@
+FactoryBot.define do
+  factory :mass_directory_extraction do
+    user
+  end
+
+  factory :mass_directory_extraction_file do
+    extraction factory: :mass_directory_extraction
+
+    parsing { false }
+  end
+end
diff --git a/backend/spec/factories/users.rb b/backend/spec/factories/users.rb
index 990f3ce0..65107610 100644
--- a/backend/spec/factories/users.rb
+++ b/backend/spec/factories/users.rb
@@ -2,6 +2,16 @@
   factory :user do
     sequence(:openid_sub) {|i| "user:#{i}" }
 
+    id_token {|user|
+      uid = user.openid_sub.sub(':', '_')
+
+      {
+        sub:                user.openid_sub,
+        preferred_username: uid,
+        email:              "#{uid}@example.com"
+      }
+    }
+
     trait :alice do
       id_token {|user|
         {
diff --git a/backend/spec/jobs/extract_metadata_job_spec.rb b/backend/spec/jobs/extract_metadata_job_spec.rb
new file mode 100644
index 00000000..bf0a8330
--- /dev/null
+++ b/backend/spec/jobs/extract_metadata_job_spec.rb
@@ -0,0 +1,147 @@
+require 'rails_helper'
+
+RSpec.describe ExtractMetadataJob, type: :job do
+  def write_file(path, content)
+    File.write File.join(ENV.fetch('MASS_DIR_PATH_TEMPLATE'), path), content
+  end
+
+  let(:extraction) { extraction = create(:mass_directory_extraction) }
+
+  describe 'ann' do
+    example 'ok' do
+      write_file 'foo.ann', <<~ANN
+        COMMON	SUBMITTER		contact	Alice Liddell
+        			email	alice@example.com
+        			institute	Wonderland Inc.
+        	DATE		hold_date	20200102
+      ANN
+
+      ExtractMetadataJob.perform_now extraction
+
+      expect(extraction.files.first).to have_attributes(
+        parsing: false,
+
+        parsed_data: {
+          'contactPerson' => {
+            'fullName'    => 'Alice Liddell',
+            'email'       => 'alice@example.com',
+            'affiliation' => 'Wonderland Inc.'
+          },
+          'holdDate' => '2020-01-02'
+        },
+
+        _errors: []
+      )
+    end
+
+    example 'empty' do
+      write_file 'foo.ann', ''
+
+      ExtractMetadataJob.perform_now extraction
+
+      expect(extraction.files.first).to have_attributes(
+        parsing: false,
+        parsed_data: nil,
+
+        _errors: [
+          {'id' => 'annotation-file-parser.missing-contact-person', 'value' => nil}
+        ]
+      )
+    end
+
+    example 'missing contact person' do
+      write_file 'foo.ann', <<~ANN
+        COMMON	DATE		hold_date	20231126
+      ANN
+
+      ExtractMetadataJob.perform_now extraction
+
+      expect(extraction.files.first).to have_attributes(
+        parsing: false,
+        parsed_data: nil,
+
+        _errors: [
+          {'id' => 'annotation-file-parser.missing-contact-person', 'value' => nil}
+        ]
+      )
+    end
+
+    example 'invalid contact person' do
+      write_file 'foo.ann', <<~ANN
+        COMMON	SUBMITTER		contact	Alice Liddell
+      ANN
+
+      ExtractMetadataJob.perform_now extraction
+
+      expect(extraction.files.first).to have_attributes(
+        parsing: false,
+        parsed_data: nil,
+
+        _errors: [
+          {'id' => 'annotation-file-parser.invalid-contact-person', 'value' => nil}
+        ]
+      )
+    end
+
+    example 'invalid hold_date' do
+      write_file 'foo.ann', <<~ANN
+        COMMON	DATE		hold_date	foo
+      ANN
+
+      ExtractMetadataJob.perform_now extraction
+
+      expect(extraction.files.first).to have_attributes(
+        parsing: false,
+        parsed_data: nil,
+
+        _errors: [
+          {'id' => 'annotation-file-parser.invalid-hold-date', 'value' => 'foo'}
+        ]
+      )
+    end
+  end
+
+  describe 'seq' do
+    example 'ok' do
+      write_file 'foo.fasta', <<~SEQ
+        >CLN01
+        ggacaggctgccgcaggagccaggccgggagcaggaagaggcttcgggggagccggagaa
+        ctgggccagatgcgcttcgtgggcgaagcctgaggaaaaagagagtgaggcaggagaatc
+        gcttgaaccccggaggcggaaccgcactccagcctgggcgacagagtgagactta
+        //
+        >CLN02
+        ctcacacagatgcgcgcacaccagtggttgtaacagaagcctgaggtgcgctcgtggtca
+        gaagagggcatgcgcttcagtcgtgggcgaagcctgaggaaaaaatagtcattcatataa
+        atttgaacacacctgctgtggctgtaactctgagatgtgctaaataaaccctctt
+        //
+      SEQ
+
+      ExtractMetadataJob.perform_now extraction
+
+      expect(extraction.files.first).to have_attributes(
+        parsing: false,
+
+        parsed_data: {
+          'entriesCount' => 2
+        },
+
+        _errors: []
+      )
+    end
+
+    example 'empty' do
+      write_file 'foo.fasta', ''
+
+      ExtractMetadataJob.perform_now extraction
+
+      expect(extraction.files.first).to have_attributes(
+        parsing:     false,
+        parsed_data: nil,
+
+        _errors: [
+          {'id' => 'sequence-file-parser.no-entries', 'value' => nil}
+        ]
+      )
+    end
+  end
+end
diff --git a/backend/spec/support/env.rb b/backend/spec/support/env.rb
index fdde38ad..ae16b886 100644
--- a/backend/spec/support/env.rb
+++ b/backend/spec/support/env.rb
@@ -1,18 +1,24 @@
 RSpec.configure do |config|
   config.around do |example|
-    Dir.mktmpdir do |dir|
-      env = {
-        CURATOR_ML_ADDRESS:          'Admin <mssform@example.com>',
-        MAIL_ALLOWED_DOMAINS:        nil,
-        MSSFORM_URL:                 'http://mssform.example.com',
-        MSS_WORKING_LIST_SHEET_ID:   'SHEET_ID',
-        MSS_WORKING_LIST_SHEET_NAME: 'SHEET_NAME',
-        OPENID_CLIENT_ID:            'CLIENT_ID',
-        STAGE:                       nil,
-        SUBMISSIONS_DIR:             dir
-      }
+    Dir.mktmpdir do |workdir|
+      Dir.mktmpdir do |mass_dir|
+        Dir.mktmpdir do |submissions_dir|
+          env = {
+            CURATOR_ML_ADDRESS:          'Admin <mssform@example.com>',
+            EXTRACTION_WORKDIR:          workdir,
+            MAIL_ALLOWED_DOMAINS:        nil,
+            MASS_DIR_PATH_TEMPLATE:      mass_dir,
+            MSSFORM_URL:                 'http://mssform.example.com',
+            MSS_WORKING_LIST_SHEET_ID:   'SHEET_ID',
+            MSS_WORKING_LIST_SHEET_NAME: 'SHEET_NAME',
+            OPENID_CLIENT_ID:            'CLIENT_ID',
+            STAGE:                       nil,
+            SUBMISSIONS_DIR:             submissions_dir
+          }
 
-      ClimateControl.modify(env, &example)
+          ClimateControl.modify(env, &example)
+        end
+      end
     end
   end
 end
diff --git a/frontend/public/workers/sequence-file-parser.js b/frontend/public/workers/sequence-file-parser.js
index b7c5073a..7787b578 100644
--- a/frontend/public/workers/sequence-file-parser.js
+++ b/frontend/public/workers/sequence-file-parser.js
@@ -6,7 +6,7 @@ addEventListener('message', async ({data: {file}}) => {
   } catch (err) {
     console.error(err);
 
-    postMessage([err, null]);
+    postMessage([err.message, null]);
   }
 });
 
diff --git a/frontend/tests/unit/models/sequence-file-test.js b/frontend/tests/unit/models/sequence-file-test.js
index 8561b5a6..5b8417a1 100644
--- a/frontend/tests/unit/models/sequence-file-test.js
+++ b/frontend/tests/unit/models/sequence-file-test.js
@@ -10,22 +10,34 @@ module('Unit | Model | sequence file', function(hooks) {
 
   for (const newline of ['\n', '\r\n', '\r']) {
     test(`parse (newline: ${JSON.stringify(newline)})`, async function(assert) {
-      const file = new File([outdent({newline})`
->CLN01
-ggacaggctgccgcaggagccaggccgggagcaggaagaggcttcgggggagccggagaa
-ctgggccagatgcgcttcgtgggcgaagcctgaggaaaaagagagtgaggcaggagaatc
-gcttgaaccccggaggcggaaccgcactccagcctgggcgacagagtgagactta
-//
->CLN02
-ctcacacagatgcgcgcacaccagtggttgtaacagaagcctgaggtgcgctcgtggtca
-gaagagggcatgcgcttcagtcgtgggcgaagcctgaggaaaaaatagtcattcatataa
-atttgaacacacctgctgtggctgtaactctgagatgtgctaaataaaccctctt
-//
+      const raw = new File([outdent({newline})`
+        >CLN01
+        ggacaggctgccgcaggagccaggccgggagcaggaagaggcttcgggggagccggagaa
+        ctgggccagatgcgcttcgtgggcgaagcctgaggaaaaagagagtgaggcaggagaatc
+        gcttgaaccccggaggcggaaccgcactccagcctgggcgacagagtgagactta
+        //
+        >CLN02
+        ctcacacagatgcgcgcacaccagtggttgtaacagaagcctgaggtgcgctcgtggtca
+        gaagagggcatgcgcttcagtcgtgggcgaagcctgaggaaaaaatagtcattcatataa
+        atttgaacacacctgctgtggctgtaactctgagatgtgctaaataaaccctctt
+        //
       `], 'foo.fasta');
 
-      const {entriesCount} = await new SequenceFile(file).parse();
+      const file           = new SequenceFile(raw);
+      const {entriesCount} = await file.parse();
 
       assert.strictEqual(entriesCount, 2);
     });
   }
+
+  test('empty', async function(assert) {
+    const raw = new File([''], 'foo.fasta');
+
+    const file = new SequenceFile(raw);
+    await file.parse();
+
+    assert.deepEqual(file.errors, [
+      {id: 'sequence-file-parser.no-entries', value: undefined}
+    ]);
+  });
 });