From 74b729ede19c665c2c18907b1e018be03e8ac0a1 Mon Sep 17 00:00:00 2001 From: Jeremy Friesen Date: Tue, 6 Jun 2023 14:32:47 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=81=20Add=20ability=20to=20find=20page?= =?UTF-8?q?s=20split=20from=20a=20PDF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit leverages the conventions established in the DerivativeRodeo around where we're writing split pages and their derivatives. The inline comments that are written/amended with this PR should be read closely for clarity and intention. Related to: - https://github.com/scientist-softserv/iiif_print/issues/251 - https://github.com/scientist-softserv/derivative_rodeo/pull/48 Co-authored-by: Kirk Wang --- .../iiif_print/derivative_rodeo_service.rb | 61 ++++++++----------- iiif_print.gemspec | 2 +- lib/iiif_print.rb | 19 ++++++ .../derivative_rodeo_splitter_spec.rb | 2 +- .../derivative_rodeo_service_spec.rb | 17 +++--- 5 files changed, 56 insertions(+), 45 deletions(-) diff --git a/app/services/iiif_print/derivative_rodeo_service.rb b/app/services/iiif_print/derivative_rodeo_service.rb index 448037e9..569467cd 100644 --- a/app/services/iiif_print/derivative_rodeo_service.rb +++ b/app/services/iiif_print/derivative_rodeo_service.rb @@ -68,19 +68,31 @@ class DerivativeRodeoService # @return [String] # rubocop:disable Metrics/MethodLength def self.derivative_rodeo_uri(file_set:, filename: nil, extension: nil) - parent = IiifPrint.parent_for(file_set) - raise IiifPrint::DataError, "Parent not found for #{file_set.class} ID=#{file_set.id}" unless parent - - dirname = parent.public_send(parent_work_identifier_property_name) + # In the case of a page split from a PDF, we need to know the grandparent's identifier to + # find the file(s) in the DerivativeRodeo. + ancestor_method = if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename) + :grandparent_for + else + # When not split from a PDF, we can use the parent to find the identifier. + :parent_for + end + + ancestor = IiifPrint.public_send(ancestor_method, file_set) + raise IiifPrint::DataError, "#{ancestor_method} not found for #{file_set.class} ID=#{file_set.id}" unless ancestor + + # By convention, we're putting the files of a work in a "directory" that is based on some + # identifying value (e.g. an object's AARK ID) + dirname = ancestor.public_send(parent_work_identifier_property_name) # TODO: This is a hack that knows about the inner workings of Hydra::Works, but for # expendiency, I'm using it. See # https://github.com/samvera/hydra-works/blob/c9b9dd0cf11de671920ba0a7161db68ccf9b7f6d/lib/hydra/works/services/add_file_to_file_set.rb#L49-L53 - - # These filename, basename, extension name is here to allow for us to take an original file - # and see if we've pre-processed the derivative file. In the pre-processed derivative case, - # that would mean we have a different extension than the original. filename ||= Hydra::Works::DetermineOriginalName.call(file_set.original_file) + + # The aforementioned filename and the following basename and extension are here to allow for + # us to take an original file and see if we've pre-processed the derivative file. In the + # pre-processed derivative case, that would mean we have a different extension than the + # original. extension ||= File.extname(filename) extension = ".#{extension}" unless extension.start_with?(".") basename = File.basename(filename, extension) @@ -89,27 +101,7 @@ def self.derivative_rodeo_uri(file_set:, filename: nil, extension: nil) # to "validate" it in another step. location = DerivativeRodeo::StorageLocations::BaseLocation.load_location(preprocessed_location_adapter_name) - # Is the given filename one that is of the format of a file that was split off from a PDF. - # - # In this case the basename will include the "filename--page-\d.extension" type format where - # "\d" is a digit. - if DerivativeRodeo::Generators::PdfSplitGenerator.filename_for_a_derived_page_from_a_pdf?(filename: filename) - # Note in this case the basename will have a suffix of "--page-" - # - # https://github.com/scientist-softserv/derivative_rodeo/blob/de8ab3993cc9d8719f70c6e990867ceb37d1dfd8/lib/derivative_rodeo/generators/pdf_split_generator.rb#L19-L56 - parent_pdf_file_basename = basename.sub(%r{--page-\d+$}) - page_basename = basename - File.join(location.adapter_prefix, dirname, parent_pdf_file_basename, "pages", "#{page_basename}#{extension}") - else - # TODO: This is based on the provided output template in - # https://github.com/scientist-softserv/space_stone-serverless/blob/0dbe2b6fa13b9f4bf8b9580ec14d0af5c98e2b00/awslambda/bin/sample_post.rb#L1 - # and is very much hard-coded. We likely want to "store" the template in a common place for - # the application. - # - # s3://s3-antics/:aark_id/:file_name_with_extension - # s3://s3-antics/12345/hello-world.pdf - File.join(location.adapter_prefix, dirname, "#{basename}#{extension}") - end + File.join(location.adapter_prefix, dirname, "#{basename}#{extension}") end # rubocop:enable Metrics/MethodLength @@ -184,12 +176,11 @@ def input_uri end def in_the_rodeo? - # We can assume that we are not going to process a supported mime type; and there is a cost - # for looking in the rodeo. - return false unless supported_mime_types.include?(mime_type) - - location = DerivativeRodeo::StorageLocations::BaseLocation.from_uri(input_uri) - location.exist? + # We can assume that we are not going to have pre-processed an unsupported mime type. We + # could check if the original file is in the rodeo, but the way it's designed thee rodeo is + # capable of generating all of the enumerated derivatives (see + # .named_derivatives_and_generators_by_type) for the supported mime type. + supported_mime_types.include?(mime_type) end end end diff --git a/iiif_print.gemspec b/iiif_print.gemspec index 7b142a14..5026263e 100644 --- a/iiif_print.gemspec +++ b/iiif_print.gemspec @@ -23,7 +23,7 @@ SUMMARY spec.files = `git ls-files`.split($OUTPUT_RECORD_SEPARATOR) spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) spec.add_dependency 'blacklight_iiif_search', '~> 1.0' - spec.add_dependency 'derivative-rodeo', "~> 0.3" + spec.add_dependency 'derivative-rodeo', "~> 0.4" spec.add_dependency 'dry-monads', '~> 1.4.0' spec.add_dependency 'hyrax', '>= 2.5', '< 4' spec.add_dependency 'nokogiri', '>=1.13.2' diff --git a/lib/iiif_print.rb b/lib/iiif_print.rb index a9d164ea..5f3cddf3 100644 --- a/lib/iiif_print.rb +++ b/lib/iiif_print.rb @@ -41,14 +41,33 @@ def self.config(&block) end ## + # Return the immediate parent of the given :file_set. + # # @param file_set [FileSet] # @return [#work?, Hydra::PCDM::Work] + # @return [NilClass] when no parent is found. def self.parent_for(file_set) # fallback to Fedora-stored relationships if work's aggregation of # file set is not indexed in Solr file_set.parent || file_set.member_of.find(&:work?) end + ## + # Return the parent's parent of the given :file_set. + # + # @param file_set [FileSet] + # @return [#work?, Hydra::PCDM::Work] + # @return [NilClass] when no grand parent is found. + def self.grandparent_for(file_set) + parent = parent_for(file_set) + # HACK: This is an assumption about the file_set structure, namely that an image page split from + # a PDF is part of a file set that is a child of a work that is a child of a single work. That + # is, it only has one grand parent. Which is a reasonable assumption for IIIF Print but is not + # valid when extended beyond IIIF Print. That is GenericWork does not have a parent method but + # does have a parents method. + parent&.parents&.first || parent&.member_of&.find(:work?) + end + DEFAULT_MODEL_CONFIGURATION = { # Split a PDF into individual page images and create a new child work for each image. pdf_splitter_job: IiifPrint::Jobs::ChildWorksFromPdfJob, diff --git a/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb index 46333f37..ac688901 100644 --- a/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb +++ b/spec/iiif_print/split_pdfs/derivative_rodeo_splitter_spec.rb @@ -3,7 +3,7 @@ require 'spec_helper' RSpec.describe IiifPrint::SplitPdfs::DerivativeRodeoSplitter do - let(:path) { nil } + let(:path) { __FILE__ } let(:work) { double(MyWork, aark_id: '12345') } let(:file_set) { FileSet.new.tap { |fs| fs.save!(validate: false) } } diff --git a/spec/services/iiif_print/derivative_rodeo_service_spec.rb b/spec/services/iiif_print/derivative_rodeo_service_spec.rb index d5acf79a..214d7e24 100644 --- a/spec/services/iiif_print/derivative_rodeo_service_spec.rb +++ b/spec/services/iiif_print/derivative_rodeo_service_spec.rb @@ -8,6 +8,14 @@ let(:generator) { DerivativeRodeo::Generators::CopyGenerator } let(:output_extension) { "rb" } + before do + allow(file_set).to receive(:parent).and_return(work) + + # TODO: This is a hack that leverages the internals oof Hydra::Works; not excited about it but + # this part is only one piece of the over all integration. + allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__)) + end + let(:instance) { described_class.new(file_set) } subject(:klass) { described_class } @@ -28,7 +36,7 @@ end describe '.derivative_rodeo_uri' do - subject { described_class.derivative_rodeo_uri(file_set: file_set) } + subject { described_class.derivative_rodeo_uri(file_set: file_set, filename: __FILE__) } context 'when the file_set does not have a parent' do xit 'is expected to raise an error' do @@ -37,13 +45,6 @@ end context 'when the file_set has a parent' do - before do - allow(file_set).to receive(:parent).and_return(work) - # TODO: This is a hack that leverages the internals oof Hydra::Works; not excited about it but - # this part is only one piece of the over all integration. - allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__)) - end - it { is_expected.to start_with("#{described_class.preprocessed_location_adapter_name}://") } it { is_expected.to end_with(File.basename(__FILE__)) } end