♻️ Flattening assumptive dir structure for PDFs

Yes, it would be nice to have sub-directories for pages ripped from a PDF. However, that aspirational state creates complications on the implementation details of the DerivativeRodeo; by moving from a tail glob to a regular expression, we create a more powerful mechanism for finding files. These changes also highlighted a few implementation bugs (namely ensuring the correct expected return value of the newly re-named function). In IIIF Print we still need to consider how to find the child page's derivatives of an original PDF. That is, however, not a problem for this repository. Related to: - scientist-softserv/iiif_print#251 Co-authored-by: Kirk Wang <[email protected]>
scientist-softserv · Jun 6, 2023 · eae0bb2 · eae0bb2
1 parent dacb533
commit eae0bb2
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 50 deletions.
diff --git a/lib/derivative_rodeo/generators/base_generator.rb b/lib/derivative_rodeo/generators/base_generator.rb
@@ -39,7 +39,6 @@ class BaseGenerator
       #        {Services::ConvertUriViaTemplateService} with the given
       #        :preprocessed_location_template.
       def initialize(input_uris:, output_location_template:, preprocessed_location_template: nil)
-        # NOTE: Are we using this preprocessed_location_template?  Wondering?
         @input_uris = Array.wrap(input_uris)
         @output_location_template = output_location_template
         @preprocessed_location_template = preprocessed_location_template

diff --git a/lib/derivative_rodeo/generators/pdf_split_generator.rb b/lib/derivative_rodeo/generators/pdf_split_generator.rb
@@ -52,7 +52,7 @@ def self.filename_for_a_derived_page_from_a_pdf?(filename:, extension: nil)
       # @see #existing_page_locations
       # @see .filename_for_a_derived_page_from_a_pdf?
       def image_file_basename_template(basename:)
-        "#{basename}/pages/#{basename}--page-%d.#{output_extension}"
+        "#{basename}--page-%d.#{output_extension}"
       end
 
       ##
@@ -62,21 +62,21 @@ def image_file_basename_template(basename:)
       # @param input_location [StorageLocations::BaseLocation]
       #
       # @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
-      #         with :tail_glob.
+      #         with :tail_regexp.
       #
       # @note There is relation to {Generators::BaseGenerator#destination} and this method.
       #
       # @note The tail_glob is in relation to the {#image_file_basename_template}
       def existing_page_locations(input_location:)
         # See image_file_basename_template
-        tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
+        tail_regexp = %r{#{input_location.file_basename}--page-\d+\.#{output_extension}$}
 
-        output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
+        output_locations = input_location.derived_file_from(template: output_location_template).matching_locations_in_file_dir(tail_regexp: tail_regexp)
         return output_locations if output_locations.count.positive?
 
         return [] if preprocessed_location_template.blank?
 
-        input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
+        input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_regexp: tail_regexp)
       end
 
       ##
@@ -101,20 +101,22 @@ def existing_page_locations(input_location:)
       def with_each_requisite_location_and_tmp_file_path
         input_files.each do |input_location|
           input_location.with_existing_tmp_path do |input_tmp_file_path|
-            ## We want a single call for a directory listing of the image_file_basename_template
-            generated_files = existing_page_locations(input_location: input_location)
+            existing_locations = existing_page_locations(input_location: input_location)
 
-            if generated_files.count.zero?
-              generated_files = Services::PdfSplitter.call(
+            if existing_locations.count.positive?
+              existing_locations.each do |location|
+                yield(location, location.file_path)
+              end
+            else
+              # We're going to need to create the files and "cast" them to locations.
+              Services::PdfSplitter.call(
                 input_tmp_file_path,
                 image_extension: output_extension,
                 image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
-              )
-            end
-
-            generated_files.each do |image_path|
-              image_location = StorageLocations::FileLocation.new("file://#{image_path}")
-              yield(image_location, image_path)
+              ).each do |image_path|
+                image_location = StorageLocations::FileLocation.new("file://#{image_path}")
+                yield(image_location, image_path)
+              end
             end
           end
         end

diff --git a/lib/derivative_rodeo/storage_locations/base_location.rb b/lib/derivative_rodeo/storage_locations/base_location.rb
@@ -210,18 +210,18 @@ def derived_file_from(template:)
 
       ##
       # When you have a known location and want to check for files that are within that location,
-      # use the {#globbed_tail_locations} method.  In the case of {Generators::PdfSplitGenerator} we
+      # use the {#matching_locations_in_file_dir} method.  In the case of {Generators::PdfSplitGenerator} we
       # need to know the path to all of the image files we "split" off of the given PDF.
       #
       # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
       # qualified" Dir.glob type search.
       #
-      # @param tail_glob [String]
+      # @param tail_regexp [Regexp]
       #
       # @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
       #         array when there are none.
-      def globbed_tail_locations(tail_glob:)
-        raise NotImplementedError, "#{self.class}#globbed_locations"
+      def matching_locations_in_file_dir(tail_regexp:)
+        raise NotImplementedError, "#{self.class}#matching_locations_in_file_dir"
       end
 
       ##

diff --git a/lib/derivative_rodeo/storage_locations/file_location.rb b/lib/derivative_rodeo/storage_locations/file_location.rb
@@ -35,8 +35,16 @@ def write
         file_uri
       end
 
-      def globbed_tail_locations(tail_glob:)
-        Dir.glob(File.join(file_dir, tail_glob))
+      ##
+      # @return [Enumerable<DerivativeRodeo::StorageLocations::FileLocation>]
+      #
+      # @see Generators::PdfSplitGenerator#image_file_basename_template
+      #
+      # @param tail_regexp [Regexp]
+      def matching_locations_in_file_dir(tail_regexp:)
+        Dir.glob(File.join(file_dir, "*")).each_with_object([]) do |filename, accumulator|
+          accumulator << derived_file_from(template: "file://#{filename}") if tail_regexp.match(filename)
+        end
       end
     end
   end

diff --git a/lib/derivative_rodeo/storage_locations/s3_location.rb b/lib/derivative_rodeo/storage_locations/s3_location.rb
@@ -65,31 +65,19 @@ def exist?
       ##
       # @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
       #
-      # @note S3 allows searching on a prefix but does not allow for "wildcard" searches.  We can
-      #       use the components of the file_path to fake that behavior.
+      # @note S3 allows searching on a prefix but does not allow for "wildcard" searches.
       #
       # @see Generators::PdfSplitGenerator#image_file_basename_template
-      def globbed_tail_locations(tail_glob:)
-        # file_path = "s3://blah/1234/hello-world/pages/*.tiff"
-        #
-        # NOTE: Should we be storing our files as such?  The pattern we need is
-        # :parent_identifier/:file_set_identifier/files There are probably cases where a work has
-        # more than one PDF (that we intend to split); we don't want to trample on those split files
-        # and miscolate two PDFs.
-        #
-        # file_path = "s3://blah/1234/hello-world/hello-world.pdf
-        globname = File.join(file_dir, tail_glob)
-        regexp = %r{#{File.extname(globname)}$}
-
-        # NOTE: We're making some informed guesses, needing to include the fully qualified template
-        # based on both the key of the item in the bucket as well as the bucket's host.
+      #
+      # @param tail_regexp [Regexp]
+      def matching_locations_in_file_dir(tail_regexp:)
         uri = URI.parse(file_uri)
         scheme_and_host = "#{uri.scheme}://#{uri.host}"
 
-        bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
-          if object.key.match(regexp)
+        bucket.objects(prefix: file_dir).each_with_object([]) do |object, accumulator|
+          if tail_regexp.match(object.key)
             template = File.join(scheme_and_host, object.key)
-            derived_file_from(template: template)
+            accumulator << derived_file_from(template: template)
           end
         end
       end

diff --git a/spec/derivative_rodeo/generators/pdf_split_generator_spec.rb b/spec/derivative_rodeo/generators/pdf_split_generator_spec.rb
@@ -44,7 +44,7 @@
             output_location = DerivativeRodeo::StorageLocations::FileLocation.build(from_uri: input_uris.first, template: output_location_template)
 
             # Let's fake a nice TIFF being in a pre-processed location.
-            pre_existing_tiff_path = File.join(output_location.file_dir, output_location.file_basename, "pages/1.tiff")
+            pre_existing_tiff_path = File.join(output_location.file_dir, "#{output_location.file_basename}--page-1.tiff")
             FileUtils.mkdir_p(File.dirname(pre_existing_tiff_path))
             File.open(pre_existing_tiff_path, "w+") do |f|
               f.puts "🤠🐮🐴 A muppet man parading as a TIFF."

diff --git a/spec/derivative_rodeo/storage_locations/file_location_spec.rb b/spec/derivative_rodeo/storage_locations/file_location_spec.rb
@@ -69,11 +69,13 @@
   end
   xit "write cases or mark private the rest of the methods"
 
-  context '#globbed_tail_locations' do
+  context '#matching_locations_in_file_dir' do
     let(:args) { "file://#{__FILE__}" }
 
     it "searches for files within the file_dir that match the given glob" do
-      expect(instance.globbed_tail_locations(tail_glob: "*.rb")).to include(__FILE__)
+      locations = instance.matching_locations_in_file_dir(tail_regexp: %r{file_location_spec\.rb$})
+      expect(locations.size).to eq(1)
+      expect(locations.map(&:file_name)).to match_array([File.basename(__FILE__)])
     end
   end
 end
diff --git a/spec/derivative_rodeo/storage_locations/s3_location_spec.rb b/spec/derivative_rodeo/storage_locations/s3_location_spec.rb
@@ -83,22 +83,22 @@
   xit "write additional write cases"
   xit "write cases or mark private the rest of the methods"
 
-  describe '#globbed_tail_locations' do
+  describe '#matching_locations_in_file_dir' do
     it 'searched the bucket' do
       # Because we instantiated the subject as a location to the :file_path (e.g. let(:file_path))
       # we are encoding where things are relative to this file.  In other words, this logic is
       # mirroring the generator logic that says where we're writing derivatives relative to their
       # original file/input file.
-      bucket_dir = "files/#{File.basename(file_path, '.tiff')}"
+      bucket_dir = "files"
 
-      basename = File.basename(__FILE__)
-      key = File.join(bucket_dir, "pages", basename)
+      original_basename_no_ext = File.basename(file_path, '.tiff')
+      key = File.join(bucket_dir, "#{original_basename_no_ext}--page-1.tiff")
       subject.bucket.object(key).upload_file(__FILE__)
 
-      non_matching_key = File.join(bucket_dir, "missing", basename)
+      non_matching_key = File.join(bucket_dir, "#{original_basename_no_ext}-skip--page-1.tiff")
       subject.bucket.object(non_matching_key).upload_file(__FILE__)
 
-      locations = subject.globbed_tail_locations(tail_glob: "ocr_color/pages/*.rb")
+      locations = subject.matching_locations_in_file_dir(tail_regexp: %r{#{original_basename_no_ext}--page-\d+\.tiff$})
 
       expect(locations.size).to eq(1)
     end