scientist-softserv · jeremyf · May 30, 2023 · May 25, 2023 · May 26, 2023 · May 30, 2023
diff --git a/lib/derivative_rodeo/generators/pdf_split_generator.rb b/lib/derivative_rodeo/generators/pdf_split_generator.rb
@@ -25,12 +25,36 @@ class PdfSplitGenerator < BaseGenerator
       # @note This must include "%d" in the returning value, as that is how Ghostscript will assign
       # the page number.
       #
-      # @note I have extracted this function to make it abundantly clear the expected filename of
-      # each split image.
+      # @note I have extracted this function to make it abundantly clear the expected location
+      # each split image.  Further there is an interaction in this
+      #
+      # @see #existing_page_locations
       def image_file_basename_template(basename:)
-        # TODO: Rather urgently we need to decide if this is a reasonable format?  Do we want to
-        # have subfolders instead?  Will that make it easier to find things.
-        "#{basename}-page%d.#{output_extension}"
+        "#{basename}/pages/#{basename}-%d.#{output_extension}"
+      end
+
+      ##
+      # We want to check the output location and pre-processed location for the existence of already
+      # split pages.  This method checks both places.
+      #
+      # @param input_location [StorageLocations::BaseLocation]
+      #
+      # @return [Enumerable<StorageLocations::BaseLocation>] the files at the given :input_location
+      #         with :tail_glob.
+      #
+      # @note There is relation to {Generators::BaseGenerator#destination} and this method.
+      #
+      # @note The tail_glob is in relation to the {#image_file_basename_template}
+      def existing_page_locations(input_location:)
+        # See image_file_basename_template
+        tail_glob = "#{input_location.file_basename}/pages/*.#{output_extension}"
+
+        output_locations = input_location.derived_file_from(template: output_location_template).globbed_tail_locations(tail_glob: tail_glob)
+        return output_locations if output_locations.count.positive?
+
+        return [] if preprocessed_location_template.blank?
+
+        input_location.derived_file_from(template: preprocessed_location_template).globbed_tail_loations(tail_glob: tail_glob)
       end
 
       ##
@@ -45,21 +69,35 @@ def image_file_basename_template(basename:)
       # @yieldparam image_location [StorageLocations::FileLocation] the file and adapter logic.
       # @yieldparam image_path [String] where to find this file in the tmp space
       #
+      # @note This function makes a concession; namely that if it encounters any
+      # {#existing_page_locations} it will use all of that result as the entire number of pages.
+      # We could make this smarter but at the moment we're deferring on that.
+      #
       # @see BaseGenerator#with_each_requisite_location_and_tmp_file_path for further discussion
+      #
+      # rubocop:disable Metrics/MethodLength
       def with_each_requisite_location_and_tmp_file_path
         input_files.each do |input_location|
           input_location.with_existing_tmp_path do |input_tmp_file_path|
-            Services::PdfSplitter.call(
-              input_tmp_file_path,
-              image_extension: output_extension,
-              image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
-            ).each do |image_path|
+            ## We want a single call for a directory listing of the image_file_basename_template
+            generated_files = existing_page_locations(input_location: input_location)
+
+            if generated_files.count.zero?
+              generated_files = Services::PdfSplitter.call(
+                input_tmp_file_path,
+                image_extension: output_extension,
+                image_file_basename_template: image_file_basename_template(basename: input_location.file_basename)
+              )
+            end
+
+            generated_files.each do |image_path|
               image_location = StorageLocations::FileLocation.new("file://#{image_path}")
               yield(image_location, image_path)
             end
           end
         end
       end
+      # rubocop:enable Metrics/MethodLength
     end
   end
 end
diff --git a/lib/derivative_rodeo/services/pdf_splitter/base.rb b/lib/derivative_rodeo/services/pdf_splitter/base.rb
@@ -2,8 +2,6 @@
 
 require 'open3'
 require 'securerandom'
-require 'tmpdir'
-
 module DerivativeRodeo
   module Services
     ##
@@ -24,13 +22,12 @@ module PdfSplitter
       #        for an image "split" from the given PDF.  It must include "%d" as part of the
       #        declaration.  For example if the template is "hello-world-%d.png" then the first
       #        split page will be "hello-world-1.png".
-      # @param tmpdir [String] place to perform the "work" of splitting the PDF.
       #
       # @return [Enumerable, Utilities::PdfSplitter::Base, #each] see {Base#each}
-      def self.call(path, image_extension:, image_file_basename_template:, tmpdir: File.dirname(path))
+      def self.call(path, image_extension:, image_file_basename_template:)
         klass_name = "#{image_extension.to_s.classify}_page".classify
         klass = "DerivativeRodeo::Services::PdfSplitter::#{klass_name}".constantize
-        klass.new(path, tmpdir: tmpdir, image_file_basename_template: image_file_basename_template)
+        klass.new(path, image_file_basename_template: image_file_basename_template)
       end
 
       ##
@@ -52,14 +49,15 @@ class Base
 
         def initialize(path,
                        image_file_basename_template:,
-                       # TODO: Do we need to provide the :tmpdir for the application?  Based on
-                       # implementation, no, this can be extracted from the provided path.
-                       tmpdir: Dir.mktmpdir,
                        pdf_pages_summary: PagesSummary.extract_from(path: path))
           @pdfpath = path
           @pdf_pages_summary = pdf_pages_summary
-          @tmpdir = tmpdir
-          @ghost_script_output_file_template = File.join(tmpdir, image_file_basename_template)
+          @ghost_script_output_file_template = File.join(File.dirname(path), image_file_basename_template)
+
+          # We need to ensure that this temporary directory exists so we can write the files to it.
+          # Fortunately, because this file space must be "local" tmp dir, we don't need to work
+          # through any of the location antics of {StorageLocations::BaseLocation}.
+          FileUtils.mkdir_p(File.dirname(@ghost_script_output_file_template))
         end
 
         attr_reader :ghost_script_output_file_template
@@ -82,8 +80,8 @@ def invalid_pdf?
           !pdf_pages_summary.valid?
         end
 
-        attr_reader :pdf_pages_summary, :tmpdir, :basename, :pdfpath
-        private :pdf_pages_summary, :tmpdir, :basename, :pdfpath
+        attr_reader :pdf_pages_summary, :basename, :pdfpath
+        private :pdf_pages_summary, :basename, :pdfpath
 
         # @api private
         def gsdevice

diff --git a/lib/derivative_rodeo/storage_locations/base_location.rb b/lib/derivative_rodeo/storage_locations/base_location.rb
@@ -206,6 +206,22 @@ def derived_file_from(template:)
         klass.build(from_uri: file_path, template: template)
       end
 
+      ##
+      # When you have a known location and want to check for files that are within that location,
+      # use the {#globbed_tail_locations} method.  In the case of {Generators::PdfSplitGenerator} we
+      # need to know the path to all of the image files we "split" off of the given PDF.
+      #
+      # We can use the :file_path as the prefix the given :tail_glob as the suffix for a "fully
+      # qualified" Dir.glob type search.
+      #
+      # @param tail_glob [String]
+      #
+      # @return [Enumerable<StorageLocations::BaseLocation>] the locations of the files; an empty
+      #         array when there are none.
+      def globbed_tail_locations(tail_glob:)
+        raise NotImplementedError, "#{self.class}#globbed_locations"
+      end
+
       ##
       # @param extension [String, StorageLocations::SAME]
       # @return [String] the path for the new extension; when given {StorageLocations::SAME} re-use

diff --git a/lib/derivative_rodeo/storage_locations/file_location.rb b/lib/derivative_rodeo/storage_locations/file_location.rb
@@ -34,6 +34,10 @@ def write
         FileUtils.cp_r(tmp_file_path, file_path)
         file_uri
       end
+
+      def globbed_tail_locations(tail_glob:)
+        Dir.glob(File.join(file_dir, tail_glob))
+      end
     end
   end
 end
diff --git a/lib/derivative_rodeo/storage_locations/s3_location.rb b/lib/derivative_rodeo/storage_locations/s3_location.rb
@@ -61,6 +61,38 @@ def exist?
         bucket.objects(prefix: file_path).count.positive?
       end
 
+      ##
+      # @return [Enumerable<DerivativeRodeo::StorageLocations::S3Location>]
+      #
+      # @note S3 allows searching on a prefix but does not allow for "wildcard" searches.  We can
+      #       use the components of the file_path to fake that behavior.
+      #
+      # @see Generators::PdfSplitGenerator#image_file_basename_template
+      def globbed_tail_locations(tail_glob:)
+        # file_path = "s3://blah/1234/hello-world/pages/*.tiff"
+        #
+        # NOTE: Should we be storing our files as such?  The pattern we need is
+        # :parent_identifier/:file_set_identifier/files There are probably cases where a work has
+        # more than one PDF (that we intend to split); we don't want to trample on those split files
+        # and miscolate two PDFs.
+        #
+        # file_path = "s3://blah/1234/hello-world/hello-world.pdf
+        globname = File.join(file_dir, tail_glob)
+        regexp = %r{#{File.extname(globname)}$}
+
+        # NOTE: We're making some informed guesses, needing to include the fully qualified template
+        # based on both the key of the item in the bucket as well as the bucket's host.
+        uri = URI.parse(file_uri)
+        scheme_and_host = "#{uri.scheme}://#{uri.host}"
+
+        bucket.objects(prefix: File.dirname(globname)).flat_map do |object|
+          if object.key.match(regexp)
+            template = File.join(scheme_and_host, object.key)
+            derived_file_from(template: template)
+          end
+        end
+      end
+
       ##
       # @api public
       # write the tmp file to the file_uri

diff --git a/lib/spec/support/aws_s3_faux_bucket.rb → lib/spec_support/aws_s3_faux_bucket.rb b/lib/spec/support/aws_s3_faux_bucket.rb → lib/spec_support/aws_s3_faux_bucket.rb
@@ -15,20 +15,23 @@ def initialize
   attr_reader :storage
   def object(path)
     # Yup, we've got nested buckets
-    @storage[path] ||= Storage.new
+    @storage[path] ||= Storage.new(key: path)
   end
 
   def objects(prefix:)
-    @storage.select do |path, _file|
-      path.start_with?(prefix)
+    @storage.each_with_object([]) do |(path, obj), accumulator|
+      accumulator << obj if path.start_with?(prefix)
     end
   end
 
   class Storage
-    def initialize
+    # Because we're now coping with the glob tail finder, we need to account for the bucket entry's
+    # key.
+    def initialize(key:)
+      @key = key
       @storage = {}
     end
-    attr_reader :storage
+    attr_reader :storage, :key
 
     def upload_file(path)
       @storage[:upload] = path

diff --git a/spec/derivative_rodeo/generators/pdf_split_generator_spec.rb b/spec/derivative_rodeo/generators/pdf_split_generator_spec.rb
@@ -17,6 +17,33 @@
   end
 
   describe '#generated_files' do
+    context 'when given an already split PDF' do
+      it 'uses the already split components' do
+        Fixtures.with_file_uris_for("minimal-2-page.pdf") do |input_uris|
+          Fixtures.with_temporary_directory do |output_temporary_path|
+            output_location_template = "file://#{output_temporary_path}/{{ dir_parts[-1..-1] }}/{{ filename }}"
+            instance = described_class.new(input_uris: input_uris, output_location_template: output_location_template)
+            output_location = DerivativeRodeo::StorageLocations::FileLocation.build(from_uri: input_uris.first, template: output_location_template)
+
+            # Let's fake a nice TIFF being in a pre-processed location.
+            pre_existing_tiff_path = File.join(output_location.file_dir, output_location.file_basename, "pages/1.tiff")
+            FileUtils.mkdir_p(File.dirname(pre_existing_tiff_path))
+            File.open(pre_existing_tiff_path, "w+") do |f|
+              f.puts "🤠🐮🐴 A muppet man parading as a TIFF."
+            end
+
+            generated_files = instance.generated_files
+            # TODO: The PDF is two pages yet we only check for the presence of one
+            # or more derived files; hence our faked pre-processed derivative is all that we find.
+            expect(generated_files.size).to eq(1)
+
+            # Ensuring that we do in fact have the pre-made file.
+            expect(File.read(generated_files.first.file_path)).to start_with("🤠🐮🐴")
+          end
+        end
+      end
+    end
+
     context 'when given a PDF to split' do
       it 'will create one image per page, writing that to the storage adapter, and then enqueue each page for processing' do
         generated_files = nil

diff --git a/spec/derivative_rodeo/storage_locations/file_location_spec.rb b/spec/derivative_rodeo/storage_locations/file_location_spec.rb
@@ -68,4 +68,12 @@
     end
   end
   xit "write cases or mark private the rest of the methods"
+
+  context '#globbed_tail_locations' do
+    let(:args) { "file://#{__FILE__}" }
+
+    it "searches for files within the file_dir that match the given glob" do
+      expect(instance.globbed_tail_locations(tail_glob: "*.rb")).to include(__FILE__)
+    end
+  end
 end
diff --git a/spec/derivative_rodeo/storage_locations/s3_location_spec.rb b/spec/derivative_rodeo/storage_locations/s3_location_spec.rb
@@ -10,6 +10,7 @@
   before do
     # Let's use a FakeBucket instead!
     subject.use_actual_s3_bucket = false
+
     DerivativeRodeo.config do |config|
       config.aws_s3_bucket = 'fake-bucket'
       config.aws_s3_access_key_id = "FAKEFAKEFAKE"
@@ -63,4 +64,25 @@
 
   xit "write additional write cases"
   xit "write cases or mark private the rest of the methods"
+
+  describe '#globbed_tail_locations' do
+    it 'searched the bucket' do
+      # Because we instantiated the subject as a location to the :file_path (e.g. let(:file_path))
+      # we are encoding where things are relative to this file.  In other words, this logic is
+      # mirroring the generator logic that says where we're writing derivatives relative to their
+      # original file/input file.
+      bucket_dir = "files/#{File.basename(file_path, '.tiff')}"
+
+      basename = File.basename(__FILE__)
+      key = File.join(bucket_dir, "pages", basename)
+      subject.bucket.object(key).upload_file(__FILE__)
+
+      non_matching_key = File.join(bucket_dir, "missing", basename)
+      subject.bucket.object(non_matching_key).upload_file(__FILE__)
+
+      locations = subject.globbed_tail_locations(tail_glob: "ocr_color/pages/*.rb")
+
+      expect(locations.size).to eq(1)
+    end
+  end
 end
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
@@ -36,7 +36,7 @@
 # of increasing the boot-up time by auto-requiring all files in the support
 # directory. Alternatively, in the individual `*_spec.rb` files, manually
 # require only the support files necessary.
-Dir.glob(File.expand_path("../lib/spec/support/**/*.rb", __dir__)).each { |f| require f }
+Dir.glob(File.expand_path("../lib/spec_support/**/*.rb", __dir__)).each { |f| require f }
 Dir.glob(File.expand_path('./support/**/*.rb', __dir__)).each { |f| require f }
 
 RSpec.configure do |config|