Direct upload of large files (#550)

* for direct upload to s3 * for direct upload (and download) to (and from) s3 * add the file_actor_decorator with the s3 upload code in it... 1GB threshold * we've already subclassed file_set_indexer so leave decorator alone for onel line change * pulled over * debug import url errors * turnoff puma-hang-on * more debug for importUrlJob * remove rescue get error * remove url_job_override, update RObs code to fit the Hyrax version we are using * add some debug to downloads controller to see what's up with filename * don't use the S3 URL to name the file * file_set.filename not file_set.file_name :/ * debug * remove debug, using @file_set.label for file.file_name when doing the external file attaching thing
scientist-softserv · Oct 31, 2024 · bcbcf40 · bcbcf40
1 parent 5274278
commit bcbcf40
Show file tree

Hide file tree

Showing 8 changed files with 95 additions and 21 deletions.
diff --git a/app/actors/hyrax/actors/file_actor_decorator.rb b/app/actors/hyrax/actors/file_actor_decorator.rb
@@ -0,0 +1,34 @@
+module Hyrax
+  module Actors
+    # Actions for a file identified by file_set and relation (maps to use predicate)
+    # @note Spawns asynchronous jobs
+    module FileActorDecorator
+      def ingest_file(io)
+        Rails.logger.error("[FileActor] starting write for #{file_set.id}")
+        if io.size.to_i >= 1.gigabytes
+          Rails.logger.error("[FileActor] Uploading directly to S3 for file_set #{file_set.id}")
+          digest = `sha1sum #{io.path}`.split.first
+          file_set.s3_only = digest
+          s3_object = Aws::S3::Object.new(ENV['AWS_BUCKET'], digest)
+          s3_object.upload_file(io.path) unless s3_object.exists?
+          Hydra::Works::AddExternalFileToFileSet.call(file_set, s3_object.public_url, relation)
+          # how do we make sure the sha gets indexed?
+        else
+          Rails.logger.error("[FileActor] writing to fcrepo #{file_set.id}")
+          # Skip versioning because versions will be minted by VersionCommitter as necessary during save_characterize_and_record_committer.
+          Hydra::Works::AddFileToFileSet.call(file_set,
+                                              io,
+                                              relation,
+                                              versioning: false)
+        end
+        return false unless file_set.save
+        repository_file = related_file
+        Hyrax::VersioningService.create(repository_file, user)
+        pathhint = io.uploaded_file.uploader.path if io.uploaded_file # in case next worker is on same filesystem
+        CharacterizeJob.perform_later(file_set, repository_file.id, pathhint || io.path)
+      end
+    end
+  end
+end
+
+Hyrax::Actors::FileActor.prepend(Hyrax::Actors::FileActorDecorator)
diff --git a/app/controllers/hyrax/downloads_controller.rb b/app/controllers/hyrax/downloads_controller.rb
@@ -41,13 +41,29 @@ def item_identifier_for_irus_analytics
       # OVERRIDE Hyrax 2.9.6 allow downloading directly from S3
       def send_file_contents
         if ENV['S3_DOWNLOADS']
-          s3_object = Aws::S3::Object.new(ENV['AWS_BUCKET'], file.digest.first.to_s.gsub('urn:sha1:', ''))
-          redirect_to s3_object.presigned_url(:get, expires_in: 3600, response_content_disposition: "attachment\; filename=#{file.original_name}")
-        else
-          self.status = 200
-          prepare_file_headers
-          stream_body file.stream
+          #s3_object = Aws::S3::Object.new(ENV['AWS_BUCKET'], file.digest.first.to_s.gsub('urn:sha1:', ''))
+          s3_object = if asset.respond_to?(:s3_only) && asset.s3_only
+                        Aws::S3::Object.new(ENV['AWS_BUCKET'], asset.s3_only)
+                      else
+                        Aws::S3::Object.new(ENV['AWS_BUCKET'], file.digest.first.to_s.gsub('urn:sha1:', ''))
+                      end
+          if s3_object.exists?
+            STDERR.puts "##################################"
+            STDERR.puts "Redirecting to S3 using the filename #{file.original_name}"
+            STDERR.puts "File object: #{file}"
+            redirect_to s3_object.presigned_url(
+              :get,
+              expires_in: 3600,
+              response_content_disposition: "attachment\; filename=#{file.original_name}"
+            )
+            return
+          end
         end
+        # from here on this is effectively `super` if this was a decorator
+        # will fall back to streaming object via fedora
+        self.status = 200
+        prepare_file_headers
+        stream_body file.stream
       end
 
       # Override the Hydra::Controller::DownloadBehavior#content_options so that

diff --git a/app/indexers/file_set_indexer.rb b/app/indexers/file_set_indexer.rb
@@ -3,6 +3,7 @@ def generate_solr_document
 
     super.tap do |solr_doc|
       solr_doc['hasFormat_ssim'] = object.rendering_ids
+      solr_doc['digest_ssim'] = "urn:sha1:#{object.s3_only}" if object.s3_only.present?
     end
 
   rescue Ldp::HttpError => exception

diff --git a/app/models/file_set.rb b/app/models/file_set.rb
@@ -3,5 +3,12 @@
 # Generated by hyrax:models:install
 class FileSet < ActiveFedora::Base
   include Ubiquity::UniversalMetadata
+
+  property :s3_only,
+           predicate: ::RDF::URI("https://hykucommons.org/terms/s3_only"),
+           multiple: false do |index|
+    index.as :stored_searchable, :facetable
+  end
+
   include ::Hyrax::FileSetBehavior
 end
diff --git a/bin/web b/bin/web
@@ -3,4 +3,4 @@ if ENV['GOOGLE_OAUTH_PRIVATE_KEY_VALUE'] && !ENV['GOOGLE_OAUTH_PRIVATE_KEY_VALUE
   %x{echo #{ENV['GOOGLE_OAUTH_PRIVATE_KEY_VALUE']} | base64 --decode > prod-cred.p12}
 end
 
-exec "bundle exec puma -v -b tcp://0.0.0.0:3000"
+exec "bundle exec puma -v -b tcp://0.0.0.0:3000"
diff --git a/config/environments/production.rb b/config/environments/production.rb
@@ -48,7 +48,7 @@
 
   # Use the lowest log level to ensure availability of diagnostic information
   # when problems arise.
-  config.log_level = :info
+  config.log_level = :debug
 
   # Prepend all log lines with the following tags.
   config.log_tags = [ :request_id ]

diff --git a/lib/hydra/works/services/add_external_file_to_file_set_decorator.rb b/lib/hydra/works/services/add_external_file_to_file_set_decorator.rb
@@ -0,0 +1,17 @@
+# OVERRIDE Hydra-works 2.0.0 to deal with fcrepo + s3s inability to upload empty files
+
+module Hydra
+  module Works
+    module UpdaterDecorator
+      def attach_attributes(external_file_url, filename = nil)
+        current_file.content = StringIO.new('-') # anything but blank
+        # filename will be the url.... but we will use file_set.label 
+        # becuase making the filename the url of an s3 key is problematic for humans
+        current_file.original_name = @file_set.label
+        current_file.mime_type = "message/external-body; access-type=URL; URL=\"#{external_file_url}\""
+      end
+    end
+  end
+end
+
+Hydra::Works::AddExternalFileToFileSet::Updater.prepend(Hydra::Works::UpdaterDecorator)
diff --git a/ops/provision/main.tf b/ops/provision/main.tf
@@ -38,31 +38,30 @@ data "local_file" "efs_name" {
   filename = "efs_name"
 }
 
+resource "helm_release" "aws-load-balancer" {
+  chart   = "aws-load-balancer-controller"
+  name    = "aws-load-balancer-controller"
+  namespace = "kube-system"
+  repository = "https://aws.github.io/eks-charts"
+  set {
+    name = "clusterName"
+    value = "r2-bl"
+  }
+}
+
 resource "helm_release" "ingress-nginx" {
   name = "ingress-nginx"
   namespace = "ingress-nginx"
   create_namespace = true
   version = "4.5.2"
   repository = "https://kubernetes.github.io/ingress-nginx"
   chart = "ingress-nginx"
+  depends_on = [helm_release.aws-load-balancer]
   values = [
     file("k8s/ingress-nginx-values.yaml")
   ]
 }
 
-resource "helm_release" "eks_efs_csi_driver" {
-  chart      = "aws-efs-csi-driver"
-  name       = "efs"
-  namespace  = "storage"
-  create_namespace = true
-  repository = "https://kubernetes-sigs.github.io/aws-efs-csi-driver/"
-
-  set {
-    name  = "image.repository"
-    value = "602401143452.dkr.ecr.${var.region}.amazonaws.com/eks/aws-efs-csi-driver"
-  }
-}
-
 resource "kubernetes_storage_class" "storage_class" {
   storage_provisioner = "efs.csi.aws.com"