Skip to content

Commit

Permalink
junk script to create mxf csv for tagging
Browse files Browse the repository at this point in the history
  • Loading branch information
johnf committed Oct 13, 2024
1 parent f511a1b commit cb88fc2
Showing 1 changed file with 97 additions and 134 deletions.
231 changes: 97 additions & 134 deletions app/services/junk_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,138 +17,101 @@ def initialize(env)
@s3 = Aws::S3::Client.new(region: 'ap-southeast-2')
end

# def run
# inventory_dir = find_recent_inventory_dir
# inventory_csv = fetch_inventory_csv(inventory_dir)
#
# s3_files = extract_s3_files(inventory_csv)
#
# s3_files.select! { |filename| filename.end_with?('PDSC_ADMIN.pdf') }
#
# collections = {}
# s3_files.each do |filename|
# collection = filename.split('/')[0]
# collections[collection] ||= []
# collections[collection] << filename
# end
#
# catalog_bucket = 'nabu-catalog-prod'
#
# collections.each do |collection, files|
# if files.size != 1
# puts "Collection: #{collection}"
# files.each do |file|
# puts " #{file}"
# end
# puts
# end
#
# # TODO MOVE
# src = files[0]
# dst = "#{collection}/pdsc_admin/#{collection}-deposit.pdf"
# print "Moving #{src} to #{dst} - "
#
# begin
# @s3.head_object(bucket: catalog_bucket, key: dst)
# puts 'ERROR: dst exists skipping'
# next
# rescue Aws::S3::Errors::NotFound
# # We dont' want it to exist
# end
#
# begin
# @s3.copy_object(bucket: catalog_bucket, copy_source: "#{catalog_bucket}/#{src}", key: dst)
# @s3.delete_object(bucket: catalog_bucket, key: src)
# rescue Aws::S3::Errors::NoSuchKey => e
# puts 'Something went wrong moving'
# puts e
# exit 1
# end
#
# puts 'OK'
# end
# end
#
# private
#
# def extract_s3_files(inventory_csv)
# s3_files = []
#
# CSV.parse(inventory_csv, headers: false) do |row|
# _bucket_name, filename, _version_id, is_latest, delete_marker, _size, _last_modified, _etag,
# storage_class, multiple_upload, multipart_upload_flag, replication_status, checksum_algo = row
#
# next if is_latest == 'false' || delete_marker == 'true'
#
# s3_files << CGI.unescape(filename)
# end
#
# if s3_files.size != s3_files.uniq.size
# raise 'Duplicate files in S3 inventory'
# end
#
# s3_files
# end
#
# def fetch_inventory_csv(inventory_dir)
# manifest_json = @s3.get_object(bucket: @bucket, key: "#{inventory_dir}manifest.json").body.read
# manifest = JSON.parse(manifest_json)
#
# files = manifest['files']
# if files.size > 1
# raise 'Multiple files in manifest'
# end
#
# file = files.first['key']
#
# # Download the S3 Inventory CSV file
# puts "Downloading S3 Inventory CSV file: #{file}"
# inventory_gzipped = @s3.get_object(bucket: @bucket, key: file).body.read
# puts "Unzipping file: #{file}\n\n"
# inventory_csv = Zlib::GzipReader.new(StringIO.new(inventory_gzipped)).read
# end
#
# def find_recent_inventory_dir
# inventory_files = fetch_inventory_files
#
# # Extract the timestamp part from each key and convert it to Time object
# timestamped_files = inventory_files.map do |key|
# match = key.match(/CatalogBucketInventory0\/(\d{4})-(\d{2})-(\d{2})T(\d{2})-(\d{2})Z/)
# if match
# year, month, day, hour, minute = match.captures
# time = Time.new(year, month, day, hour, minute)
# { key: key, time: time }
# end
# end.compact
#
# # Find the most recent file
# most_recent_dir = timestamped_files.max_by { |file| file[:time] }
#
# puts "Most recent inventory file: #{most_recent_dir[:key]}"
#
# most_recent_dir[:key]
# end
#
# def fetch_inventory_files
# inventory_files = []
# next_token = nil
#
# loop do
# response = @s3.list_objects_v2(
# bucket: @bucket,
# prefix: @prefix,
# delimiter: '/',
# continuation_token: next_token
# )
#
# # Collect all object keys
# inventory_files += response.common_prefixes.map(&:prefix)
#
# break unless response.is_truncated
#
# next_token = response.next_continuation_token
# end
#
# inventory_files
# end
def run
inventory_dir = find_recent_inventory_dir
inventory_csv = fetch_inventory_csv(inventory_dir)

s3_files = extract_s3_files(inventory_csv)

s3_files.select! { |filename| filename.end_with?('mxf') }

puts "Found #{s3_files.size} MXF files"

s3_files.each do |filename|
puts "nabu-catalog-prod,#{filename}"
end
end

private

def extract_s3_files(inventory_csv)
s3_files = []

CSV.parse(inventory_csv, headers: false) do |row|
_bucket_name, filename, _version_id, is_latest, delete_marker, _size, _last_modified, _etag,
storage_class, multiple_upload, multipart_upload_flag, replication_status, checksum_algo = row

next if is_latest == 'false' || delete_marker == 'true'

s3_files << CGI.unescape(filename)
end

if s3_files.size != s3_files.uniq.size
raise 'Duplicate files in S3 inventory'
end

s3_files
end

def fetch_inventory_csv(inventory_dir)
manifest_json = @s3.get_object(bucket: @bucket, key: "#{inventory_dir}manifest.json").body.read
manifest = JSON.parse(manifest_json)

files = manifest['files']
if files.size > 1
raise 'Multiple files in manifest'
end

file = files.first['key']

# Download the S3 Inventory CSV file
puts "Downloading S3 Inventory CSV file: #{file}"
inventory_gzipped = @s3.get_object(bucket: @bucket, key: file).body.read
puts "Unzipping file: #{file}\n\n"
inventory_csv = Zlib::GzipReader.new(StringIO.new(inventory_gzipped)).read
end

def find_recent_inventory_dir
inventory_files = fetch_inventory_files

# Extract the timestamp part from each key and convert it to Time object
timestamped_files = inventory_files.map do |key|
match = key.match(/CatalogBucketInventory0\/(\d{4})-(\d{2})-(\d{2})T(\d{2})-(\d{2})Z/)
if match
year, month, day, hour, minute = match.captures
time = Time.new(year, month, day, hour, minute)
{ key: key, time: time }
end
end.compact

# Find the most recent file
most_recent_dir = timestamped_files.max_by { |file| file[:time] }

puts "Most recent inventory file: #{most_recent_dir[:key]}"

most_recent_dir[:key]
end

def fetch_inventory_files
inventory_files = []
next_token = nil

loop do
response = @s3.list_objects_v2(
bucket: @bucket,
prefix: @prefix,
delimiter: '/',
continuation_token: next_token
)

# Collect all object keys
inventory_files += response.common_prefixes.map(&:prefix)

break unless response.is_truncated

next_token = response.next_continuation_token
end

inventory_files
end
end

0 comments on commit cb88fc2

Please sign in to comment.