Skip to content

Commit

Permalink
Add Replication report
Browse files Browse the repository at this point in the history
  • Loading branch information
johnf committed Oct 13, 2024
1 parent 687bb90 commit e3d335f
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 5 deletions.
7 changes: 7 additions & 0 deletions app/mailers/admin_mailer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,11 @@ def catalog_s3_sync_report

mail(subject: "[NABU Admin] Catalog S3 Sync Report: #{Date.today.strftime('%F')}")
end

def catalog_replication_report
@prod_only = params[:prod_only]
@dr_only = params[:dr_only]

mail(subject: "[NABU Admin] Catalog Replication Report: #{Date.today.strftime('%F')}")
end
end
127 changes: 127 additions & 0 deletions app/services/catalog_replication_validator_service.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
require 'csv'
require 'aws-sdk-s3'

class CatalogReplicationValidatorService
attr_reader :catalog_dir, :verbose

def initialize
# Strange bug in dev docker
ENV.delete('AWS_SECRET_ACCESS_KEY')
ENV.delete('AWS_ACCESS_KEY_ID')
ENV.delete('AWS_SESSION_TOKEN')

@s3 = Aws::S3::Client.new(region: 'ap-southeast-2')
end

def run
prod_inventory = fetch_inventory_csv('prod')
dr_inventory = fetch_inventory_csv('dr')

prod_files, prod_new = extract_s3_files(prod_inventory)
dr_files, dr_new = extract_s3_files(dr_inventory)

prod_only = prod_files - dr_files - prod_new
dr_only = dr_files - prod_files

AdminMailer.with(prod_only:, dr_only:).catalog_replication_report.deliver_now
end

private

def extract_s3_files(inventory_csv)
s3_files = []
new_files = []

CSV.parse(inventory_csv, headers: false) do |row|
_bucket_name, filename, _version_id, is_latest, delete_marker, _size, last_modified, _etag,
storage_class, multiple_upload, multipart_upload_flag, replication_status, checksum_algo = row

next if is_latest == 'false' || delete_marker == 'true'



file = CGI.unescape(filename)

s3_files << file

# If the file was modifed in the last week add it to the new list
new_files << file if Time.parse(last_modified) > Time.now - 1.week
end

s3_files = s3_files.reject { |filename| filename.ends_with?('pdsc_admin/ro-crate-metadata.json') }
.reject { |filename| filename.starts_with?('pdsc_admin/') && filename.ends_with?('-deposit.pdf') }
# TODO: Remove this after we migrate all the df files
.reject { |filename| filename.ends_with?('df-PDSC_ADMIN.pdf') }

if s3_files.size != s3_files.uniq.size
raise 'Duplicate files in S3 inventory'
end

[s3_files, new_files]
end

def meta_bucket(env)
env === 'prod' ? 'nabu-meta-prod' : 'nabu-metadr-prod'
end

def fetch_inventory_csv(env)
inventory_dir = find_recent_inventory_dir(env)

manifest_json = @s3.get_object(bucket: meta_bucket(env), key: "#{inventory_dir}manifest.json").body.read
manifest = JSON.parse(manifest_json)

files = manifest['files']
if files.size > 1
raise 'Multiple files in manifest'
end

file = files.first['key']

# Download the S3 Inventory CSV file
inventory_gzipped = @s3.get_object(bucket: meta_bucket(env), key: file).body.read
inventory_csv = Zlib::GzipReader.new(StringIO.new(inventory_gzipped)).read
end

def find_recent_inventory_dir(env)
inventory_files = fetch_inventory_files(env)

# Extract the timestamp part from each key and convert it to Time object
timestamped_files = inventory_files.map do |key|
match = key.match(/(?:Catalog|Dr)BucketInventory0\/(\d{4})-(\d{2})-(\d{2})T(\d{2})-(\d{2})Z/)
if match
year, month, day, hour, minute = match.captures
time = Time.new(year, month, day, hour, minute)
{ key: key, time: time }
end
end.compact

# Find the most recent file
most_recent_dir = timestamped_files.max_by { |file| file[:time] }

most_recent_dir[:key]
end

def fetch_inventory_files(env)
prefix = env === 'prod' ? 'inventories/catalog/nabu-catalog-prod/CatalogBucketInventory0/' : 'inventories/catalogdr/nabu-catalogdr-prod/DrBucketInventory0/'
inventory_files = []
next_token = nil

loop do
response = @s3.list_objects_v2(
bucket: meta_bucket(env),
prefix: prefix,
delimiter: '/',
continuation_token: next_token
)

# Collect all object keys
inventory_files += response.common_prefixes.map(&:prefix)

break unless response.is_truncated

next_token = response.next_continuation_token
end

inventory_files
end
end
16 changes: 16 additions & 0 deletions app/views/admin_mailer/catalog_replication_report.text.erb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
This report validates that the Sydney bucket is fully synced with the Melbourne bucket.

## Files in Sydney but not Melbourne

<% @prod_only.each do |file| -%>
= <%= file %>
<% end -%>

## Files in Melbourne but not Sydney

<% @dr_only.each do |file| -%>
= <%= file %>
<% end -%>

NOTE: It is normal to see a handful of files here as this report is based of an inventory of Sydney and Melbourne created by AWS at different times.
you shouldn't see this list grow though or the same files week to week
1 change: 0 additions & 1 deletion cdk/lib/app-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import * as ecrAssets from 'aws-cdk-lib/aws-ecr-assets';
import * as ecs from 'aws-cdk-lib/aws-ecs';
import * as elbv2 from 'aws-cdk-lib/aws-elasticloadbalancingv2';
import * as events from 'aws-cdk-lib/aws-events';
import * as eventbridge from 'aws-cdk-lib/aws-events';
import * as targets from 'aws-cdk-lib/aws-events-targets';
import * as iam from 'aws-cdk-lib/aws-iam';
import * as opensearch from 'aws-cdk-lib/aws-opensearchservice';
Expand Down
3 changes: 0 additions & 3 deletions cdk/lib/main-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,6 @@ export class MainStack extends cdk.Stack {
encryption: s3.BucketEncryption.S3_MANAGED,
blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
enforceSSL: true,
// TODO: Do we want tiering?
// intelligentTieringConfigurations: [ ],
// TODO: Decide on lifecycle rules
lifecycleRules: [{ abortIncompleteMultipartUploadAfter: cdk.Duration.days(7) }],
versioned: env === 'prod',
inventories: [
Expand Down
15 changes: 14 additions & 1 deletion cron-worker/cron.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
scheduler = Rufus::Scheduler.new

scheduler.cron '27 4 * * 2' do
name = 'DB/S3 Sync'
name = 'Check DB S3 Sync'
task = 'catalog:check_db_s3_sync'

puts "#{Time.current}: Starting task #{name}"
Expand All @@ -26,6 +26,19 @@
end
end

scheduler.cron '27 5 * * 2' do
name = 'Check Replication'
task = 'catalog:check_replication'

puts "#{Time.current}: Starting task #{name}"

begin
Rake::Task[task].invoke
ensure
Rake::Task[task].reenable
end
end

scheduler.cron '10 1 * * *' do
name = 'Mint Dois'
task = 'catalog:mint_dois'
Expand Down
6 changes: 6 additions & 0 deletions lib/tasks/catalog.rake
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,12 @@ namespace :catalog do
validator.run
end

desc 'Validate DR Replication'
task check_replication: :environment do
validator = CatalogReplicationValidatorService.new
validator.run
end

desc "Mint DOIs for objects that don't have one"
task mint_dois: :environment do
dry_run = ENV['DRY_RUN'] ? true : false
Expand Down

0 comments on commit e3d335f

Please sign in to comment.