Skip to content
This repository has been archived by the owner on Jan 8, 2022. It is now read-only.

[HOLD] Migration third run #543

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
6cb769f
Add a script to export collections
jcoyne Dec 14, 2020
4841e19
Export all the collections
jcoyne Dec 14, 2020
f42d2ea
Add required columns
jcoyne Dec 14, 2020
c1cb30d
Export the version
jcoyne Dec 14, 2020
45dd09c
Warn on invalid APOs
jcoyne Dec 14, 2020
d4485fb
Add related items to the export list
jcoyne Dec 14, 2020
7b6314b
Fix apo identifier method
jcoyne Dec 14, 2020
8d9d3d8
Don't export empty related items
jcoyne Dec 15, 2020
34afe25
Export more collection fields
jcoyne Dec 15, 2020
9084e88
Add a script for exporting items
jcoyne Dec 15, 2020
cde889a
Stream output and handle errors
jcoyne Dec 15, 2020
f5ec36f
Export contributors
jcoyne Dec 15, 2020
c753a21
Export timestamps and files
jcoyne Dec 16, 2020
0e90294
Export keywords
jcoyne Dec 16, 2020
8dad1b7
Export related citations
jcoyne Dec 16, 2020
c8aefde
Allow for filtering which collections and items are exported
mjgiarlo Feb 17, 2021
366dbb1
Fix bug in collection exporter (referencing an item)
mjgiarlo Feb 18, 2021
c47a429
Fix bug with item exporter such that the list is always an AF::Relation
mjgiarlo Feb 18, 2021
6bbba60
Export events for items and collections
mjgiarlo Feb 23, 2021
a3fbe51
Filter out graveyard APO collections in item exporter
mjgiarlo Feb 24, 2021
7d66be1
Filter out items belonging to the graveyard APO when exporting
mjgiarlo Feb 26, 2021
19e58fb
Filter out items and collections from being exported if they have a c…
mjgiarlo Feb 26, 2021
c89d5a6
Export files "hide" bit when exporting items
mjgiarlo Feb 26, 2021
e1a872b
Add a list of items to exclude
jcoyne Jun 3, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions bin/export-collections
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

# Usage:
# RAILS_ENV=production bin/export-collections > collections.jsonl

COLLECTION_DRUIDS_LIST = 'collection_druids.txt'
GRAVEYARD_APO = 'druid:kg712km1576'
UNIVERSITY_ARCHIVES_APO = 'druid:yf767bj4831'

require_relative '../config/environment'

collection_druids = File.exist?(COLLECTION_DRUIDS_LIST) ?
File.read(COLLECTION_DRUIDS_LIST).split.map { |bare_druid| "druid:#{bare_druid}" } :
[]

list = collection_druids.any? ?
Hydrus::Collection.find(collection_druids) :
Hydrus::Collection.all

def events_for(coll)
coll.get_hydrus_events.map do |event|
{
who: event.who, # is a bare sunetid string, maps to user_id
when: event.when, # is a UTC timestamp string, maps to created_at/updated_at
text: event.text # is a string, maps to event_type ('update_metadata') and description
# mapping also requires an eventable_type ('Collection') and an eventable_id (the collection ID)
}
end
end

def serialize(coll)
creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s
return if coll.admin_policy_object_id == GRAVEYARD_APO
if coll.admin_policy_object_id == UNIVERSITY_ARCHIVES_APO
warn "Collection #{coll.id} is in the University Archives APO, which is not a Hydrus::APO"
return
end

apo = coll.apo
related_items = coll.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?)

{
druid: coll.id,
version: coll.current_version.to_i,
creator: { sunetid: creator },
name: coll.title,
description: coll.abstract,
contact_email: coll.contact,
visibility_option: coll.visibility_option,
visibility: coll.visibility,
embargo_option: coll.embargo_option,
embargo_terms: coll.embargo_terms,
requires_human_approval: coll.requires_human_approval,
license_option: coll.license_option,
license: coll.license,
object_status: coll.object_status,
managers: apo.persons_with_role('hydrus-collection-manager'),
depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'),
reviewers: apo.persons_with_role('hydrus-collection-reviewer'),
related_items: related_items,
events: events_for(coll),
created_at: coll.create_date,
updated_at: coll.modified_date
}
end

warn "Exporting #{list.count} collections"

list.map do |collection|
if collection.catkey.present?
warn "Skipping #{collection.pid} because it has a catkey"
next
end

puts serialize(collection).to_json
end
151 changes: 151 additions & 0 deletions bin/export-items
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

# Usage:
# RAILS_ENV=production bin/export-items > items.jsonl

COLLECTION_DRUIDS_LIST = 'collection_druids.txt'
GRAVEYARD_APO = 'druid:kg712km1576'

# This list from: https://docs.google.com/spreadsheets/d/1Gu0TIUpNByNgNtFDn5MJARUvtsgcNXJAHiKf5s_NsUc/edit#gid=0
# NOTE: we don't need to worry about the "hydrus object with DOI", because these are not returned in the query
# for Hydrus::Items as they have been converted to Dor::Item.
ITEMS_TO_EXCLUDE = %w[
druid:ty334nd6571
druid:bx749bs2681
druid:zc000fq4044
druid:pr213sh5046
druid:jr671fk0644
]

require_relative '../config/environment'

# Monkey-patch AF to allow using HTTP POST (for querying items by their collection)
module ActiveFedora
class SolrService
def self.query(query, args={})
raw = args.delete(:raw)
args = args.merge(:q=>query, :qt=>'standard')
result = SolrService.instance.conn.post('select', :data=>args)
return result if raw
result['response']['docs']
end
end

module FinderMethods
def find_in_batches conditions, opts={}
data = { :q => create_query(conditions) }
opts[:qt] = @klass.solr_query_handler
#set default sort to created date ascending
unless opts[:sort].present?
opts[:sort]= @klass.default_sort_params
end

batch_size = opts.delete(:batch_size) || 1000

counter = 0
begin
counter += 1
response = ActiveFedora::SolrService.instance.conn.paginate counter, batch_size, "select", { :method => :post, :params => opts, :data => data }
docs = response["response"]["docs"]
yield docs
end while docs.has_next?
end
end
end

collection_druids = if File.exist?(COLLECTION_DRUIDS_LIST)
File.read(COLLECTION_DRUIDS_LIST).split.map { |bare_druid| "druid:#{bare_druid}" }
else
[]
end

list = if collection_druids.any?
where_collection_in_list_query = ActiveFedora::SolrService.construct_query_for_rel(
collection_druids.map { |druid| [:is_member_of_collection, "info:fedora/#{druid}"] },
' OR '
)
Hydrus::Item.where(where_collection_in_list_query)
else
Hydrus::Item.all
end

def contributors(item)
item.contributors.
map { |contrib| { full_name: contrib.name, role: contrib.role, name_type: contrib.name_type }}.
reject { |contrib| contrib[:full_name].blank? }
end

def files(item)
item.files.
map do |object_file|
{
path: File.realdirpath(object_file.current_path),
label: object_file.label,
hide: object_file.hide
}
end
end

def events_for(item)
item.get_hydrus_events.map do |event|
{
who: event.who, # is a bare sunetid string, maps to user_id
when: event.when, # is a UTC timestamp string, maps to created_at/updated_at
text: event.text # is a string, maps to event_type ('update_metadata') and description
# mapping also requires an eventable_type ('Work') and an eventable_id (the work ID)
}
end
end

def serialize(item)
creator = item.roleMetadata.ng_xml.xpath('//role/person/identifier').text
related_items = item.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?)
{
druid: item.id,
version: item.current_version.to_i,
creator: { sunetid: creator },
title: item.title,
abstract: item.abstract,
contact_email: item.contact,
collection: item.collection_id,
visibility: item.visibility.first,
license: item.license,
embargo_release_date: item.rmd_embargo_release_date,
date_created: item.date_created,
object_status: item.object_status,
item_type: item.item_type,
citation: item.preferred_citation,
related_items: related_items,
related_citations: item.related_citation,
contributors: contributors(item),
keywords: item.keywords,
files: files(item),
events: events_for(item),
created_at: item.create_date,
updated_at: item.modified_date
}
end

warn "Exporting #{list.count} items"
count = 0
list.each do |item|
count += 1
warn count
begin
if item.apo_pid == GRAVEYARD_APO
warn "Skipping #{item.pid} because it belongs to the Graveyard APO"
next
elsif item.catkey.present?
warn "Skipping #{item.pid} because it has a catkey"
next
elsif ITEMS_TO_EXCLUDE.include? item.pid
warn "Skipping #{item.pid} because it is on the list of items to exclude"
next
end
attributes = serialize(item)
puts attributes.compact.to_json
rescue => e
warn "Error with #{item.pid}. #{e.message}"
end
end