diff --git a/bin/export-collections b/bin/export-collections new file mode 100755 index 000000000..87e7bd83a --- /dev/null +++ b/bin/export-collections @@ -0,0 +1,77 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +# Usage: +# RAILS_ENV=production bin/export-collections > collections.jsonl + +COLLECTION_DRUIDS_LIST = 'collection_druids.txt' +GRAVEYARD_APO = 'druid:kg712km1576' +UNIVERSITY_ARCHIVES_APO = 'druid:yf767bj4831' + +require_relative '../config/environment' + +collection_druids = File.exist?(COLLECTION_DRUIDS_LIST) ? + File.read(COLLECTION_DRUIDS_LIST).split.map { |bare_druid| "druid:#{bare_druid}" } : + [] + +list = collection_druids.any? ? + Hydrus::Collection.find(collection_druids) : + Hydrus::Collection.all + +def events_for(coll) + coll.get_hydrus_events.map do |event| + { + who: event.who, # is a bare sunetid string, maps to user_id + when: event.when, # is a UTC timestamp string, maps to created_at/updated_at + text: event.text # is a string, maps to event_type ('update_metadata') and description + # mapping also requires an eventable_type ('Collection') and an eventable_id (the collection ID) + } + end +end + +def serialize(coll) + creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s + return if coll.admin_policy_object_id == GRAVEYARD_APO + if coll.admin_policy_object_id == UNIVERSITY_ARCHIVES_APO + warn "Collection #{coll.id} is in the University Archives APO, which is not a Hydrus::APO" + return + end + + apo = coll.apo + related_items = coll.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?) + + { + druid: coll.id, + version: coll.current_version.to_i, + creator: { sunetid: creator }, + name: coll.title, + description: coll.abstract, + contact_email: coll.contact, + visibility_option: coll.visibility_option, + visibility: coll.visibility, + embargo_option: coll.embargo_option, + embargo_terms: coll.embargo_terms, + requires_human_approval: coll.requires_human_approval, + license_option: coll.license_option, + license: coll.license, + object_status: coll.object_status, + managers: apo.persons_with_role('hydrus-collection-manager'), + depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), + reviewers: apo.persons_with_role('hydrus-collection-reviewer'), + related_items: related_items, + events: events_for(coll), + created_at: coll.create_date, + updated_at: coll.modified_date + } +end + +warn "Exporting #{list.count} collections" + +list.map do |collection| + if collection.catkey.present? + warn "Skipping #{collection.pid} because it has a catkey" + next + end + + puts serialize(collection).to_json +end diff --git a/bin/export-items b/bin/export-items new file mode 100755 index 000000000..909247f46 --- /dev/null +++ b/bin/export-items @@ -0,0 +1,151 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +# Usage: +# RAILS_ENV=production bin/export-items > items.jsonl + +COLLECTION_DRUIDS_LIST = 'collection_druids.txt' +GRAVEYARD_APO = 'druid:kg712km1576' + +# This list from: https://docs.google.com/spreadsheets/d/1Gu0TIUpNByNgNtFDn5MJARUvtsgcNXJAHiKf5s_NsUc/edit#gid=0 +# NOTE: we don't need to worry about the "hydrus object with DOI", because these are not returned in the query +# for Hydrus::Items as they have been converted to Dor::Item. +ITEMS_TO_EXCLUDE = %w[ + druid:ty334nd6571 + druid:bx749bs2681 + druid:zc000fq4044 + druid:pr213sh5046 + druid:jr671fk0644 +] + +require_relative '../config/environment' + +# Monkey-patch AF to allow using HTTP POST (for querying items by their collection) +module ActiveFedora + class SolrService + def self.query(query, args={}) + raw = args.delete(:raw) + args = args.merge(:q=>query, :qt=>'standard') + result = SolrService.instance.conn.post('select', :data=>args) + return result if raw + result['response']['docs'] + end + end + + module FinderMethods + def find_in_batches conditions, opts={} + data = { :q => create_query(conditions) } + opts[:qt] = @klass.solr_query_handler + #set default sort to created date ascending + unless opts[:sort].present? + opts[:sort]= @klass.default_sort_params + end + + batch_size = opts.delete(:batch_size) || 1000 + + counter = 0 + begin + counter += 1 + response = ActiveFedora::SolrService.instance.conn.paginate counter, batch_size, "select", { :method => :post, :params => opts, :data => data } + docs = response["response"]["docs"] + yield docs + end while docs.has_next? + end + end +end + +collection_druids = if File.exist?(COLLECTION_DRUIDS_LIST) + File.read(COLLECTION_DRUIDS_LIST).split.map { |bare_druid| "druid:#{bare_druid}" } + else + [] + end + +list = if collection_druids.any? + where_collection_in_list_query = ActiveFedora::SolrService.construct_query_for_rel( + collection_druids.map { |druid| [:is_member_of_collection, "info:fedora/#{druid}"] }, + ' OR ' + ) + Hydrus::Item.where(where_collection_in_list_query) + else + Hydrus::Item.all + end + +def contributors(item) + item.contributors. + map { |contrib| { full_name: contrib.name, role: contrib.role, name_type: contrib.name_type }}. + reject { |contrib| contrib[:full_name].blank? } +end + +def files(item) + item.files. + map do |object_file| + { + path: File.realdirpath(object_file.current_path), + label: object_file.label, + hide: object_file.hide + } + end +end + +def events_for(item) + item.get_hydrus_events.map do |event| + { + who: event.who, # is a bare sunetid string, maps to user_id + when: event.when, # is a UTC timestamp string, maps to created_at/updated_at + text: event.text # is a string, maps to event_type ('update_metadata') and description + # mapping also requires an eventable_type ('Work') and an eventable_id (the work ID) + } + end +end + +def serialize(item) + creator = item.roleMetadata.ng_xml.xpath('//role/person/identifier').text + related_items = item.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?) + { + druid: item.id, + version: item.current_version.to_i, + creator: { sunetid: creator }, + title: item.title, + abstract: item.abstract, + contact_email: item.contact, + collection: item.collection_id, + visibility: item.visibility.first, + license: item.license, + embargo_release_date: item.rmd_embargo_release_date, + date_created: item.date_created, + object_status: item.object_status, + item_type: item.item_type, + citation: item.preferred_citation, + related_items: related_items, + related_citations: item.related_citation, + contributors: contributors(item), + keywords: item.keywords, + files: files(item), + events: events_for(item), + created_at: item.create_date, + updated_at: item.modified_date + } +end + +warn "Exporting #{list.count} items" +count = 0 +list.each do |item| + count += 1 + warn count + begin + if item.apo_pid == GRAVEYARD_APO + warn "Skipping #{item.pid} because it belongs to the Graveyard APO" + next + elsif item.catkey.present? + warn "Skipping #{item.pid} because it has a catkey" + next + elsif ITEMS_TO_EXCLUDE.include? item.pid + warn "Skipping #{item.pid} because it is on the list of items to exclude" + next + end + attributes = serialize(item) + puts attributes.compact.to_json + rescue => e + warn "Error with #{item.pid}. #{e.message}" + end +end