Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENG-6185][ENG-6186][ENG-6187][ENG-6188] additional searchable metadata for institutional dashboard #10779

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 46 additions & 18 deletions api/caching/tasks.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import logging
from urllib.parse import urlparse

from django.apps import apps
from django.contrib.contenttypes.models import ContentType
from django.db import connection
from django.db.models import Sum

import requests
import logging

from django.apps import apps
from api.caching.utils import storage_usage_cache
from framework.postcommit_tasks.handlers import enqueue_postcommit_task

Expand All @@ -16,6 +17,9 @@
logger = logging.getLogger(__name__)


_DEFAULT_FILEVERSION_PAGE_SIZE = 500000


def get_varnish_servers():
# TODO: this should get the varnish servers from HAProxy or a setting
return settings.VARNISH_SERVERS
Expand Down Expand Up @@ -111,35 +115,59 @@ def ban_url(instance):


@app.task(max_retries=5, default_retry_delay=10)
def update_storage_usage_cache(target_id, target_guid, per_page=500000):
def update_storage_usage_cache(target_id, target_guid, per_page=_DEFAULT_FILEVERSION_PAGE_SIZE):
if not settings.ENABLE_STORAGE_USAGE_CACHE:
return
from osf.models import Guid
storage_usage_total = compute_storage_usage_total(Guid.load(target_guid).referent, per_page=per_page)
key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid)
storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT)


def compute_storage_usage_total(target_obj, per_page=_DEFAULT_FILEVERSION_PAGE_SIZE):
sql = """
SELECT count(size), sum(size) from
(SELECT size FROM osf_basefileversionsthrough AS obfnv
LEFT JOIN osf_basefilenode file ON obfnv.basefilenode_id = file.id
LEFT JOIN osf_fileversion version ON obfnv.fileversion_id = version.id
LEFT JOIN django_content_type type on file.target_content_type_id = type.id
WHERE file.provider = 'osfstorage'
AND type.model = 'abstractnode'
AND file.deleted_on IS NULL
AND file.target_object_id=%s
AND file.target_object_id=%(target_pk)s
AND file.target_content_type_id=%(target_content_type_pk)s
ORDER BY version.id
LIMIT %s OFFSET %s) file_page
LIMIT %(per_page)s OFFSET %(offset)s
) file_page
"""
count = per_page
last_count = 1 # initialize non-zero
offset = 0
storage_usage_total = 0
content_type_pk = ContentType.objects.get_for_model(target_obj).pk
with connection.cursor() as cursor:
while count:
cursor.execute(sql, [target_id, per_page, offset])
result = cursor.fetchall()
storage_usage_total += int(result[0][1]) if result[0][1] else 0
count = int(result[0][0]) if result[0][0] else 0
offset += count

key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_guid)
storage_usage_cache.set(key, storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT)
while last_count:
cursor.execute(
sql, {
'target_pk': target_obj.pk,
'target_content_type_pk': content_type_pk,
'per_page': per_page,
'offset': offset,
},
)
this_count, size_sum = cursor.fetchall()[0]
storage_usage_total += int(size_sum or 0)
last_count = (this_count or 0)
offset += last_count
return storage_usage_total


def get_storage_usage_total(target_obj):
if not settings.ENABLE_STORAGE_USAGE_CACHE:
return compute_storage_usage_total(target_obj)
_cache_key = cache_settings.STORAGE_USAGE_KEY.format(target_id=target_obj._id)
_storage_usage_total = storage_usage_cache.get(_cache_key)
if _storage_usage_total is None:
_storage_usage_total = compute_storage_usage_total(target_obj)
storage_usage_cache.set(_cache_key, _storage_usage_total, settings.STORAGE_USAGE_CACHE_TIMEOUT)
return _storage_usage_total


def update_storage_usage(target):
Expand Down
93 changes: 81 additions & 12 deletions osf/metadata/osf_gathering.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from django import db
import rdflib

from api.caching.tasks import get_storage_usage_total
from osf import models as osfdb
from osf.metadata import gather
from osf.metadata.rdfutils import (
Expand All @@ -21,6 +22,7 @@
OSF,
OSFIO,
OWL,
PROV,
RDF,
ROR,
SKOS,
Expand All @@ -31,7 +33,10 @@
)
from osf.metrics.reports import PublicItemUsageReport
from osf.metrics.utils import YearMonth
from osf.utils import workflows as osfworkflows
from osf.utils import (
workflows as osfworkflows,
permissions as osfpermissions,
)
from osf.utils.outcomes import ArtifactTypes
from website import settings as website_settings

Expand Down Expand Up @@ -85,6 +90,7 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket:
OSF.isContainedBy: OSF_OBJECT_REFERENCE,
OSF.fileName: None,
OSF.filePath: None,
OSF.hasFileVersion: None,
}

OSF_OBJECT = {
Expand Down Expand Up @@ -128,16 +134,7 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket:
DCTERMS.creator: OSF_AGENT_REFERENCE,
},
OWL.sameAs: None,
}

OSF_FILEVERSION = {
DCTERMS.created: None,
DCTERMS.creator: OSF_AGENT_REFERENCE,
DCTERMS.extent: None,
DCTERMS.modified: None,
DCTERMS.requires: None,
DCTERMS['format']: None,
OSF.versionNumber: None,
PROV.qualifiedAttribution: None,
}

OSFMAP = {
Expand Down Expand Up @@ -190,7 +187,7 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket:
DCTERMS.modified: None,
DCTERMS.title: None,
DCTERMS.type: None,
OSF.hasFileVersion: OSF_FILEVERSION,
OSF.hasFileVersion: None,
OSF.isContainedBy: OSF_OBJECT_REFERENCE,
OSF.fileName: None,
OSF.filePath: None,
Expand All @@ -211,14 +208,26 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket:
# metadata not included in the core record
OSFMAP_SUPPLEMENT = {
OSF.Project: {
OSF.hasOsfAddon: None,
OSF.storageByteCount: None,
OSF.storageRegion: None,
},
OSF.ProjectComponent: {
OSF.hasOsfAddon: None,
OSF.storageByteCount: None,
OSF.storageRegion: None,
},
OSF.Registration: {
OSF.storageByteCount: None,
OSF.storageRegion: None,
},
OSF.RegistrationComponent: {
OSF.storageByteCount: None,
OSF.storageRegion: None,
},
OSF.Preprint: {
OSF.storageByteCount: None,
OSF.storageRegion: None,
},
OSF.File: {
},
Expand Down Expand Up @@ -254,6 +263,11 @@ def pls_get_magic_metadata_basket(osf_item) -> gather.Basket:
ArtifactTypes.PAPERS: OSF.hasPapersResource,
ArtifactTypes.SUPPLEMENTS: OSF.hasSupplementalResource,
}
OSF_CONTRIBUTOR_ROLES = {
osfpermissions.READ: OSF['readonly-contributor'],
osfpermissions.WRITE: OSF['write-contributor'],
osfpermissions.ADMIN: OSF['admin-contributor'],
}

BEPRESS_SUBJECT_SCHEME_URI = 'https://bepress.com/reference_guide_dc/disciplines/'
BEPRESS_SUBJECT_SCHEME_TITLE = 'bepress Digital Commons Three-Tiered Taxonomy'
Expand Down Expand Up @@ -686,6 +700,8 @@ def _gather_fileversion(fileversion, fileversion_iri):
version_sha256 = (fileversion.metadata or {}).get('sha256')
if version_sha256:
yield (fileversion_iri, DCTERMS.requires, checksum_iri('sha-256', version_sha256))
if fileversion.region is not None:
yield from _storage_region_triples(fileversion.region, subject_ref=fileversion_iri)


@gather.er(OSF.contains)
Expand Down Expand Up @@ -886,6 +902,19 @@ def gather_agents(focus):
# TODO: preserve order via rdflib.Seq


@gather.er(PROV.qualifiedAttribution)
def gather_qualified_attributions(focus):
_contributor_set = getattr(focus.dbmodel, 'contributor_set', None)
if _contributor_set is not None:
for _contributor in _contributor_set.filter(visible=True).select_related('user'):
_osfrole_ref = OSF_CONTRIBUTOR_ROLES.get(_contributor.permission)
if _osfrole_ref is not None:
_attribution_ref = rdflib.BNode()
yield (PROV.qualifiedAttribution, _attribution_ref)
yield (_attribution_ref, PROV.agent, OsfFocus(_contributor.user))
yield (_attribution_ref, DCAT.hadRole, _osfrole_ref)


@gather.er(OSF.affiliation)
def gather_affiliated_institutions(focus):
if hasattr(focus.dbmodel, 'get_affiliated_institutions'): # like OSFUser
Expand Down Expand Up @@ -1116,3 +1145,43 @@ def gather_last_month_usage(focus):
yield (_usage_report_ref, OSF.viewSessionCount, _usage_report.view_session_count)
yield (_usage_report_ref, OSF.downloadCount, _usage_report.download_count)
yield (_usage_report_ref, OSF.downloadSessionCount, _usage_report.download_session_count)


@gather.er(OSF.hasOsfAddon)
def gather_addons(focus):
# note: when gravyvalet exists, use `iterate_addons_for_resource`
# from osf.external.gravy_valet.request_helpers and get urls like
# "https://addons.osf.example/v1/addon-imps/..." instead of a urn
for _addon_settings in focus.dbmodel.get_addons():
if not _addon_settings.config.added_default: # skip always-on addons
_addon_ref = rdflib.URIRef(f'urn:osf.io:addons:{_addon_settings.short_name}')
yield (OSF.hasOsfAddon, _addon_ref)
yield (_addon_ref, RDF.type, OSF.AddonImplementation)
yield (_addon_ref, DCTERMS.identifier, _addon_settings.short_name)
yield (_addon_ref, SKOS.prefLabel, _addon_settings.config.full_name)


@gather.er(OSF.storageRegion)
def gather_storage_region(focus):
_region = getattr(focus.dbmodel, 'osfstorage_region', None)
if _region is not None:
yield from _storage_region_triples(_region)


def _storage_region_triples(region, *, subject_ref=None):
_region_ref = rdflib.URIRef(region.absolute_api_v2_url)
if subject_ref is None:
yield (OSF.storageRegion, _region_ref)
else:
yield (subject_ref, OSF.storageRegion, _region_ref)
yield (_region_ref, SKOS.prefLabel, rdflib.Literal(region.name, lang='en'))


@gather.er(
OSF.storageByteCount,
focustype_iris=[OSF.Project, OSF.ProjectComponent, OSF.Registration, OSF.RegistrationComponent, OSF.Preprint]
)
def gather_storage_byte_count(focus):
_storage_usage_total = get_storage_usage_total(focus.dbmodel)
if _storage_usage_total is not None:
yield (OSF.storageByteCount, _storage_usage_total)
2 changes: 2 additions & 0 deletions osf/metadata/rdfutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
RDF = rdflib.Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') # "resource description framework"
SKOS = rdflib.Namespace('http://www.w3.org/2004/02/skos/core#') # "simple knowledge organization system"
DCAT = rdflib.Namespace('http://www.w3.org/ns/dcat#') # "data catalog (vocabulary)"
PROV = rdflib.Namespace('http://www.w3.org/ns/prov#') # "provenance"
# non-standard namespace for datacite terms (resolves to datacite docs)
DATACITE = rdflib.Namespace('https://schema.datacite.org/meta/kernel-4/#')

Expand All @@ -38,6 +39,7 @@
'skos': SKOS,
'dcmitype': DCMITYPE,
'dcat': DCAT,
'prov': PROV,
}


Expand Down
17 changes: 8 additions & 9 deletions osf_tests/metadata/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,23 @@
from osf.metadata import gather
from osf.metadata.rdfutils import contextualized_graph

def assert_triples(actual_triples, expected_triples):
def assert_triples(actual_triples, expected_triples, label=''):
_expected_graph, _expected_focuses = _get_graph_and_focuses(expected_triples)
_actual_graph, _actual_focuses = _get_graph_and_focuses(actual_triples)
assert_graphs_equal(_actual_graph, _expected_graph)
assert_graphs_equal(_actual_graph, _expected_graph, label=label)
assert _expected_focuses == _actual_focuses


def assert_graphs_equal(actual_rdflib_graph, expected_rdflib_graph):
def assert_graphs_equal(actual_rdflib_graph, expected_rdflib_graph, label=''):
(_overlap, _expected_but_absent, _unexpected_but_present) = rdflib.compare.graph_diff(
expected_rdflib_graph,
actual_rdflib_graph,
)
assert not _expected_but_absent and not _unexpected_but_present, '\n\t'.join((
'unequal triple-sets!',
(f'unequal triplesets for "{label}"!' if label else 'unequal triple-sets!'),
f'overlap size: {len(_overlap)}',
f'expected (but absent): {_friendly_graph(_expected_but_absent)}',
f'unexpected (but present): {_friendly_graph(_unexpected_but_present)}',
f'expected (but absent): {_indented_graph(_expected_but_absent)}',
f'unexpected (but present): {_indented_graph(_unexpected_but_present)}',
))


Expand All @@ -35,10 +35,9 @@ def _get_graph_and_focuses(triples):
return _graph, _focuses


def _friendly_graph(rdfgraph) -> str:
def _indented_graph(rdfgraph) -> str:
_graph_to_print = contextualized_graph(rdfgraph)
_delim = '\n\t\t'
return _delim + _delim.join(
' '.join(_term.n3() for _term in triple)
for triple in _graph_to_print
_graph_to_print.serialize(format='turtle').strip().split('\n')
)
6 changes: 5 additions & 1 deletion osf_tests/metadata/expected_metadata_files/file_basic.turtle
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix osf: <https://osf.io/vocab/2022/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .

<http://localhost:5000/w3ibb> a osf:File ;
dcat:accessService <http://localhost:5000> ;
Expand Down Expand Up @@ -33,7 +34,8 @@
dcterms:extent "0.000007 MB" ;
dcterms:format "img/png" ;
dcterms:modified "2123-05-04" ;
dcterms:requires <urn:checksum:sha-256::6ac3c336e4094835293a3fed8a4b5fedde1b5e2626d9838fed50693bba00af0e> ;
dcterms:requires <urn:checksum:sha-256::shashasha> ;
osf:storageRegion <http://localhost:8000/v2/regions/us/> ;
osf:versionNumber "1" .

<http://localhost:5000/w1ibb> a dcterms:Agent,
Expand All @@ -45,3 +47,5 @@
foaf:Organization ;
dcterms:identifier "http://localhost:5000" ;
foaf:name "OSF" .

<http://localhost:8000/v2/regions/us/> skos:prefLabel "United States"@en .
6 changes: 5 additions & 1 deletion osf_tests/metadata/expected_metadata_files/file_full.turtle
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
@prefix osf: <https://osf.io/vocab/2022/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .

<http://localhost:5000/w3ibb> a osf:File ;
dcat:accessService <http://localhost:5000> ;
Expand Down Expand Up @@ -39,7 +40,8 @@
dcterms:extent "0.000007 MB" ;
dcterms:format "img/png" ;
dcterms:modified "2123-05-04" ;
dcterms:requires <urn:checksum:sha-256::6ac3c336e4094835293a3fed8a4b5fedde1b5e2626d9838fed50693bba00af0e> ;
dcterms:requires <urn:checksum:sha-256::shashasha> ;
osf:storageRegion <http://localhost:8000/v2/regions/us/> ;
osf:versionNumber "1" .

<https://moneypockets.example/millions> a osf:FundingAward ;
Expand Down Expand Up @@ -76,3 +78,5 @@
foaf:name "OSF" .

<https://schema.datacite.org/meta/kernel-4/#Dataset> rdfs:label "Dataset"@en .

<http://localhost:8000/v2/regions/us/> skos:prefLabel "United States"@en .
Loading
Loading