diff --git a/osf/metrics/reporters/__init__.py b/osf/metrics/reporters/__init__.py index 1f8e0fba862..26738fc0418 100644 --- a/osf/metrics/reporters/__init__.py +++ b/osf/metrics/reporters/__init__.py @@ -4,6 +4,7 @@ from .storage_addon_usage import StorageAddonUsageReporter from .download_count import DownloadCountReporter from .institution_summary import InstitutionSummaryReporter +from .institutional_users import InstitutionalUsersReporter from .new_user_domain import NewUserDomainReporter from .node_count import NodeCountReporter from .osfstorage_file_count import OsfstorageFileCountReporter @@ -26,3 +27,4 @@ class AllDailyReporters(enum.Enum): class AllMonthlyReporters(enum.Enum): SPAM_COUNT = SpamCountReporter + INSTITUTIONAL_USERS = InstitutionalUsersReporter diff --git a/osf/metrics/reporters/_base.py b/osf/metrics/reporters/_base.py index d3bf1722523..24499971ede 100644 --- a/osf/metrics/reporters/_base.py +++ b/osf/metrics/reporters/_base.py @@ -1,5 +1,7 @@ +from collections import abc import logging +from osf.metrics.reports import MonthlyReport from osf.metrics.utils import YearMonth @@ -7,12 +9,15 @@ class MonthlyReporter: - def report(self, report_yearmonth: YearMonth): + def report( + self, + report_yearmonth: YearMonth, + ) -> abc.Iterable[MonthlyReport] | abc.Iterator[MonthlyReport]: """build a report for the given month """ raise NotImplementedError(f'{self.__name__} must implement `report`') - def run_and_record_for_month(self, report_yearmonth: YearMonth): + def run_and_record_for_month(self, report_yearmonth: YearMonth) -> None: reports = self.report(report_yearmonth) for report in reports: assert report.report_yearmonth == str(report_yearmonth) diff --git a/osf/metrics/reporters/institutional_users.py b/osf/metrics/reporters/institutional_users.py new file mode 100644 index 00000000000..0a0598172e0 --- /dev/null +++ b/osf/metrics/reporters/institutional_users.py @@ -0,0 +1,141 @@ +import dataclasses +import datetime + +from django.contrib.contenttypes.models import ContentType +from django.db.models import Q, F, Sum + +from osf import models as osfdb +from osf.models.spam import SpamStatus +from addons.osfstorage.models import OsfStorageFile +from osf.metrics.reports import InstitutionalUserReport +from osf.metrics.utils import YearMonth +from ._base import MonthlyReporter + + +_CHUNK_SIZE = 500 + + +class InstitutionalUsersReporter(MonthlyReporter): + '''build an InstitutionalUserReport for each institution-user affiliation + + built for the institution dashboard at ://osf.example/institutions//dashboard/, + which offers institutional admins insight into how people at their institution are + using osf, based on their explicitly-affiliated osf objects + ''' + def report(self, yearmonth: YearMonth): + _before_datetime = yearmonth.next_month() + for _institution in osfdb.Institution.objects.filter(created__lt=_before_datetime): + _user_qs = _institution.get_institution_users().filter(created__lt=_before_datetime) + for _user in _user_qs.iterator(chunk_size=_CHUNK_SIZE): + _helper = _InstiUserReportHelper(_institution, _user, yearmonth, _before_datetime) + yield _helper.report + + +# helper +@dataclasses.dataclass +class _InstiUserReportHelper: + institution: osfdb.Institution + user: osfdb.OSFUser + yearmonth: YearMonth + before_datetime: datetime.datetime + report: InstitutionalUserReport = dataclasses.field(init=False) + + def __post_init__(self): + _affiliation = self.user.get_institution_affiliation(self.institution._id) + self.report = InstitutionalUserReport( + report_yearmonth=self.yearmonth, + institution_id=self.institution._id, + user_id=self.user._id, + user_name=self.user.fullname, + department_name=(_affiliation.sso_department or None), + month_last_login=YearMonth.from_date(self.user.date_last_login), + account_creation_date=YearMonth.from_date(self.user.created), + orcid_id=self.user.get_verified_external_id('ORCID', verified_only=True), + public_project_count=self._public_project_queryset().count(), + private_project_count=self._private_project_queryset().count(), + public_registration_count=self._public_registration_queryset().count(), + embargoed_registration_count=self._embargoed_registration_queryset().count(), + public_file_count=self._public_osfstorage_file_queryset().count(), + published_preprint_count=self._published_preprint_queryset().count(), + storage_byte_count=self._storage_byte_count(), + ) + + def _node_queryset(self): + _institution_node_qs = self.institution.nodes.filter( + created__lt=self.before_datetime, + is_deleted=False, + ).exclude(spam_status=SpamStatus.SPAM) + return osfdb.Node.objects.get_nodes_for_user( + user=self.user, + base_queryset=_institution_node_qs, + ) + + def _public_project_queryset(self): + return self._node_queryset().filter( + type='osf.node', # `type` field from TypedModel + is_public=True, + root_id=F('pk'), # only root nodes + ) + + def _private_project_queryset(self): + return self._node_queryset().filter( + type='osf.node', # `type` field from TypedModel + is_public=False, + root_id=F('pk'), # only root nodes + ) + + def _public_registration_queryset(self): + return self._node_queryset().filter( + type='osf.registration', # `type` field from TypedModel + is_public=True, + root_id=F('pk'), # only root nodes + ) + + def _embargoed_registration_queryset(self): + return self._node_queryset().filter( + type='osf.registration', # `type` field from TypedModel + is_public=False, + root_id=F('pk'), # only root nodes + embargo__end_date__gte=self.before_datetime, + ) + + def _published_preprint_queryset(self): + if not hasattr(osfdb.Preprint, 'affiliated_institutions'): + return osfdb.Preprint.objects.none() # HACK: preprints affiliation project still in-progress + return ( + osfdb.Preprint.objects.can_view() # published/publicly-viewable + .filter( + affiliated_institutions=self.institution, + _contributors=self.user, + date_published__lt=self.before_datetime, + ) + .exclude(spam_status=SpamStatus.SPAM) + ) + + def _public_osfstorage_file_queryset(self): + _target_node_q = Q( + # any public project, registration, project component, or registration component + target_object_id__in=self._node_queryset().filter(is_public=True).values('pk'), + target_content_type=ContentType.objects.get_for_model(osfdb.AbstractNode), + ) + _target_preprint_q = Q( + target_object_id__in=self._published_preprint_queryset().values('pk'), + target_content_type=ContentType.objects.get_for_model(osfdb.Preprint), + ) + return ( + OsfStorageFile.objects + .filter( + created__lt=self.before_datetime, + deleted__isnull=True, + purged__isnull=True, + ) + .filter(_target_node_q | _target_preprint_q) + ) + + def _storage_byte_count(self): + return osfdb.FileVersion.objects.filter( + size__gt=0, + created__lt=self.before_datetime, + purged__isnull=True, + basefilenode__in=self._public_osfstorage_file_queryset(), + ).aggregate(storage_bytes=Sum('size', default=0))['storage_bytes'] diff --git a/osf/metrics/reports.py b/osf/metrics/reports.py index 22758f91aea..68127d8f958 100644 --- a/osf/metrics/reports.py +++ b/osf/metrics/reports.py @@ -225,3 +225,23 @@ class SpamSummaryReport(MonthlyReport): preprint_flagged = metrics.Integer() user_marked_as_spam = metrics.Integer() user_marked_as_ham = metrics.Integer() + + +class InstitutionalUserReport(MonthlyReport): + UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', 'user_id',) + institution_id = metrics.Keyword() + # user info: + user_id = metrics.Keyword() + user_name = metrics.Text() + department_name = metrics.Text() + month_last_login = YearmonthField() + account_creation_date = YearmonthField() + orcid_id = metrics.Keyword() + # counts: + public_project_count = metrics.Integer() + private_project_count = metrics.Integer() + public_registration_count = metrics.Integer() + embargoed_registration_count = metrics.Integer() + published_preprint_count = metrics.Integer() + public_file_count = metrics.Integer() + storage_byte_count = metrics.Integer() diff --git a/osf_tests/metrics/reporters/test_institutional_users_reporter.py b/osf_tests/metrics/reporters/test_institutional_users_reporter.py new file mode 100644 index 00000000000..6af191cc669 --- /dev/null +++ b/osf_tests/metrics/reporters/test_institutional_users_reporter.py @@ -0,0 +1,257 @@ +from __future__ import annotations +import dataclasses +import datetime +import unittest + +from django.test import TestCase + +from api_tests.utils import create_test_file +from osf import models as osfdb +from osf.metrics.reports import InstitutionalUserReport +from osf.metrics.reporters import InstitutionalUsersReporter +from osf.metrics.utils import YearMonth +from osf_tests.factories import ( + InstitutionFactory, + PreprintFactory, + ProjectFactory, + RegistrationFactory, + UserFactory, + EmbargoFactory, +) + + +def _can_affiliate_preprints() -> bool: + # HACK: preprints affiliation project still in-progress + return hasattr(osfdb.Preprint, 'affiliated_institutions') + + +def _patch_now(fakenow: datetime.datetime): + return unittest.mock.patch('django.utils.timezone.now', return_value=fakenow) + + +class TestInstiUsersReporter(TestCase): + @classmethod + def setUpTestData(cls): + cls._yearmonth = YearMonth(2012, 7) + cls._now = datetime.datetime( + cls._yearmonth.year, + cls._yearmonth.month, + 13, # just some day in the month + tzinfo=datetime.UTC, + ) + with _patch_now(cls._now): + cls._institution = InstitutionFactory() + cls._user_setup_with_nothing = _InstiUserSetup(0, 0, 0, 0, 0, cls._institution, cls._now) + cls._user_setup_with_ones = _InstiUserSetup(1, 1, 1, 1, 1, cls._institution, cls._now) + cls._user_setup_with_stuff = _InstiUserSetup( + 2, 3, 5, 3, 2, cls._institution, cls._now, + orcid_id='1111-2222-3333-4444', + department_name='blargl studies', + ) + cls._user_setup_with_stuff.fill_uncounted_objects() + + def _assert_report_matches_setup(self, report: InstitutionalUserReport, setup: _InstiUserSetup): + self.assertEqual(report.institution_id, setup.institution._id) + # user info: + self.assertEqual(report.user_id, setup.user._id) + self.assertEqual(report.user_name, setup.user.fullname) + self.assertEqual(report.department_name, setup.department_name) + self.assertEqual(report.month_last_login, YearMonth.from_date(setup.user.date_last_login)) + self.assertEqual(report.account_creation_date, YearMonth.from_date(setup.user.created)) + self.assertEqual(report.orcid_id, setup.orcid_id) + # counts (NOTE: report.public_file_count and report.storage_byte_count tested separately) + self.assertEqual(report.public_project_count, setup.public_project_count) + self.assertEqual(report.private_project_count, setup.private_project_count) + self.assertEqual(report.public_registration_count, setup.public_registration_count) + self.assertEqual(report.embargoed_registration_count, setup.embargoed_registration_count) + if _can_affiliate_preprints(): + self.assertEqual(report.published_preprint_count, setup.published_preprint_count) + else: + self.assertEqual(report.published_preprint_count, 0) + + def test_no_users(self): + _actual_reports = list(InstitutionalUsersReporter().report(self._yearmonth)) + self.assertEqual(_actual_reports, []) + + def test_one_user_with_nothing(self): + self._user_setup_with_nothing.affiliate_user() + _reports = list(InstitutionalUsersReporter().report(self._yearmonth)) + self.assertEqual(len(_reports), 1) + self._assert_report_matches_setup(_reports[0], self._user_setup_with_nothing) + + def test_one_user_with_ones(self): + self._user_setup_with_ones.affiliate_user() + _reports = list(InstitutionalUsersReporter().report(self._yearmonth)) + self.assertEqual(len(_reports), 1) + self._assert_report_matches_setup(_reports[0], self._user_setup_with_ones) + + def test_one_user_with_stuff_and_no_files(self): + self._user_setup_with_stuff.affiliate_user() + _reports = list(InstitutionalUsersReporter().report(self._yearmonth)) + self.assertEqual(len(_reports), 1) + self._assert_report_matches_setup(_reports[0], self._user_setup_with_stuff) + self.assertEqual(_reports[0].public_file_count, 0) + self.assertEqual(_reports[0].storage_byte_count, 0) + + def test_one_user_with_stuff_and_a_file(self): + self._user_setup_with_stuff.affiliate_user() + _user = self._user_setup_with_stuff.user + _project = _user.nodes.first() + with _patch_now(self._now): + create_test_file(target=_project, user=_user, size=37) + (_report,) = InstitutionalUsersReporter().report(self._yearmonth) + self._assert_report_matches_setup(_report, self._user_setup_with_stuff) + self.assertEqual(_report.public_file_count, 1) + self.assertEqual(_report.storage_byte_count, 37) + + def test_one_user_with_stuff_and_multiple_files(self): + self._user_setup_with_stuff.affiliate_user() + _user = self._user_setup_with_stuff.user + _project = _user.nodes.first() + with _patch_now(self._now): + create_test_file(target=_project, user=_user, size=37, filename='b') + create_test_file(target=_project, user=_user, size=73, filename='bl') + _component = ProjectFactory(parent=_project, creator=_user, is_public=True) + _component.affiliated_institutions.add(self._institution) + create_test_file(target=_component, user=_user, size=53, filename='bla') + create_test_file(target=_component, user=_user, size=51, filename='blar') + create_test_file(target=_component, user=_user, size=47, filename='blarg') + (_report,) = InstitutionalUsersReporter().report(self._yearmonth) + self._assert_report_matches_setup(_report, self._user_setup_with_stuff) + self.assertEqual(_report.public_file_count, 5) + self.assertEqual(_report.storage_byte_count, 37 + 73 + 53 + 51 + 47) + + def test_several_users(self): + _setups = [ + self._user_setup_with_nothing, + self._user_setup_with_ones, + self._user_setup_with_stuff, + ] + for _setup in _setups: + _setup.affiliate_user() + _setup_by_userid = { + _setup.user._id: _setup + for _setup in _setups + } + _reports = list(InstitutionalUsersReporter().report(self._yearmonth)) + self.assertEqual(len(_reports), len(_setup_by_userid)) + for _actual_report in _reports: + _setup = _setup_by_userid[_actual_report.user_id] + self._assert_report_matches_setup(_actual_report, _setup) + + +@dataclasses.dataclass +class _InstiUserSetup: + '''helper class to simplify database setup for a test-case + + (note: public_file_count and storage_byte_count set up separately) + ''' + public_project_count: int + private_project_count: int + public_registration_count: int + embargoed_registration_count: int + published_preprint_count: int + institution: osfdb.Institution + now: datetime.datetime + department_name: str | None = None + orcid_id: str | None = None + user: osfdb.OSFUser = dataclasses.field(init=False) + + def __post_init__(self): + self.user = UserFactory( + date_last_login=self.now, + external_identity=( + {'ORCID': {self.orcid_id: 'VERIFIED'}} + if self.orcid_id + else {} + ), + ) + self._add_affiliations(self._generate_counted_objects()) + + def affiliate_user(self): + self.user.add_or_update_affiliated_institution( + self.institution, + sso_department=self.department_name, + ) + + @property + def future_timestamp(self): + return self.now + datetime.timedelta(days=123) + + def fill_uncounted_objects(self): + # uncounted because not affiliated: + self._add_public_project() + self._add_private_project() + self._add_public_registration() + self._add_embargoed_registration() + self._add_published_preprint() + # uncounted because affiliated with another institution: + self._add_affiliations(( + self._add_public_project(), + self._add_private_project(), + self._add_public_registration(), + self._add_embargoed_registration(), + self._add_published_preprint(), + ), institution=InstitutionFactory()) + # uncounted because created after the report's time range: + with _patch_now(self.future_timestamp): + self._add_affiliations(( + self._add_public_project(), + self._add_private_project(), + self._add_public_registration(), + self._add_embargoed_registration(), + self._add_published_preprint(), + )) + + def _add_affiliations(self, objs, institution=None): + for _obj in objs: + if _obj is not None: + _obj.affiliated_institutions.add(institution or self.institution) + + def _generate_counted_objects(self): + for _ in range(self.public_project_count): + yield self._add_public_project() + for _ in range(self.private_project_count): + yield self._add_private_project() + for _ in range(self.public_registration_count): + yield self._add_public_registration() + for _ in range(self.embargoed_registration_count): + yield self._add_embargoed_registration() + for _ in range(self.published_preprint_count): + yield self._add_published_preprint() + + def _add_public_project(self) -> osfdb.Node: + return ProjectFactory( + creator=self.user, + is_public=True, + ) + + def _add_private_project(self) -> osfdb.Node: + return ProjectFactory( + creator=self.user, + is_public=False, + ) + + def _add_public_registration(self) -> osfdb.Registration: + return RegistrationFactory( + creator=self.user, + is_public=True, + ) + + def _add_embargoed_registration(self) -> osfdb.Registration: + return RegistrationFactory( + creator=self.user, + is_public=False, + embargo=EmbargoFactory( + user=self.user, + end_date=self.future_timestamp, + ), + ) + + def _add_published_preprint(self) -> osfdb.Preprint | None: + if _can_affiliate_preprints(): # HACK: preprints affiliation project still in-progress + return PreprintFactory( + creator=self.user, + is_public=True, + ) + return None