Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENG-6124] Create Monthly Reporter for Institution Summary #10756

Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions osf/metrics/reporters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .download_count import DownloadCountReporter
from .institution_summary import InstitutionSummaryReporter
from .institutional_users import InstitutionalUsersReporter
from .institution_summary_monthly import InstitutionalSummaryMonthlyReporter
from .new_user_domain import NewUserDomainReporter
from .node_count import NodeCountReporter
from .osfstorage_file_count import OsfstorageFileCountReporter
Expand All @@ -28,3 +29,4 @@ class AllDailyReporters(enum.Enum):
class AllMonthlyReporters(enum.Enum):
SPAM_COUNT = SpamCountReporter
INSTITUTIONAL_USERS = InstitutionalUsersReporter
INSTITUTIONAL_SUMMARY = InstitutionalSummaryMonthlyReporter
103 changes: 103 additions & 0 deletions osf/metrics/reporters/institution_summary_monthly.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
from django.contrib.contenttypes.models import ContentType
from django.db.models import Q, F, Sum

from osf.models import Institution, Preprint, AbstractNode, FileVersion
from osf.models.spam import SpamStatus
from addons.osfstorage.models import OsfStorageFile
from osf.metrics.reports import InstitutionMonthlySummaryReport
from osf.metrics.utils import YearMonth
from ._base import MonthlyReporter


class InstitutionalSummaryMonthlyReporter(MonthlyReporter):
"""Generate an InstitutionMonthlySummaryReport for each institution."""

def report(self, yearmonth: YearMonth):
for institution in Institution.objects.all():
yield self.generate_report(institution, yearmonth)

def generate_report(self, institution, yearmonth):
node_queryset = institution.nodes.filter(
deleted__isnull=True,
created__lt=yearmonth.next_month()
).exclude(
spam_status=SpamStatus.SPAM,
)

preprint_queryset = self.get_published_preprints(institution, yearmonth)

return InstitutionMonthlySummaryReport(
institution_id=institution._id,
user_count=institution.get_institution_users().count(),
private_project_count=self._get_count(node_queryset, 'osf.node', is_public=False),
public_project_count=self._get_count(node_queryset, 'osf.node', is_public=True),
public_registration_count=self._get_count(node_queryset, 'osf.registration', is_public=True),
embargoed_registration_count=self._get_count(node_queryset, 'osf.registration', is_public=False),
published_preprint_count=preprint_queryset.count(),
storage_byte_count=self.get_storage_size(node_queryset, preprint_queryset),
public_file_count=self.get_files(node_queryset, preprint_queryset, is_public=True).count(),
monthly_logged_in_user_count=self.get_monthly_logged_in_user_count(institution, yearmonth),
monthly_active_user_count=self.get_monthly_active_user_count(institution, yearmonth),
)

def _get_count(self, node_queryset, node_type, is_public):
return node_queryset.filter(type=node_type, is_public=is_public, root_id=F('pk')).count()

def get_published_preprints(self, institution, yearmonth):
queryset = Preprint.objects.can_view().filter(
affiliated_institutions=institution,
created__lte=yearmonth.next_month()
).exclude(
spam_status=SpamStatus.SPAM
)

return queryset

def get_files(self, node_queryset, preprint_queryset, is_public=None):
public_kwargs = {}
if is_public:
public_kwargs = {'is_public': is_public}

target_node_q = Q(
target_object_id__in=node_queryset.filter(**public_kwargs).values('pk'),
target_content_type=ContentType.objects.get_for_model(AbstractNode),
)
target_preprint_q = Q(
target_object_id__in=preprint_queryset.values('pk'),
target_content_type=ContentType.objects.get_for_model(Preprint),
)
return OsfStorageFile.objects.filter(
deleted__isnull=True, purged__isnull=True
).filter(target_node_q | target_preprint_q)

def get_storage_size(self, node_queryset, preprint_queryset):
files = self.get_files(node_queryset, preprint_queryset)
return FileVersion.objects.filter(
size__gt=0,
purged__isnull=True,
basefilenode__in=files
).aggregate(storage_bytes=Sum('size', default=0))['storage_bytes']

def get_monthly_logged_in_user_count(self, institution, yearmonth):
return institution.get_institution_users().filter(
date_last_login__gte=yearmonth.target_month(),
date_last_login__lt=yearmonth.next_month()
).count()

def get_monthly_active_user_count(self, institution, yearmonth):
institution_users = institution.get_institution_users().filter(
date_disabled__isnull=True
)

active_users = institution_users.filter(
Q(
logs__created__gte=yearmonth.target_month(),
logs__created__lt=yearmonth.next_month()
) |
Q(
preprint_logs__created__gte=yearmonth.target_month(),
preprint_logs__created__lt=yearmonth.next_month()
)
).distinct()

return active_users.count()
15 changes: 15 additions & 0 deletions osf/metrics/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,18 @@ class InstitutionalUserReport(MonthlyReport):
published_preprint_count = metrics.Integer()
public_file_count = metrics.Long()
storage_byte_count = metrics.Long()


class InstitutionMonthlySummaryReport(MonthlyReport):
UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'institution_id', )
institution_id = metrics.Keyword()
user_count = metrics.Integer()
public_project_count = metrics.Integer()
private_project_count = metrics.Integer()
public_registration_count = metrics.Integer()
embargoed_registration_count = metrics.Integer()
published_preprint_count = metrics.Integer()
storage_byte_count = metrics.Long()
public_file_count = metrics.Long()
monthly_logged_in_user_count = metrics.Long()
monthly_active_user_count = metrics.Long()
135 changes: 135 additions & 0 deletions osf_tests/metrics/reporters/test_institutional_summary_reporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import datetime
from django.test import TestCase
from osf.metrics.reporters import InstitutionalSummaryMonthlyReporter
from osf.metrics.utils import YearMonth
from osf_tests.factories import (
InstitutionFactory,
ProjectFactory,
RegistrationFactory,
PreprintFactory,
AuthUserFactory,
)


class TestInstiSummaryMonthlyReporter(TestCase):

@classmethod
def setUpTestData(cls):
cls._yearmonth = YearMonth(2018, 2) # February 2018
cls._institution = InstitutionFactory()
cls._now = datetime.datetime(2018, 2, 4, tzinfo=datetime.UTC)

# Existing data for the primary institution
cls._public_project = cls._create_affiliated_project(cls._institution, is_public=True, created=cls._now)
cls._private_project = cls._create_affiliated_project(cls._institution, is_public=False, created=cls._now)
cls._public_registration = cls._create_affiliated_registration(cls._institution, is_public=True, created=cls._now)
cls._embargoed_registration = cls._create_affiliated_registration(cls._institution, is_public=False, created=cls._now)

cls._published_preprint = cls._create_affiliated_preprint(cls._institution, is_public=True, created=cls._now)

cls._logged_in_user = cls._create_logged_in_user(cls._institution, date_last_login=cls._now)
cls._active_user = cls._create_active_user(cls._institution, date_confirmed=cls._now - datetime.timedelta(days=1))

@classmethod
def _create_affiliated_preprint(cls, institution, is_public, created):
published_preprint = PreprintFactory(is_public=is_public)
published_preprint.affiliated_institutions.add(institution)
published_preprint.created = created
published_preprint.save()
return published_preprint

@classmethod
def _create_affiliated_project(cls, institution, is_public, created):
project = ProjectFactory(is_public=is_public)
project.affiliated_institutions.add(institution)
project.created = created
project.save()
return project

@classmethod
def _create_affiliated_registration(cls, institution, is_public, created):
registration = RegistrationFactory(is_public=is_public)
registration.affiliated_institutions.add(institution)
registration.created = created
registration.save()
return registration

@classmethod
def _create_logged_in_user(cls, institution, date_last_login):
user = AuthUserFactory()
user.add_or_update_affiliated_institution(institution)
user.date_last_login = date_last_login
user.save()
return user

@classmethod
def _create_active_user(cls, institution, date_confirmed):
user = AuthUserFactory()
user.add_or_update_affiliated_institution(institution)
user.date_confirmed = date_confirmed
ProjectFactory(creator=user) # adds log to make active
log = user.logs.get()
log.created = date_confirmed
log.save()
user.save()
return user

def test_report_generation(self):
reporter = InstitutionalSummaryMonthlyReporter()
reports = list(reporter.report(self._yearmonth))
self.assertEqual(len(reports), 1)

report = reports[0]
self.assertEqual(report.institution_id, self._institution._id)
self.assertEqual(report.user_count, 2) # _logged_in_user and _active_user
self.assertEqual(report.public_project_count, 1)
self.assertEqual(report.private_project_count, 1)
self.assertEqual(report.public_registration_count, 1)
self.assertEqual(report.embargoed_registration_count, 1)
self.assertEqual(report.published_preprint_count, 1)
self.assertEqual(report.storage_byte_count, 1337) # test value for one file
self.assertEqual(report.public_file_count, 1)
self.assertEqual(report.monthly_logged_in_user_count, 1)
self.assertEqual(report.monthly_active_user_count, 1)
Comment on lines +85 to +95
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would ideally have more than one test case


def test_report_generation_multiple_institutions(self):
institution2 = InstitutionFactory()
institution3 = InstitutionFactory()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: why create institution3 and then not assert anything on its report?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just making sure it can handle an empty one,


# Set up dates for different months
now = datetime.datetime(2018, 2, 4, tzinfo=datetime.UTC)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this is already at self._now? why this test method so inconsistent from the last one? when setting up a different scenario with different data, might be cleaner in a separate TestCase

last_month = datetime.datetime(2018, 1, 15, tzinfo=datetime.UTC)
next_month = datetime.datetime(2018, 3, 10, tzinfo=datetime.UTC)

self._create_affiliated_project(institution2, is_public=True, created=now)
self._create_affiliated_project(institution3, is_public=True, created=last_month)

# Create future projects for self._institution (should not be counted)
self._create_affiliated_project(self._institution, is_public=True, created=next_month)

# Create users affiliated with different institutions
self._create_active_user(institution2, date_confirmed=now)
self._create_active_user(institution3, date_confirmed=last_month)

# Run the reporter for the current month (February 2018)
reporter = InstitutionalSummaryMonthlyReporter()
reports = list(reporter.report(self._yearmonth))
self.assertEqual(len(reports), 3) # Reports for self._institution, institution2, institution3

# Extract reports by institution
report_institution = next(r for r in reports if r.institution_id == self._institution._id)
report_institution2 = next(r for r in reports if r.institution_id == institution2._id)

# Validate report for self._institution
self.assertEqual(report_institution.public_project_count, 1)
self.assertEqual(report_institution.private_project_count, 1)
self.assertEqual(report_institution.user_count, 2)
self.assertEqual(report_institution.monthly_active_user_count, 1)
self.assertEqual(report_institution.monthly_logged_in_user_count, 1)

# Validate report for institution2
self.assertEqual(report_institution2.public_project_count, 1)
self.assertEqual(report_institution2.private_project_count, 0)
self.assertEqual(report_institution2.user_count, 1)
self.assertEqual(report_institution2.monthly_active_user_count, 1)
self.assertEqual(report_institution2.monthly_logged_in_user_count, 0) # No logged-in users
Comment on lines +132 to +136
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

having no counts greater than 1 doesn't test counting logic very well -- when i suggested "more than one test case" i meant to imply a small variety of situations that yield different results, not roughly the same situation a second time

(tho don't get me wrong, "multiple institutions" is a good step in that direction)

Loading