Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENG-6164] allow monthly metrics reports on multiple subjects #10718

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 32 additions & 21 deletions osf/metrics/reports.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import abc
import datetime

from django.dispatch import receiver
Expand All @@ -20,10 +21,14 @@ class DailyReport(metrics.Metric):
There's something we'd like to know about every so often,
so let's regularly run a report and stash the results here.
"""
DAILY_UNIQUE_FIELD = None # set in subclasses that expect multiple reports per day
UNIQUE_TOGETHER_FIELDS = ('report_date',) # override in subclasses for multiple reports per day

report_date = metrics.Date(format='strict_date', required=True)

def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
assert 'report_date' in cls.UNIQUE_TOGETHER_FIELDS, f'DailyReport subclasses must have "report_date" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})'

class Meta:
abstract = True
dynamic = metrics.MetaField('strict')
Expand Down Expand Up @@ -58,6 +63,7 @@ def serialize(self, data):
class MonthlyReport(metrics.Metric):
"""MonthlyReport (abstract base for report-based metrics that run monthly)
"""
UNIQUE_TOGETHER_FIELDS = ('report_yearmonth',) # override in subclasses for multiple reports per month

report_yearmonth = YearmonthField()

Expand All @@ -66,26 +72,31 @@ class Meta:
dynamic = metrics.MetaField('strict')
source = metrics.MetaField(enabled=True)

def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems unnecessary KISS.



@receiver(metrics_pre_save)
def set_report_id(sender, instance, **kwargs):
# Set the document id to a hash of "unique together"
# values (just `report_date` by default) to get
# "ON CONFLICT UPDATE" behavior -- if the document
# already exists, it will be updated rather than duplicated.
# Cannot detect/avoid conflicts this way, but that's ok.

if issubclass(sender, DailyReport):
duf_name = instance.DAILY_UNIQUE_FIELD
if duf_name is None:
instance.meta.id = stable_key(instance.report_date)
else:
duf_value = getattr(instance, duf_name)
if not duf_value or not isinstance(duf_value, str):
raise ReportInvalid(f'{sender.__name__}.{duf_name} MUST have a non-empty string value (got {duf_value})')
instance.meta.id = stable_key(instance.report_date, duf_value)
elif issubclass(sender, MonthlyReport):
instance.meta.id = stable_key(instance.report_yearmonth)
try:
_unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS
except AttributeError:
pass
else:
# Set the document id to a hash of "unique together" fields
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like a lot of comments for a simple set_id function, there's a code smell here that this is over engineered.

# for "ON CONFLICT UPDATE" behavior -- if the document
# already exists, it will be updated rather than duplicated.
# Cannot detect/avoid conflicts this way, but that's ok.
_key_values = []
for _field_name in _unique_together_fields:
_field_value = getattr(instance, _field_name)
if not _field_value or (
isinstance(_field_value, abc.Iterable) and not isinstance(_field_value, str)
):
raise ReportInvalid(f'because "{_field_name}" is in {sender.__name__}.UNIQUE_TOGETHER_FIELDS, {sender.__name__}.{_field_name} MUST have a non-empty scalar value (got {_field_value} of type {type(_field_value)})')
_key_values.append(_field_value)
instance.meta.id = stable_key(*_key_values)


#### BEGIN reusable inner objects #####
Expand Down Expand Up @@ -157,7 +168,7 @@ class DownloadCountReport(DailyReport):


class InstitutionSummaryReport(DailyReport):
DAILY_UNIQUE_FIELD = 'institution_id'
UNIQUE_TOGETHER_FIELDS = ('report_date', 'institution_id',)

institution_id = metrics.Keyword()
institution_name = metrics.Keyword()
Expand All @@ -169,7 +180,7 @@ class InstitutionSummaryReport(DailyReport):


class NewUserDomainReport(DailyReport):
DAILY_UNIQUE_FIELD = 'domain_name'
UNIQUE_TOGETHER_FIELDS = ('report_date', 'domain_name',)

domain_name = metrics.Keyword()
new_user_count = metrics.Integer()
Expand All @@ -187,7 +198,7 @@ class OsfstorageFileCountReport(DailyReport):


class PreprintSummaryReport(DailyReport):
DAILY_UNIQUE_FIELD = 'provider_key'
UNIQUE_TOGETHER_FIELDS = ('report_date', 'provider_key',)

provider_key = metrics.Keyword()
preprint_count = metrics.Integer()
Expand Down
16 changes: 8 additions & 8 deletions osf/metrics/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import dataclasses
import re
import datetime
import typing
from hashlib import sha256

import pytz
from typing import ClassVar


def stable_key(*key_parts):
Expand All @@ -20,11 +19,12 @@ def stable_key(*key_parts):
return sha256(bytes(plain_key, encoding='utf')).hexdigest()


class YearMonth(typing.NamedTuple):
@dataclasses.dataclass(frozen=True)
class YearMonth:
year: int
month: int

YEARMONTH_RE = re.compile(r'(?P<year>\d{4})-(?P<month>\d{2})')
YEARMONTH_RE: ClassVar[re.Pattern] = re.compile(r'(?P<year>\d{4})-(?P<month>\d{2})')

@classmethod
def from_date(cls, date):
Expand All @@ -46,9 +46,9 @@ def __str__(self):
return f'{self.year}-{self.month:0>2}'

def target_month(self):
return datetime.datetime(self.year, self.month, 1, tzinfo=pytz.utc)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this changing?

return datetime.datetime(self.year, self.month, 1, tzinfo=datetime.UTC)

def next_month(self):
if self.month == 12:
return datetime.datetime(self.year + 1, 1, 1, tzinfo=pytz.utc)
return datetime.datetime(self.year, self.month + 1, 1, tzinfo=pytz.utc)
return datetime.datetime(self.year + 1, 1, 1, tzinfo=datetime.UTC)
return datetime.datetime(self.year, self.month + 1, 1, tzinfo=datetime.UTC)
19 changes: 11 additions & 8 deletions osf_tests/metrics/test_daily_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,33 +37,36 @@ class Meta:
assert report.meta.id == expected_key
mock_save.reset_mock()

def test_with_duf(self, mock_save):
def test_with_unique_together(self, mock_save):
# multiple reports of this type per day, unique by given field
class UniqueByDateAndField(DailyReport):
DAILY_UNIQUE_FIELD = 'duf'
duf = metrics.Keyword()
UNIQUE_TOGETHER_FIELDS = ('report_date', 'uniquefield',)
uniquefield = metrics.Keyword()

class Meta:
app_label = 'osf'

today = date(2022, 5, 18)

expected_blah = 'dca57e6cde89b19274ea24bc713971dab137a896b8e06d43a11a3f437cd1d151'
blah_report = UniqueByDateAndField(report_date=today, duf='blah')
blah_report = UniqueByDateAndField(report_date=today, uniquefield='blah')
blah_report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is blah_report
assert blah_report.meta.id == expected_blah
mock_save.reset_mock()

expected_fleh = 'e7dd5ff6b087807efcfa958077dc713878f21c65af79b3ccdb5dc2409bf5ad99'
fleh_report = UniqueByDateAndField(report_date=today, duf='fleh')
fleh_report = UniqueByDateAndField(report_date=today, uniquefield='fleh')
fleh_report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is fleh_report
assert fleh_report.meta.id == expected_fleh
mock_save.reset_mock()

bad_report = UniqueByDateAndField(report_date=today)
with pytest.raises(ReportInvalid):
bad_report.save()
for _bad_report in (
UniqueByDateAndField(report_date=today),
UniqueByDateAndField(report_date=today, uniquefield=['list', 'of', 'things']),
):
with pytest.raises(ReportInvalid):
_bad_report.save()
72 changes: 72 additions & 0 deletions osf_tests/metrics/test_monthly_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from unittest import mock

import pytest
from elasticsearch_metrics import metrics

from osf.metrics.reports import MonthlyReport, ReportInvalid
from osf.metrics.utils import YearMonth


class TestMonthlyReportKey:
@pytest.fixture
def mock_save(self):
with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save:
yield mock_save

def test_default(self, mock_save):
# only one of this type of report per month
class UniqueByMonth(MonthlyReport):
blah = metrics.Keyword()

class Meta:
app_label = 'osf'

yearmonth = YearMonth(2022, 5)

reports = [
UniqueByMonth(report_yearmonth=yearmonth),
UniqueByMonth(report_yearmonth=yearmonth, blah='blah'),
UniqueByMonth(report_yearmonth=yearmonth, blah='fleh'),
]
expected_key = '8463aac67c1e5a038049196781d8f100f069225352d1829651892cf3fbfc50e2'

for report in reports:
report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is report
assert report.meta.id == expected_key
mock_save.reset_mock()

def test_with_unique_together(self, mock_save):
# multiple reports of this type per day, unique by given field
class UniqueByMonthAndField(MonthlyReport):
UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'uniquefield',)
uniquefield = metrics.Keyword()

class Meta:
app_label = 'osf'

yearmonth = YearMonth(2022, 5)

expected_blah = '62ebf38317cd8402e27a50ce99f836d1734b3f545adf7d144d0e1cf37a0d9d08'
blah_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='blah')
blah_report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is blah_report
assert blah_report.meta.id == expected_blah
mock_save.reset_mock()

expected_fleh = '385700db282f6d6089a0d21836db5ee8423f548615e515b6e034bcc90a14500f'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What makes this different, break these up by case.

fleh_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='fleh')
fleh_report.save()
assert mock_save.call_count == 1
assert mock_save.call_args[0][0] is fleh_report
assert fleh_report.meta.id == expected_fleh
mock_save.reset_mock()

for _bad_report in (
UniqueByMonthAndField(report_yearmonth=yearmonth),
UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield=['list']),
):
with pytest.raises(ReportInvalid):
_bad_report.save()
Loading