-
Notifications
You must be signed in to change notification settings - Fork 330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ENG-6164] allow monthly metrics reports on multiple subjects #10718
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from collections import abc | ||
import datetime | ||
|
||
from django.dispatch import receiver | ||
|
@@ -20,10 +21,14 @@ class DailyReport(metrics.Metric): | |
There's something we'd like to know about every so often, | ||
so let's regularly run a report and stash the results here. | ||
""" | ||
DAILY_UNIQUE_FIELD = None # set in subclasses that expect multiple reports per day | ||
UNIQUE_TOGETHER_FIELDS = ('report_date',) # override in subclasses for multiple reports per day | ||
|
||
report_date = metrics.Date(format='strict_date', required=True) | ||
|
||
def __init_subclass__(cls, **kwargs): | ||
super().__init_subclass__(**kwargs) | ||
assert 'report_date' in cls.UNIQUE_TOGETHER_FIELDS, f'DailyReport subclasses must have "report_date" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' | ||
|
||
class Meta: | ||
abstract = True | ||
dynamic = metrics.MetaField('strict') | ||
|
@@ -58,6 +63,7 @@ def serialize(self, data): | |
class MonthlyReport(metrics.Metric): | ||
"""MonthlyReport (abstract base for report-based metrics that run monthly) | ||
""" | ||
UNIQUE_TOGETHER_FIELDS = ('report_yearmonth',) # override in subclasses for multiple reports per month | ||
|
||
report_yearmonth = YearmonthField() | ||
|
||
|
@@ -66,26 +72,31 @@ class Meta: | |
dynamic = metrics.MetaField('strict') | ||
source = metrics.MetaField(enabled=True) | ||
|
||
def __init_subclass__(cls, **kwargs): | ||
super().__init_subclass__(**kwargs) | ||
assert 'report_yearmonth' in cls.UNIQUE_TOGETHER_FIELDS, f'MonthlyReport subclasses must have "report_yearmonth" in UNIQUE_TOGETHER_FIELDS (on {cls.__qualname__}, got {cls.UNIQUE_TOGETHER_FIELDS})' | ||
|
||
|
||
@receiver(metrics_pre_save) | ||
def set_report_id(sender, instance, **kwargs): | ||
# Set the document id to a hash of "unique together" | ||
# values (just `report_date` by default) to get | ||
# "ON CONFLICT UPDATE" behavior -- if the document | ||
# already exists, it will be updated rather than duplicated. | ||
# Cannot detect/avoid conflicts this way, but that's ok. | ||
|
||
if issubclass(sender, DailyReport): | ||
duf_name = instance.DAILY_UNIQUE_FIELD | ||
if duf_name is None: | ||
instance.meta.id = stable_key(instance.report_date) | ||
else: | ||
duf_value = getattr(instance, duf_name) | ||
if not duf_value or not isinstance(duf_value, str): | ||
raise ReportInvalid(f'{sender.__name__}.{duf_name} MUST have a non-empty string value (got {duf_value})') | ||
instance.meta.id = stable_key(instance.report_date, duf_value) | ||
elif issubclass(sender, MonthlyReport): | ||
instance.meta.id = stable_key(instance.report_yearmonth) | ||
try: | ||
_unique_together_fields = instance.UNIQUE_TOGETHER_FIELDS | ||
except AttributeError: | ||
pass | ||
else: | ||
# Set the document id to a hash of "unique together" fields | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like a lot of comments for a simple set_id function, there's a code smell here that this is over engineered. |
||
# for "ON CONFLICT UPDATE" behavior -- if the document | ||
# already exists, it will be updated rather than duplicated. | ||
# Cannot detect/avoid conflicts this way, but that's ok. | ||
_key_values = [] | ||
for _field_name in _unique_together_fields: | ||
_field_value = getattr(instance, _field_name) | ||
if not _field_value or ( | ||
isinstance(_field_value, abc.Iterable) and not isinstance(_field_value, str) | ||
): | ||
raise ReportInvalid(f'because "{_field_name}" is in {sender.__name__}.UNIQUE_TOGETHER_FIELDS, {sender.__name__}.{_field_name} MUST have a non-empty scalar value (got {_field_value} of type {type(_field_value)})') | ||
_key_values.append(_field_value) | ||
instance.meta.id = stable_key(*_key_values) | ||
|
||
|
||
#### BEGIN reusable inner objects ##### | ||
|
@@ -157,7 +168,7 @@ class DownloadCountReport(DailyReport): | |
|
||
|
||
class InstitutionSummaryReport(DailyReport): | ||
DAILY_UNIQUE_FIELD = 'institution_id' | ||
UNIQUE_TOGETHER_FIELDS = ('report_date', 'institution_id',) | ||
|
||
institution_id = metrics.Keyword() | ||
institution_name = metrics.Keyword() | ||
|
@@ -169,7 +180,7 @@ class InstitutionSummaryReport(DailyReport): | |
|
||
|
||
class NewUserDomainReport(DailyReport): | ||
DAILY_UNIQUE_FIELD = 'domain_name' | ||
UNIQUE_TOGETHER_FIELDS = ('report_date', 'domain_name',) | ||
|
||
domain_name = metrics.Keyword() | ||
new_user_count = metrics.Integer() | ||
|
@@ -187,7 +198,7 @@ class OsfstorageFileCountReport(DailyReport): | |
|
||
|
||
class PreprintSummaryReport(DailyReport): | ||
DAILY_UNIQUE_FIELD = 'provider_key' | ||
UNIQUE_TOGETHER_FIELDS = ('report_date', 'provider_key',) | ||
|
||
provider_key = metrics.Keyword() | ||
preprint_count = metrics.Integer() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,8 @@ | ||
import dataclasses | ||
import re | ||
import datetime | ||
import typing | ||
from hashlib import sha256 | ||
|
||
import pytz | ||
from typing import ClassVar | ||
|
||
|
||
def stable_key(*key_parts): | ||
|
@@ -20,11 +19,12 @@ def stable_key(*key_parts): | |
return sha256(bytes(plain_key, encoding='utf')).hexdigest() | ||
|
||
|
||
class YearMonth(typing.NamedTuple): | ||
@dataclasses.dataclass(frozen=True) | ||
class YearMonth: | ||
year: int | ||
month: int | ||
|
||
YEARMONTH_RE = re.compile(r'(?P<year>\d{4})-(?P<month>\d{2})') | ||
YEARMONTH_RE: ClassVar[re.Pattern] = re.compile(r'(?P<year>\d{4})-(?P<month>\d{2})') | ||
|
||
@classmethod | ||
def from_date(cls, date): | ||
|
@@ -46,9 +46,9 @@ def __str__(self): | |
return f'{self.year}-{self.month:0>2}' | ||
|
||
def target_month(self): | ||
return datetime.datetime(self.year, self.month, 1, tzinfo=pytz.utc) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this changing? |
||
return datetime.datetime(self.year, self.month, 1, tzinfo=datetime.UTC) | ||
|
||
def next_month(self): | ||
if self.month == 12: | ||
return datetime.datetime(self.year + 1, 1, 1, tzinfo=pytz.utc) | ||
return datetime.datetime(self.year, self.month + 1, 1, tzinfo=pytz.utc) | ||
return datetime.datetime(self.year + 1, 1, 1, tzinfo=datetime.UTC) | ||
return datetime.datetime(self.year, self.month + 1, 1, tzinfo=datetime.UTC) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
from unittest import mock | ||
|
||
import pytest | ||
from elasticsearch_metrics import metrics | ||
|
||
from osf.metrics.reports import MonthlyReport, ReportInvalid | ||
from osf.metrics.utils import YearMonth | ||
|
||
|
||
class TestMonthlyReportKey: | ||
@pytest.fixture | ||
def mock_save(self): | ||
with mock.patch('elasticsearch6_dsl.Document.save', autospec=True) as mock_save: | ||
yield mock_save | ||
|
||
def test_default(self, mock_save): | ||
# only one of this type of report per month | ||
class UniqueByMonth(MonthlyReport): | ||
blah = metrics.Keyword() | ||
|
||
class Meta: | ||
app_label = 'osf' | ||
|
||
yearmonth = YearMonth(2022, 5) | ||
|
||
reports = [ | ||
UniqueByMonth(report_yearmonth=yearmonth), | ||
UniqueByMonth(report_yearmonth=yearmonth, blah='blah'), | ||
UniqueByMonth(report_yearmonth=yearmonth, blah='fleh'), | ||
] | ||
expected_key = '8463aac67c1e5a038049196781d8f100f069225352d1829651892cf3fbfc50e2' | ||
|
||
for report in reports: | ||
report.save() | ||
assert mock_save.call_count == 1 | ||
assert mock_save.call_args[0][0] is report | ||
assert report.meta.id == expected_key | ||
mock_save.reset_mock() | ||
|
||
def test_with_unique_together(self, mock_save): | ||
# multiple reports of this type per day, unique by given field | ||
class UniqueByMonthAndField(MonthlyReport): | ||
UNIQUE_TOGETHER_FIELDS = ('report_yearmonth', 'uniquefield',) | ||
uniquefield = metrics.Keyword() | ||
|
||
class Meta: | ||
app_label = 'osf' | ||
|
||
yearmonth = YearMonth(2022, 5) | ||
|
||
expected_blah = '62ebf38317cd8402e27a50ce99f836d1734b3f545adf7d144d0e1cf37a0d9d08' | ||
blah_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='blah') | ||
blah_report.save() | ||
assert mock_save.call_count == 1 | ||
assert mock_save.call_args[0][0] is blah_report | ||
assert blah_report.meta.id == expected_blah | ||
mock_save.reset_mock() | ||
|
||
expected_fleh = '385700db282f6d6089a0d21836db5ee8423f548615e515b6e034bcc90a14500f' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What makes this different, break these up by case. |
||
fleh_report = UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield='fleh') | ||
fleh_report.save() | ||
assert mock_save.call_count == 1 | ||
assert mock_save.call_args[0][0] is fleh_report | ||
assert fleh_report.meta.id == expected_fleh | ||
mock_save.reset_mock() | ||
|
||
for _bad_report in ( | ||
UniqueByMonthAndField(report_yearmonth=yearmonth), | ||
UniqueByMonthAndField(report_yearmonth=yearmonth, uniquefield=['list']), | ||
): | ||
with pytest.raises(ReportInvalid): | ||
_bad_report.save() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems unnecessary KISS.