From ff346522d9a1052fd9bc70b6f0af6f3e37fec577 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 23 Aug 2024 13:34:08 -0400 Subject: [PATCH 1/3] schedule metrics reporters in their own tasks --- osf/management/commands/daily_reporters_go.py | 39 ++++++++++--------- .../commands/monthly_reporters_go.py | 31 +++++++++------ osf/metrics/reporters/__init__.py | 29 +++++++------- 3 files changed, 54 insertions(+), 45 deletions(-) diff --git a/osf/management/commands/daily_reporters_go.py b/osf/management/commands/daily_reporters_go.py index d45f02fe54b..b2f73ea5842 100644 --- a/osf/management/commands/daily_reporters_go.py +++ b/osf/management/commands/daily_reporters_go.py @@ -2,11 +2,11 @@ import logging from django.core.management.base import BaseCommand +from django.db.utils import OperationalError from django.utils import timezone -from framework import sentry from framework.celery_tasks import app as celery_app -from osf.metrics.reporters import DAILY_REPORTERS +from osf.metrics.reporters import AllDailyReporters from website.app import init_app @@ -20,25 +20,26 @@ def daily_reporters_go(also_send_to_keen=False, report_date=None, reporter_filte if report_date is None: # default to yesterday report_date = (timezone.now() - datetime.timedelta(days=1)).date() - errors = {} - for reporter_class in DAILY_REPORTERS: - if reporter_filter and (reporter_filter.lower() not in reporter_class.__name__.lower()): + for _reporter_key, _reporter_class in AllDailyReporters.__members__.items(): + if reporter_filter and (reporter_filter.lower() not in _reporter_class.__name__.lower()): continue - try: - reporter_class().run_and_record_for_date( - report_date=report_date, - also_send_to_keen=also_send_to_keen, - ) - except Exception as e: - errors[reporter_class.__name__] = repr(e) - logger.exception(e) - sentry.log_exception(e) - # continue with the next reporter - return errors + daily_reporter_go.apply_async(kwargs={ + 'reporter_key': _reporter_key, + 'report_date': report_date.isoformat(), + }) -def date_fromisoformat(date_str): - return datetime.datetime.strptime(date_str, '%Y-%m-%d').date() +@celery_app.task( + name='management.commands.daily_reporter_go', + autoretry_for=(OperationalError,), + max_retries=5, + retry_backoff=True, + bind=True, +) +def daily_reporter_go(task, reporter_key: str, report_date: str): + _reporter_class = AllDailyReporters[reporter_key].value + _parsed_date = datetime.date.fromisoformat(report_date) + _reporter_class().run_and_record_for_date(report_date=_parsed_date) class Command(BaseCommand): @@ -51,7 +52,7 @@ def add_arguments(self, parser): ) parser.add_argument( '--date', - type=date_fromisoformat, # in python 3.7+, could pass datetime.date.fromisoformat + type=datetime.date.fromisoformat, help='run for a specific date (default: yesterday)', ) parser.add_argument( diff --git a/osf/management/commands/monthly_reporters_go.py b/osf/management/commands/monthly_reporters_go.py index 74bd69da6ab..8f9854a722b 100644 --- a/osf/management/commands/monthly_reporters_go.py +++ b/osf/management/commands/monthly_reporters_go.py @@ -1,11 +1,11 @@ import logging from django.core.management.base import BaseCommand +from django.db.utils import OperationalError from django.utils import timezone -from framework import sentry from framework.celery_tasks import app as celery_app -from osf.metrics.reporters import MONTHLY_REPORTERS +from osf.metrics.reporters import AllMonthlyReporters from osf.metrics.utils import YearMonth from website.app import init_app @@ -28,17 +28,24 @@ def monthly_reporters_go(report_year=None, report_month=None): year=today.year if today.month > 1 else today.year - 1, month=today.month - 1 or MAXMONTH, ) + for _reporter_key in AllMonthlyReporters.__members__.keys(): + monthly_reporter_go.apply_async(kwargs={ + 'reporter_key': _reporter_key, + 'yearmonth': str(report_yearmonth), + }) - errors = {} - for reporter_class in MONTHLY_REPORTERS: - try: - reporter_class().run_and_record_for_month(report_yearmonth) - except Exception as e: - errors[reporter_class.__name__] = str(e) - logger.exception(e) - sentry.log_exception(e) - # continue with the next reporter - return errors + +@celery_app.task( + name='management.commands.monthly_reporter_go', + autoretry_for=(OperationalError,), + max_retries=5, + retry_backoff=True, + bind=True, +) +def monthly_reporter_go(task, reporter_key: str, yearmonth: str): + _reporter_class = AllMonthlyReporters[reporter_key].value + _parsed_yearmonth = YearMonth.from_str(yearmonth) + _reporter_class().run_and_record_for_month(_parsed_yearmonth) class Command(BaseCommand): diff --git a/osf/metrics/reporters/__init__.py b/osf/metrics/reporters/__init__.py index b7a0f5e5363..1f8e0fba862 100644 --- a/osf/metrics/reporters/__init__.py +++ b/osf/metrics/reporters/__init__.py @@ -1,3 +1,5 @@ +import enum + # from .active_users import ActiveUserReporter from .storage_addon_usage import StorageAddonUsageReporter from .download_count import DownloadCountReporter @@ -10,18 +12,17 @@ from .spam_count import SpamCountReporter -DAILY_REPORTERS = ( - # ActiveUserReporter, - DownloadCountReporter, - InstitutionSummaryReporter, - NewUserDomainReporter, - NodeCountReporter, - OsfstorageFileCountReporter, - PreprintCountReporter, - StorageAddonUsageReporter, - UserCountReporter, -) +class AllDailyReporters(enum.Enum): + # ACTIVE_USER = ActiveUserReporter + DOWNLOAD_COUNT = DownloadCountReporter + INSTITUTION_SUMMARY = InstitutionSummaryReporter + NEW_USER_DOMAIN = NewUserDomainReporter + NODE_COUNT = NodeCountReporter + OSFSTORAGE_FILE_COUNT = OsfstorageFileCountReporter + PREPRINT_COUNT = PreprintCountReporter + STORAGE_ADDON_USAGE = StorageAddonUsageReporter + USER_COUNT = UserCountReporter + -MONTHLY_REPORTERS = ( - SpamCountReporter, -) +class AllMonthlyReporters(enum.Enum): + SPAM_COUNT = SpamCountReporter From 647cf1b54a19060b2f0fcbc730ec9763347a4010 Mon Sep 17 00:00:00 2001 From: abram axel booth Date: Fri, 23 Aug 2024 13:35:55 -0400 Subject: [PATCH 2/3] stop trying to send metrics reports to keen --- admin/management/views.py | 2 - admin/templates/management/commands.html | 5 -- osf/management/commands/daily_reporters_go.py | 9 +--- osf/metrics/reporters/_base.py | 47 +------------------ osf/metrics/reporters/download_count.py | 8 ---- osf/metrics/reporters/institution_summary.py | 14 ------ osf/metrics/reporters/new_user_domain.py | 9 ---- osf/metrics/reporters/node_count.py | 9 ---- .../reporters/osfstorage_file_count.py | 6 --- osf/metrics/reporters/preprint_count.py | 9 ---- osf/metrics/reporters/storage_addon_usage.py | 20 -------- osf/metrics/reporters/user_count.py | 13 ----- website/settings/defaults.py | 1 - 13 files changed, 2 insertions(+), 150 deletions(-) diff --git a/admin/management/views.py b/admin/management/views.py index 4323d7fd429..3bd675790dd 100644 --- a/admin/management/views.py +++ b/admin/management/views.py @@ -100,7 +100,6 @@ def post(self, request, *args, **kwargs): class DailyReportersGo(ManagementCommandPermissionView): def post(self, request, *args, **kwargs): - also_keen = bool(request.POST.get('also_send_to_keen', False)) report_date = request.POST.get('report_date', None) if report_date: report_date = isoparse(report_date).date() @@ -109,7 +108,6 @@ def post(self, request, *args, **kwargs): daily_reporters_go.apply_async(kwargs={ 'report_date': report_date, - 'also_send_to_keen': also_keen }) messages.success(request, 'Daily reporters going!') return redirect(reverse('management:commands')) diff --git a/admin/templates/management/commands.html b/admin/templates/management/commands.html index 269ead1bd3d..91471394c71 100644 --- a/admin/templates/management/commands.html +++ b/admin/templates/management/commands.html @@ -89,11 +89,6 @@

Daily Reporters, Go!

(default: yesterday)
- - - (may result in duplicates) diff --git a/osf/management/commands/daily_reporters_go.py b/osf/management/commands/daily_reporters_go.py index b2f73ea5842..5c62e6fbaa6 100644 --- a/osf/management/commands/daily_reporters_go.py +++ b/osf/management/commands/daily_reporters_go.py @@ -14,7 +14,7 @@ @celery_app.task(name='management.commands.daily_reporters_go') -def daily_reporters_go(also_send_to_keen=False, report_date=None, reporter_filter=None): +def daily_reporters_go(report_date=None, reporter_filter=None, **kwargs): init_app() # OSF-specific setup if report_date is None: # default to yesterday @@ -44,12 +44,6 @@ def daily_reporter_go(task, reporter_key: str, report_date: str): class Command(BaseCommand): def add_arguments(self, parser): - parser.add_argument( - '--keen', - type=bool, - default=False, - help='also send reports to keen', - ) parser.add_argument( '--date', type=datetime.date.fromisoformat, @@ -63,7 +57,6 @@ def add_arguments(self, parser): def handle(self, *args, **options): errors = daily_reporters_go( report_date=options.get('date'), - also_send_to_keen=options['keen'], reporter_filter=options.get('filter'), ) for error_key, error_val in errors.items(): diff --git a/osf/metrics/reporters/_base.py b/osf/metrics/reporters/_base.py index 94d35bbaad2..d3bf1722523 100644 --- a/osf/metrics/reporters/_base.py +++ b/osf/metrics/reporters/_base.py @@ -1,12 +1,6 @@ -from collections import defaultdict -from datetime import datetime import logging -import pytz - -from keen.client import KeenClient from osf.metrics.utils import YearMonth -from website.settings import KEEN as keen_settings logger = logging.getLogger(__name__) @@ -33,49 +27,10 @@ def report(self, report_date): """ raise NotImplementedError(f'{self.__name__} must implement `report`') - def keen_events_from_report(self, report): - """given one of this reporter's own reports, build equivalent keen events - (for back-compat; to be deleted once we don't need keen anymore) - - return a mapping from keen collection name to iterable of events - e.g. {'my_keen_collection': [event1, event2, ...]} - """ - raise NotImplementedError(f'{self.__name__} should probably implement keen_events_from_report') - - def run_and_record_for_date(self, report_date, *, also_send_to_keen=False): + def run_and_record_for_date(self, report_date): reports = self.report(report_date) # expecting each reporter to spit out only a handful of reports per day; # not bothering with bulk-create for report in reports: report.save() - - if also_send_to_keen: - self.send_to_keen(reports) - - def send_to_keen(self, reports): - keen_project = keen_settings['private']['project_id'] - write_key = keen_settings['private']['write_key'] - if not (keen_project and write_key): - logger.warning(f'keen not configured; not sending events for {self.__class__.__name__}') - return - - keen_events_by_collection = defaultdict(list) - for report in reports: - keen_event_timestamp = datetime( - report.report_date.year, - report.report_date.month, - report.report_date.day, - tzinfo=pytz.utc, - ) - - for collection_name, keen_events in self.keen_events_from_report(report).items(): - for event in keen_events: - event['keen'] = {'timestamp': keen_event_timestamp.isoformat()} - keen_events_by_collection[collection_name].extend(keen_events) - - client = KeenClient( - project_id=keen_project, - write_key=write_key, - ) - client.add_events(keen_events_by_collection) diff --git a/osf/metrics/reporters/download_count.py b/osf/metrics/reporters/download_count.py index f6ed14df198..f772722dc31 100644 --- a/osf/metrics/reporters/download_count.py +++ b/osf/metrics/reporters/download_count.py @@ -12,11 +12,3 @@ def report(self, date): report_date=date, ), ] - - def keen_events_from_report(self, report): - event = { - 'files': { - 'total': report.daily_file_downloads, - }, - } - return {'download_count_summary': [event]} diff --git a/osf/metrics/reporters/institution_summary.py b/osf/metrics/reporters/institution_summary.py index d51657e83b6..892e337aec4 100644 --- a/osf/metrics/reporters/institution_summary.py +++ b/osf/metrics/reporters/institution_summary.py @@ -93,17 +93,3 @@ def report(self, date): reports.append(report) return reports - - def keen_events_from_report(self, report): - event = { - 'institution': { - 'id': report.institution_id, - 'name': report.institution_name, - }, - 'users': report.users.to_dict(), - 'nodes': report.nodes.to_dict(), - 'projects': report.projects.to_dict(), - 'registered_nodes': report.registered_nodes.to_dict(), - 'registered_projects': report.registered_projects.to_dict(), - } - return {'institution_summary': [event]} diff --git a/osf/metrics/reporters/new_user_domain.py b/osf/metrics/reporters/new_user_domain.py index be28079e331..ec13aad860f 100644 --- a/osf/metrics/reporters/new_user_domain.py +++ b/osf/metrics/reporters/new_user_domain.py @@ -28,12 +28,3 @@ def report(self, date): ) for domain_name, count in domain_names.items() ] - - def keen_events_from_report(self, report): - events = [ - {'domain': report.domain_name, 'date': str(report.report_date)} - for _ in range(report.new_user_count) - ] - return { - 'user_domain_events': events, - } diff --git a/osf/metrics/reporters/node_count.py b/osf/metrics/reporters/node_count.py index d90a23fda0b..0a4120ca1f9 100644 --- a/osf/metrics/reporters/node_count.py +++ b/osf/metrics/reporters/node_count.py @@ -90,12 +90,3 @@ def report(self, date): ) return [report] - - def keen_events_from_report(self, report): - event = { - 'nodes': report.nodes.to_dict(), - 'projects': report.projects.to_dict(), - 'registered_nodes': report.registered_nodes.to_dict(), - 'registered_projects': report.registered_projects.to_dict(), - } - return {'node_summary': [event]} diff --git a/osf/metrics/reporters/osfstorage_file_count.py b/osf/metrics/reporters/osfstorage_file_count.py index 339838dce78..2f35e1e81fd 100644 --- a/osf/metrics/reporters/osfstorage_file_count.py +++ b/osf/metrics/reporters/osfstorage_file_count.py @@ -45,9 +45,3 @@ def report(self, date): ) return [report] - - def keen_events_from_report(self, report): - event = { - 'osfstorage_files_including_quickfiles': report.files.to_dict(), - } - return {'file_summary': [event]} diff --git a/osf/metrics/reporters/preprint_count.py b/osf/metrics/reporters/preprint_count.py index 319f72ae319..23f68bc7736 100644 --- a/osf/metrics/reporters/preprint_count.py +++ b/osf/metrics/reporters/preprint_count.py @@ -58,12 +58,3 @@ def report(self, date): logger.info('{} Preprints counted for the provider {}'.format(resp['hits']['total'], preprint_provider.name)) return reports - - def keen_events_from_report(self, report): - event = { - 'provider': { - 'name': report.provider_key, - 'total': report.preprint_count, - }, - } - return {'preprint_summary': [event]} diff --git a/osf/metrics/reporters/storage_addon_usage.py b/osf/metrics/reporters/storage_addon_usage.py index 242be243b57..704254795f0 100644 --- a/osf/metrics/reporters/storage_addon_usage.py +++ b/osf/metrics/reporters/storage_addon_usage.py @@ -167,23 +167,3 @@ def report(self, date): report_date=date, usage_by_addon=usage_by_addon, )] - - def keen_events_from_report(self, report): - events = [ - { - 'provider': { - 'name': addon_usage.addon_shortname, - }, - 'users': { - 'enabled': addon_usage.enabled_usersettings, - 'linked': addon_usage.linked_usersettings, - }, - 'nodes': { - 'connected': addon_usage.connected_nodesettings, - 'deleted': addon_usage.deleted_nodesettings, - 'disconnected': addon_usage.disconnected_nodesettings - }, - } - for addon_usage in report.usage_by_addon - ] - return {'addon_snapshot': events} diff --git a/osf/metrics/reporters/user_count.py b/osf/metrics/reporters/user_count.py index fc9f3d6df54..e0a61c7bb10 100644 --- a/osf/metrics/reporters/user_count.py +++ b/osf/metrics/reporters/user_count.py @@ -18,16 +18,3 @@ def report(self, report_date): ) return [report] - - def keen_events_from_report(self, report): - event = { - 'status': { - 'active': report.active, - 'deactivated': report.deactivated, - 'merged': report.merged, - 'new_users_daily': report.new_users_daily, - 'new_users_with_institution_daily': report.new_users_with_institution_daily, - 'unconfirmed': report.unconfirmed, - } - } - return {'user_summary': [event]} diff --git a/website/settings/defaults.py b/website/settings/defaults.py index 63535dac1dd..7d6df427336 100644 --- a/website/settings/defaults.py +++ b/website/settings/defaults.py @@ -646,7 +646,6 @@ class CeleryConfig: 'daily_reporters_go': { 'task': 'management.commands.daily_reporters_go', 'schedule': crontab(minute=0, hour=6), # Daily 1:00 a.m. - 'kwargs': {'also_send_to_keen': True}, }, 'monthly_reporters_go': { 'task': 'management.commands.monthly_reporters_go', From 979eded66a7e29c3677692452ca80e45549027aa Mon Sep 17 00:00:00 2001 From: Matt Frazier Date: Fri, 23 Aug 2024 14:22:39 -0400 Subject: [PATCH 3/3] Don't record preprint metrics from contributors --- addons/base/views.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/addons/base/views.py b/addons/base/views.py index 6fea2444421..6253f7bc91b 100644 --- a/addons/base/views.py +++ b/addons/base/views.py @@ -696,6 +696,10 @@ def osfstoragefile_mark_viewed(self, auth, fileversion, file_node): @file_signals.file_viewed.connect def osfstoragefile_update_view_analytics(self, auth, fileversion, file_node): resource = file_node.target + user = getattr(auth, 'user', None) + if hasattr(resource, 'is_contributor_or_group_member') and resource.is_contributor_or_group_member(user): + # Don't record views by contributors + return enqueue_update_analytics( resource, file_node, @@ -707,6 +711,10 @@ def osfstoragefile_update_view_analytics(self, auth, fileversion, file_node): @file_signals.file_viewed.connect def osfstoragefile_viewed_update_metrics(self, auth, fileversion, file_node): resource = file_node.target + user = getattr(auth, 'user', None) + if hasattr(resource, 'is_contributor_or_group_member') and resource.is_contributor_or_group_member(user): + # Don't record views by contributors + return if waffle.switch_is_active(features.ELASTICSEARCH_METRICS) and isinstance(resource, Preprint): try: PreprintView.record_for_preprint( @@ -730,6 +738,10 @@ def osfstoragefile_downloaded_update_analytics(self, auth, fileversion, file_nod @file_signals.file_downloaded.connect def osfstoragefile_downloaded_update_metrics(self, auth, fileversion, file_node): resource = file_node.target + user = getattr(auth, 'user', None) + if hasattr(resource, 'is_contributor_or_group_member') and resource.is_contributor_or_group_member(user): + # Don't record downloads by contributors + return if waffle.switch_is_active(features.ELASTICSEARCH_METRICS) and isinstance(resource, Preprint): try: PreprintDownload.record_for_preprint(