From f0539d9c113c4a530c9a33494f2e1d683f1032e6 Mon Sep 17 00:00:00 2001 From: Rebecca Pearce Date: Thu, 18 Jun 2020 12:25:15 +0100 Subject: [PATCH] Add a job to rerun master etl This adds a job to repopulate the data that has been collected by the [content data api etl job](https://deploy.blue.production.govuk.digital/job/content_data_api_import_etl_master_process/). The data collected by this initial etl job was giving inaccurate results we traced this down to a [delay in results showing up in Google analytics](https://support.google.com/analytics/answer/1070983?hl=en#:~:text=Data%20processing%20latency,for%20up%20to%20two%20days), results can take between 24-48 hours to appear in GA, the initial etl is run at 7am leaving only 7 hours for the data to appear in GA. The newly added job will collect the data after 2 days leaving time for for the data to appear correctly in GA. We have added this new job instead of moving the current job back since there are many references in the code of content-data to data collected yesterday. The date ranges are also done from data received yesterday. Changing this would be a big job, so initially we use the potentially inaccurate data and then correct it after 2 days. Trello card: https://trello.com/c/ODGWWnXt/2005-5-content-data-incorrect-users-who-found-this-useful-data-for-march --- Rakefile | 1 + hieradata_aws/class/integration/jenkins.yaml | 1 + hieradata_aws/class/production/jenkins.yaml | 1 + hieradata_aws/class/staging/jenkins.yaml | 1 + hieradata_aws/common.yaml | 1 + .../manifests/jobs/content_data_api_re_run.pp | 22 +++++++++++++++++++ .../jobs/content_data_api_re_run.yaml.erb | 21 ++++++++++++++++++ spec/fixtures/hieradata/common.yaml | 1 + 8 files changed, 49 insertions(+) create mode 100644 modules/govuk_jenkins/manifests/jobs/content_data_api_re_run.pp create mode 100644 modules/govuk_jenkins/templates/jobs/content_data_api_re_run.yaml.erb diff --git a/Rakefile b/Rakefile index 8924647bc8..7423e695d1 100644 --- a/Rakefile +++ b/Rakefile @@ -423,6 +423,7 @@ task :check_consistency_between_aws_and_carrenza do govuk_jenkins::deploy_all_apps::apps_on_nodes govuk_jenkins::deploy_all_apps::deploy_environment govuk_jenkins::jobs::content_data_api::rake_etl_master_process_cron_schedule + govuk_jenkins::jobs::content_data_api_re_run::re_run_rake_etl_master_process_cron_schedule govuk_jenkins::jobs::deploy_app::graphite_host govuk_jenkins::jobs::deploy_app::graphite_port govuk_jenkins::jobs::deploy_emergency_banner::clear_cdn_cache diff --git a/hieradata_aws/class/integration/jenkins.yaml b/hieradata_aws/class/integration/jenkins.yaml index 82e943b6db..abed3d037b 100644 --- a/hieradata_aws/class/integration/jenkins.yaml +++ b/hieradata_aws/class/integration/jenkins.yaml @@ -24,6 +24,7 @@ govuk_jenkins::job_builder::jobs: - govuk_jenkins::jobs::govuk_taxonomy_supervised_learning - govuk_jenkins::jobs::monitor_taxonomy_health - govuk_jenkins::jobs::passive_checks + - govuk_jenkins::jobs::content_data_api_re_run - govuk_jenkins::jobs::publication_delay_report - govuk_jenkins::jobs::publish_special_routes - govuk_jenkins::jobs::record_taxonomy_metrics diff --git a/hieradata_aws/class/production/jenkins.yaml b/hieradata_aws/class/production/jenkins.yaml index 6fe01c9a4d..39c0d2db88 100644 --- a/hieradata_aws/class/production/jenkins.yaml +++ b/hieradata_aws/class/production/jenkins.yaml @@ -53,6 +53,7 @@ govuk_jenkins::job_builder::jobs: - govuk_jenkins::jobs::email_alert_check - govuk_jenkins::jobs::enhanced_ecommerce_search_api - govuk_jenkins::jobs::passive_checks + - govuk_jenkins::jobs::content_data_api_re_run - govuk_jenkins::jobs::publish_special_routes - govuk_jenkins::jobs::publishing_api_archive_events - govuk_jenkins::jobs::remove_emergency_banner diff --git a/hieradata_aws/class/staging/jenkins.yaml b/hieradata_aws/class/staging/jenkins.yaml index 7e7a724ad7..6b7d99ce50 100644 --- a/hieradata_aws/class/staging/jenkins.yaml +++ b/hieradata_aws/class/staging/jenkins.yaml @@ -59,6 +59,7 @@ govuk_jenkins::job_builder::jobs: - govuk_jenkins::jobs::govuk_taxonomy_supervised_learning - govuk_jenkins::jobs::monitor_taxonomy_health - govuk_jenkins::jobs::passive_checks + - govuk_jenkins::jobs::content_data_api_re_run - govuk_jenkins::jobs::publish_special_routes - govuk_jenkins::jobs::record_taxonomy_metrics - govuk_jenkins::jobs::remove_emergency_banner diff --git a/hieradata_aws/common.yaml b/hieradata_aws/common.yaml index b279f369be..66cbc43aab 100644 --- a/hieradata_aws/common.yaml +++ b/hieradata_aws/common.yaml @@ -879,6 +879,7 @@ govuk_jenkins::packages::govuk_python::apt_mirror_gpg_key_fingerprint: "%{hiera( govuk_jenkins::jobs::deploy_app::graphite_host: "graphite.%{hiera('app_domain_internal')}" govuk_jenkins::jobs::deploy_app::graphite_port: '443' +govuk_jenkins::jobs::content_data_api_re_run::re_run_rake_etl_master_process_cron_schedule: '0 3 * * *' govuk_jenkins::deploy_all_apps::deploy_environment: "%{hiera('govuk_jenkins::job_builder::environment')}" diff --git a/modules/govuk_jenkins/manifests/jobs/content_data_api_re_run.pp b/modules/govuk_jenkins/manifests/jobs/content_data_api_re_run.pp new file mode 100644 index 0000000000..aa2c44d2fd --- /dev/null +++ b/modules/govuk_jenkins/manifests/jobs/content_data_api_re_run.pp @@ -0,0 +1,22 @@ +# == Class: govuk_jenkins::jobs::content_data_api_re_run +# +# Create a jenkins job to periodically run rake for the following tasks: +# - rake etl:rerun_master +# +# === Parameters: +# +# [*re_run_rake_etl_master_process_cron_schedule *] +# The cron timings for the etl:master process +# Default: undef +# +class govuk_jenkins::jobs::content_data_api_re_run ( + $re_run_rake_etl_master_process_cron_schedule, + $app_domain = hiera('app_domain'), +) { + + file { '/etc/jenkins_jobs/jobs/content_data_api_re_run.yaml': + ensure => present, + content => template('govuk_jenkins/jobs/content_data_api_re_run.yaml.erb'), + notify => Exec['jenkins_jobs_update'], + } +} diff --git a/modules/govuk_jenkins/templates/jobs/content_data_api_re_run.yaml.erb b/modules/govuk_jenkins/templates/jobs/content_data_api_re_run.yaml.erb new file mode 100644 index 0000000000..b1a325972c --- /dev/null +++ b/modules/govuk_jenkins/templates/jobs/content_data_api_re_run.yaml.erb @@ -0,0 +1,21 @@ +--- +- job: + name: content_data_api_re_run_re_run_rake_etl_master_process_cron_schedule + display-name: Content Data API - rerun ETL master + project-type: freestyle + description: "

Rerun the etl:master rake task to populate missing data.

" + builders: + - trigger-builds: + - project: run-rake-task + block: true + predefined-parameters: | + TARGET_APPLICATION=content-data-api + MACHINE_CLASS=backend + RAKE_TASK=etl:rerun_master['<%= "#{(Time.now-2*24*60*60).strftime("%Y-%m-%d")}, #{(Time.now-2*24*60*60).strftime("%Y-%m-%d")}" %>'] + wrappers: + - ansicolor: + colormap: xterm + triggers: + - timed: <%= @re_run_rake_etl_master_process_cron_schedule %> + logrotate: + daysToKeep: 365 diff --git a/spec/fixtures/hieradata/common.yaml b/spec/fixtures/hieradata/common.yaml index f52d2a296f..d8e830e5a4 100644 --- a/spec/fixtures/hieradata/common.yaml +++ b/spec/fixtures/hieradata/common.yaml @@ -70,6 +70,7 @@ govuk_jenkins::config::github_web_uri: wibble govuk_jenkins::jobs::deploy_app::applications: *deployable_applications govuk_jenkins::jobs::deploy_app_downstream::applications: *deployable_applications govuk_jenkins::jobs::run_rake_task::applications: *deployable_applications +govuk_jenkins::jobs::content_data_api_re_run::re_run_rake_etl_master_process_cron_schedule: '0 3 * * *' govuk_jenkins::packages::gcloud::apt_mirror_hostname: "%{hiera('apt_mirror_hostname')}" govuk_jenkins::packages::gcloud::apt_mirror_gpg_key_fingerprint: "%{hiera('apt_mirror_fingerprint')}"