From 878e80c7977749bdee7c5d31c5b4c433c26863d9 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 30 May 2018 12:17:38 +0100 Subject: [PATCH 01/19] Clojure Collector: bump ring to 1.6.3 (closes #3778) --- 2-collectors/clojure-collector/java-servlet/project.clj | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/2-collectors/clojure-collector/java-servlet/project.clj b/2-collectors/clojure-collector/java-servlet/project.clj index f5fc1a5da7..54633232e9 100644 --- a/2-collectors/clojure-collector/java-servlet/project.clj +++ b/2-collectors/clojure-collector/java-servlet/project.clj @@ -22,14 +22,14 @@ :url "http://www.apache.org/licenses/LICENSE-2.0"} :description "A SnowPlow event collector written in Clojure. AWS Elastic Beanstalk compatible." :dependencies [[org.clojure/clojure "1.4.0"] - [ring/ring-core "1.1.8"] - [ring/ring-devel "1.1.8"] + [ring/ring-core "1.6.3"] + [ring/ring-devel "1.6.3"] [compojure "1.1.3"] [metrics-clojure "0.9.2"] [metrics-clojure-ring "0.9.2"] [commons-codec/commons-codec "1.7"]] ;; The jetty adapter is only used during development - :profiles {:dev {:dependencies [[ring/ring-jetty-adapter "1.1.8"]]}} + :profiles {:dev {:dependencies [[ring/ring-jetty-adapter "1.6.3"]]}} :war-resources-path "war-resources" :plugins [[lein-ring "0.8.3"] [lein-beanstalk "0.2.6"]] From 8aa5be947d17000ed0ff682dd6c029b31e1cae1a Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 30 May 2018 15:54:01 +0100 Subject: [PATCH 02/19] Clojure Collector: bump clojure to 1.9.0 (closes #3779) --- 2-collectors/clojure-collector/java-servlet/project.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2-collectors/clojure-collector/java-servlet/project.clj b/2-collectors/clojure-collector/java-servlet/project.clj index 54633232e9..d58fb4da75 100644 --- a/2-collectors/clojure-collector/java-servlet/project.clj +++ b/2-collectors/clojure-collector/java-servlet/project.clj @@ -21,7 +21,7 @@ :license {:name "Apache Version 2.0" :url "http://www.apache.org/licenses/LICENSE-2.0"} :description "A SnowPlow event collector written in Clojure. AWS Elastic Beanstalk compatible." - :dependencies [[org.clojure/clojure "1.4.0"] + :dependencies [[org.clojure/clojure "1.9.0"] [ring/ring-core "1.6.3"] [ring/ring-devel "1.6.3"] [compojure "1.1.3"] From 87a70fa228f7399fa762a257691245ec3d689546 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 30 May 2018 15:55:56 +0100 Subject: [PATCH 03/19] Clojure Collector: bump compojure to 1.6.1 (closes #3780) --- 2-collectors/clojure-collector/java-servlet/project.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2-collectors/clojure-collector/java-servlet/project.clj b/2-collectors/clojure-collector/java-servlet/project.clj index d58fb4da75..12ed451fce 100644 --- a/2-collectors/clojure-collector/java-servlet/project.clj +++ b/2-collectors/clojure-collector/java-servlet/project.clj @@ -24,7 +24,7 @@ :dependencies [[org.clojure/clojure "1.9.0"] [ring/ring-core "1.6.3"] [ring/ring-devel "1.6.3"] - [compojure "1.1.3"] + [compojure "1.6.1"] [metrics-clojure "0.9.2"] [metrics-clojure-ring "0.9.2"] [commons-codec/commons-codec "1.7"]] From 496ca0fd08de4ecf3dd26e3d7daeb37573bf3d36 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 30 May 2018 16:00:53 +0100 Subject: [PATCH 04/19] Clojure Collector: bump metrics-clojure to 2.10.0 (closes #3781) --- 2-collectors/clojure-collector/java-servlet/project.clj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/2-collectors/clojure-collector/java-servlet/project.clj b/2-collectors/clojure-collector/java-servlet/project.clj index 12ed451fce..f449a6368e 100644 --- a/2-collectors/clojure-collector/java-servlet/project.clj +++ b/2-collectors/clojure-collector/java-servlet/project.clj @@ -25,8 +25,8 @@ [ring/ring-core "1.6.3"] [ring/ring-devel "1.6.3"] [compojure "1.6.1"] - [metrics-clojure "0.9.2"] - [metrics-clojure-ring "0.9.2"] + [metrics-clojure "2.10.0"] + [metrics-clojure-ring "2.10.0"] [commons-codec/commons-codec "1.7"]] ;; The jetty adapter is only used during development :profiles {:dev {:dependencies [[ring/ring-jetty-adapter "1.6.3"]]}} From e31438725e3a3c8036d23807945c331e167bdb7e Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 30 May 2018 16:04:17 +0100 Subject: [PATCH 05/19] Clojure Collector: bump commons-codec to 1.11 (closes #3782) --- 2-collectors/clojure-collector/java-servlet/project.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2-collectors/clojure-collector/java-servlet/project.clj b/2-collectors/clojure-collector/java-servlet/project.clj index f449a6368e..8530713c52 100644 --- a/2-collectors/clojure-collector/java-servlet/project.clj +++ b/2-collectors/clojure-collector/java-servlet/project.clj @@ -27,7 +27,7 @@ [compojure "1.6.1"] [metrics-clojure "2.10.0"] [metrics-clojure-ring "2.10.0"] - [commons-codec/commons-codec "1.7"]] + [commons-codec/commons-codec "1.11"]] ;; The jetty adapter is only used during development :profiles {:dev {:dependencies [[ring/ring-jetty-adapter "1.6.3"]]}} :war-resources-path "war-resources" From fd2685597855188a6f77b6cac3c3999bc81300f0 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 30 May 2018 16:08:41 +0100 Subject: [PATCH 06/19] Clojure Collector: bump lein-ring to 0.12.4 (closes #3783) --- 2-collectors/clojure-collector/java-servlet/project.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/2-collectors/clojure-collector/java-servlet/project.clj b/2-collectors/clojure-collector/java-servlet/project.clj index 8530713c52..02b6769766 100644 --- a/2-collectors/clojure-collector/java-servlet/project.clj +++ b/2-collectors/clojure-collector/java-servlet/project.clj @@ -31,7 +31,7 @@ ;; The jetty adapter is only used during development :profiles {:dev {:dependencies [[ring/ring-jetty-adapter "1.6.3"]]}} :war-resources-path "war-resources" - :plugins [[lein-ring "0.8.3"] + :plugins [[lein-ring "0.12.4"] [lein-beanstalk "0.2.6"]] :ring {:handler snowplow.clojure-collector.beanstalk/app}) ; .beanstalk -> .core if you don't need Beanstalk support From 5d3c4d5f48968e703a955f6644d16135154b3735 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 30 May 2018 16:12:22 +0100 Subject: [PATCH 07/19] Clojure Collector: remove lein-beanstalk (closes #3784) --- 2-collectors/clojure-collector/java-servlet/project.clj | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/2-collectors/clojure-collector/java-servlet/project.clj b/2-collectors/clojure-collector/java-servlet/project.clj index 02b6769766..f0dc984d9f 100644 --- a/2-collectors/clojure-collector/java-servlet/project.clj +++ b/2-collectors/clojure-collector/java-servlet/project.clj @@ -31,7 +31,6 @@ ;; The jetty adapter is only used during development :profiles {:dev {:dependencies [[ring/ring-jetty-adapter "1.6.3"]]}} :war-resources-path "war-resources" - :plugins [[lein-ring "0.12.4"] - [lein-beanstalk "0.2.6"]] + :plugins [[lein-ring "0.12.4"]] :ring {:handler snowplow.clojure-collector.beanstalk/app}) ; .beanstalk -> .core if you don't need Beanstalk support From 044a4049f4ef7fc4bf99cf4f0fbe97f5d81d5eb8 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 30 May 2018 16:26:45 +0100 Subject: [PATCH 08/19] Clojure Collector: do not allow dependencies requiring an HTTP repository (closes #3559) --- 2-collectors/clojure-collector/java-servlet/project.clj | 4 ---- 1 file changed, 4 deletions(-) diff --git a/2-collectors/clojure-collector/java-servlet/project.clj b/2-collectors/clojure-collector/java-servlet/project.clj index f0dc984d9f..6ecfb586b5 100644 --- a/2-collectors/clojure-collector/java-servlet/project.clj +++ b/2-collectors/clojure-collector/java-servlet/project.clj @@ -13,10 +13,6 @@ ;;;; Copyright: Copyright (c) 2012-2013 Snowplow Analytics Ltd ;;;; License: Apache License Version 2.0 -(require 'cemerick.pomegranate.aether) -(cemerick.pomegranate.aether/register-wagon-factory! - "http" #(org.apache.maven.wagon.providers.http.HttpWagon.)) - (defproject snowplow/clojure-collector "2.0.0" ;; MUST also bump version in server.xml :license {:name "Apache Version 2.0" :url "http://www.apache.org/licenses/LICENSE-2.0"} From 980b2243403c07d955b9d001b4ebdf072b215444 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 30 May 2018 17:33:20 +0100 Subject: [PATCH 09/19] Clojure Collector: make cookie path configurable (closes #2739) --- .../src/snowplow/clojure_collector/config.clj | 8 +++++++- .../src/snowplow/clojure_collector/core.clj | 1 + .../src/snowplow/clojure_collector/responses.clj | 16 ++++++++++------ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/config.clj b/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/config.clj index 2ed69659aa..f336652662 100644 --- a/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/config.clj +++ b/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/config.clj @@ -24,6 +24,7 @@ (def ^:const duration-varnames ["PARAM4", "SP_DURATION"]) (def ^:const cross-domain-policy-domain-varnames ["PARAM5", "SP_CDP_DOMAIN"]) (def ^:const cross-domain-policy-secure-varnames ["PARAM6", "SP_CDP_SECURE"]) +(def ^:const path-varnames ["PARAM7", "SP_PATH"]) ;; Defaults (def ^:const default-p3p-header "policyref=\"/w3c/p3p.xml\", CP=\"NOI DSP COR NID PSA OUR IND COM NAV STA\"") @@ -69,9 +70,14 @@ (def domain "Get the domain the name cookies will be set on. Can be a wildcard e.g. '.foo.com'. - If undefined we'll just use the FQDN of the host" + If undefined we'll just use the FQDN of the host." (get-var domain-varnames)) +(def path + "Get the path the cookies will be set on. + If undefined we'll just use /" + (get-var path-varnames)) + (def cross-domain-policy-domain "Get the cross domain policy domain. See the specification for reference: diff --git a/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/core.clj b/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/core.clj index 9bbd1de678..2cf272f4d2 100644 --- a/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/core.clj +++ b/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/core.clj @@ -35,6 +35,7 @@ cookies config/duration config/domain + config/path config/p3p-header pixel vendor diff --git a/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/responses.clj b/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/responses.clj index 0fdccf8f61..d0ab25046b 100644 --- a/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/responses.clj +++ b/2-collectors/clojure-collector/java-servlet/src/snowplow/clojure_collector/responses.clj @@ -38,13 +38,17 @@ (defn- set-cookie "Sets a Snowplow cookie with visitor `id`, - to last `duration` seconds for `domain`. + to last `duration` seconds for `domain` at `path`. If domain is nil, leave out so the FQDN - of the host can be used instead" - [id duration domain] + of the host can be used instead. + If path is nil, path will be /." + [id duration domain path] (merge {:value id - :expires (now-plus duration)} + :expires (now-plus duration) + :path (if (nil? path) + "/" + path)} (when-let [d domain] {:domain d}))) @@ -95,11 +99,11 @@ (defn send-cookie-pixel-or-200-or-redirect "Respond with the cookie and either a transparent pixel, a 200 or a redirect" - [cookies duration domain p3p-header pixel vendor params] + [cookies duration domain path p3p-header pixel vendor params] (let [id (generate-id cookies) cookies (if (= duration 0) {} - {cookie-name (set-cookie id duration domain)}) + {cookie-name (set-cookie id duration domain path)}) headers {"P3P" p3p-header}] (if (= vendor "r") (send-redirect cookies headers params) From 311a4dcc60a8bc826cd7c190d9164344d1523aaa Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 6 Jun 2018 10:14:04 +0200 Subject: [PATCH 10/19] EmrEtlRunner: replace Sluice by aws-sdk-s3 (closes #3524) --- 3-enrich/emr-etl-runner/Gemfile | 2 +- 3-enrich/emr-etl-runner/Gemfile.lock | 104 +++++------------- .../lib/snowplow-emr-etl-runner.rb | 1 + .../lib/snowplow-emr-etl-runner/emr_job.rb | 94 +++++++--------- .../lib/snowplow-emr-etl-runner/s3.rb | 101 +++++++++++++++++ .../spec/snowplow-emr-etl-runner/s3_spec.rb | 81 ++++++++++++++ 6 files changed, 254 insertions(+), 129 deletions(-) create mode 100644 3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/s3.rb create mode 100644 3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/s3_spec.rb diff --git a/3-enrich/emr-etl-runner/Gemfile b/3-enrich/emr-etl-runner/Gemfile index 98ef9bba0b..6eb50b1d03 100755 --- a/3-enrich/emr-etl-runner/Gemfile +++ b/3-enrich/emr-etl-runner/Gemfile @@ -25,7 +25,7 @@ gem "avro", "~> 1.8.1" gem "awrence", "~> 0.1.0" gem "snowplow-tracker", "~> 0.5.2" gem "iglu-ruby-client", ">= 0.1.0" -gem "sluice", "~> 0.4.0" +gem "aws-sdk-s3", "~> 1" gem "diplomat", "~> 2.0.1" group :development do diff --git a/3-enrich/emr-etl-runner/Gemfile.lock b/3-enrich/emr-etl-runner/Gemfile.lock index 6ac13aec88..5afa1ad3d9 100644 --- a/3-enrich/emr-etl-runner/Gemfile.lock +++ b/3-enrich/emr-etl-runner/Gemfile.lock @@ -1,12 +1,26 @@ GEM remote: https://rubygems.org/ specs: - CFPropertyList (2.3.5) addressable (2.5.0) public_suffix (~> 2.0, >= 2.0.2) avro (1.8.1) multi_json awrence (0.1.0) + aws-eventstream (1.0.0) + aws-partitions (1.89.1) + aws-sdk-core (3.21.2) + aws-eventstream (~> 1.0) + aws-partitions (~> 1.0) + aws-sigv4 (~> 1.0) + jmespath (~> 1.0) + aws-sdk-kms (1.5.0) + aws-sdk-core (~> 3) + aws-sigv4 (~> 1.0) + aws-sdk-s3 (1.13.0) + aws-sdk-core (~> 3, >= 3.21.2) + aws-sdk-kms (~> 1) + aws-sigv4 (~> 1.0) + aws-sigv4 (1.0.2) builder (3.2.3) contracts (0.11.0) coveralls (0.8.19) @@ -26,67 +40,24 @@ GEM fog-aws (~> 1.0) rest-client (~> 1.0) unf (~> 0.1) - excon (0.52.0) + excon (0.62.0) faraday (0.12.2) multipart-post (>= 1.2, < 3) - fission (0.5.0) - CFPropertyList (~> 2.2) - fog (1.25.0) - fog-brightbox (~> 0.4) - fog-core (~> 1.25) - fog-json - fog-profitbricks - fog-radosgw (>= 0.0.2) - fog-sakuracloud (>= 0.0.4) - fog-softlayer - fog-terremark - fog-vmfusion - fog-voxel - fog-xml (~> 0.1.1) - ipaddress (~> 0.5) - nokogiri (~> 1.5, >= 1.5.11) - opennebula - fog-aws (1.4.0) + fog-aws (1.4.1) fog-core (~> 1.38) fog-json (~> 1.0) fog-xml (~> 0.1) ipaddress (~> 0.8) - fog-brightbox (0.11.0) - fog-core (~> 1.22) - fog-json - inflecto (~> 0.0.2) - fog-core (1.42.0) + fog-core (1.45.0) builder - excon (~> 0.49) + excon (~> 0.58) formatador (~> 0.2) fog-json (1.0.2) fog-core (~> 1.0) multi_json (~> 1.10) - fog-profitbricks (3.0.0) - fog-core (~> 1.42) - fog-json (~> 1.0) - fog-radosgw (0.0.5) - fog-core (>= 1.21.0) - fog-json - fog-xml (>= 0.0.1) - fog-sakuracloud (1.7.5) - fog-core - fog-json - fog-softlayer (1.1.4) - fog-core - fog-json - fog-terremark (0.1.0) - fog-core - fog-xml - fog-vmfusion (0.1.0) - fission + fog-xml (0.1.3) fog-core - fog-voxel (0.1.0) - fog-core - fog-xml - fog-xml (0.1.2) - fog-core - nokogiri (~> 1.5, >= 1.5.11) + nokogiri (>= 1.5.11, < 2.0.0) formatador (0.2.5) http-cookie (1.0.3) domain_name (~> 0.5) @@ -95,31 +66,21 @@ GEM iglu-ruby-client (0.1.0) httparty (<= 0.14.0) json-schema (~> 2.7.0, >= 2.7.0) - inflecto (0.0.2) ipaddress (0.8.3) - jruby-jars (9.1.7.0) - jruby-rack (1.1.20) + jmespath (1.4.0) + jruby-jars (9.2.0.0) + jruby-rack (1.1.21) json (2.0.3-java) json-schema (2.7.0) addressable (>= 2.4) mime-types (2.99.3) - multi_json (1.12.1) + multi_json (1.13.1) multi_xml (0.6.0) multipart-post (2.0.0) - net-ssh (2.9.4) netrc (0.11.0) - nokogiri (1.7.0.1-java) - opennebula (5.4.0) - json - nokogiri - rbvmomi + nokogiri (1.8.2-java) public_suffix (2.0.5) - rake (12.0.0) - rbvmomi (1.11.3) - builder (~> 3.0) - json (>= 1.8) - nokogiri (~> 1.5) - trollop (~> 2.1) + rake (12.3.1) rest-client (1.8.0) http-cookie (>= 1.0.2, < 2.0) mime-types (>= 1.16, < 3.0) @@ -143,19 +104,14 @@ GEM json (>= 1.8, < 3) simplecov-html (~> 0.10.0) simplecov-html (0.10.0) - sluice (0.4.0) - contracts (~> 0.9) - fog (= 1.25) - net-ssh (~> 2.9.2) snowplow-tracker (0.5.2) contracts (~> 0.7, <= 0.11) term-ansicolor (1.4.0) tins (~> 1.0) thor (0.19.4) tins (1.13.2) - trollop (2.1.2) unf (0.1.4-java) - warbler (2.0.4) + warbler (2.0.5) jruby-jars (>= 9.0.0.0) jruby-rack (>= 1.1.1, < 1.3) rake (>= 10.1.0) @@ -167,13 +123,13 @@ PLATFORMS DEPENDENCIES avro (~> 1.8.1) awrence (~> 0.1.0) + aws-sdk-s3 (~> 1) contracts (~> 0.9, <= 0.11) coveralls diplomat (~> 2.0.1) elasticity (~> 6.0.12) iglu-ruby-client (>= 0.1.0) rspec (~> 3.5.0) - sluice (~> 0.4.0) snowplow-tracker (~> 0.5.2) warbler @@ -181,4 +137,4 @@ RUBY VERSION ruby 2.3.1p0 (jruby 9.1.6.0) BUNDLED WITH - 1.15.3 + 1.15.4 diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner.rb index 3c74e433fc..d9db70a10f 100755 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner.rb @@ -21,6 +21,7 @@ require_relative 'snowplow-emr-etl-runner/monitoring/snowplow' require_relative 'snowplow-emr-etl-runner/cli' require_relative 'snowplow-emr-etl-runner/job_result' +require_relative 'snowplow-emr-etl-runner/s3' require_relative 'snowplow-emr-etl-runner/emr_job' require_relative 'snowplow-emr-etl-runner/lock/lock' require_relative 'snowplow-emr-etl-runner/lock/file_lock' diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb index 57465d21aa..d36d8c15a5 100755 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb @@ -15,7 +15,7 @@ require 'set' require 'elasticity' -require 'sluice' +require 'aws-sdk-s3' require 'awrence' require 'json' require 'base64' @@ -23,7 +23,6 @@ require 'iglu-client' require 'securerandom' require 'tempfile' -require 'fog' # Ruby class to execute Snowplow's Hive jobs against Amazon EMR # using Elasticity (https://github.com/rslifka/elasticity). @@ -55,6 +54,7 @@ class EmrJob include Monitoring::Logging include Snowplow::EmrEtlRunner::Utils + include Snowplow::EmrEtlRunner::S3 # Initializes our wrapper for the Amazon EMR client. Contract Bool, Bool, Bool, Bool, Bool, Bool, Bool, Bool, ArchiveStep, ArchiveStep, ConfigHash, ArrayOf[String], String, TargetsHash, RdbLoaderSteps => EmrJob @@ -83,10 +83,10 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive etl_tstamp = (run_tstamp.to_f * 1000).to_i.to_s output_codec = output_codec_from_compression_format(config.dig(:enrich, :output_compression)) - s3 = Sluice::Storage::S3::new_fog_s3_from( - config[:aws][:s3][:region], - config[:aws][:access_key_id], - config[:aws][:secret_access_key]) + s3 = Aws::S3::Client.new( + :access_key_id => config[:aws][:access_key_id], + :secret_access_key => config[:aws][:secret_access_key], + :region => config[:aws][:s3][:region]) ami_version = Gem::Version.new(config[:aws][:emr][:ami_version]) @@ -137,18 +137,15 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive # staging if staging - csbr_processing_loc = Sluice::Storage::S3::Location.new(csbr[:processing]) - unless Sluice::Storage::S3::is_empty?(s3, csbr_processing_loc) - raise DirectoryNotEmptyError, "Cannot safely add staging step to jobflow, #{csbr_processing_loc} is not empty" + unless empty?(s3, csbr[:processing]) + raise DirectoryNotEmptyError, "Cannot safely add staging step to jobflow, #{csbr[:processing]} is not empty" end src_pattern = collector_format == 'clj-tomcat' ? '.*localhost\_access\_log.*\.txt.*' : '.+' src_pattern_regex = Regexp.new src_pattern non_empty_locs = csbr[:in].select { |l| - loc = Sluice::Storage::S3::Location.new(l) - files = Sluice::Storage::S3::list_files(s3, loc) - .select { |f| !(f.key =~ src_pattern_regex).nil? } - files.length > 0 + not empty?(s3, l, + lambda { |k| !(k =~ /\/$/) and !(k =~ /\$folder\$$/) and !(k =~ src_pattern_regex).nil? }) } if non_empty_locs.empty? @@ -369,9 +366,8 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive end # Late check whether our enrichment directory is empty. We do an early check too - csbe_good_loc = Sluice::Storage::S3::Location.new(csbe[:good]) - unless Sluice::Storage::S3::is_empty?(s3, csbe_good_loc) - raise DirectoryNotEmptyError, "Cannot safely add enrichment step to jobflow, #{csbe_good_loc} is not empty" + unless empty?(s3, csbe[:good]) + raise DirectoryNotEmptyError, "Cannot safely add enrichment step to jobflow, #{csbe[:good]} is not empty" end @jobflow.add_step(enrich_step) @@ -399,16 +395,12 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive # Staging data produced by Stream Enrich if staging_stream_enrich - csbe_good_loc = Sluice::Storage::S3::Location.new(csbe[:good]) - unless Sluice::Storage::S3::is_empty?(s3, csbe_good_loc) - raise DirectoryNotEmptyError, "Cannot safely add stream staging step to jobflow, #{csbe_good_loc} is not empty" + unless empty?(s3, csbe[:good]) + raise DirectoryNotEmptyError, "Cannot safely add stream staging step to jobflow, #{csbe[:good]} is not empty" end - stream_enrich_loc = Sluice::Storage::S3::Location.new(csbe[:stream]) - src_pattern_regex = Regexp.new STREAM_ENRICH_REGEXP - files = Sluice::Storage::S3::list_files(s3, stream_enrich_loc).select { |f| !(f.key =~ src_pattern_regex).nil? } - if files.empty? + if empty?(s3, csbe[:stream], lambda { |k| !(k =~ /\/$/) and !(k =~ /\$folder\$$/) and !(k =~ src_pattern_regex).nil? }) raise NoDataToProcessError, "No Snowplow enriched stream logs to process since last run" end @@ -431,11 +423,11 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive # Add processing manifest if available processing_manifest = get_processing_manifest(targets) - processing_manifest_shred_args = - if not processing_manifest.nil? + processing_manifest_shred_args = + if not processing_manifest.nil? if Gem::Version.new(config[:storage][:versions][:rdb_shredder]) >= SHRED_JOB_WITH_PROCESSING_MANIFEST { 'processing-manifest-table' => processing_manifest, 'item-id' => shred_final_output } - else + else {} end else @@ -457,7 +449,6 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive "--outputCodec", "none", "--s3Endpoint" , s3_endpoint ] - copy_to_hdfs_step.name << ": Enriched S3 -> HDFS" @jobflow.add_step(copy_to_hdfs_step) end @@ -496,9 +487,8 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive end # Late check whether our target directory is empty - csbs_good_loc = Sluice::Storage::S3::Location.new(csbs[:good]) - unless Sluice::Storage::S3::is_empty?(s3, csbs_good_loc) - raise DirectoryNotEmptyError, "Cannot safely add shredding step to jobflow, #{csbs_good_loc} is not empty" + unless empty?(s3, csbs[:good]) + raise DirectoryNotEmptyError, "Cannot safely add shredding step to jobflow, #{csbs[:good]} is not empty" end @jobflow.add_step(shred_step) @@ -674,23 +664,26 @@ def run(config) Contract String, String, String, String => nil def output_rdb_loader_logs(region, aws_access_key_id, aws_secret_key, log_level) - s3 = Sluice::Storage::S3::new_fog_s3_from(region, aws_access_key_id, aws_secret_key) - - loc = Sluice::Storage::S3::Location.new(@rdb_loader_log_base) + s3 = Aws::S3::Client.new( + :access_key_id => aws_access_key_id, + :secret_access_key => aws_secret_key, + :region => region) - if @rdb_loader_logs.empty? or Sluice::Storage::S3::is_empty?(s3, loc) + if @rdb_loader_logs.empty? or empty?(s3, @rdb_loader_log_base) logger.info "No RDB Loader logs" else logger.info "RDB Loader logs" @rdb_loader_logs.each do |l| tmp = Tempfile.new("rdbloader") - uri = URI.parse(l[1]) - bucket, key = uri.host, uri.path[1..-1] - logger.debug "Downloading #{uri} to #{tmp.path}" + bucket, key = parse_bucket_prefix(l[1]) + logger.debug "Downloading #{l[1]} to #{tmp.path}" begin - log = s3.directories.get(bucket).files.head(key) - Sluice::Storage::S3::download_file(s3, log, tmp) + s3.get_object({ + response_target: tmp, + bucket: bucket, + key: key, + }) if log_level == 'info' logger.info l[0] logger.info tmp.read @@ -774,22 +767,15 @@ def get_rdb_loader_steps(config, targets, resolver, jar, rdbloader_steps, skip_m # +s3+:: AWS S3 client # +s3_path+:: Full S3 path to folder def get_latest_run_id(s3, s3_path) - uri = URI.parse(s3_path) - folders = s3.directories.get(uri.host, delimiter: '/', prefix: uri.path[1..-1]).files.common_prefixes - # each is mandatory, otherwise there'll be pagination issues if there are > 1k objects - # cf snowplow/snowplow#3434 - run_folders = [] - folders.each { |f| - if f.include?('run=') - run_folders << f - end - } - begin - folder = run_folders[-1].split('/')[-1] - folder.slice('run='.length, folder.length) - rescue NoMethodError => _ + run_id_regex = /.*\/run=((\d|-)+)\/.*/ + folders = list_object_names(s3, s3_path, + lambda { |k| !(k =~ /\$folder\$$/) and !k[run_id_regex, 1].nil? }) + .map { |k| k[run_id_regex, 1] } + if folders.empty? logger.error "No run folders in [#{s3_path}] found" raise UnexpectedStateError, "No run folders in [#{s3_path}] found" + else + folders.first end end @@ -802,7 +788,7 @@ def get_latest_run_id(s3, s3_path) # +name+:: step description to show in EMR console # # Returns a step ready for adding to the Elasticity Jobflow. - Contract String, String, String, String, String => Elasticity::S3DistCpStep + Contract String, String, String, String, String, Bool => Elasticity::S3DistCpStep def get_archive_step(good_path, archive_path, run_id_folder, s3_endpoint, name) archive_step = Elasticity::S3DistCpStep.new(legacy = @legacy) archive_step.arguments = [ diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/s3.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/s3.rb new file mode 100644 index 0000000000..6877436782 --- /dev/null +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/s3.rb @@ -0,0 +1,101 @@ +# Copyright (c) 2012-2018 Snowplow Analytics Ltd. All rights reserved. +# +# This program is licensed to you under the Apache License Version 2.0, +# and you may not use this file except in compliance with the Apache License Version 2.0. +# You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the Apache License Version 2.0 is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + +# Author:: Ben Fradet (mailto:support@snowplowanalytics.com) +# Copyright:: Copyright (c) 2012-2018 Snowplow Analytics Ltd +# License:: Apache License Version 2.0 + +require 'aws-sdk-s3' +require 'contracts' +require 'pathname' +require 'uri' + +module Snowplow + module EmrEtlRunner + module S3 + + include Contracts + + # Check a location on S3 is empty. + # + # Parameters: + # +client+:: S3 client + # +location+:: S3 url of the folder to check for emptiness + # +key_filter+:: filter to apply on the keys, filters folders and $folder$ files by default + def empty?(client, location, + key_filter = lambda { |k| !(k =~ /\/$/) and !(k =~ /\$folder\$$/) }) + bucket, prefix = parse_bucket_prefix(location) + empty_impl(client, bucket, prefix, key_filter) + end + + # List all object names satisfying a key filter. + # + # Parameters: + # +client+:: S3 client + # +location+:: S3 url of the folder to list the object names for + # +key_filter+:: filter to apply on the keys + def list_object_names(client, location, key_filter) + bucket, prefix = parse_bucket_prefix(location) + list_object_names_impl(client, bucket, prefix, key_filter) + end + + # Extract the bucket and prefix from an S3 url. + # + # Parameters: + # +location+:: the S3 url to parse + Contract String => [String, String] + def parse_bucket_prefix(location) + u = URI.parse(location) + return u.host, u.path[1..-1] + end + + private + + def list_object_names_impl(client, bucket, prefix, key_filter, max_keys = 50, token = nil) + response = list_objects(client, bucket, prefix, max_keys, token) + filtered = response.contents + .select { |c| key_filter[c.key] } + .map { |c| c.key } + if response.is_truncated + filtered + list_object_names_impl( + client, bucket, prefix, key_filter, max_keys, response.next_continuation_token) + else + filtered + end + end + + def empty_impl(client, bucket, prefix, key_filter, max_keys = 50, token = nil) + response = list_objects(client, bucket, prefix, max_keys, token) + filtered = response.contents.select { |c| key_filter[c.key] } + if filtered.empty? + if response.is_truncated + empty_impl(client, bucket, prefix, key_filter, max_keys, response.next_continuation_token) + else + true + end + else + false + end + end + + def list_objects(client, bucket, prefix, max_keys, token) + options = { + bucket: bucket, + prefix: prefix, + max_keys: max_keys, + } + options[:continuation_token] = token if !token.nil? + client.list_objects_v2(options) + end + + end + end +end diff --git a/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/s3_spec.rb b/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/s3_spec.rb new file mode 100644 index 0000000000..80c4bb482d --- /dev/null +++ b/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/s3_spec.rb @@ -0,0 +1,81 @@ +# Copyright (c) 2012-2018 Snowplow Analytics Ltd. All rights reserved. +# +# This program is licensed to you under the Apache License Version 2.0, +# and you may not use this file except in compliance with the Apache License Version 2.0. +# You may obtain a copy of the Apache License Version 2.0 at http://www.apache.org/licenses/LICENSE-2.0. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the Apache License Version 2.0 is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the Apache License Version 2.0 for the specific language governing permissions and limitations there under. + +# Author:: Ben Fradet (mailto:support@snowplowanalytics.com) +# Copyright:: Copyright (c) 2012-2018 Snowplow Analytics Ltd +# License:: Apache License Version 2.0 + +require 'aws-sdk-s3' +require 'spec_helper' + +S3 = Snowplow::EmrEtlRunner::S3 + +describe S3 do + subject { Object.new.extend described_class } + + s3 = Aws::S3::Client.new(stub_responses: true) + + describe '#empty?' do + it 'should take a client and location argument' do + expect(subject).to respond_to(:empty?).with(2).argument + end + + it 'should take a client, a location and a filter argument' do + expect(subject).to respond_to(:empty?).with(3).argument + end + + it 'should check a folder on S3 is empty' do + s3.stub_responses(:list_objects_v2, + { contents: [{ key: 'prefix/example1.jpg' }, { key: 'prefix/example2.jpg' }]}) + expect(subject.empty?(s3, 's3://bucket/prefix')).to eq(false) + end + + it 'should filter folders by default' do + s3.stub_responses(:list_objects_v2, { contents: [{ key: 'prefix/folder/' }]}) + expect(subject.empty?(s3, 's3://bucket/prefix')).to eq(true) + end + + it 'should filter $folder$ files by default' do + s3.stub_responses(:list_objects_v2, { contents: [{ key: 'prefix/something_$folder$' }]}) + expect(subject.empty?(s3, 's3://bucket/prefix')).to eq(true) + end + + it 'should support custom filters' do + s3.stub_responses(:list_objects_v2, { contents: [{ key: 'abc' }]}) + expect(subject.empty?(s3, 's3://bucket/prefix', lambda { |k| k.length == 3})).to eq(false) + end + end + + describe '#list_object_names' do + it 'should take a client, a location and a filter argument' do + expect(subject).to respond_to(:list_object_names).with(3).argument + end + + it 'should filter file names based on the filter' do + s3.stub_responses(:list_objects_v2, { contents: [{ key: 'abc' }, { key: 'defg' }]}) + expect(subject.list_object_names(s3, 's3://bucket/prefix', lambda { |k| k.length == 3})) + .to eq(['abc']) + end + + end + + describe '#parse_bucket_prefix' do + it 'should take a s3 url argument' do + expect(subject).to respond_to(:parse_bucket_prefix).with(1).argument + end + + it 'should parse the bucket and prefix' do + expect(subject.parse_bucket_prefix('s3://bucket/prefix')).to eq(['bucket', 'prefix']) + expect(subject.parse_bucket_prefix('s3://bucket/prefix/file.jpg')).to eq( + ['bucket', 'prefix/file.jpg']) + end + end +end From b8d09def1c81b1a3bb326cc968e57042dea85e95 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 6 Jun 2018 13:05:16 +0200 Subject: [PATCH 11/19] EmrEtlRunner: make protocol in Snowplow monitoring configurable (closes #3791) --- 3-enrich/emr-etl-runner/config/config.yml.sample | 1 + .../emr-etl-runner/config/stream_config.yml.sample | 1 + .../snowplow-emr-etl-runner/monitoring/snowplow.rb | 11 ++++++++--- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/3-enrich/emr-etl-runner/config/config.yml.sample b/3-enrich/emr-etl-runner/config/config.yml.sample index f2bb962e9a..dd071d2586 100644 --- a/3-enrich/emr-etl-runner/config/config.yml.sample +++ b/3-enrich/emr-etl-runner/config/config.yml.sample @@ -75,5 +75,6 @@ monitoring: level: DEBUG # You can optionally switch to INFO for production snowplow: method: get + protocol: http app_id: ADD HERE # e.g. snowplow collector: ADD HERE # e.g. d3rkrsqld9gmqf.cloudfront.net diff --git a/3-enrich/emr-etl-runner/config/stream_config.yml.sample b/3-enrich/emr-etl-runner/config/stream_config.yml.sample index ac76d3f6a4..bf30c53c87 100644 --- a/3-enrich/emr-etl-runner/config/stream_config.yml.sample +++ b/3-enrich/emr-etl-runner/config/stream_config.yml.sample @@ -67,3 +67,4 @@ monitoring: method: get app_id: ADD HERE # e.g. snowplow collector: ADD HERE # e.g. d3rkrsqld9gmqf.cloudfront.net + protocol: http diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/monitoring/snowplow.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/monitoring/snowplow.rb index 8717cbb439..94829d3165 100644 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/monitoring/snowplow.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/monitoring/snowplow.rb @@ -28,7 +28,6 @@ class Snowplow include Contracts # Constants - PROTOCOL = "http" PORT = 80 BUFFER_SIZE = 0 APPLICATION_CONTEXT_SCHEMA = "iglu:com.snowplowanalytics.monitoring.batch/application_context/jsonschema/1-0-0" @@ -40,6 +39,7 @@ class Snowplow # Parameters @@method = "get" + @@protocol = "http" @@collector_uri = nil @@app_id = nil @@ -51,9 +51,14 @@ def self.parameterize(config) cm = config[:monitoring] cms = cm[:snowplow] @@method = cms[:method].downcase || @@method + @@protocol = if not cms[:protocol].nil? + cms[:protocol].downcase + else + @@protocol + end @@collector_uri = cms[:collector] # Could be nil @@app_id = cms[:app_id] # Could be nil - + @@app_context = SnowplowTracker::SelfDescribingJson.new(APPLICATION_CONTEXT_SCHEMA, { :name => NAME, :version => VERSION, @@ -70,7 +75,7 @@ def initialize @tracker = if @@collector_uri emitter = SnowplowTracker::Emitter.new(@@collector_uri, { - :protocol => PROTOCOL, + :protocol => @@protocol, :method => @@method, :port => PORT, :buffer_size => BUFFER_SIZE From f708724df99b599013ad22c99ddd8b36c42cc61d Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 6 Jun 2018 13:06:29 +0200 Subject: [PATCH 12/19] EmrEtlRunner: make port in Snowplow monitoring configurable (closes #3236) --- 3-enrich/emr-etl-runner/config/config.yml.sample | 1 + 3-enrich/emr-etl-runner/config/stream_config.yml.sample | 1 + .../lib/snowplow-emr-etl-runner/monitoring/snowplow.rb | 5 +++-- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/3-enrich/emr-etl-runner/config/config.yml.sample b/3-enrich/emr-etl-runner/config/config.yml.sample index dd071d2586..64eb75aca6 100644 --- a/3-enrich/emr-etl-runner/config/config.yml.sample +++ b/3-enrich/emr-etl-runner/config/config.yml.sample @@ -76,5 +76,6 @@ monitoring: snowplow: method: get protocol: http + port: 80 app_id: ADD HERE # e.g. snowplow collector: ADD HERE # e.g. d3rkrsqld9gmqf.cloudfront.net diff --git a/3-enrich/emr-etl-runner/config/stream_config.yml.sample b/3-enrich/emr-etl-runner/config/stream_config.yml.sample index bf30c53c87..f1958e3f98 100644 --- a/3-enrich/emr-etl-runner/config/stream_config.yml.sample +++ b/3-enrich/emr-etl-runner/config/stream_config.yml.sample @@ -68,3 +68,4 @@ monitoring: app_id: ADD HERE # e.g. snowplow collector: ADD HERE # e.g. d3rkrsqld9gmqf.cloudfront.net protocol: http + port: 80 diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/monitoring/snowplow.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/monitoring/snowplow.rb index 94829d3165..d7f52c6020 100644 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/monitoring/snowplow.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/monitoring/snowplow.rb @@ -28,7 +28,6 @@ class Snowplow include Contracts # Constants - PORT = 80 BUFFER_SIZE = 0 APPLICATION_CONTEXT_SCHEMA = "iglu:com.snowplowanalytics.monitoring.batch/application_context/jsonschema/1-0-0" JOB_STATUS_SCHEMA = "iglu:com.snowplowanalytics.monitoring.batch/emr_job_status/jsonschema/1-0-0" @@ -40,6 +39,7 @@ class Snowplow # Parameters @@method = "get" @@protocol = "http" + @@port = 80 @@collector_uri = nil @@app_id = nil @@ -56,6 +56,7 @@ def self.parameterize(config) else @@protocol end + @@port = cms[:port] || @@port @@collector_uri = cms[:collector] # Could be nil @@app_id = cms[:app_id] # Could be nil @@ -77,7 +78,7 @@ def initialize emitter = SnowplowTracker::Emitter.new(@@collector_uri, { :protocol => @@protocol, :method => @@method, - :port => PORT, + :port => @@port, :buffer_size => BUFFER_SIZE }) From f8c1574e876c7a97fb7facb4253a753f1e6a2737 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 6 Jun 2018 15:10:20 +0200 Subject: [PATCH 13/19] EmrEtlRunner: add --ignore-lock-on-start option (closes #3537) --- .../emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb | 9 ++++++--- .../emr-etl-runner/lib/snowplow-emr-etl-runner/runner.rb | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb index 10f317a086..6d75acc339 100755 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/cli.rb @@ -56,7 +56,8 @@ def self.get_args_config_enrichments_resolver options = { :debug => false, :skip => [], - :include => [] + :include => [], + :ignore_lock_on_start => false, } commands = { @@ -72,6 +73,7 @@ def self.get_args_config_enrichments_resolver opts.on('-x', "--skip {#{SKIPPABLES.to_a.join(',')}}", Array, 'skip the specified step(s)') { |config| options[:skip] = config } opts.on('-i', "--include {#{INCLUDES.to_a.join(',')}}", Array, 'include additional step(s)') { |config| options[:include] = config } opts.on('-l', '--lock PATH', 'where to store the lock') { |config| options[:lock] = config } + opts.on('--ignore-lock-on-start', 'ignore the lock if it is set when starting') { |config| options[:ignore_lock_on_start] = true } opts.on('--consul ADDRESS', 'address to the Consul server') { |config| options[:consul] = config } end, 'generate emr-config' => OptionParser.new do |opts| @@ -183,6 +185,7 @@ def self.process_options(options, optparse, cmd_name) :resume_from => options[:resume_from], :include => options[:include], :lock => options[:lock], + :ignore_lock_on_start => options[:ignore_lock_on_start], :consul => options[:consul] } @@ -267,7 +270,7 @@ def self.load_targets(targets_path) Dir.entries(targets_path).select do |f| f.end_with?('.json') end.map do |f| - json = JSON.parse(File.read(targets_path + '/' + f), {:symbolize_names => true}) + json = JSON.parse(File.read(targets_path + '/' + f), {:symbolize_names => true}) id = json.dig(:data, :id) unless id.nil? ids.push(id) @@ -322,7 +325,7 @@ def self.validate_and_coalesce(args, config) if args[:resume_from] == "enrich" raise ConfigError, 'cannot resume from enrich in stream enrich mode' end - if args[:skip].include?('staging') || args[:skip].include?('enrich') + if args[:skip].include?('staging') || args[:skip].include?('enrich') raise ConfigError, 'cannot skip staging nor enrich in stream enrich mode. Either skip staging_stream_enrich or resume from shred' end if args[:skip].include?('archive_raw') || args[:resume_from] == "archive_raw" diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/runner.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/runner.rb index b998281e39..612f86762c 100755 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/runner.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/runner.rb @@ -39,7 +39,7 @@ def self.get_steps(skips, resume, enriched_stream) not skips.include?('shred')), :es => ((resume.nil? or [ 'enrich', 'shred', 'elasticsearch' ].include?(resume)) and not skips.include?('elasticsearch')), - :archive_raw => (enriched_stream.nil? and (resume.nil? or [ 'enrich', 'shred', 'elasticsearch', 'archive_raw' ].include?(resume)) and + :archive_raw => (enriched_stream.nil? and (resume.nil? or [ 'enrich', 'shred', 'elasticsearch', 'archive_raw' ].include?(resume)) and not skips.include?('archive_raw')), :rdb_load => ((resume.nil? or [ 'enrich', 'shred', 'elasticsearch', 'archive_raw', 'rdb_load' ].include?(resume)) and not skips.include?('rdb_load')), @@ -122,7 +122,7 @@ def run end lock = get_lock(@args[:lock], @args[:consul]) - if not lock.nil? + if not lock.nil? and not @args[:ignore_lock_on_start] lock.try_lock end From 156643074a9e68fa1d22cb5da7a07ad81e2efa41 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Fri, 8 Jun 2018 17:50:54 +0200 Subject: [PATCH 14/19] EmrEtlRunner: handle SSE-S3 encrypted S3 buckets (closes #3456) --- .../emr-etl-runner/config/config.yml.sample | 1 + .../config/stream_config.yml.sample | 1 + .../lib/snowplow-emr-etl-runner/contracts.rb | 4 +- .../lib/snowplow-emr-etl-runner/emr_job.rb | 42 ++++++++++++++++--- .../resources/sparse_config.yml | 1 + .../resources/stream_config.yml | 1 + 6 files changed, 44 insertions(+), 6 deletions(-) diff --git a/3-enrich/emr-etl-runner/config/config.yml.sample b/3-enrich/emr-etl-runner/config/config.yml.sample index 64eb75aca6..35fbcfbb52 100644 --- a/3-enrich/emr-etl-runner/config/config.yml.sample +++ b/3-enrich/emr-etl-runner/config/config.yml.sample @@ -8,6 +8,7 @@ aws: assets: s3://snowplow-hosted-assets # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket jsonpath_assets: # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here log: ADD HERE + encrypted: false # Whether the buckets below are enrcrypted using server side encryption (SSE-S3) raw: in: # This is a YAML array of one or more in buckets - you MUST use hyphens before each entry in the array, as below - ADD HERE # e.g. s3://my-old-collector-bucket diff --git a/3-enrich/emr-etl-runner/config/stream_config.yml.sample b/3-enrich/emr-etl-runner/config/stream_config.yml.sample index f1958e3f98..d8c9043745 100644 --- a/3-enrich/emr-etl-runner/config/stream_config.yml.sample +++ b/3-enrich/emr-etl-runner/config/stream_config.yml.sample @@ -8,6 +8,7 @@ aws: assets: s3://snowplow-hosted-assets # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket jsonpath_assets: # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here log: ADD HERE + encrypted: false enriched: good: ADD HERE # e.g. s3://my-out-bucket/enriched/good archive: ADD HERE # Where to archive enriched events to, e.g. s3://my-archive-bucket/enriched diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb index 30ac6fbc69..c474636280 100755 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb @@ -42,7 +42,8 @@ module EmrEtlRunner :skip => Maybe[ArrayOf[String]], :include => Maybe[ArrayOf[String]], :lock => Maybe[String], - :consul => Maybe[String] + :consul => Maybe[String], + :ignore_lock_on_start => Maybe[Bool] }) # The Hash containing the buckets field from the configuration YAML @@ -50,6 +51,7 @@ module EmrEtlRunner :assets => String, :jsonpath_assets => Maybe[String], :log => String, + :encrypted => Bool, :raw => Maybe[({ :in => ArrayOf[String], :processing => String, diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb index d36d8c15a5..2557346d41 100755 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb @@ -82,6 +82,7 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive @rdb_loader_logs = [] # pairs of target name and associated log etl_tstamp = (run_tstamp.to_f * 1000).to_i.to_s output_codec = output_codec_from_compression_format(config.dig(:enrich, :output_compression)) + encrypted = config[:aws][:s3][:buckets][:encrypted] s3 = Aws::S3::Client.new( :access_key_id => config[:aws][:access_key_id], @@ -163,6 +164,9 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive if collector_format == 'clj-tomcat' staging_step.arguments = staging_step.arguments + [ '--groupBy', '.*/_*(.+)' ] end + if encrypted + staging_step.arguments = staging_step.arguments + [ '--s3ServerSideEncryption' ] + end staging_step.name << ": Raw #{l} -> Raw Staging S3" @jobflow.add_step(staging_step) } @@ -318,6 +322,9 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive if collector_format == "clj-tomcat" then compact_to_hdfs_step.arguments << "--outputCodec" << "none" end + if encrypted + compact_to_hdfs_step.arguments = compact_to_hdfs_step.arguments + [ '--s3ServerSideEncryption' ] + end compact_to_hdfs_step.name << ": Raw S3 -> Raw HDFS" @jobflow.add_step(compact_to_hdfs_step) @@ -379,6 +386,9 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive "--srcPattern" , PARTFILE_REGEXP, "--s3Endpoint" , s3_endpoint ] + output_codec + if encrypted + copy_to_s3_step.arguments = copy_to_s3_step.arguments + [ '--s3ServerSideEncryption' ] + end copy_to_s3_step.name << ": Enriched HDFS -> S3" @jobflow.add_step(copy_to_s3_step) @@ -389,6 +399,9 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive "--srcPattern" , SUCCESS_REGEXP, "--s3Endpoint" , s3_endpoint ] + if encrypted + copy_success_file_step.arguments = copy_success_file_step.arguments + [ '--s3ServerSideEncryption' ] + end copy_success_file_step.name << ": Enriched HDFS _SUCCESS -> S3" @jobflow.add_step(copy_success_file_step) end @@ -412,6 +425,9 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive "--srcPattern" , STREAM_ENRICH_REGEXP, "--deleteOnSuccess" ] + if encrypted + staging_step.arguments = staging_step.arguments + [ '--s3ServerSideEncryption' ] + end staging_step.name << ": Stream Enriched #{csbe[:stream]} -> Enriched Staging S3" @jobflow.add_step(staging_step) end @@ -449,6 +465,9 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive "--outputCodec", "none", "--s3Endpoint" , s3_endpoint ] + if encrypted + copy_to_hdfs_step.arguments = copy_to_hdfs_step.arguments + [ '--s3ServerSideEncryption' ] + end copy_to_hdfs_step.name << ": Enriched S3 -> HDFS" @jobflow.add_step(copy_to_hdfs_step) end @@ -500,6 +519,9 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive "--srcPattern" , PARTFILE_REGEXP, "--s3Endpoint" , s3_endpoint ] + output_codec + if encrypted + copy_to_s3_step.arguments = copy_to_s3_step.arguments + [ '--s3ServerSideEncryption' ] + end copy_to_s3_step.name << ": Shredded HDFS -> S3" @jobflow.add_step(copy_to_s3_step) @@ -510,6 +532,9 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive "--srcPattern" , SUCCESS_REGEXP, "--s3Endpoint" , s3_endpoint ] + if encrypted + copy_success_file_step.arguments = copy_success_file_step.arguments + [ '--s3ServerSideEncryption' ] + end copy_success_file_step.name << ": Shredded HDFS _SUCCESS -> S3" @jobflow.add_step(copy_success_file_step) end @@ -529,6 +554,9 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive "--s3Endpoint" , s3_endpoint, "--deleteOnSuccess" ] + if encrypted + archive_raw_step.arguments = archive_raw_step.arguments + [ '--s3ServerSideEncryption' ] + end archive_raw_step.name << ": Raw Staging S3 -> Raw Archive S3" @jobflow.add_step(archive_raw_step) end @@ -542,22 +570,22 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive end if archive_enriched == 'pipeline' - archive_enriched_step = get_archive_step(csbe[:good], csbe[:archive], run_id, s3_endpoint, ": Enriched S3 -> Enriched Archive S3") + archive_enriched_step = get_archive_step(csbe[:good], csbe[:archive], run_id, s3_endpoint, ": Enriched S3 -> Enriched Archive S3", encrypted) @jobflow.add_step(archive_enriched_step) elsif archive_enriched == 'recover' latest_run_id = get_latest_run_id(s3, csbe[:good]) - archive_enriched_step = get_archive_step(csbe[:good], csbe[:archive], latest_run_id, s3_endpoint, ': Enriched S3 -> S3 Enriched Archive') + archive_enriched_step = get_archive_step(csbe[:good], csbe[:archive], latest_run_id, s3_endpoint, ': Enriched S3 -> S3 Enriched Archive', encrypted) @jobflow.add_step(archive_enriched_step) else # skip nil end if archive_shredded == 'pipeline' - archive_shredded_step = get_archive_step(csbs[:good], csbs[:archive], run_id, s3_endpoint, ": Shredded S3 -> Shredded Archive S3") + archive_shredded_step = get_archive_step(csbs[:good], csbs[:archive], run_id, s3_endpoint, ": Shredded S3 -> Shredded Archive S3", encrypted) @jobflow.add_step(archive_shredded_step) elsif archive_shredded == 'recover' latest_run_id = get_latest_run_id(s3, csbs[:good]) - archive_shredded_step = get_archive_step(csbs[:good], csbs[:archive], latest_run_id, s3_endpoint, ": Shredded S3 -> S3 Shredded Archive") + archive_shredded_step = get_archive_step(csbs[:good], csbs[:archive], latest_run_id, s3_endpoint, ": Shredded S3 -> S3 Shredded Archive", encrypted) @jobflow.add_step(archive_shredded_step) else # skip nil @@ -786,10 +814,11 @@ def get_latest_run_id(s3, s3_path) # +archive_path+:: enriched:archive or shredded:archive full S3 path # +run_id_folder+:: run id foler name (2017-05-10-02-45-30, without `=run`) # +name+:: step description to show in EMR console + # +encrypted+:: whether the destination bucket is encrypted # # Returns a step ready for adding to the Elasticity Jobflow. Contract String, String, String, String, String, Bool => Elasticity::S3DistCpStep - def get_archive_step(good_path, archive_path, run_id_folder, s3_endpoint, name) + def get_archive_step(good_path, archive_path, run_id_folder, s3_endpoint, name, encrypted) archive_step = Elasticity::S3DistCpStep.new(legacy = @legacy) archive_step.arguments = [ "--src" , partition_by_run(good_path, run_id_folder), @@ -797,6 +826,9 @@ def get_archive_step(good_path, archive_path, run_id_folder, s3_endpoint, name) "--s3Endpoint" , s3_endpoint, "--deleteOnSuccess" ] + if encrypted + archive_step.arguments = archive_step.arguments + [ '--s3ServerSideEncryption' ] + end archive_step.name << name archive_step end diff --git a/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/resources/sparse_config.yml b/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/resources/sparse_config.yml index 5ba478efe5..f3beae1bb8 100644 --- a/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/resources/sparse_config.yml +++ b/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/resources/sparse_config.yml @@ -9,6 +9,7 @@ aws: assets: spha # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket jsonpath_assets: # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here log: s3://not-a-bucket + encrypted: false raw: in: # Multiple in buckets are permitted - ri # e.g. s3://my-archive-bucket/raw diff --git a/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/resources/stream_config.yml b/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/resources/stream_config.yml index c124147372..a2d1fc412a 100644 --- a/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/resources/stream_config.yml +++ b/3-enrich/emr-etl-runner/spec/snowplow-emr-etl-runner/resources/stream_config.yml @@ -9,6 +9,7 @@ aws: assets: spha # DO NOT CHANGE unless you are hosting the jarfiles etc yourself in your own bucket jsonpath_assets: # If you have defined your own JSON Schemas, add the s3:// path to your own JSON Path files in your own bucket here log: s3://not-a-bucket + encrypted: false enriched: good: eg # e.g. s3://my-out-bucket/enriched/good archive: ea # Where to archive enriched events to, e.g. s3://my-out-bucket/enriched/archive From 8a3b15d93cc60fc8cc2180bb3604ee1140c0d914 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Tue, 12 Jun 2018 18:19:08 +0200 Subject: [PATCH 15/19] EmrEtlRunner: add ability to specify an EMR security configuration (closes #3798) --- 3-enrich/emr-etl-runner/config/config.yml.sample | 1 + 3-enrich/emr-etl-runner/config/stream_config.yml.sample | 1 + .../emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb | 1 + .../emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb | 4 ++++ 4 files changed, 7 insertions(+) diff --git a/3-enrich/emr-etl-runner/config/config.yml.sample b/3-enrich/emr-etl-runner/config/config.yml.sample index 35fbcfbb52..5747da50ae 100644 --- a/3-enrich/emr-etl-runner/config/config.yml.sample +++ b/3-enrich/emr-etl-runner/config/config.yml.sample @@ -33,6 +33,7 @@ aws: placement: ADD HERE # Set this if not running in VPC. Leave blank otherwise ec2_subnet_id: ADD HERE # Set this if running in VPC. Leave blank otherwise ec2_key_name: ADD HERE + security_configuration: ADD HERE # Specify your EMR security configuration if needed. Leave blank otherwise bootstrap: [] # Set this to specify custom boostrap actions. Leave empty otherwise software: hbase: # Optional. To launch on cluster, provide version, "0.92.0", keep quotes. Leave empty otherwise. diff --git a/3-enrich/emr-etl-runner/config/stream_config.yml.sample b/3-enrich/emr-etl-runner/config/stream_config.yml.sample index d8c9043745..2c3f730597 100644 --- a/3-enrich/emr-etl-runner/config/stream_config.yml.sample +++ b/3-enrich/emr-etl-runner/config/stream_config.yml.sample @@ -26,6 +26,7 @@ aws: placement: ADD HERE # Set this if not running in VPC. Leave blank otherwise ec2_subnet_id: ADD HERE # Set this if running in VPC. Leave blank otherwise ec2_key_name: ADD HERE + security_configuration: ADD HERE # Specify your EMR security configuration if needed. Leave blank otherwise bootstrap: [] # Set this to specify custom boostrap actions. Leave empty otherwise software: hbase: # Optional. To launch on cluster, provide version, "0.92.0", keep quotes. Leave empty otherwise. diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb index c474636280..d9032b1d77 100755 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/contracts.rb @@ -97,6 +97,7 @@ module EmrEtlRunner :placement => Maybe[String], :ec2_subnet_id => Maybe[String], :ec2_key_name => String, + :security_configuration => Maybe[String], :bootstrap => Maybe[ArrayOf[String]], :software => ({ :hbase => Maybe[String], diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb index 2557346d41..94d06e5638 100755 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb @@ -123,6 +123,10 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive @jobflow.ec2_subnet_id = config[:aws][:emr][:ec2_subnet_id] end + unless config[:aws][:emr][:security_configuration].nil? + @jobflow.security_configuration = config[:aws][:emr][:security_configuration] + end + @jobflow.log_uri = config[:aws][:s3][:buckets][:log] @jobflow.enable_debugging = debug @jobflow.visible_to_all_users = true From 927a35278e9fe6c59d58a0f25ce76eab9f0e2ea7 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 13 Jun 2018 17:26:20 +0200 Subject: [PATCH 16/19] Clojure Collector: bump to 2.1.0 (closes #3801) --- 2-collectors/clojure-collector/java-servlet/project.clj | 2 +- .../java-servlet/war-resources/.ebextensions/server.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/2-collectors/clojure-collector/java-servlet/project.clj b/2-collectors/clojure-collector/java-servlet/project.clj index 6ecfb586b5..07e95e26dc 100644 --- a/2-collectors/clojure-collector/java-servlet/project.clj +++ b/2-collectors/clojure-collector/java-servlet/project.clj @@ -13,7 +13,7 @@ ;;;; Copyright: Copyright (c) 2012-2013 Snowplow Analytics Ltd ;;;; License: Apache License Version 2.0 -(defproject snowplow/clojure-collector "2.0.0" ;; MUST also bump version in server.xml +(defproject snowplow/clojure-collector "2.1.0" ;; MUST also bump version in server.xml :license {:name "Apache Version 2.0" :url "http://www.apache.org/licenses/LICENSE-2.0"} :description "A SnowPlow event collector written in Clojure. AWS Elastic Beanstalk compatible." diff --git a/2-collectors/clojure-collector/java-servlet/war-resources/.ebextensions/server.xml b/2-collectors/clojure-collector/java-servlet/war-resources/.ebextensions/server.xml index be895c3f50..f6ac7d5d90 100644 --- a/2-collectors/clojure-collector/java-servlet/war-resources/.ebextensions/server.xml +++ b/2-collectors/clojure-collector/java-servlet/war-resources/.ebextensions/server.xml @@ -135,7 +135,7 @@ + pattern="%{yyyy-MM-dd}t %{HH:mm:ss}t - %b %a %m %h %U %s %{Referer}i %{User-Agent}I %q&cv=clj-2.1.0-%v&nuid=%{sp}C - - - %~ %w" /> From 3371cb6efab22bb7c6f3ec2182cb61d28f0181ce Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Wed, 13 Jun 2018 17:28:03 +0200 Subject: [PATCH 17/19] EmrEtlRunner: bump to 0.33.0 (closes #3800) --- 3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner.rb index d9db70a10f..0258240221 100755 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner.rb @@ -36,6 +36,6 @@ module Snowplow module EmrEtlRunner NAME = "snowplow-emr-etl-runner" - VERSION = "0.32.0" + VERSION = "0.33.0" end end From 56656dd0d813af105869f2d1cb45d836d16848bf Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Thu, 14 Jun 2018 15:38:16 +0200 Subject: [PATCH 18/19] EmrEtlRunner: check the processing folder for emptiness when resuming from enrich (closes #3803) --- .../emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb index 94d06e5638..6dfe823879 100755 --- a/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb +++ b/3-enrich/emr-etl-runner/lib/snowplow-emr-etl-runner/emr_job.rb @@ -305,6 +305,11 @@ def initialize(debug, staging, enrich, staging_stream_enrich, shred, es, archive raw_input = csbr[:processing] + # When resuming from enrich, we need to check for emptiness of the processing bucket + if !staging and empty?(s3, raw_input) + raise NoDataToProcessError, "No Snowplow logs in #{raw_input}, can't resume from enrich" + end + # for ndjson/urbanairship we can group by everything, just aim for the target size group_by = is_ua_ndjson(collector_format) ? ".*\/(\w+)\/.*" : ".*([0-9]+-[0-9]+-[0-9]+)-[0-9]+.*" From 34cd6ca3f298356b23d06756bdeaabb4d3f4c1d3 Mon Sep 17 00:00:00 2001 From: Ben Fradet Date: Thu, 14 Jun 2018 15:38:55 +0200 Subject: [PATCH 19/19] Prepared for release --- CHANGELOG | 21 +++++++++++++++++++++ README.md | 2 +- VERSION | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index ecf8541886..b09d274015 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,24 @@ +Release 108 Val Camonica (2018-07-24) +------------------------------------- +EmrEtlRunner: bump to 0.33.0 (#3800) +EmrEtlRunner: add ability to specify an EMR security configuration (#3798) +EmrEtlRunner: handle SSE-S3 encrypted S3 buckets (#3456) +EmrEtlRunner: replace Sluice by aws-sdk-s3 (#3524) +EmrEtlRunner: add --ignore-lock-on-start option (#3537) +EmrEtlRunner: check the processing folder for emptiness when resuming from enrich (#3803) +EmrEtlRunner: make port in Snowplow monitoring configurable (#3236) +EmrEtlRunner: make protocol in Snowplow monitoring configurable (#3791) +Clojure Collector: bump to 2.1.0 (#3801) +Clojure Collector: make cookie path configurable (#2739) +Clojure Collector: do not allow dependencies requiring an HTTP repository (#3559) +Clojure Collector: bump lein-ring to 0.12.4 (#3783) +Clojure Collector: bump commons-codec to 1.11 (#3782) +Clojure Collector: bump metrics-clojure to 2.10.0 (#3781) +Clojure Collector: bump compojure to 1.6.1 (#3780) +Clojure Collector: bump clojure to 1.9.0 (#3779) +Clojure Collector: bump ring to 1.6.3 (#3778) +Clojure Collector: remove lein-beanstalk (#3784) + Release 107 Trypillia (2018-07-17) ---------------------------------- Enrich: update example config for PII to version 2-0-0 (#3812) diff --git a/README.md b/README.md index 1a51536e8f..73ec5f042b 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ limitations under the License. [travis-image]: https://travis-ci.org/snowplow/snowplow.png?branch=master [travis]: http://travis-ci.org/snowplow/snowplow -[release-image]: https://img.shields.io/badge/release-107_Trypillia-orange.svg?style=flat +[release-image]: https://img.shields.io/badge/release-108_Val_Camonica-orange.svg?style=flat [releases]: https://github.com/snowplow/snowplow/releases [license-image]: http://img.shields.io/badge/license-Apache--2-blue.svg?style=flat diff --git a/VERSION b/VERSION index 9fd6ca9538..d333070a13 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -r107-trypillia +r108-val-camonica