Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add options for statistical tracking of success #20

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ Each service can be further configured with the following:
* `seconds_before_retry` - The number of seconds to wait before sending a new request when an outage is reported. Every N seconds, a new request will be sent, and if it succeeds the outage will be ended. Defaults to 60.
* `error_threshold` - The percentage of errors over which an outage will be reported. Defaults to 50.
* `data_retention_seconds` - The number of seconds for which data will be stored in Redis for successful and unsuccessful request counts. See below for information on the structure of data within Redis. Defaults to 30 days.
* `min_errors` - At least this many errors need to occur in the observation period before an outage will be reported. Defaults to 1.
* `success_sample_per` - Record every Nth success by incrementing by N, e.g., 5 will increment the success count by 5, 20% (1/5) of the time. Reduces write traffic to Redis. Defaults to 1 (no sampling).
* `seconds_between_outage_checks` - Check redis for a recorded outage at most once per this time period. Reduces read traffic to Redis. Defaults to 0 (always check).

### Client

Expand Down
47 changes: 39 additions & 8 deletions lib/breakers/service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ class Service
DEFAULT_OPTS = {
seconds_before_retry: 60,
error_threshold: 50,
data_retention_seconds: 60 * 60 * 24 * 30
data_retention_seconds: 60 * 60 * 24 * 30,
min_errors: 1,
success_sample_per: 1,
seconds_between_outage_checks: 0
}.freeze

# Create a new service
Expand All @@ -17,9 +20,14 @@ class Service
# @option opts [Integer] :seconds_before_retry The number of seconds to wait after an outage begins before testing with a new request
# @option opts [Integer] :error_threshold The percentage of errors over the last two minutes that indicates an outage
# @option opts [Integer] :data_retention_seconds The number of seconds to retain success and error data in Redis
# @option opts [Integer] :success_sample_per The number of successes (statistically) before those successes are counted
# @option opts [Proc] :exception_handler A proc taking an exception and returns true if it represents an error on the service
def initialize(opts)
@configuration = DEFAULT_OPTS.merge(opts)
per = @configuration[:success_sample_per].to_i
per = [1, [per, 1_000_000].min].max
@configuration[:success_sample_per] = per
@configuration
end

# Get the name of the service
Expand All @@ -44,6 +52,13 @@ def seconds_before_retry
@configuration[:seconds_before_retry]
end

# Get the success sample per
#
# @return [Integer] the value
def success_sample_per
@configuration[:success_sample_per]
end

# Returns true if a given exception represents an error with the service
#
# @return [Boolean] is it an error?
Expand All @@ -59,7 +74,11 @@ def add_error

# Indicate that a successful response has occurred
def add_success
increment_key(key: successes_key)
if success_sample_per == 1
increment_key(key: successes_key)
elsif rand < 1.0/success_sample_per
increment_key(key: successes_key, by: success_sample_per)
end
end

# Force an outage to begin on the service. Forced outages are not periodically retested.
Expand All @@ -75,9 +94,16 @@ def end_forced_outage!
end
end

# Return the most recent outage on the service
# Return the most recent outage on the service, throttled to reference
# redis at most once every `seconds_between_outage_checks`
def latest_outage
Outage.find_latest(service: self)
throttle = @configuration[:seconds_between_outage_checks]
if !@latest_outage_fetched.nil? && @latest_outage_fetched > Time.now - throttle
@latest_outage
else
@latest_outage_fetched = Time.now
@latest_outage = Outage.find_latest(service: self)
end
end

# Return a list of all outages in the given time range
Expand All @@ -93,7 +119,7 @@ def outages_in_range(start_time:, end_time:)
)
end

# Return data about the successful request counts in the time range
# Return data about the successful request counts in the time range (a statistical estimate)
#
# @param start_time [Time] the beginning of the range
# @param end_time [Time] the end of the range
Expand Down Expand Up @@ -142,9 +168,13 @@ def values_in_range(start_time:, end_time:, type:, sample_minutes:)
end
end

def increment_key(key:)
def increment_key(key:, by: 1)
Breakers.client.redis_connection.multi do |pipeline|
pipeline.incr(key)
if by == 1
pipeline.incr(key)
else
pipeline.incrby(key, by)
end
pipeline.expire(key, @configuration[:data_retention_seconds])
end
end
Expand All @@ -164,8 +194,9 @@ def maybe_create_outage
end
failure_count = data[0].to_i + data[1].to_i
success_count = data[2].to_i + data[3].to_i
return if failure_count < @configuration[:min_errors]

if failure_count > 0 && success_count == 0
if success_count == 0
Outage.create(service: self)
else
failure_rate = failure_count / (failure_count + success_count).to_f
Expand Down
158 changes: 156 additions & 2 deletions spec/integration_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,28 @@
expect(service.latest_outage).to be
end

context 'with min_errors' do
let(:service) do
Breakers::Service.new(
name: 'VA',
request_matcher: proc { |request_env| request_env.url.host =~ /.*va.gov/ },
seconds_before_retry: 60,
error_threshold: 50,
min_errors: 3
)
end

it 'does not create an outage with a single error' do
connection.get '/'
expect(service.latest_outage).to be_nil
end

it 'creates an outage after many errors' do
3.times { connection.get '/' }
expect(service.latest_outage).to be_truthy
end
end

it 'logs the error' do
expect(logger).to receive(:warn).with(
msg: 'Breakers failed request', service: 'VA', url: 'http://va.gov/', error: 500
Expand Down Expand Up @@ -290,11 +312,12 @@
end
end

context 'there is a completed outage' do
context 'there is a completed outage with guaranteed success INCRs' do
let(:start_time) { Time.now.utc - (60 * 60) }
let(:end_time) { Time.now.utc - 60 }
let(:now_time) { Time.now.utc }
before do
service.instance_variable_get(:@configuration)[:success_sample_per] = 1
Timecop.freeze(now_time)
redis.zadd('VA-outages', start_time.to_i, MultiJson.dump(start_time: start_time.to_i, end_time: end_time))
stub_request(:get, 'va.gov').to_return(status: 200)
Expand All @@ -312,7 +335,86 @@
expect(count).to eq('1')
end

it 'informs the plugin about the success' do
it 'adds two successes to redis' do
response = connection.get '/'
response = connection.get '/'
rounded_time = now_time.to_i - (now_time.to_i % 60)
count = redis.get("VA-successes-#{rounded_time}")
expect(count).to eq('2')
end

it 'informs the plugin about a success' do
expect(plugin).to receive(:on_success).with(service, instance_of(Faraday::Env), instance_of(Faraday::Env))
connection.get '/'
end

it 'should not tell the plugin about a skipped request' do
expect(plugin).not_to receive(:on_skipped_request)
connection.get '/'
end
end

context 'there is a completed outage with pseudo-random success INCRs' do
let(:start_time) { Time.now.utc - (60 * 60) }
let(:end_time) { Time.now.utc - 60 }
let(:now_time) { Time.now.utc }
before do
service.instance_variable_get(:@configuration)[:success_sample_per] = 2
Timecop.freeze(now_time)
redis.zadd('VA-outages', start_time.to_i, MultiJson.dump(start_time: start_time.to_i, end_time: end_time))
stub_request(:get, 'va.gov').to_return(status: 200)
end

# Wrap the examples to ensure exactly half of status messages get written
# to (our mocked in-memory) redis, alternating, starting with false.
def silence_warnings
original_verbosity = $VERBOSE
$VERBOSE = nil
result = yield
$VERBOSE = original_verbosity
result
end
around(:example) do |example|
silence_warnings do
class Breakers::Service
@@_fake_rand = [0.75, 0.25]
def rand
@@_fake_rand.push(@@_fake_rand.shift)
@@_fake_rand[-1]
end
end
end
result = example.run
silence_warnings do
class Breakers::Service
remove_method :rand
end
end
result
end

it 'makes the request' do
response = connection.get '/'
expect(response.status).to eq(200)
end

it 'adds success to redis after every other request' do
rounded_time = now_time.to_i - (now_time.to_i % 60)
response = connection.get '/'
count = redis.get("VA-successes-#{rounded_time}")
expect(count).to eq(nil)
response = connection.get '/'
count = redis.get("VA-successes-#{rounded_time}")
expect(count).to eq('2')
response = connection.get '/'
count = redis.get("VA-successes-#{rounded_time}")
expect(count).to eq('2')
response = connection.get '/'
count = redis.get("VA-successes-#{rounded_time}")
expect(count).to eq('4')
end

it 'informs the plugin about a success regardless of sample_per' do
expect(plugin).to receive(:on_success).with(service, instance_of(Faraday::Env), instance_of(Faraday::Env))
connection.get '/'
end
Expand Down Expand Up @@ -417,6 +519,37 @@

context 'with a bunch of successes over the last few minutes' do
let(:now) { Time.now.utc }
before do
service.instance_variable_get(:@configuration)[:success_sample_per] = 2
end

# Wrap the examples to ensure exactly half of status messages get written
# to (our mocked in-memory) redis, alternating, starting with false.
def silence_warnings
original_verbosity = $VERBOSE
$VERBOSE = nil
result = yield
$VERBOSE = original_verbosity
result
end
around(:example) do |example|
silence_warnings do
class Breakers::Service
@@_fake_rand = [0.75, 0.25]
def rand
@@_fake_rand.push(@@_fake_rand.shift)
@@_fake_rand[-1]
end
end
end
result = example.run
silence_warnings do
class Breakers::Service
remove_method :rand
end
end
result
end

before do
Timecop.freeze(now - 90)
Expand Down Expand Up @@ -543,4 +676,25 @@
expect(response.env[:duration]).to be
end
end

context 'with throttling of outage checks' do
let(:now) { Time.now.utc }
let(:service) do
Breakers::Service.new(
name: 'VA',
request_matcher: proc { |request_env| request_env.url.host =~ /.*va.gov/ },
seconds_before_retry: 60,
error_threshold: 50,
seconds_between_outage_checks: 10
)
end

it 'only checks for outages once every 10 seconds' do
expect(redis).to receive(:zrange).twice.and_return([])
2.times { service.latest_outage }
Timecop.freeze(now + 10)
2.times { service.latest_outage }
Timecop.return
end
end
end