Skip to content

Commit

Permalink
Merge pull request #18 from Financial-Times/fix/0-replicas-healthcheck
Browse files Browse the repository at this point in the history
Services with 0 desired replicas should appear as healthy
  • Loading branch information
Mihai Sorin Moisa authored Feb 13, 2018
2 parents 9181b39 + 7013ff6 commit 9ebd3cf
Show file tree
Hide file tree
Showing 13 changed files with 193 additions and 87 deletions.
20 changes: 15 additions & 5 deletions cachingController.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ package main

import (
"fmt"
fthealth "github.com/Financial-Times/go-fthealth/v1_1"
"math"
"reflect"
"time"

fthealth "github.com/Financial-Times/go-fthealth/v1_1"
)

const (
Expand All @@ -19,7 +20,7 @@ func newMeasuredService(service service) measuredService {
return measuredService{service, cachedHealth, bufferedHealths}
}

func (c *healthCheckController) collectChecksFromCachesFor(categories map[string]category) ([]fthealth.CheckResult, map[string][]fthealth.CheckResult) {
func (c *healthCheckController) collectChecksFromCachesFor(categories map[string]category) ([]fthealth.CheckResult, map[string][]fthealth.CheckResult, error) {
var checkResults []fthealth.CheckResult
categorisedResults := make(map[string][]fthealth.CheckResult)
serviceNames := getServiceNamesFromCategories(categories)
Expand All @@ -35,11 +36,14 @@ func (c *healthCheckController) collectChecksFromCachesFor(categories map[string
}

if len(servicesThatAreNotInCache) != 0 {
notCachedChecks := c.runServiceChecksByServiceNames(servicesThatAreNotInCache, categories)
notCachedChecks, err := c.runServiceChecksByServiceNames(servicesThatAreNotInCache, categories)
if err != nil {
return nil, nil, err
}
checkResults = append(checkResults, notCachedChecks...)
}

return checkResults, categorisedResults
return checkResults, categorisedResults, nil
}

func (c *healthCheckController) updateCachedHealth(services map[string]service, categories map[string]category) {
Expand Down Expand Up @@ -85,10 +89,16 @@ func (c *healthCheckController) scheduleCheck(mService measuredService, refreshP
}

// run check
deployments, err := c.healthCheckService.getDeployments()
if err != nil {
errorLogger.Printf("Cannot run scheduled health check: %s", err.Error())
return
}

serviceToBeChecked := mService.service

var checks []fthealth.Check
checks = append(checks, newServiceHealthCheck(serviceToBeChecked, c.healthCheckService))
checks = append(checks, newServiceHealthCheck(serviceToBeChecked, deployments, c.healthCheckService))

checkResult := fthealth.RunCheck(fthealth.HealthCheck{
serviceToBeChecked.name,
Expand Down
38 changes: 26 additions & 12 deletions checkerService.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ import (
"encoding/json"
"errors"
"fmt"
fthealth "github.com/Financial-Times/go-fthealth/v1_1"
"io/ioutil"
"net/http"

fthealth "github.com/Financial-Times/go-fthealth/v1_1"
)

type healthcheckResponse struct {
Expand All @@ -18,11 +19,11 @@ type healthcheckResponse struct {
}
}

func (hs *k8sHealthcheckService) checkServiceHealth(service service) (string, error) {
func (hs *k8sHealthcheckService) checkServiceHealth(service service, deployments map[string]deployment) (string, error) {
var err error
pods, err := hs.getPodsForService(service.name)
if err != nil {
return "", fmt.Errorf("Cannot retrieve pods for service with name %s to perform healthcheck, error was: %s", service.name, err)
return "", fmt.Errorf("cannot retrieve pods for service with name %s to perform healthcheck: %s", service.name, err.Error())
}

noOfUnavailablePods := 0
Expand All @@ -34,24 +35,37 @@ func (hs *k8sHealthcheckService) checkServiceHealth(service service) (string, er
}

totalNoOfPods := len(pods)
outputMsg := fmt.Sprintf("%v/%v pods available", totalNoOfPods - noOfUnavailablePods, totalNoOfPods)
if totalNoOfPods == 0 || noOfUnavailablePods != 0 {
outputMsg := fmt.Sprintf("%v/%v pods available", totalNoOfPods-noOfUnavailablePods, totalNoOfPods)

if noOfUnavailablePods != 0 {
return "", errors.New(outputMsg)
}
if service.isDaemon {
if totalNoOfPods == 0 {
return "", errors.New(outputMsg)
}
} else {
if _, exists := deployments[service.name]; !exists {
return "", fmt.Errorf("cannot find deployment for service with name %s", service.name)
}
if totalNoOfPods == 0 && deployments[service.name].desiredReplicas != 0 {
return "", errors.New(outputMsg)
}
}

return outputMsg, nil
}

func (hs *k8sHealthcheckService) checkPodHealth(pod pod, appPort int32) error {
health, err := hs.getHealthChecksForPod(pod, appPort)
if err != nil {
errorLogger.Printf("Cannot perform healthcheck for pod with name %s. Error was: %s", pod.name, err.Error())
return errors.New("Cannot perform healthcheck for pod")
errorLogger.Printf("Cannot perform healthcheck for pod with name %s: %s", pod.name, err.Error())
return errors.New("cannot perform healthcheck for pod")
}

for _, check := range health.Checks {
if !check.OK {
return fmt.Errorf("Failing check is: %s", check.Name)
return fmt.Errorf("failing check is: %s", check.Name)
}
}

Expand All @@ -62,7 +76,7 @@ func (hs *k8sHealthcheckService) getIndividualPodSeverity(pod pod, appPort int32
health, err := hs.getHealthChecksForPod(pod, appPort)

if err != nil {
return defaultSeverity, fmt.Errorf("Cannot get severity for pod with name %s. Error was: %s", pod.name, err.Error())
return defaultSeverity, fmt.Errorf("cannot get severity for pod with name %s: %s", pod.name, err.Error())
}

finalSeverity := uint8(2)
Expand Down Expand Up @@ -97,7 +111,7 @@ func (hs *k8sHealthcheckService) getHealthChecksForPod(pod pod, appPort int32) (
}()

if resp.StatusCode != 200 {
return healthcheckResponse{}, fmt.Errorf("Healthcheck endpoint returned non-200 status (%v)", resp.StatusCode)
return healthcheckResponse{}, fmt.Errorf("healthcheck endpoint returned non-200 status (%v)", resp.StatusCode)
}

body, err := ioutil.ReadAll(resp.Body)
Expand Down Expand Up @@ -133,15 +147,15 @@ func newPodHealthCheck(pod pod, service service, healthcheckService healthcheckS
}
}

func newServiceHealthCheck(service service, healthcheckService healthcheckService) fthealth.Check {
func newServiceHealthCheck(service service, deployments map[string]deployment, healthcheckService healthcheckService) fthealth.Check {
return fthealth.Check{
BusinessImpact: "On its own this failure does not have a business impact but it represents a degradation of the cluster health.",
Name: service.name,
PanicGuide: "https://sites.google.com/a/ft.com/technology/systems/dynamic-semantic-publishing/coco/runbook",
Severity: defaultSeverity,
TechnicalSummary: "The service is not healthy. Please check the panic guide.",
Checker: func() (string, error) {
return healthcheckService.checkServiceHealth(service)
return healthcheckService.checkServiceHealth(service, deployments)
},
}
}
46 changes: 29 additions & 17 deletions controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ package main

import (
"fmt"
fthealth "github.com/Financial-Times/go-fthealth/v1_1"
"sort"
"sync"
"time"

fthealth "github.com/Financial-Times/go-fthealth/v1_1"
)

type healthCheckController struct {
Expand All @@ -16,11 +17,11 @@ type healthCheckController struct {

type controller interface {
buildServicesHealthResult([]string, bool) (fthealth.HealthResult, map[string]category, map[string]category, error)
runServiceChecksByServiceNames(map[string]service, map[string]category) []fthealth.CheckResult
runServiceChecksFor(map[string]category) ([]fthealth.CheckResult, map[string][]fthealth.CheckResult)
runServiceChecksByServiceNames(map[string]service, map[string]category) ([]fthealth.CheckResult, error)
runServiceChecksFor(map[string]category) ([]fthealth.CheckResult, map[string][]fthealth.CheckResult, error)
buildPodsHealthResult(string) (fthealth.HealthResult, error)
runPodChecksFor(string) ([]fthealth.CheckResult, error)
collectChecksFromCachesFor(map[string]category) ([]fthealth.CheckResult, map[string][]fthealth.CheckResult)
collectChecksFromCachesFor(map[string]category) ([]fthealth.CheckResult, map[string][]fthealth.CheckResult, error)
updateCachedHealth(map[string]service, map[string]category)
scheduleCheck(measuredService, time.Duration, *time.Timer)
getIndividualPodHealth(string) ([]byte, string, error)
Expand Down Expand Up @@ -54,27 +55,27 @@ func (c *healthCheckController) updateStickyCategory(categoryName string, isEnab

func (c *healthCheckController) removeAck(serviceName string) error {
if !c.healthCheckService.isServicePresent(serviceName) {
return fmt.Errorf("Cannot find service with name %s", serviceName)
return fmt.Errorf("cannot find service with name %s", serviceName)
}

err := c.healthCheckService.removeAck(serviceName)

if err != nil {
return fmt.Errorf("Failed to remove ack for service %s. Error was: %s", serviceName, err.Error())
return fmt.Errorf("failed to remove ack for service %s: %s", serviceName, err.Error())
}

return nil
}

func (c *healthCheckController) addAck(serviceName string, ackMessage string) error {
if !c.healthCheckService.isServicePresent(serviceName) {
return fmt.Errorf("Cannot find service with name %s", serviceName)
return fmt.Errorf("cannot find service with name %s", serviceName)
}

err := c.healthCheckService.addAck(serviceName, ackMessage)

if err != nil {
return fmt.Errorf("Failed to add ack message [%s] for service %s. Error was: %s", ackMessage, serviceName, err.Error())
return fmt.Errorf("failed to add ack message [%s] for service %s: %s", ackMessage, serviceName, err.Error())
}

return nil
Expand All @@ -85,16 +86,19 @@ func (c *healthCheckController) buildServicesHealthResult(providedCategories []s
desc := "Health of the whole cluster of the moment served without cache."
availableCategories, err := c.healthCheckService.getCategories()
if err != nil {
return fthealth.HealthResult{}, nil, nil, fmt.Errorf("Cannot build health check result for services. Error was: %v", err.Error())
return fthealth.HealthResult{}, nil, nil, fmt.Errorf("cannot build health check result for services: %v", err.Error())
}

matchingCategories := getMatchingCategories(providedCategories, availableCategories)

if useCache {
desc = "Health of the whole cluster served from cache."
checkResults, _ = c.collectChecksFromCachesFor(matchingCategories)
checkResults, _, err = c.collectChecksFromCachesFor(matchingCategories)
} else {
checkResults, _ = c.runServiceChecksFor(matchingCategories)
checkResults, _, err = c.runServiceChecksFor(matchingCategories)
}
if err != nil {
return fthealth.HealthResult{}, nil, nil, fmt.Errorf("cannot build health check result for services: %v", err.Error())
}

c.disableStickyFailingCategories(matchingCategories, checkResults)
Expand All @@ -115,11 +119,16 @@ func (c *healthCheckController) buildServicesHealthResult(providedCategories []s
return health, matchingCategories, nil, nil
}

func (c *healthCheckController) runServiceChecksByServiceNames(services map[string]service, categories map[string]category) []fthealth.CheckResult {
func (c *healthCheckController) runServiceChecksByServiceNames(services map[string]service, categories map[string]category) ([]fthealth.CheckResult, error) {
var checks []fthealth.Check

deployments, err := c.healthCheckService.getDeployments()
if err != nil {
return nil, err
}

for _, service := range services {
check := newServiceHealthCheck(service, c.healthCheckService)
check := newServiceHealthCheck(service, deployments, c.healthCheckService)
checks = append(checks, check)
}

Expand Down Expand Up @@ -156,16 +165,19 @@ func (c *healthCheckController) runServiceChecksByServiceNames(services map[stri

c.updateCachedHealth(services, categories)

return healthChecks
return healthChecks, nil
}

func (c *healthCheckController) runServiceChecksFor(categories map[string]category) ([]fthealth.CheckResult, map[string][]fthealth.CheckResult) {
func (c *healthCheckController) runServiceChecksFor(categories map[string]category) ([]fthealth.CheckResult, map[string][]fthealth.CheckResult, error) {
categorisedResults := make(map[string][]fthealth.CheckResult)
serviceNames := getServiceNamesFromCategories(categories)
services := c.healthCheckService.getServicesMapByNames(serviceNames)
healthChecks := c.runServiceChecksByServiceNames(services, categories)
healthChecks, err := c.runServiceChecksByServiceNames(services, categories)
if err != nil {
return nil, nil, err
}

return healthChecks, categorisedResults
return healthChecks, categorisedResults, nil
}

func (c *healthCheckController) disableStickyFailingCategories(categories map[string]category, healthChecks []fthealth.CheckResult) {
Expand Down
18 changes: 15 additions & 3 deletions controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@ package main
import (
"errors"
"fmt"
fthealth "github.com/Financial-Times/go-fthealth/v1_1"
"github.com/stretchr/testify/assert"
"net/http"
"os"
"testing"
"time"

fthealth "github.com/Financial-Times/go-fthealth/v1_1"
"github.com/stretchr/testify/assert"
)

const (
Expand Down Expand Up @@ -51,6 +52,17 @@ func (m *MockService) updateCategory(categoryName string, isEnabled bool) error
return nil
}

func (m *MockService) getDeployments() (map[string]deployment, error) {
return map[string]deployment{
"test-service-name": {
desiredReplicas: 2,
},
"test-service-name-2": {
desiredReplicas: 2,
},
}, nil
}

func (m *MockService) isServicePresent(serviceName string) bool {
if serviceName == nonExistingServiceName {
return false
Expand Down Expand Up @@ -123,7 +135,7 @@ func (m *MockService) getPodByName(podName string) (pod, error) {
}, nil
}

func (m *MockService) checkServiceHealth(service service) (string, error) {
func (m *MockService) checkServiceHealth(service service, deployments map[string]deployment) (string, error) {
return "", errors.New("Error reading healthcheck response: ")
}

Expand Down
3 changes: 2 additions & 1 deletion graphiteFeeder.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@ package main
import (
"errors"
"fmt"
fthealth "github.com/Financial-Times/go-fthealth/v1_1"
"net"
"strings"
"time"

fthealth "github.com/Financial-Times/go-fthealth/v1_1"
)

const (
Expand Down
5 changes: 3 additions & 2 deletions graphiteFeeder_test.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
package main

import (
fthealth "github.com/Financial-Times/go-fthealth/v1_1"
"github.com/stretchr/testify/assert"
"net"
"testing"
"time"

fthealth "github.com/Financial-Times/go-fthealth/v1_1"
"github.com/stretchr/testify/assert"
)

type DummyConnection struct {
Expand Down
3 changes: 2 additions & 1 deletion handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@ package main
import (
"encoding/json"
"fmt"
fthealth "github.com/Financial-Times/go-fthealth/v1_1"
"html/template"
"net/http"
"net/url"
"strings"

fthealth "github.com/Financial-Times/go-fthealth/v1_1"
)

type httpHandler struct {
Expand Down
Loading

0 comments on commit 9ebd3cf

Please sign in to comment.