From 12b3037abbb1631f62337e21e43ae131b59056a4 Mon Sep 17 00:00:00 2001 From: David Balazs Date: Wed, 3 May 2017 16:20:18 +0300 Subject: [PATCH] =?UTF-8?q?Removed=20path=20prefix=20for=20gtg=20endpoint,?= =?UTF-8?q?=20logged=20input=20params=20and=20updated=E2=80=A6=20(#8)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Removed path prefix for gtg endpoint, logged input params and updated documentation * Fixed log * Added more tests * Moved gtg handler a few lines up in the file to emphasize the fact that it uses the main router * Decreased timeout value for httpClient * Added parallel severity check * Fixed pod severity checking, decreased httpClient timeout * Fixed tests * Fixed tests and code format * Updated readme --- README.md | 92 ++++++++++++++++++++++++++++++++++++---------- checkerService.go | 2 +- controller.go | 26 ++++++++----- controller_test.go | 10 ++++- handler_test.go | 2 +- main.go | 6 +-- podController.go | 27 +++++++++----- service.go | 9 ++--- service_test.go | 14 +++---- 9 files changed, 129 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index 1c58c6f..84b523d 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,32 @@ # upp-aggregate-healthcheck [![Circle CI](https://circleci.com/gh/Financial-Times/upp-aggregate-healthcheck.svg?style=shield)](https://circleci.com/gh/Financial-Times/upp-aggregate-healthcheck) [![Go Report Card](https://goreportcard.com/badge/github.com/Financial-Times/upp-aggregate-healthcheck)](https://goreportcard.com/report/github.com/Financial-Times/upp-aggregate-healthcheck) [![Coverage Status](https://coveralls.io/repos/github/Financial-Times/upp-aggregate-healthcheck/badge.svg)](https://coveralls.io/github/Financial-Times/upp-aggregate-healthcheck) -The purpose of this service is to serve the functionality of aggregating healthchecks from services and pods in the Kubernetes cluster. +The purpose of this service is to aggregate the healthchecks from services and pods in the Kubernetes cluster. -## Usage +## Introduction In this section, the aggregate-healthcheck functionalities are described. ### Get services health A service is considered to be healthy if it has at least one pod that is able to serve requests. To determine which pods are able to serve requests, - there is a readinessProbe configured on each of them, which checks the GoodToGo endpoint of the app running inside the pod. If the GoodToGo responds + there is a readinessProbe configured on the deployment, which checks the GoodToGo endpoint of the app running inside the pod. If the GoodToGo responds with a 503 Service Unavailable status code, the pod will not serve requests anymore, until it will receive 200 OK status code on GoodToGo endpoint. For a service, if there is at least one pod that can serve requests, the service will be considered healthy, but if there are pods that are unavailable, a message will be displayed in the "Output" section of the corresponding service. + As an exception, if a service is marked as non-resilient (it has the __isResilient: "false"__ label), it will be considered unhealthy if there is at least one pod which is unhealthy. Not that for services are grouped into categories, therefore there is the possibility to query the aggregate-healthcheck only for a certain list of categories. If no category is provided, the healthchecks of all services will be displayed. ### Get pods health for a service - Pods health is evaluated by querying the health endpoint of apps inside the pods. Given a pod, if there is at least one check that fails, - the pod health will be considered warning or critical, based on the severity level of the check that fails. + The healths of the pods are evaluated by querying the __health endpoint of apps inside the pods. Given a pod, if there is at least one check that fails, + the pod health will be considered warning or critical, based on the severity level of the check that fails. ### Acknowledge a service When a service is unhealthy, there is a possibility to acknowledge the warning. By acknowledging all the services that are unhealthy, the general status of the aggregate-healthcheck will become healthy (it will also mention that there are 'n' services acknowledged). ### Sticky categories Categories can be sticky, meaning that if one of the services become unhealthy, the category will be disabled, meaning that it will be unhealthy, until manual re-enabling it. There is an endpoint for enabling a category. + ## Running locally To run the service locally, you will need to run the following commands first to get the vendored dependencies for this project: `go get github.com/kardianos/govendor` and @@ -35,6 +37,35 @@ To run the service locally, you will need to run the following commands first to The list of functionalities that can be used outside of the cluster are: * Add/Remove acknowledge * Enable/Disable sticky categories + +## Build and deployment + To build Docker images for this service, use the following repo: [coco/upp-aggregate-healthcheck](https://hub.docker.com/r/coco/upp-aggregate-healthcheck/) +## How to configure services for aggregate-healthcheck + For a service to be taken into consideration by aggregate-healthcheck it needs to have the following: + * The Kubernetes service should have __hasHealthcheck: "true"__ label. + * The container should have Kubernetes `readinessProbe` configured to check the `__gtg` endpoint of the app + * The app should have `__gtg` and `__health` endpoints. + * Optionally the Kubernetes service can have: + - `isResilient: "false"` label which will cause the service to be unhealthy if there is at least one pod that is unhealthy. Default value for `isResilient` flag is `true` + - `isDaemon: "true"` label which indicates that the pods are managed by a daemonSet instead of a deployment. Default value for `isDaemon` flag is `false`, meaning that pods are managed by a Deployment. + +## How to configure categories for aggregate-healthcheck + Categories are stored in Kubernetes ConfigMaps. + The template of a ConfigMap for a category is shown below: +
+  kind: ConfigMap
+      apiVersion: v1
+      metadata:
+        name: category.CATEGORY-NAME # name of the category
+        labels:
+          healthcheck-categories-for: aggregate-healthcheck # this flag is used by aggregate-healthcheck service to pick up only ConfigMaps that store categories.
+      data:
+        category.name: CATEGORY-NAME # name of the category
+        category.services: serviceName1, serviceName2, serviceName3 # services that belong to this category
+        category.refreshrate: "60" # refresh rate in seconds for cache (by default it is 60)
+        category.issticky: "false" # boolean flag that marks category as sticky. By default this flag is set to false.
+        category.enabled: "true" # boolean flag that marks category as disabled. By default, this flag is set to true.
+  
## Endpoints In the following section, aggregate-healthcheck endpoints are described. Note that this app has two options of retrieving healthchecks: @@ -42,38 +73,59 @@ To run the service locally, you will need to run the following commands first to - `HTML format` - this is the default format of displaying healthchecks. ### Service endpoints - * `__health` - Perform services healthcheck. + Note that there is a configurable __pathPrefix__ which will be the prefix of each endpoint's path (E.g. if the + prefix is `__health`, the endpoint path for add-ack is `__health/add-ack`. The default value for __pathPrefix__ is the empty string. + In the provided examples, it is assumed that the __pathPrefix__ is `__health`. + * `__gtg` - the GoodToGoo endpoint + - params: + - `categories` - the healthcheck will be performed on the services belonging to the provided categories. + - `cache` - if set to false, the healthchecks will be performed without the help of cache. By default, the cache is used. + - returns a __503 Service Unavailable__ status code in the following cases: + - if at least one of the provided categories is disabled (see sticky functionality) + - if at least one of the checked services is unhealthy + - returns a __200 OK__ status code otherwise + - example: + `localhost:8080/__gtg?cache=false&categories=read,publish` + * `/__health` or simply `` - Perform services healthcheck. - params: - `categories` - the healthcheck will be performed on the services belonging to the provided categories. - `cache` - if set to false, the healthchecks will be performed without the help of cache. By default, the cache is used. - * `__pods-health` - Perform pods healthcheck for a service. + - example: + `localhost:8080/__health?cache=false&categories=read,publish` + * `/__pods-health` - Perform pods healthcheck for a service. - params: - `service-name` - The healthcheck will be performed only for pods belonging to the provided service. - * `__pod-individual-health` - Retrieves the healthchecks of the app running inside the pod. + - example: + `localhost:8080/__health/__pods-health?service-name=api-policy-component` + * `/__pod-individual-health` - Retrieves the healthchecks of the app running inside the pod. - params: - `pod-name` - The name of the pod for which the healthchecks will be retrieved. - * `add-ack` - (POST) Acknowledges a service + - example: + `localhost:8080/__health/__pod-individual-health?pod-name=api-policy-component2912-12341` + * `/add-ack` - (POST) Acknowledges a service - params: - `service-name` - The service to be acknowledged. + - example: + `localhost:8080/__health/add-ack?service-name=api-policy-component` (request body: `ack-msg=this is the message for ack`) - request body: - `ack-msg` the acknowledge message. - * `rem-ack` - Removes the acknowledge of a service + * `/rem-ack` - Removes the acknowledge of a service - params: - `service-name` - The service to be updated. - * `enable-category` - Enables a category. This is used for sticky categories which are unhealthy. + - example: + `localhost:8080/__health/rem-ack?service-name=api-policy-component` + * `/enable-category` - Enables a category. This is used for sticky categories which are unhealthy. - params: - `category-name` - The category to be enabled. - * `disable-category` - Disables a category. This is useful when doing a failover. + - example: + `localhost:8080/__health/enable-category?category-name=read` + * `/disable-category` - Disables a category. This is useful when doing a failover. - params: - `category-name` - The category to be disabled. - + - example: + `localhost:8080/__health/disable-category?category-name=read` + ### Admin endpoints * `__health` * `__gtg` - - params: - - `categories` - the healthcheck will be performed on the services belonging to the provided categories. - - `cache` - if set to false, the healthchecks will be performed without the help of cache. By default, the cache is used. - - returns a __503 Service Unavailable__ status code in the following cases: - - if at least one of the provided categories is disabled (see sticky functionality) - - if at least one of the checked services is unhealthy - - returns a __200 OK__ status code otherwise + diff --git a/checkerService.go b/checkerService.go index cfde77e..78175d8 100644 --- a/checkerService.go +++ b/checkerService.go @@ -80,7 +80,7 @@ func checkServiceHealthByResiliency(service service, noOfAvailablePods int32, no func (hs *k8sHealthcheckService) checkPodHealth(pod pod, appPort int32) error { health, err := hs.getHealthChecksForPod(pod, appPort) if err != nil { - errorLogger.Printf("Cannot perform healthcheck for pod. Error was: %s", err.Error()) + errorLogger.Printf("Cannot perform healthcheck for pod with name %s. Error was: %s", pod.name, err.Error()) return errors.New("Cannot perform healthcheck for pod") } diff --git a/controller.go b/controller.go index d11fb06..766f5bc 100644 --- a/controller.go +++ b/controller.go @@ -4,6 +4,7 @@ import ( "fmt" fthealth "github.com/Financial-Times/go-fthealth/v1a" "sort" + "sync" "time" ) @@ -123,16 +124,23 @@ func (c *healthCheckController) runServiceChecksByServiceNames(services map[stri healthChecks := fthealth.RunCheck("Forced check run", "", true, checks...).Checks - for i, individualHealthcheck := range healthChecks { - if !individualHealthcheck.Ok { - if unhealthyService, ok := services[individualHealthcheck.Name]; ok { - severity := c.getSeverityForService(individualHealthcheck.Name, unhealthyService.appPort) - healthChecks[i].Severity = severity - } else { - warnLogger.Printf("Cannot compute severity for service with name %s because it was not found. Using default value.", individualHealthcheck.Name) + wg := sync.WaitGroup{} + for i := range healthChecks { + wg.Add(1) + go func(i int) { + healthCheck := healthChecks[i] + if !healthCheck.Ok { + if unhealthyService, ok := services[healthCheck.Name]; ok { + severity := c.getSeverityForService(healthCheck.Name, unhealthyService.appPort) + healthChecks[i].Severity = severity + } else { + warnLogger.Printf("Cannot compute severity for service with name %s because it was not found. Using default value.", healthCheck.Name) + } } - } + wg.Done() + }(i) } + wg.Wait() for _, service := range services { if service.ack != "" { @@ -190,7 +198,7 @@ func updateHealthCheckWithAckMsg(healthChecks []fthealth.CheckResult, name strin func getFinalResult(checkResults []fthealth.CheckResult, categories map[string]category) (bool, uint8) { finalOk := true - var finalSeverity uint8 = 2 + finalSeverity := defaultSeverity if len(checkResults) == 0 { return false, finalSeverity diff --git a/controller_test.go b/controller_test.go index d184487..fa1fbfe 100644 --- a/controller_test.go +++ b/controller_test.go @@ -94,11 +94,11 @@ func (m *MockService) getPodsForService(serviceName string) ([]pod, error) { return []pod{ { - name: "test-pod-name1-8425234-9hdfg ", + name: "test-pod-name2-8425234-9hdfg ", ip: "10.2.51.2", }, { - name: "test-pod-name2-8425234-9hdfg ", + name: "test-pod-name1-8425234-9hdfg ", ip: "10.2.51.2", }, }, nil @@ -430,6 +430,12 @@ func TestGetFinalResultCategoryDisabled(t *testing.T) { assert.False(t, finalOk) } +func TestGetFinalResultEmptyCheckResultsList(t *testing.T) { + finalOk, finalSeverity := getFinalResult([]fthealth.CheckResult{}, map[string]category{}) + assert.False(t, finalOk) + assert.Equal(t, defaultSeverity, finalSeverity) +} + func TestGetEnvironment(t *testing.T) { healthCheckController := &healthCheckController{ environment: validEnvName, diff --git a/handler_test.go b/handler_test.go index 070d868..49a32b4 100644 --- a/handler_test.go +++ b/handler_test.go @@ -1,9 +1,9 @@ package main import ( + "errors" "fmt" fthealth "github.com/Financial-Times/go-fthealth/v1a" - "errors" "github.com/stretchr/testify/assert" "net/http" "net/http/httptest" diff --git a/main.go b/main.go index 6312190..cb54f15 100644 --- a/main.go +++ b/main.go @@ -21,7 +21,7 @@ func main() { environment := app.String(cli.StringOpt{ Name: "environment", - Value: "Kubernetes", + Value: "Default-environment", Desc: "Environment tag (e.g. local, pre-prod, prod-uk)", EnvVar: "ENVIRONMENT", }) @@ -42,6 +42,7 @@ func main() { app.Action = func() { initLogs(os.Stdout, os.Stdout, os.Stderr) + infoLogger.Printf("Starting app with params: [environment: %s], [pathPrefix: %s], [graphiteURL: %s]", *environment, *pathPrefix, *graphiteURL) controller := initializeController(*environment) handler := &httpHandler{ @@ -63,6 +64,7 @@ func main() { func listen(httpHandler *httpHandler, pathPrefix string) { r := mux.NewRouter() + r.HandleFunc("/__gtg", httpHandler.handleGoodToGo) s := r.PathPrefix(pathPrefix).Subrouter() s.HandleFunc("/add-ack", httpHandler.handleAddAck).Methods("POST") s.HandleFunc("/enable-category", httpHandler.handleEnableCategory) @@ -73,9 +75,7 @@ func listen(httpHandler *httpHandler, pathPrefix string) { s.HandleFunc("/", httpHandler.handleServicesHealthCheck) s.HandleFunc("/__pods-health", httpHandler.handlePodsHealthCheck) s.HandleFunc("/__pod-individual-health", httpHandler.handleIndividualPodHealthCheck) - s.HandleFunc("/__gtg", httpHandler.handleGoodToGo) s.PathPrefix("/").Handler(http.StripPrefix("/", http.FileServer(http.Dir("resources/")))) - err := http.ListenAndServe(":8080", r) if err != nil { panic(fmt.Sprintf("Cannot set up HTTP listener. Error was: %v", err)) diff --git a/podController.go b/podController.go index 5f118f2..bceba28 100644 --- a/podController.go +++ b/podController.go @@ -7,6 +7,7 @@ import ( "io/ioutil" "net/http" "sort" + "sync" ) func (c *healthCheckController) buildPodsHealthResult(serviceName string) (fthealth.HealthResult, error) { @@ -52,17 +53,23 @@ func (c *healthCheckController) runPodChecksFor(serviceName string) ([]fthealth. } healthChecks := fthealth.RunCheck("Forced check run", "", true, checks...).Checks - - for i, check := range healthChecks { - if !check.Ok { - severity := c.getSeverityForPod(check.Name, serviceToBeChecked.appPort) - healthChecks[i].Severity = severity - } - - if serviceToBeChecked.ack != "" { - healthChecks[i].Ack = serviceToBeChecked.ack - } + wg := sync.WaitGroup{} + for i := range healthChecks { + wg.Add(1) + go func(i int, serviceToBeChecked service) { + healthCheck := healthChecks[i] + if !healthCheck.Ok { + severity := c.getSeverityForPod(healthCheck.Name, serviceToBeChecked.appPort) + healthChecks[i].Severity = severity + } + + if serviceToBeChecked.ack != "" { + healthChecks[i].Ack = serviceToBeChecked.ack + } + wg.Done() + }(i, serviceToBeChecked) } + wg.Wait() return healthChecks, nil } diff --git a/service.go b/service.go index 86ea2e8..dd2bb6a 100644 --- a/service.go +++ b/service.go @@ -68,7 +68,7 @@ func (hs *k8sHealthcheckService) watchAcks() { errorLogger.Printf("Error while starting to watch acks configMap with label selector: %s. Error was: %s", ackMessagesConfigMapLabelSelector, err.Error()) } - infoLogger.Print("Started watching services") + infoLogger.Print("Started watching acks configMap") resultChannel := watcher.ResultChan() for msg := range resultChannel { switch msg.Type { @@ -162,7 +162,7 @@ func (hs *k8sHealthcheckService) watchDeployments() { func initializeHealthCheckService() *k8sHealthcheckService { httpClient := &http.Client{ - Timeout: 60 * time.Second, + Timeout: 5 * time.Second, Transport: &http.Transport{ MaxIdleConnsPerHost: 100, Dial: (&net.Dialer{ @@ -179,7 +179,7 @@ func initializeHealthCheckService() *k8sHealthcheckService { // creates the clientset k8sClient, err := kubernetes.NewForConfig(config) if err != nil { - errorLogger.Printf("Failed to create k8s client, error was: %v", err.Error()) + panic(fmt.Sprintf("Failed to create k8s client, error was: %v", err.Error())) } deployments := make(map[string]deployment) @@ -298,7 +298,6 @@ func (hs *k8sHealthcheckService) getServicesMapByNames(serviceNames []string) ma if len(serviceNames) == 0 { hs.services.RLock() defer hs.services.RUnlock() - //TODO: check if this map can be modified after it is returned. return hs.services.m } @@ -354,13 +353,11 @@ func populateCategory(k8sCatData map[string]string) category { categoryName := k8sCatData["category.name"] isSticky, err := strconv.ParseBool(k8sCatData["category.issticky"]) if err != nil { - infoLogger.Printf("isSticky flag is not set for category with name [%s]. Using default value of false.", categoryName) isSticky = false } isEnabled, err := strconv.ParseBool(k8sCatData["category.enabled"]) if err != nil { - infoLogger.Printf("isEnabled flag is not set for category with name [%s]. Using default value of true.", categoryName) isEnabled = true } diff --git a/service_test.go b/service_test.go index 0acd2bf..7fcbc34 100644 --- a/service_test.go +++ b/service_test.go @@ -5,9 +5,9 @@ import ( "io/ioutil" "k8s.io/client-go/kubernetes/fake" "net/http" + "os" "strings" "testing" - "os" ) type MockWebClient struct{} @@ -17,12 +17,12 @@ type mockTransport struct { } const ( - validIP = "1.0.0.0" - validK8sServiceName = "validServiceName" - validK8sServiceNameWithAck = "validK8sServiceNameWithAck" - nonExistingK8sServiceName = "vnonExistingServiceName" - validSeverity = uint8(1) - ackMsg = "ack-msg" + validIP = "1.0.0.0" + validK8sServiceName = "validServiceName" + validK8sServiceNameWithAck = "validK8sServiceNameWithAck" + nonExistingK8sServiceName = "vnonExistingServiceName" + validSeverity = uint8(1) + ackMsg = "ack-msg" validFailingHealthCheckResponseBody = `{ "schemaVersion": 1, "name": "CMSNotifierApplication",