Skip to content

Commit

Permalink
add cluster alarms
Browse files Browse the repository at this point in the history
add run once option
add custom config option
  • Loading branch information
yacut committed Mar 6, 2021
1 parent 2f49cd9 commit 4bcfaf4
Show file tree
Hide file tree
Showing 19 changed files with 696 additions and 391 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## v1.6.0 / 2021-03-05

- [FEATURE] Add cluster alarms
- [FEATURE] Add serverless support via `watcher.RunOnce(...)`
- [FEATURE] Add option to run checks only once via `--run-once` flag
- [ENHANCEMENT] Add insecure option for the kubernetes API server

## v1.5.0 / 2021-03-02

- [ENHANCEMENT] Split CPU and memory resource config for better configuration opportunities
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ server and generates incidents about the health state of the pods and the nodes.
Simply build and run ilert-kube-agent to get Kubernetes cluster alarms.
| Flag | Description |
| ----------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `--alarms.cluster.enabled` | Enables cluster alarms. Triggers an alarm if any cluster problem occurred e.g. API server not available [Default: true] |
| `--alarms.pods.terminate.enabled` | Enables terminate pod alarms. Triggers an alarm if any pod terminated e.g. Terminated, OOMKilled, Error, ContainerCannotRun, DeadlineExceeded [Default: true] |
| `--alarms.pods.waiting.enabled` | Enables waiting pod alarms. Triggers an alarm if any pod in waiting status e.g. CrashLoopBackOff, ErrImagePull, ImagePullBackOff, CreateContainerConfigError, InvalidImageName, CreateContainerError [Default: true] |
| `--alarms.pods.restarts.enabled` | Enables restarts pod alarms. Triggers an alarm if any pod restarts count reached threshold [Default: true] |
Expand Down
112 changes: 12 additions & 100 deletions cmd/flag.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"flag"
"fmt"
"os"
"strings"

"github.com/rs/zerolog/log"
"github.com/spf13/pflag"
Expand All @@ -13,23 +12,25 @@ import (
shared "github.com/iLert/ilert-kube-agent"
"github.com/iLert/ilert-kube-agent/pkg/config"
"github.com/iLert/ilert-kube-agent/pkg/logger"
"github.com/iLert/ilert-kube-agent/pkg/utils"
)

var (
help bool
version bool
runOnce bool
cfgFile string
)

func parseAndValidateFlags() *config.Config {

flag.BoolVar(&help, "help", false, "Print this help.")
flag.BoolVar(&version, "version", false, "Print version.")
flag.BoolVar(&runOnce, "run-once", false, "Run checks only once and exit.")
flag.StringVar(&cfgFile, "config", "", "Config file")

flag.String("settings.kubeconfig", "", "Path to a kubeconfig. Only required if out-of-cluster.")
flag.String("settings.master", "", "The address of the Kubernetes API server. Overrides any value in kubeconfig. Only required if out-of-cluster.")
flag.Bool("settings.insecure", false, "The Kubernetes API server should be accessed without verifying the TLS certificate. Overrides any value in kubeconfig. Only required if out-of-cluster.")
flag.String("settings.namespace", "kube-system", "Namespace in which agent run.")
flag.String("settings.log.level", "info", "Log level (debug, info, warn, error, fatal).")
flag.Bool("settings.log.json", false, "Enable json format log")
Expand All @@ -38,6 +39,9 @@ func parseAndValidateFlags() *config.Config {
flag.String("settings.apiKey", "", "(REQUIRED) The iLert alert source api key")
flag.String("settings.checkInterval", "15s", "The evaluation check interval e.g. resources check")

flag.Bool("alarms.cluster.enabled", true, "Enable cluster alarms")
flag.String("alarms.cluster.priority", "HIGH", "The cluster alarm incident priority")

flag.Bool("alarms.pods.enabled", true, "Enable pod alarms")
flag.Bool("alarms.pods.terminate.enabled", true, "Enable pod terminate alarms")
flag.String("alarms.pods.terminate.priority", "HIGH", "The pod terminate alarm incident priority")
Expand Down Expand Up @@ -69,9 +73,6 @@ func parseAndValidateFlags() *config.Config {
pflag.Parse()

viper.RegisterAlias("settings.api-key", "settings.apiKey")
viper.SetEnvKeyReplacer(strings.NewReplacer(".", "_", "-", "_"))
viper.SetEnvPrefix("ilert")
viper.AutomaticEnv()

err := viper.BindPFlags(pflag.CommandLine)
if err != nil {
Expand All @@ -88,105 +89,16 @@ func parseAndValidateFlags() *config.Config {
os.Exit(0)
}

if cfgFile != "" {
log.Debug().Str("file", cfgFile).Msg("Reading config file")
viper.SetConfigFile(cfgFile)
err := viper.ReadInConfig()
if err != nil {
log.Fatal().Err(err).Msg("Unable to read config")
}
}

cfg := &config.Config{}
err = viper.Unmarshal(cfg)
if err != nil {
log.Fatal().Err(err).Msg("Unable to decode config")
}

if cfg.Links.Pods == nil {
cfg.Links.Pods = make([]config.ConfigLinksSetting, 0)
}
if cfg.Links.Nodes == nil {
cfg.Links.Nodes = make([]config.ConfigLinksSetting, 0)
if cfgFile != "" {
cfg.SetConfigFile(cfgFile)
}

for _, e := range os.Environ() {
pair := strings.SplitN(e, "=", 2)
if strings.HasPrefix(pair[0], "ILERT_LINKS_PODS_") {
link := strings.ReplaceAll(pair[0], "ILERT_LINKS_PODS_", "")
cfg.Links.Pods = append(cfg.Links.Pods, config.ConfigLinksSetting{
Name: strings.Title(strings.ToLower(strings.ReplaceAll(link, "_", " "))),
Href: pair[1],
})
}

if strings.HasPrefix(pair[0], "ILERT_LINKS_NODES_") {
cfg.Links.Nodes = append(cfg.Links.Nodes, config.ConfigLinksSetting{
Name: strings.Title(strings.ToLower(strings.ReplaceAll(strings.ReplaceAll(pair[0], "ILERT_LINKS_NODES_", ""), "_", " "))),
Href: pair[1],
})
}
if runOnce {
cfg.SetRunOnce(true)
}

cfg.Load()
cfg.Validate()
logger.Init(cfg.Settings.Log)

ilertAPIKeyEnv := utils.GetEnv("ILERT_API_KEY", "")
if ilertAPIKeyEnv != "" {
cfg.Settings.APIKey = ilertAPIKeyEnv
}

namespaceEnv := utils.GetEnv("NAMESPACE", "")
if namespaceEnv != "" {
cfg.Settings.Namespace = namespaceEnv
}

logLevelEnv := utils.GetEnv("LOG_LEVEL", "")
if logLevelEnv != "" {
cfg.Settings.Log.Level = logLevelEnv
}

if cfg.Settings.ElectionID == "" {
log.Fatal().Msg("Election ID is required.")
}

if cfg.Settings.Namespace == "" {
log.Fatal().Msg("Namespace is required. Use --settings.namespace flag or NAMESPACE env var")
}

if cfg.Settings.APIKey == "" {
log.Fatal().Msg("iLert api key is required. Use --settings.apiKey flag or ILERT_API_KEY env var")
}

if cfg.Settings.Log.Level != "debug" && cfg.Settings.Log.Level != "info" && cfg.Settings.Log.Level != "warn" && cfg.Settings.Log.Level != "error" && cfg.Settings.Log.Level != "fatal" {
log.Fatal().Msg("Invalid --settings.log.level flag value or config.")
}

checkPriorityConfig(cfg.Alarms.Pods.Terminate.Priority, "--alarms.pods.terminate.priority")
checkPriorityConfig(cfg.Alarms.Pods.Waiting.Priority, "--alarms.pods.waiting.priority")
checkPriorityConfig(cfg.Alarms.Pods.Restarts.Priority, "--alarms.pods.restarts.priority")
checkPriorityConfig(cfg.Alarms.Pods.Resources.CPU.Priority, "--alarms.pods.resources.cpu.priority")
checkPriorityConfig(cfg.Alarms.Pods.Resources.Memory.Priority, "--alarms.pods.resources.memory.priority")
checkPriorityConfig(cfg.Alarms.Nodes.Terminate.Priority, "--alarms.nodes.terminate.priority")
checkPriorityConfig(cfg.Alarms.Nodes.Resources.CPU.Priority, "--alarms.nodes.resources.cpu.priority")
checkPriorityConfig(cfg.Alarms.Nodes.Resources.Memory.Priority, "--alarms.nodes.resources.memory.priority")

checkThresholdConfig(cfg.Alarms.Pods.Resources.CPU.Threshold, 1, 100, "--alarms.pods.resources.cpu.threshold")
checkThresholdConfig(cfg.Alarms.Pods.Resources.Memory.Threshold, 1, 100, "--alarms.pods.resources.memory.threshold")
checkThresholdConfig(cfg.Alarms.Pods.Restarts.Threshold, 1, 1000000, "--alarms.pods.restarts.threshold")
checkThresholdConfig(cfg.Alarms.Pods.Resources.CPU.Threshold, 1, 100, "--alarms.nodes.resources.cpu.threshold")
checkThresholdConfig(cfg.Alarms.Pods.Resources.Memory.Threshold, 1, 100, "--alarms.nodes.resources.memory.threshold")

return cfg
}

func checkPriorityConfig(priority string, flag string) {
if priority != "HIGH" && priority != "LOW" {
log.Fatal().Msg(fmt.Sprintf("Invalid %s flag value.", flag))
}
}

func checkThresholdConfig(threshold int32, min int32, max int32, flag string) {
if threshold < min || threshold > max {
log.Fatal().Msg(fmt.Sprintf("Invalid %s flag value (min=%d max=%d).", flag, min, max))
}
}
39 changes: 7 additions & 32 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,16 @@ import (
"syscall"
"time"

agentclientset "github.com/iLert/ilert-kube-agent/pkg/client/clientset/versioned"
"github.com/iLert/ilert-kube-agent/pkg/router"
"github.com/iLert/ilert-kube-agent/pkg/storage"
"github.com/iLert/ilert-kube-agent/pkg/watcher"

"github.com/rs/zerolog/log"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
_ "k8s.io/client-go/plugin/pkg/client/auth"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/client-go/tools/leaderelection"
"k8s.io/client-go/tools/leaderelection/resourcelock"
metrics "k8s.io/metrics/pkg/client/clientset/versioned"
)

const (
Expand All @@ -36,6 +32,11 @@ func main() {

log.Info().Interface("config", cfg).Msg("Starting agent with config")

if cfg.GetRunOnce() {
watcher.RunOnce(cfg)
return
}

srg := &storage.Storage{}
srg.Init()
router := router.Setup(srg)
Expand All @@ -60,38 +61,12 @@ func main() {
log.Fatal().Err(err).Msg("Unable to get hostname")
}

config, err := clientcmd.BuildConfigFromFlags(cfg.Settings.Master, cfg.Settings.KubeConfig)
if err != nil {
log.Fatal().Err(err).Msg("Failed to build kubeconfig")
}

kubeClient, err := kubernetes.NewForConfig(config)
if err != nil {
log.Fatal().Err(err).Msg("Failed to create kube client")
}

agentKubeClient, err := agentclientset.NewForConfig(config)
if err != nil {
log.Fatal().Err(err).Msg("Failed to create kube client")
}

metricsClient, err := metrics.NewForConfig(config)
if err != nil {
log.Fatal().Err(err).Msg("Failed to create metrics client")
}

// Validate that the client is ok.
_, err = kubeClient.CoreV1().Pods(metav1.NamespaceAll).List(metav1.ListOptions{})
if err != nil {
log.Fatal().Err(err).Msg("Failed to get nodes from apiserver")
}

lock := &resourcelock.LeaseLock{
LeaseMeta: metav1.ObjectMeta{
Name: cfg.Settings.ElectionID,
Namespace: cfg.Settings.Namespace,
},
Client: kubeClient.CoordinationV1(),
Client: cfg.KubeClient.CoordinationV1(),
LockConfig: resourcelock.ResourceLockConfig{
Identity: id,
},
Expand All @@ -117,7 +92,7 @@ func main() {
Callbacks: leaderelection.LeaderCallbacks{
OnStartedLeading: func(_ context.Context) {
log.Info().Str("identity", id).Msg("I am the new leader")
watcher.Start(kubeClient, metricsClient, agentKubeClient, cfg)
watcher.Start(cfg)
},
OnStoppedLeading: func() {
watcher.Stop()
Expand Down
9 changes: 9 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ settings:
## The address of the Kubernetes API server. Overrides any value in kubeconfig. Only required if out-of-cluster.
# master: ""

## The Kubernetes API server should be accessed without verifying the TLS certificate. Overrides any value in kubeconfig. Only required if out-of-cluster.
# insecure: false

## Namespace in which agent run.
namespace: kube-systems

Expand All @@ -27,6 +30,12 @@ settings:
json: false

alarms:
cluster:
## Enables cluster alarms
enabled: true
## The cluster alarm incident priority
priority: HIGH

pods:
## Enables all pod alarms
enabled: false
Expand Down
58 changes: 58 additions & 0 deletions pkg/config/factory.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package config

import (
"github.com/rs/zerolog/log"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
metrics "k8s.io/metrics/pkg/client/clientset/versioned"

agentclientset "github.com/iLert/ilert-kube-agent/pkg/client/clientset/versioned"
)

// SetKubeConfig override default kube config
func (cfg *Config) SetKubeConfig(config *rest.Config) {
cfg.KubeConfig = config
}

func (cfg *Config) initializeClients() {
if cfg.KubeConfig == nil {
config, err := clientcmd.BuildConfigFromFlags(cfg.Settings.Master, cfg.Settings.KubeConfig)
if err != nil {
log.Fatal().Err(err).Msg("Failed to build kubeconfig")
} else {
cfg.KubeConfig = config
}

if cfg.Settings.Insecure {
config.Insecure = true
}
}

if cfg.KubeClient == nil {
kubeClient, err := kubernetes.NewForConfig(cfg.KubeConfig)
if err != nil {
log.Fatal().Err(err).Msg("Failed to create kube client")
} else {
cfg.KubeClient = kubeClient
}
}

if cfg.AgentKubeClient == nil {
agentKubeClient, err := agentclientset.NewForConfig(cfg.KubeConfig)
if err != nil {
log.Fatal().Err(err).Msg("Failed to create kube agent client")
} else {
cfg.AgentKubeClient = agentKubeClient
}
}

if cfg.MetricsClient == nil {
metricsClient, err := metrics.NewForConfig(cfg.KubeConfig)
if err != nil {
log.Fatal().Err(err).Msg("Failed to create metrics client")
} else {
cfg.MetricsClient = metricsClient
}
}
}
Loading

0 comments on commit 4bcfaf4

Please sign in to comment.