diff --git a/pkg/controller/migration/BUILD.bazel b/pkg/controller/migration/BUILD.bazel index 14acaed6b..34119146d 100644 --- a/pkg/controller/migration/BUILD.bazel +++ b/pkg/controller/migration/BUILD.bazel @@ -5,7 +5,6 @@ go_library( srcs = [ "controller.go", "doc.go", - "metrics.go", "predicate.go", "validation.go", ], @@ -20,9 +19,8 @@ go_library( "//pkg/lib/error", "//pkg/lib/logging", "//pkg/lib/ref", + "//pkg/monitoring/metrics/forklift-controller", "//pkg/settings", - "//vendor/github.com/prometheus/client_golang/prometheus", - "//vendor/github.com/prometheus/client_golang/prometheus/promauto", "//vendor/k8s.io/apimachinery/pkg/api/errors", "//vendor/k8s.io/apiserver/pkg/storage/names", "//vendor/sigs.k8s.io/controller-runtime/pkg/client", diff --git a/pkg/controller/migration/controller.go b/pkg/controller/migration/controller.go index dc0235a5b..a5fc60107 100644 --- a/pkg/controller/migration/controller.go +++ b/pkg/controller/migration/controller.go @@ -24,6 +24,7 @@ import ( libcnd "github.com/konveyor/forklift-controller/pkg/lib/condition" "github.com/konveyor/forklift-controller/pkg/lib/logging" libref "github.com/konveyor/forklift-controller/pkg/lib/ref" + metrics "github.com/konveyor/forklift-controller/pkg/monitoring/metrics/forklift-controller" "github.com/konveyor/forklift-controller/pkg/settings" k8serr "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apiserver/pkg/storage/names" @@ -84,7 +85,7 @@ func Add(mgr manager.Manager) error { } // Gather migration metrics - recordMetrics(mgr.GetClient()) + metrics.RecordMigrationMetrics(mgr.GetClient()) return nil } diff --git a/pkg/controller/migration/metrics.go b/pkg/controller/migration/metrics.go deleted file mode 100644 index 6eec1d450..000000000 --- a/pkg/controller/migration/metrics.go +++ /dev/null @@ -1,72 +0,0 @@ -package migration - -import ( - "context" - "time" - - api "github.com/konveyor/forklift-controller/pkg/apis/forklift/v1beta1" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -var ( - // 'status' - [ executing, succeeded, failed, canceled ] - migrationGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ - Name: "mtv_workload_migrations", - Help: "VM Migrations sorted by status", - }, - []string{"status"}, - ) -) - -// Calculate Migrations metrics every 10 seconds -func recordMetrics(client client.Client) { - go func() { - for { - time.Sleep(10 * time.Second) - - // get all migration objects - migrations := api.MigrationList{} - err := client.List(context.TODO(), &migrations) - - // if error occurs, retry 10 seconds later - if err != nil { - log.Info("Metrics Migrations list error: %v", err) - continue - } - - // Holding counter vars used to make gauge update "atomic" - var executing, succeeded, failed, canceled float64 - - // for all migrations, count # in executing, succeeded, failed, canceled - for _, m := range migrations.Items { - if m.Status.HasCondition(Executing) { - executing++ - continue - } - if m.Status.HasCondition(Succeeded) { - succeeded++ - continue - } - if m.Status.HasCondition(Failed) { - failed++ - continue - } - if m.Status.HasCondition(Canceled) { - canceled++ - continue - } - } - - migrationGauge.With( - prometheus.Labels{"status": Executing}).Set(executing) - migrationGauge.With( - prometheus.Labels{"status": Succeeded}).Set(succeeded) - migrationGauge.With( - prometheus.Labels{"status": Failed}).Set(failed) - migrationGauge.With( - prometheus.Labels{"status": Canceled}).Set(canceled) - } - }() -} diff --git a/pkg/controller/plan/BUILD.bazel b/pkg/controller/plan/BUILD.bazel index 8172e0ac9..dc42defc5 100644 --- a/pkg/controller/plan/BUILD.bazel +++ b/pkg/controller/plan/BUILD.bazel @@ -7,7 +7,6 @@ go_library( "doc.go", "hook.go", "kubevirt.go", - "metrics.go", "migration.go", "predicate.go", "util.go", @@ -36,13 +35,12 @@ go_library( "//pkg/lib/itinerary", "//pkg/lib/logging", "//pkg/lib/ref", + "//pkg/monitoring/metrics/forklift-controller", "//pkg/settings", "//vendor/github.com/k8snetworkplumbingwg/network-attachment-definition-client/pkg/apis/k8s.cni.cncf.io/v1:k8s_cni_cncf_io", "//vendor/github.com/openshift/api/template/v1:template", "//vendor/github.com/openshift/library-go/pkg/template/generator", "//vendor/github.com/openshift/library-go/pkg/template/templateprocessing", - "//vendor/github.com/prometheus/client_golang/prometheus", - "//vendor/github.com/prometheus/client_golang/prometheus/promauto", "//vendor/gopkg.in/yaml.v2:yaml_v2", "//vendor/k8s.io/api/batch/v1:batch", "//vendor/k8s.io/api/core/v1:core", diff --git a/pkg/controller/plan/controller.go b/pkg/controller/plan/controller.go index c23fb087d..17c0cb6ca 100644 --- a/pkg/controller/plan/controller.go +++ b/pkg/controller/plan/controller.go @@ -30,6 +30,7 @@ import ( liberr "github.com/konveyor/forklift-controller/pkg/lib/error" "github.com/konveyor/forklift-controller/pkg/lib/logging" libref "github.com/konveyor/forklift-controller/pkg/lib/ref" + metrics "github.com/konveyor/forklift-controller/pkg/monitoring/metrics/forklift-controller" "github.com/konveyor/forklift-controller/pkg/settings" k8serr "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apiserver/pkg/storage/names" @@ -145,7 +146,7 @@ func Add(mgr manager.Manager) error { } // Gather migration Plan metrics - recordMetrics(mgr.GetClient()) + metrics.RecordMigrationMetrics(mgr.GetClient()) return nil } diff --git a/pkg/controller/plan/metrics.go b/pkg/controller/plan/metrics.go deleted file mode 100644 index 4d9aa739c..000000000 --- a/pkg/controller/plan/metrics.go +++ /dev/null @@ -1,106 +0,0 @@ -package plan - -import ( - "context" - "time" - - api "github.com/konveyor/forklift-controller/pkg/apis/forklift/v1beta1" - "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" - "sigs.k8s.io/controller-runtime/pkg/client" -) - -var ( - // 'status' - [ idle, executing, succeeded, failed, canceled, deleted, paused, pending, running, blocked ] - migrationGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ - Name: "mtv_workload_plans", - Help: "VM migration Plans sorted by status", - }, - []string{"status"}, - ) -) - -// Calculate Plans metrics every 10 seconds -func recordMetrics(client client.Client) { - go func() { - for { - time.Sleep(10 * time.Second) - - // get all migration objects - plans := api.PlanList{} - err := client.List(context.TODO(), &plans) - - // if error occurs, retry 10 seconds later - if err != nil { - log.Info("Metrics Plans list error: %v", err) - continue - } - - // Holding counter vars used to make gauge update "atomic" - var idle, executing, succeeded, failed, canceled, deleted, paused, pending, running, blocked float64 - - // for all plans, count # in Idle, Executing, Succeeded, Failed, Canceled, Deleted, Paused, Pending, Running, Blocked - for _, m := range plans.Items { - if m.Status.HasCondition(Executing) { - executing++ - continue - } - if m.Status.HasCondition(Succeeded) { - succeeded++ - continue - } - if m.Status.HasCondition(Failed) { - failed++ - continue - } - if m.Status.HasCondition(Canceled) { - canceled++ - continue - } - if m.Status.HasCondition(Deleted) { - deleted++ - continue - } - if m.Status.HasCondition(Paused) { - paused++ - continue - } - if m.Status.HasCondition(Pending) { - pending++ - continue - } - if m.Status.HasCondition(Running) { - running++ - continue - } - if m.Status.HasCondition(Blocked) { - blocked++ - continue - } - // If the Plan has no matching condition, but exists, it should be counted as Idle - idle++ - } - - migrationGauge.With( - prometheus.Labels{"status": "Idle"}).Set(idle) - migrationGauge.With( - prometheus.Labels{"status": Executing}).Set(executing) - migrationGauge.With( - prometheus.Labels{"status": Succeeded}).Set(succeeded) - migrationGauge.With( - prometheus.Labels{"status": Failed}).Set(failed) - migrationGauge.With( - prometheus.Labels{"status": Canceled}).Set(canceled) - migrationGauge.With( - prometheus.Labels{"status": Deleted}).Set(deleted) - migrationGauge.With( - prometheus.Labels{"status": Paused}).Set(paused) - migrationGauge.With( - prometheus.Labels{"status": Pending}).Set(pending) - migrationGauge.With( - prometheus.Labels{"status": Running}).Set(running) - migrationGauge.With( - prometheus.Labels{"status": Blocked}).Set(blocked) - } - }() -} diff --git a/pkg/monitoring/metrics/forklift-controller/BUILD.bazel b/pkg/monitoring/metrics/forklift-controller/BUILD.bazel new file mode 100644 index 000000000..cdddd80b5 --- /dev/null +++ b/pkg/monitoring/metrics/forklift-controller/BUILD.bazel @@ -0,0 +1,18 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "forklift-controller", + srcs = [ + "metrics.go", + "migration_metrics.go", + "plan_metrics.go", + ], + importpath = "github.com/konveyor/forklift-controller/pkg/monitoring/metrics/forklift-controller", + visibility = ["//visibility:public"], + deps = [ + "//pkg/apis/forklift/v1beta1", + "//vendor/github.com/prometheus/client_golang/prometheus", + "//vendor/github.com/prometheus/client_golang/prometheus/promauto", + "//vendor/sigs.k8s.io/controller-runtime/pkg/client", + ], +) diff --git a/pkg/monitoring/metrics/forklift-controller/metrics.go b/pkg/monitoring/metrics/forklift-controller/metrics.go new file mode 100644 index 000000000..f0c009a77 --- /dev/null +++ b/pkg/monitoring/metrics/forklift-controller/metrics.go @@ -0,0 +1,117 @@ +package forklift_controller + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +const ( + Succeeded = "Succeeded" + Failed = "Failed" + Executing = "Executing" + Running = "Running" + Pending = "Pending" + Canceled = "Canceled" + Blocked = "Blocked" + Ready = "Ready" + Deleted = "Deleted" + Warm = "Warm" + Cold = "Cold" + Local = "Local" + Remote = "Remote" +) + +var ( + // 'status' - [ Succeeded, Failed, Executing, Canceled] + // 'provider' - [oVirt, VSphere, Openstack, OVA, Openshift] + // 'mode' - [Cold, Warm] + // 'target' - [Local, Remote] + migrationStatusGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "mtv_migrations_status", + Help: "VM Migrations sorted by status, provider, mode and destination", + }, + []string{ + "status", + "provider", + "mode", + "target", + }, + ) + + // 'status' - [ Succeeded, Failed, Executing, Running, Pending, Canceled, Blocked, Deleted] + // 'provider' - [oVirt, VSphere, Openstack, OVA, Openshift] + // 'mode' - [Cold, Warm] + // 'target' - [Local, Remote] + planStatusGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "mtv_plans_status", + Help: "VM migration Plans sorted by status, provider, mode and destination", + }, + []string{ + "status", + "provider", + "mode", + "target", + }, + ) + + // 'status' - [ Succeeded, Failed, Executing, Canceled] + // 'provider' - [oVirt, VSphere, Openstack, OVA, Openshift] + // 'mode' - [Cold, Warm] + // 'target' - [Local, Remote] + // 'plan' - [Id] + migrationDurationGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "mtv_migration_duration_seconds", + Help: "Duration of VM migrations in seconds", + }, + []string{"provider", "mode", "target", "plan"}, + ) + + // 'provider' - [oVirt, VSphere, Openstack, OVA, Openshift] + // 'mode' - [Cold, Warm] + // 'target' - [Local, Remote] + // 'plan' - [Id] + dataTransferredGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "mtv_migration_data_transferred_bytes", + Help: "Total data transferred during VM migrations in bytes", + }, + []string{ + "provider", + "mode", + "target", + "plan", + }, + ) + + // 'status' - [ Succeeded, Failed, Executing, Canceled] + // 'provider' - [oVirt, VSphere, Openstack, OVA, Openshift] + // 'mode' - [Cold, Warm] + // 'target' - [Local, Remote] + // 'plan' - [Id] + migrationPlanCorrelationStatusGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "mtv_workload_migrations_status", + Help: "VM Migrations status by provider, mode, destination and plan", + }, + []string{ + "status", + "provider", + "mode", + "target", + "plan", + }, + ) + + // 'provider' - [oVirt, VSphere, Openstack, OVA, Openshift] + // 'mode' - [Cold, Warm] + // 'target' - [Local, Remote] + migrationDurationHistogram = promauto.NewHistogramVec(prometheus.HistogramOpts{ + Name: "mtv_migrations_duration_seconds", + Help: "Histogram of VM migrations duration in seconds", + Buckets: []float64{1 * 3600, 2 * 3600, 5 * 3600, 10 * 3600, 24 * 3600, 48 * 3600}, // 1, 2, 5, 10, 24, 48 hours in seconds + }, + []string{ + "provider", + "mode", + "target", + }, + ) +) diff --git a/pkg/monitoring/metrics/forklift-controller/migration_metrics.go b/pkg/monitoring/metrics/forklift-controller/migration_metrics.go new file mode 100644 index 000000000..23252ede0 --- /dev/null +++ b/pkg/monitoring/metrics/forklift-controller/migration_metrics.go @@ -0,0 +1,119 @@ +package forklift_controller + +import ( + "context" + "fmt" + "strings" + "time" + + api "github.com/konveyor/forklift-controller/pkg/apis/forklift/v1beta1" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var processedSucceededMigrations = make(map[string]struct{}) + +// Calculate Migrations metrics every 10 seconds +func RecordMigrationMetrics(c client.Client) { + go func() { + for { + time.Sleep(10 * time.Second) + + // get all migration objects + migrations := api.MigrationList{} + err := c.List(context.TODO(), &migrations) + + // if error occurs, retry 10 seconds later + if err != nil { + fmt.Printf("Metrics Migrations list error: %v\n", err) + continue + } + + // Initialize or reset the counter map at the beginning of each iteration + counterMap := make(map[string]float64) + + for _, m := range migrations.Items { + plan := api.Plan{} + err := c.Get(context.TODO(), client.ObjectKey{Namespace: m.Spec.Plan.Namespace, Name: m.Spec.Plan.Name}, &plan) + if err != nil { + continue + } + + sourceProvider := api.Provider{} + err = c.Get(context.TODO(), client.ObjectKey{Namespace: plan.Spec.Provider.Source.Namespace, Name: plan.Spec.Provider.Source.Name}, &sourceProvider) + if err != nil { + continue + } + + destProvider := api.Provider{} + err = c.Get(context.TODO(), client.ObjectKey{Namespace: plan.Spec.Provider.Destination.Namespace, Name: plan.Spec.Provider.Destination.Name}, &destProvider) + if err != nil { + continue + } + + isLocal := destProvider.Spec.URL == "" + isWarm := plan.Spec.Warm + + var target, mode, key string + if isLocal { + target = Local + } else { + target = Remote + } + if isWarm { + mode = Warm + } else { + mode = Cold + } + + provider := sourceProvider.Type().String() + + if m.Status.HasCondition(Succeeded) { + key = fmt.Sprintf("%s|%s|%s|%s|%s", Succeeded, provider, mode, target, string(plan.UID)) + counterMap[key]++ + + startTime := m.Status.Started.Time + endTime := m.Status.Completed.Time + duration := endTime.Sub(startTime).Seconds() + + var totalDataTransferred float64 + for _, vm := range m.Status.VMs { + for _, step := range vm.Pipeline { + if step.Name == "DiskTransferV2v" || step.Name == "DiskTransfer" { + for _, task := range step.Tasks { + totalDataTransferred += float64(task.Progress.Completed) * 1024 * 1024 // convert to Bytes + } + } + } + } + + // Set the metrics for duration and data transferred and update the map for scaned migration + if _, exists := processedSucceededMigrations[string(m.UID)]; !exists { + migrationDurationGauge.With(prometheus.Labels{"provider": provider, "mode": mode, "target": target, "plan": string(plan.UID)}).Set(duration) + migrationDurationHistogram.With(prometheus.Labels{"provider": provider, "mode": mode, "target": target}).Observe(duration) + dataTransferredGauge.With(prometheus.Labels{"provider": provider, "mode": mode, "target": target, "plan": string(plan.UID)}).Set(totalDataTransferred) + processedSucceededMigrations[string(m.UID)] = struct{}{} + } + } + if m.Status.HasCondition(Failed) { + key = fmt.Sprintf("%s|%s|%s|%s|%s", Failed, provider, mode, target, string(plan.UID)) + counterMap[key]++ + } + if m.Status.HasCondition(Executing) { + key = fmt.Sprintf("%s|%s|%s|%s|%s", Executing, provider, mode, target, string(plan.UID)) + counterMap[key]++ + } + if m.Status.HasCondition(Canceled) { + key = fmt.Sprintf("%s|%s|%s|%s|%s", Canceled, provider, mode, target, string(plan.UID)) + counterMap[key]++ + } + } + + for key, value := range counterMap { + parts := strings.Split(key, "|") + migrationStatusGauge.With(prometheus.Labels{"status": parts[0], "provider": parts[1], "mode": parts[2], "target": parts[3]}).Set(value) + migrationPlanCorrelationStatusGauge.With(prometheus.Labels{"status": parts[0], "provider": parts[1], "mode": parts[2], "target": parts[3], "plan": parts[4]}).Set(value) + } + } + }() +} diff --git a/pkg/monitoring/metrics/forklift-controller/plan_metrics.go b/pkg/monitoring/metrics/forklift-controller/plan_metrics.go new file mode 100644 index 000000000..3ab928e48 --- /dev/null +++ b/pkg/monitoring/metrics/forklift-controller/plan_metrics.go @@ -0,0 +1,103 @@ +package forklift_controller + +import ( + "context" + "fmt" + "strings" + "time" + + api "github.com/konveyor/forklift-controller/pkg/apis/forklift/v1beta1" + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// Calculate Plans metrics every 10 seconds +func RecordPlanMetrics(c client.Client) { + go func() { + for { + time.Sleep(10 * time.Second) + + // get all plans objects + plans := api.PlanList{} + err := c.List(context.TODO(), &plans) + + // if error occurs, retry 10 seconds later + if err != nil { + fmt.Printf("Metrics Plans list error: %v\n", err) + continue + } + + // Initialize or reset the counter map at the beginning of each iteration + plansCounterMap := make(map[string]float64) + + for _, m := range plans.Items { + sourceProvider := api.Provider{} + err = c.Get(context.TODO(), client.ObjectKey{Namespace: m.Spec.Provider.Source.Namespace, Name: m.Spec.Provider.Source.Name}, &sourceProvider) + if err != nil { + continue + } + + destProvider := api.Provider{} + err := c.Get(context.TODO(), client.ObjectKey{Namespace: m.Spec.Provider.Destination.Namespace, Name: m.Spec.Provider.Destination.Name}, &destProvider) + if err != nil { + continue + } + + isLocal := destProvider.Spec.URL == "" + isWarm := m.Spec.Warm + + var target, mode, key string + if isLocal { + target = Local + } else { + target = Remote + } + if isWarm { + mode = Warm + } else { + mode = Cold + } + + provider := sourceProvider.Type().String() + + if m.Status.HasCondition(Succeeded) { + key = fmt.Sprintf("%s|%s|%s|%s", Succeeded, provider, mode, target) + plansCounterMap[key]++ + } + if m.Status.HasCondition(Failed) { + key = fmt.Sprintf("%s|%s|%s|%s", Failed, provider, mode, target) + plansCounterMap[key]++ + } + if m.Status.HasCondition(Executing) { + key = fmt.Sprintf("%s|%s|%s|%s", Executing, provider, mode, target) + plansCounterMap[key]++ + } + if m.Status.HasCondition(Running) { + key = fmt.Sprintf("%s|%s|%s|%s", Running, provider, mode, target) + plansCounterMap[key]++ + } + if m.Status.HasCondition(Pending) { + key = fmt.Sprintf("%s|%s|%s|%s", Pending, provider, mode, target) + plansCounterMap[key]++ + } + if m.Status.HasCondition(Canceled) { + key = fmt.Sprintf("%s|%s|%s|%s", Canceled, provider, mode, target) + plansCounterMap[key]++ + } + if m.Status.HasCondition(Blocked) { + key = fmt.Sprintf("%s|%s|%s|%s", Blocked, provider, mode, target) + plansCounterMap[key]++ + } + if m.Status.HasCondition(Deleted) { + key = fmt.Sprintf("%s|%s|%s|%s", Deleted, provider, mode, target) + plansCounterMap[key]++ + } + } + + for key, value := range plansCounterMap { + parts := strings.Split(key, "|") + planStatusGauge.With(prometheus.Labels{"status": parts[0], "provider": parts[1], "mode": parts[2], "target": parts[3]}).Set(value) + } + } + }() +}