Skip to content

Commit

Permalink
Add node metrics for time to register, ready
Browse files Browse the repository at this point in the history
  • Loading branch information
cartermckinnon committed Jun 19, 2024
1 parent 56cbd21 commit c1eb247
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 26 deletions.
29 changes: 12 additions & 17 deletions kubetest2/internal/deployers/eksapi/deployer.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package eksapi
import (
"flag"
"fmt"
"path"
"path/filepath"
"time"

Expand All @@ -13,7 +12,6 @@ import (
"github.com/aws/aws-k8s-tester/kubetest2/internal/util"

"github.com/aws/aws-sdk-go-v2/service/cloudwatch"
cloudwatchtypes "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types"
"github.com/octago/sflags/gen/gpflag"
"github.com/spf13/pflag"
Expand All @@ -29,16 +27,6 @@ const ResourcePrefix = "kubetest2-" + DeployerName

var SupportedNodeNameStrategy = []string{"SessionName", "EC2PrivateDNSName"}

var DeployerMetricNamespace = path.Join("kubetest2", DeployerName)

var (
totalRuntimeSeconds = &metrics.MetricSpec{
Namespace: DeployerMetricNamespace,
Metric: "TotalRuntimeSeconds",
Unit: cloudwatchtypes.StandardUnitSeconds,
}
)

// assert that deployer implements optional interfaces
var _ types.DeployerWithKubeconfig = &deployer{}
var _ types.DeployerWithInit = &deployer{}
Expand All @@ -54,6 +42,8 @@ type deployer struct {
addonManager *AddonManager
nodegroupManager *NodegroupManager

awsClients *awsClients

infra *Infrastructure
cluster *Cluster

Expand Down Expand Up @@ -113,18 +103,18 @@ func (d *deployer) Version() string {
func (d *deployer) Init() error {
d.initTime = time.Now()
awsConfig := awssdk.NewConfig()
awsClients := newAWSClients(awsConfig, d.EKSEndpointURL)
d.awsClients = newAWSClients(awsConfig, d.EKSEndpointURL)
resourceID := ResourcePrefix + "-" + d.commonOptions.RunID()
if d.deployerOptions.EmitMetrics {
client := cloudwatch.NewFromConfig(awsConfig)
d.metrics = metrics.NewCloudWatchRegistry(client)
} else {
d.metrics = metrics.NewNoopMetricRegistry()
}
d.infraManager = NewInfrastructureManager(awsClients, resourceID, d.metrics)
d.clusterManager = NewClusterManager(awsClients, resourceID)
d.addonManager = NewAddonManager(awsClients)
d.nodegroupManager = NewNodegroupManager(awsClients, resourceID)
d.infraManager = NewInfrastructureManager(d.awsClients, resourceID, d.metrics)
d.clusterManager = NewClusterManager(d.awsClients, resourceID)
d.addonManager = NewAddonManager(d.awsClients)
d.nodegroupManager = NewNodegroupManager(d.awsClients, resourceID)
return nil
}

Expand Down Expand Up @@ -205,6 +195,11 @@ func (d *deployer) Up() error {
if err := waitForReadyNodes(k8sClient, d.Nodes, d.NodeReadyTimeout); err != nil {
return err
}
if d.EmitMetrics {
if err := emitNodeMetrics(d.metrics, k8sClient, d.awsClients.EC2()); err != nil {
return err
}
}
return nil
}

Expand Down
92 changes: 83 additions & 9 deletions kubetest2/internal/deployers/eksapi/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@ package eksapi
import (
"context"
"fmt"
"net/url"
"strings"
"time"

"github.com/aws/aws-k8s-tester/kubetest2/internal/metrics"
"github.com/aws/aws-sdk-go-v2/service/ec2"
"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -33,10 +37,11 @@ func waitForReadyNodes(client *kubernetes.Clientset, nodeCount int, timeout time
return errors.Wrap(err, "creating node watcher")
}
defer watcher.Stop()
counter, err := getReadyNodes(client)
initialReadyNodes, err := getReadyNodes(client)
if err != nil {
return errors.Wrap(err, "listing nodes")
}
counter := len(initialReadyNodes)
ctx, _ := context.WithTimeout(context.Background(), timeout)
for {
select {
Expand Down Expand Up @@ -70,27 +75,35 @@ func waitForReadyNodes(client *kubernetes.Clientset, nodeCount int, timeout time
return nil
}

func getReadyNodes(client kubernetes.Interface) (int, error) {
func getReadyNodes(client kubernetes.Interface) ([]corev1.Node, error) {
nodes, err := client.CoreV1().Nodes().List(context.TODO(), v1.ListOptions{})
if err != nil {
return 0, err
return nil, err
}
counter := 0
var readyNodes []corev1.Node
for _, node := range nodes.Items {
if isNodeReady(&node) {
counter++
readyNodes = append(readyNodes, node)
}
}
return counter, nil
return readyNodes, nil
}

func isNodeReady(node *corev1.Node) bool {
c := getNodeReadyCondition(node)
if c == nil {
return false
}
return c.Status == corev1.ConditionTrue
}

func getNodeReadyCondition(node *corev1.Node) *corev1.NodeCondition {
for _, c := range node.Status.Conditions {
if c.Type == corev1.NodeReady && c.Status == corev1.ConditionTrue {
return true
if c.Type == corev1.NodeReady {
return &c
}
}
return false
return nil
}

func createAWSAuthConfigMap(client *kubernetes.Clientset, nodeNameStrategy string, nodeRoleARN string) error {
Expand All @@ -110,3 +123,64 @@ func createAWSAuthConfigMap(client *kubernetes.Clientset, nodeNameStrategy strin
}, metav1.CreateOptions{})
return err
}

func emitNodeMetrics(metricRegistry metrics.MetricRegistry, k8sClient *kubernetes.Clientset, ec2Client *ec2.Client) error {
nodes, err := getReadyNodes(k8sClient)
if err != nil {
return err
}
var errs []error
for _, node := range nodes {
providerId, err := parseKubernetesProviderID(node.Spec.ProviderID)
if err != nil {
errs = append(errs, err)
continue
}
instanceInfo, err := ec2Client.DescribeInstances(context.TODO(), &ec2.DescribeInstancesInput{
InstanceIds: []string{providerId.InstanceID},
})
if err != nil {
errs = append(errs, err)
continue
}
launchTime := *instanceInfo.Reservations[0].Instances[0].LaunchTime
timeToRegistration := node.ObjectMeta.CreationTimestamp.Time.Sub(launchTime)
timeToReady := getNodeReadyCondition(&node).LastTransitionTime.Time.Sub(launchTime)

nodeDimensions := map[string]string{
"instanceType": node.ObjectMeta.Labels[corev1.LabelInstanceTypeStable],
"os": node.ObjectMeta.Labels[corev1.LabelOSStable],
"arch": node.ObjectMeta.Labels[corev1.LabelArchStable],
}

metricRegistry.Record(nodeTimeToRegistrationSeconds, timeToRegistration.Seconds(), nodeDimensions)
metricRegistry.Record(nodeTimeToReadySeconds, timeToReady.Seconds(), nodeDimensions)
}
return nil
}

type KubernetesProviderID struct {
AvailabilityZone string
InstanceID string
}

func parseKubernetesProviderID(rawProviderId string) (*KubernetesProviderID, error) {
url, err := url.Parse(rawProviderId)
if err != nil {
return nil, fmt.Errorf("malformed provider ID: %s", rawProviderId)
}
if url.Scheme != "aws" {
return nil, fmt.Errorf("usupported provider ID scheme: %s", url.Scheme)
}
if url.Path == "" {
return nil, fmt.Errorf("provider ID path is empty: %s", rawProviderId)
}
parts := strings.Split(url.Path, "/")
if len(parts) != 2 {
return nil, fmt.Errorf("provider ID path does not have 2 parts: %s", url.Path)
}
return &KubernetesProviderID{
AvailabilityZone: parts[0],
InstanceID: parts[1],
}, nil
}
30 changes: 30 additions & 0 deletions kubetest2/internal/deployers/eksapi/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package eksapi

import (
"path"

"github.com/aws/aws-k8s-tester/kubetest2/internal/metrics"
cloudwatchtypes "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
)

var DeployerMetricNamespace = path.Join("kubetest2", DeployerName)

var (
totalRuntimeSeconds = &metrics.MetricSpec{
Namespace: DeployerMetricNamespace,
Metric: "TotalRuntimeSeconds",
Unit: cloudwatchtypes.StandardUnitSeconds,
}

nodeTimeToRegistrationSeconds = &metrics.MetricSpec{
Namespace: DeployerMetricNamespace,
Metric: "NodeTimeToRegistrationSeconds",
Unit: cloudwatchtypes.StandardUnitSeconds,
}

nodeTimeToReadySeconds = &metrics.MetricSpec{
Namespace: DeployerMetricNamespace,
Metric: "NodeTimeToReadySeconds",
Unit: cloudwatchtypes.StandardUnitSeconds,
}
)

0 comments on commit c1eb247

Please sign in to comment.