Skip to content

Commit

Permalink
Revert change that causes px deploy to timeout on initial deployment (
Browse files Browse the repository at this point in the history
#1899)

Summary: Revert change that causes `px deploy` to timeout on initial
deployment

This reverts #1670, which creates a circular dependency between the
operator and the cloud connector service. The cloud connector service is
responsible for [registering a
vizier](https://github.com/pixie-io/pixie/blob/fb18345808e6fbd8aed02dac8163469322b84205/src/vizier/services/cloud_connector/bridge/server.go#L290-L302)
and populating the `pl-cluster-secrets` k8s secret with its vizier ID.
The operator is responsible for creating the vizier services (including
the cloud connector), so [this
call](https://github.com/pixie-io/pixie/blob/fb18345808e6fbd8aed02dac8163469322b84205/src/operator/controllers/vizier_controller.go#L503)
will never succeed on a fresh install.

This bug doesn't cause pixie installs to fail completely. A `px deploy`
cli command will time out after 10 minutes and then the vizier is
deployed following that timeout expiration. While it eventually
converges to a healthy vizier, this is a poor user experience. The perf
tool is also experiencing this problem, but because it requires `px
deploy` to return a successful status code it is causing it to fail
completely.

Relevant Issues: Reopens #1632

Type of change: /kind bug

Test Plan: Reverted this change and verified that the a `skaffold`'ed
operator doesn't timeout

Signed-off-by: Dom Del Nano <[email protected]>
  • Loading branch information
ddelnano authored May 13, 2024
1 parent fb18345 commit 66b0515
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 44 deletions.
2 changes: 0 additions & 2 deletions src/operator/controllers/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,11 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//src/api/proto/cloudpb:cloudapi_pl_go_proto",
"//src/api/proto/uuidpb:uuid_pl_go_proto",
"//src/api/proto/vizierconfigpb:vizier_pl_go_proto",
"//src/operator/apis/px.dev/v1alpha1",
"//src/shared/goversion",
"//src/shared/services",
"//src/shared/status",
"//src/utils",
"//src/utils/shared/certs",
"//src/utils/shared/k8s",
"@com_github_blang_semver//:semver",
Expand Down
44 changes: 2 additions & 42 deletions src/operator/controllers/vizier_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,11 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"

"px.dev/pixie/src/api/proto/cloudpb"
"px.dev/pixie/src/api/proto/uuidpb"
"px.dev/pixie/src/api/proto/vizierconfigpb"
"px.dev/pixie/src/operator/apis/px.dev/v1alpha1"
version "px.dev/pixie/src/shared/goversion"
"px.dev/pixie/src/shared/services"
"px.dev/pixie/src/shared/status"
"px.dev/pixie/src/utils"
"px.dev/pixie/src/utils/shared/certs"
"px.dev/pixie/src/utils/shared/k8s"
)
Expand Down Expand Up @@ -499,13 +497,7 @@ func (r *VizierReconciler) deployVizier(ctx context.Context, req ctrl.Request, v
return err
}

// Get the Vizier's ID from the cluster's secrets.
vizierID, err := getVizierID(r.Clientset, req.Namespace)
if err != nil {
log.WithError(err).Error("Failed to retrieve the Vizier ID from the cluster's secrets")
}

configForVizierResp, err := generateVizierYAMLsConfig(ctx, req.Namespace, r.K8sVersion, vizierID, vz, cloudClient)
configForVizierResp, err := generateVizierYAMLsConfig(ctx, req.Namespace, r.K8sVersion, vz, cloudClient)
if err != nil {
log.WithError(err).Error("Failed to generate configs for Vizier YAMLs")
return err
Expand Down Expand Up @@ -817,14 +809,13 @@ func convertResourceType(originalLst v1.ResourceList) *vizierconfigpb.ResourceLi

// generateVizierYAMLsConfig is responsible retrieving a yaml map of configurations from
// Pixie Cloud.
func generateVizierYAMLsConfig(ctx context.Context, ns string, k8sVersion string, vizierID *uuidpb.UUID, vz *v1alpha1.Vizier, conn *grpc.ClientConn) (*cloudpb.ConfigForVizierResponse,
func generateVizierYAMLsConfig(ctx context.Context, ns string, k8sVersion string, vz *v1alpha1.Vizier, conn *grpc.ClientConn) (*cloudpb.ConfigForVizierResponse,
error) {
client := cloudpb.NewConfigServiceClient(conn)

req := &cloudpb.ConfigForVizierRequest{
Namespace: ns,
K8sVersion: k8sVersion,
VizierID: vizierID,
VzSpec: &vizierconfigpb.VizierSpec{
Version: vz.Spec.Version,
DeployKey: vz.Spec.DeployKey,
Expand Down Expand Up @@ -1143,37 +1134,6 @@ func getClusterUID(clientset *kubernetes.Clientset) (string, error) {
return string(ksNS.UID), nil
}

// getVizierID gets the ID of the cluster the Vizier is in.
func getVizierID(clientset *kubernetes.Clientset, namespace string) (*uuidpb.UUID, error) {
op := func() (*uuidpb.UUID, error) {
var vizierID *uuidpb.UUID
s := k8s.GetSecret(clientset, namespace, "pl-cluster-secrets")
if s == nil {
return nil, errors.New("Missing cluster secrets, retrying again")
}
if id, ok := s.Data["cluster-id"]; ok {
vizierID = utils.ProtoFromUUIDStrOrNil(string(id))
if vizierID == nil {
return nil, errors.New("Couldn't convert ID to proto")
}
}

return vizierID, nil
}

expBackoff := backoff.NewExponentialBackOff()
expBackoff.InitialInterval = 10 * time.Second
expBackoff.Multiplier = 2
expBackoff.MaxElapsedTime = 10 * time.Minute

vizierID, err := backoff.RetryWithData(op, expBackoff)
if err != nil {
return nil, errors.New("Timed out waiting for the Vizier ID")
}

return vizierID, nil
}

// getConfigForOperator is responsible retrieving the Operator config from from Pixie Cloud.
func getConfigForOperator(ctx context.Context, conn *grpc.ClientConn) (*cloudpb.ConfigForOperatorResponse, error) {
client := cloudpb.NewConfigServiceClient(conn)
Expand Down

0 comments on commit 66b0515

Please sign in to comment.