ci: drop cloudwatch agent (#6269)

aws · May 25, 2024 · b3076dc · b3076dc
1 parent 47608b1
commit b3076dc
Show file tree

Hide file tree

Showing 4 changed files with 4 additions and 47 deletions.
diff --git a/.github/actions/e2e/run-tests-private-cluster/action.yaml b/.github/actions/e2e/run-tests-private-cluster/action.yaml
@@ -125,10 +125,6 @@ runs:
               - kubectl delete ec2nodeclass --all
               - kubectl delete deployment --all
               - PRIVATE_CLUSTER=$CLUSTER_NAME TEST_SUITE=$SUITE ENABLE_METRICS=$ENABLE_METRICS METRICS_REGION=$METRICS_REGION GIT_REF="$(git rev-parse HEAD)" CLUSTER_NAME=$CLUSTER_NAME CLUSTER_ENDPOINT="$(aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.endpoint" --output text)" INTERRUPTION_QUEUE=$CLUSTER_NAME make e2etests
-              - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/application --retention-in-days 30
-              - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/dataplane --retention-in-days 30
-              - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/host --retention-in-days 30
-              - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/performance --retention-in-days 30
           post_build:
             commands:
               # Describe karpenter pods

diff --git a/.github/actions/e2e/setup-cluster/action.yaml b/.github/actions/e2e/setup-cluster/action.yaml
@@ -78,7 +78,6 @@ runs:
         --capabilities CAPABILITY_NAMED_IAM \
         --parameter-overrides "ClusterName=$CLUSTER_NAME" \
         --tags "testing/type=e2e" "testing/cluster=$CLUSTER_NAME" "github.com/run-url=https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" "karpenter.sh/discovery=$CLUSTER_NAME"
-      aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy --role-name KarpenterNodeRole-$CLUSTER_NAME
   - name: create or upgrade cluster
     shell: bash
     env:
@@ -153,9 +152,11 @@ runs:
           minSize: 2
           maxSize: 2
           iam:
-            withAddonPolicies:
-              cloudWatch: true
             instanceRolePermissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary"
+          taints:
+          - key: CriticalAddonsOnly
+            value: "true"
+            effect: NoSchedule
       cloudWatch:
         clusterLogging:
           enableTypes: ["*"]
@@ -174,10 +175,6 @@ runs:
         $KARPENTER_IAM
         withOIDC: true
       addons:
-      - name: amazon-cloudwatch-observability
-        # Pin addon version due to undiagnosed e2e failures after 1.6.0 release
-        version: '1.5.5-eksbuild.1'
-        permissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary"
       - name: vpc-cni
         permissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary"
       - name: coredns
@@ -219,24 +216,6 @@ runs:
       # Remove the serviceaccount created by eksctl as of 1.77.0 for pod identity associations. This service account causes our helm chart installation to fail.
       # Reference: https://github.com/eksctl-io/eksctl/issues/7775
       kubectl delete sa -n kube-system karpenter || true
-
-      # Adding taints after all necessary pods have scheduled to the manged node group nodes
-      # amazon-cloudwatch-observability pods do no not tolerate CriticalAddonsOnly=true:NoSchedule and
-      # amazon-cloudwatch-observability addons does not allow to add tolerations to the addon pods as part of the advanced configuration
-      # Overwrite existing taints to ensure we don't fail here on upgrade
-      kubectl taint nodes CriticalAddonsOnly=true:NoSchedule --all --overwrite
-
-      # We delete DaemonSets that we don't care about because it causes inconsistencies in scheduling due to
-      # dcgm-exporter and neuron-monitor selecting on specific instance types
-      # See https://github.com/kubernetes-sigs/karpenter/issues/715 for more detail
-      kubectl delete daemonsets -n amazon-cloudwatch dcgm-exporter neuron-monitor --ignore-not-found
-
-      # We patch the priorityClass onto all DaemonSets to ensure that DaemonSets always schedule to nodes so we don't get scheduling inconsistencies
-      # See https://karpenter.sh/docs/faq/#when-deploying-an-additional-daemonset-to-my-cluster-why-does-karpenter-not-scale-up-my-nodes-to-support-the-extra-daemonset for more detail
-      # Additionally, we patch an everything toleration onto the daemonsets to prevent them from being included in drain operations.
-      for DAEMONSET in "cloudwatch-agent" "cloudwatch-agent-windows" "fluent-bit" "fluent-bit-windows"; do
-        kubectl patch daemonset -n amazon-cloudwatch $DAEMONSET -p '{"spec":{"template":{"spec":{"priorityClassName":"system-node-critical","tolerations": [{"operator": "Exists"}]}}}}' --type=merge
-      done
   - name: tag oidc provider of the cluster
     if: always()
     shell: bash

diff --git a/.github/workflows/e2e-upgrade.yaml b/.github/workflows/e2e-upgrade.yaml
@@ -136,15 +136,6 @@ jobs:
           url: ${{ secrets.SLACK_WEBHOOK_URL }}
           suite: Upgrade
           git_ref: ${{ inputs.to_git_ref }}
-      - name: add log retention policy
-        if: always() && inputs.workflow_trigger != 'private_cluster'
-        env:
-          CLUSTER_NAME: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}
-        run: |
-          aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/application --retention-in-days 30
-          aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/dataplane --retention-in-days 30
-          aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/host --retention-in-days 30
-          aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/performance --retention-in-days 30
       - name: dump logs on failure
         uses: ./.github/actions/e2e/dump-logs
         if: failure() || cancelled()

diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
@@ -188,15 +188,6 @@ jobs:
           suite: ${{ inputs.suite }}
           git_ref: ${{ inputs.git_ref }}
           workflow_trigger: ${{ inputs.workflow_trigger }}
-      - name: add log retention policy
-        if:  always() && inputs.workflow_trigger != 'private_cluster'
-        env:
-          CLUSTER_NAME: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}
-        run: |
-          aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/application --retention-in-days 30
-          aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/dataplane --retention-in-days 30
-          aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/host --retention-in-days 30
-          aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/performance --retention-in-days 30
       - name: dump logs on failure
         uses: ./.github/actions/e2e/dump-logs
         if: (failure() || cancelled()) && inputs.workflow_trigger != 'private_cluster'