Skip to content

Commit

Permalink
ci: drop cloudwatch agent (#6269)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmdeal authored May 25, 2024
1 parent 47608b1 commit b3076dc
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 47 deletions.
4 changes: 0 additions & 4 deletions .github/actions/e2e/run-tests-private-cluster/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -125,10 +125,6 @@ runs:
- kubectl delete ec2nodeclass --all
- kubectl delete deployment --all
- PRIVATE_CLUSTER=$CLUSTER_NAME TEST_SUITE=$SUITE ENABLE_METRICS=$ENABLE_METRICS METRICS_REGION=$METRICS_REGION GIT_REF="$(git rev-parse HEAD)" CLUSTER_NAME=$CLUSTER_NAME CLUSTER_ENDPOINT="$(aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.endpoint" --output text)" INTERRUPTION_QUEUE=$CLUSTER_NAME make e2etests
- aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/application --retention-in-days 30
- aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/dataplane --retention-in-days 30
- aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/host --retention-in-days 30
- aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/performance --retention-in-days 30
post_build:
commands:
# Describe karpenter pods
Expand Down
29 changes: 4 additions & 25 deletions .github/actions/e2e/setup-cluster/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ runs:
--capabilities CAPABILITY_NAMED_IAM \
--parameter-overrides "ClusterName=$CLUSTER_NAME" \
--tags "testing/type=e2e" "testing/cluster=$CLUSTER_NAME" "github.com/run-url=https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" "karpenter.sh/discovery=$CLUSTER_NAME"
aws iam attach-role-policy --policy-arn arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy --role-name KarpenterNodeRole-$CLUSTER_NAME
- name: create or upgrade cluster
shell: bash
env:
Expand Down Expand Up @@ -153,9 +152,11 @@ runs:
minSize: 2
maxSize: 2
iam:
withAddonPolicies:
cloudWatch: true
instanceRolePermissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary"
taints:
- key: CriticalAddonsOnly
value: "true"
effect: NoSchedule
cloudWatch:
clusterLogging:
enableTypes: ["*"]
Expand All @@ -174,10 +175,6 @@ runs:
$KARPENTER_IAM
withOIDC: true
addons:
- name: amazon-cloudwatch-observability
# Pin addon version due to undiagnosed e2e failures after 1.6.0 release
version: '1.5.5-eksbuild.1'
permissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary"
- name: vpc-cni
permissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary"
- name: coredns
Expand Down Expand Up @@ -219,24 +216,6 @@ runs:
# Remove the serviceaccount created by eksctl as of 1.77.0 for pod identity associations. This service account causes our helm chart installation to fail.
# Reference: https://github.com/eksctl-io/eksctl/issues/7775
kubectl delete sa -n kube-system karpenter || true
# Adding taints after all necessary pods have scheduled to the manged node group nodes
# amazon-cloudwatch-observability pods do no not tolerate CriticalAddonsOnly=true:NoSchedule and
# amazon-cloudwatch-observability addons does not allow to add tolerations to the addon pods as part of the advanced configuration
# Overwrite existing taints to ensure we don't fail here on upgrade
kubectl taint nodes CriticalAddonsOnly=true:NoSchedule --all --overwrite
# We delete DaemonSets that we don't care about because it causes inconsistencies in scheduling due to
# dcgm-exporter and neuron-monitor selecting on specific instance types
# See https://github.com/kubernetes-sigs/karpenter/issues/715 for more detail
kubectl delete daemonsets -n amazon-cloudwatch dcgm-exporter neuron-monitor --ignore-not-found
# We patch the priorityClass onto all DaemonSets to ensure that DaemonSets always schedule to nodes so we don't get scheduling inconsistencies
# See https://karpenter.sh/docs/faq/#when-deploying-an-additional-daemonset-to-my-cluster-why-does-karpenter-not-scale-up-my-nodes-to-support-the-extra-daemonset for more detail
# Additionally, we patch an everything toleration onto the daemonsets to prevent them from being included in drain operations.
for DAEMONSET in "cloudwatch-agent" "cloudwatch-agent-windows" "fluent-bit" "fluent-bit-windows"; do
kubectl patch daemonset -n amazon-cloudwatch $DAEMONSET -p '{"spec":{"template":{"spec":{"priorityClassName":"system-node-critical","tolerations": [{"operator": "Exists"}]}}}}' --type=merge
done
- name: tag oidc provider of the cluster
if: always()
shell: bash
Expand Down
9 changes: 0 additions & 9 deletions .github/workflows/e2e-upgrade.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,6 @@ jobs:
url: ${{ secrets.SLACK_WEBHOOK_URL }}
suite: Upgrade
git_ref: ${{ inputs.to_git_ref }}
- name: add log retention policy
if: always() && inputs.workflow_trigger != 'private_cluster'
env:
CLUSTER_NAME: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}
run: |
aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/application --retention-in-days 30
aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/dataplane --retention-in-days 30
aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/host --retention-in-days 30
aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/performance --retention-in-days 30
- name: dump logs on failure
uses: ./.github/actions/e2e/dump-logs
if: failure() || cancelled()
Expand Down
9 changes: 0 additions & 9 deletions .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -188,15 +188,6 @@ jobs:
suite: ${{ inputs.suite }}
git_ref: ${{ inputs.git_ref }}
workflow_trigger: ${{ inputs.workflow_trigger }}
- name: add log retention policy
if: always() && inputs.workflow_trigger != 'private_cluster'
env:
CLUSTER_NAME: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}
run: |
aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/application --retention-in-days 30
aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/dataplane --retention-in-days 30
aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/host --retention-in-days 30
aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/performance --retention-in-days 30
- name: dump logs on failure
uses: ./.github/actions/e2e/dump-logs
if: (failure() || cancelled()) && inputs.workflow_trigger != 'private_cluster'
Expand Down

0 comments on commit b3076dc

Please sign in to comment.