From 71c8d8a7b07bc76bac035d010429305b1386ee01 Mon Sep 17 00:00:00 2001 From: Cameron Meissner Date: Thu, 10 Oct 2024 08:57:50 -0700 Subject: [PATCH] cleanup: remove unused self-contained content (#5060) Co-authored-by: Cameron Meissner --- .pipelines/scripts/verify_shell.sh | 2 +- self-contained/bootstrap_cmd.sh | 156 ------- self-contained/bootstrap_config.sh | 697 ---------------------------- self-contained/bootstrap_helpers.sh | 400 ---------------- self-contained/bootstrap_install.sh | 531 --------------------- self-contained/bootstrap_main.sh | 424 ----------------- self-contained/bootstrap_start.sh | 96 ---- 7 files changed, 1 insertion(+), 2305 deletions(-) delete mode 100644 self-contained/bootstrap_cmd.sh delete mode 100755 self-contained/bootstrap_config.sh delete mode 100755 self-contained/bootstrap_helpers.sh delete mode 100755 self-contained/bootstrap_install.sh delete mode 100755 self-contained/bootstrap_main.sh delete mode 100644 self-contained/bootstrap_start.sh diff --git a/.pipelines/scripts/verify_shell.sh b/.pipelines/scripts/verify_shell.sh index d6b9fefa121..29abae2b0ed 100755 --- a/.pipelines/scripts/verify_shell.sh +++ b/.pipelines/scripts/verify_shell.sh @@ -25,7 +25,7 @@ else echo "shellcheck installed" fi -filesToCheck=$(find . -type f -name "*.sh" -not -path './parts/linux/cloud-init/artifacts/*' -not -path './pkg/agent/testdata/*' -not -path './vendor/*' -not -path './hack/tools/vendor/*' -not -path './.git/*' -not -path './self-contained/*' -not -path './hack/tools/bin/shellspecsrc/*') +filesToCheck=$(find . -type f -name "*.sh" -not -path './parts/linux/cloud-init/artifacts/*' -not -path './pkg/agent/testdata/*' -not -path './vendor/*' -not -path './hack/tools/vendor/*' -not -path './.git/*' -not -path './hack/tools/bin/shellspecsrc/*') # also shell-check generated test data generatedTestData=$(find ./pkg/agent/testdata -type f -name "*.sh" ) diff --git a/self-contained/bootstrap_cmd.sh b/self-contained/bootstrap_cmd.sh deleted file mode 100644 index b7fd3969594..00000000000 --- a/self-contained/bootstrap_cmd.sh +++ /dev/null @@ -1,156 +0,0 @@ -PROVISION_OUTPUT="/var/log/azure/cluster-provision-cse-output.log"; -echo $(date),$(hostname) > ${PROVISION_OUTPUT}; -{{if ShouldEnableCustomData}} -cloud-init status --wait > /dev/null 2>&1; -[ $? -ne 0 ] && echo 'cloud-init failed' >> ${PROVISION_OUTPUT} && exit 1; -echo "cloud-init succeeded" >> ${PROVISION_OUTPUT}; -{{end}} -{{if IsAKSCustomCloud}} -REPO_DEPOT_ENDPOINT="{{AKSCustomCloudRepoDepotEndpoint}}" -{{GetInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; -{{end}} -ADMINUSER={{GetParameter "linuxAdminUsername"}} -MOBY_VERSION={{GetParameter "mobyVersion"}} -TENANT_ID={{GetVariable "tenantID"}} -KUBERNETES_VERSION={{GetParameter "kubernetesVersion"}} -HYPERKUBE_URL={{GetParameter "kubernetesHyperkubeSpec"}} -KUBE_BINARY_URL={{GetParameter "kubeBinaryURL"}} -CUSTOM_KUBE_BINARY_URL={{GetParameter "customKubeBinaryURL"}} -PRIVATE_KUBE_BINARY_URL="{{GetLinuxPrivatePackageURL}}" -KUBEPROXY_URL={{GetParameter "kubeProxySpec"}} -APISERVER_PUBLIC_KEY={{GetParameter "apiServerCertificate"}} -SUBSCRIPTION_ID={{GetVariable "subscriptionId"}} -RESOURCE_GROUP={{GetVariable "resourceGroup"}} -LOCATION={{GetVariable "location"}} -VM_TYPE={{GetVariable "vmType"}} -SUBNET={{GetVariable "subnetName"}} -NETWORK_SECURITY_GROUP={{GetVariable "nsgName"}} -VIRTUAL_NETWORK={{GetVariable "virtualNetworkName"}} -VIRTUAL_NETWORK_RESOURCE_GROUP={{GetVariable "virtualNetworkResourceGroupName"}} -ROUTE_TABLE={{GetVariable "routeTableName"}} -PRIMARY_AVAILABILITY_SET={{GetVariable "primaryAvailabilitySetName"}} -PRIMARY_SCALE_SET={{GetVariable "primaryScaleSetName"}} -SERVICE_PRINCIPAL_CLIENT_ID={{GetParameter "servicePrincipalClientId"}} -NETWORK_PLUGIN={{GetParameter "networkPlugin"}} -NETWORK_POLICY={{GetParameter "networkPolicy"}} -VNET_CNI_PLUGINS_URL={{GetParameter "vnetCniLinuxPluginsURL"}} -CLOUDPROVIDER_BACKOFF={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoff"}} -CLOUDPROVIDER_BACKOFF_MODE={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoffMode"}} -CLOUDPROVIDER_BACKOFF_RETRIES={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoffRetries"}} -CLOUDPROVIDER_BACKOFF_EXPONENT={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoffExponent"}} -CLOUDPROVIDER_BACKOFF_DURATION={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoffDuration"}} -CLOUDPROVIDER_BACKOFF_JITTER={{GetParameterProperty "cloudproviderConfig" "cloudProviderBackoffJitter"}} -CLOUDPROVIDER_RATELIMIT={{GetParameterProperty "cloudproviderConfig" "cloudProviderRateLimit"}} -CLOUDPROVIDER_RATELIMIT_QPS={{GetParameterProperty "cloudproviderConfig" "cloudProviderRateLimitQPS"}} -CLOUDPROVIDER_RATELIMIT_QPS_WRITE={{GetParameterProperty "cloudproviderConfig" "cloudProviderRateLimitQPSWrite"}} -CLOUDPROVIDER_RATELIMIT_BUCKET={{GetParameterProperty "cloudproviderConfig" "cloudProviderRateLimitBucket"}} -CLOUDPROVIDER_RATELIMIT_BUCKET_WRITE={{GetParameterProperty "cloudproviderConfig" "cloudProviderRateLimitBucketWrite"}} -LOAD_BALANCER_DISABLE_OUTBOUND_SNAT={{GetParameterProperty "cloudproviderConfig" "cloudProviderDisableOutboundSNAT"}} -USE_MANAGED_IDENTITY_EXTENSION={{GetVariable "useManagedIdentityExtension"}} -USE_INSTANCE_METADATA={{GetVariable "useInstanceMetadata"}} -LOAD_BALANCER_SKU={{GetVariable "loadBalancerSku"}} -EXCLUDE_MASTER_FROM_STANDARD_LB={{GetVariable "excludeMasterFromStandardLB"}} -MAXIMUM_LOADBALANCER_RULE_COUNT={{GetVariable "maximumLoadBalancerRuleCount"}} -CONTAINER_RUNTIME={{GetParameter "containerRuntime"}} -CLI_TOOL={{GetParameter "cliTool"}} -CONTAINERD_DOWNLOAD_URL_BASE={{GetParameter "containerdDownloadURLBase"}} -NETWORK_MODE={{GetParameter "networkMode"}} -KUBE_BINARY_URL={{GetParameter "kubeBinaryURL"}} -USER_ASSIGNED_IDENTITY_ID={{GetVariable "userAssignedIdentityID"}} -API_SERVER_NAME={{GetKubernetesEndpoint}} -IS_VHD={{GetVariable "isVHD"}} -GPU_NODE={{GetVariable "gpuNode"}} -SGX_NODE={{GetVariable "sgxNode"}} -MIG_NODE={{GetVariable "migNode"}} -CONFIG_GPU_DRIVER_IF_NEEDED={{GetVariable "configGPUDriverIfNeeded"}} -ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED={{GetVariable "enableGPUDevicePluginIfNeeded"}} -TELEPORTD_PLUGIN_DOWNLOAD_URL={{GetParameter "teleportdPluginURL"}} -CONTAINERD_VERSION={{GetParameter "containerdVersion"}} -CONTAINERD_PACKAGE_URL={{GetParameter "containerdPackageURL"}} -RUNC_VERSION={{GetParameter "runcVersion"}} -RUNC_PACKAGE_URL={{GetParameter "runcPackageURL"}} -ENABLE_HOSTS_CONFIG_AGENT="{{EnableHostsConfigAgent}}" -DISABLE_SSH="{{ShouldDisableSSH}}" -NEEDS_CONTAINERD="{{NeedsContainerd}}" -TELEPORT_ENABLED="{{TeleportEnabled}}" -SHOULD_CONFIGURE_HTTP_PROXY="{{ShouldConfigureHTTPProxy}}" -SHOULD_CONFIGURE_HTTP_PROXY_CA="{{ShouldConfigureHTTPProxyCA}}" -HTTP_PROXY_TRUSTED_CA="{{GetHTTPProxyCA}}" -SHOULD_CONFIGURE_CUSTOM_CA_TRUST="{{ShouldConfigureCustomCATrust}}" -CUSTOM_CA_TRUST_COUNT="{{len GetCustomCATrustConfigCerts}}" -{{range $i, $cert := GetCustomCATrustConfigCerts}} -CUSTOM_CA_CERT_{{$i}}="{{$cert}}" -{{end}} -IS_KRUSTLET="{{IsKrustlet}}" -GPU_NEEDS_FABRIC_MANAGER="{{GPUNeedsFabricManager}}" -#NEEDS_DOCKER_LOGIN="{{and IsDockerContainerRuntime HasPrivateAzureRegistryServer}}" This field is no longer required for the new contract since Docker is out of support and its value depends on Container Runtime = Docker -IPV6_DUAL_STACK_ENABLED="{{IsIPv6DualStackFeatureEnabled}}" -OUTBOUND_COMMAND="{{GetOutboundCommand}}" -ENABLE_UNATTENDED_UPGRADES="{{EnableUnattendedUpgrade}}" -ENSURE_NO_DUPE_PROMISCUOUS_BRIDGE="{{ and NeedsContainerd IsKubenet (not HasCalicoNetworkPolicy) }}" -SHOULD_CONFIG_SWAP_FILE="{{ShouldConfigSwapFile}}" -SHOULD_CONFIG_TRANSPARENT_HUGE_PAGE="{{ShouldConfigTransparentHugePage}}" -SHOULD_CONFIG_CONTAINERD_ULIMITS="{{ShouldConfigContainerdUlimits}}" -CONTAINERD_ULIMITS="{{GetContainerdUlimitString}}" -{{/* both CLOUD and ENVIRONMENT have special values when IsAKSCustomCloud == true */}} -{{/* CLOUD uses AzureStackCloud and seems to be used by kubelet, k8s cloud provider */}} -{{/* target environment seems to go to ARM SDK config */}} -{{/* not sure why separate/inconsistent? */}} -{{/* see GetCustomEnvironmentJSON for more weirdness. */}} -TARGET_CLOUD="{{- if IsAKSCustomCloud -}} AzureStackCloud {{- else -}} {{GetTargetEnvironment}} {{- end -}}" -TARGET_ENVIRONMENT="{{GetTargetEnvironment}}" -CUSTOM_ENV_JSON="{{GetBase64EncodedEnvironmentJSON}}" -IS_CUSTOM_CLOUD="{{IsAKSCustomCloud}}" -CSE_HELPERS_FILEPATH="{{GetCSEHelpersScriptFilepath}}" -CSE_DISTRO_HELPERS_FILEPATH="{{GetCSEHelpersScriptDistroFilepath}}" -CSE_INSTALL_FILEPATH="{{GetCSEInstallScriptFilepath}}" -CSE_DISTRO_INSTALL_FILEPATH="{{GetCSEInstallScriptDistroFilepath}}" -CSE_CONFIG_FILEPATH="{{GetCSEConfigScriptFilepath}}" -AZURE_PRIVATE_REGISTRY_SERVER="{{GetPrivateAzureRegistryServer}}" -HAS_CUSTOM_SEARCH_DOMAIN="{{HasCustomSearchDomain}}" -CUSTOM_SEARCH_DOMAIN_FILEPATH="{{GetCustomSearchDomainsCSEScriptFilepath}}" -HTTP_PROXY_URLS="{{GetHTTPProxy}}" -HTTPS_PROXY_URLS="{{GetHTTPSProxy}}" -NO_PROXY_URLS="{{GetNoProxy}}" -PROXY_VARS="{{GetProxyVariables}}" -ENABLE_TLS_BOOTSTRAPPING="{{EnableTLSBootstrapping}}" -ENABLE_SECURE_TLS_BOOTSTRAPPING="{{EnableSecureTLSBootstrapping}}" -DHCPV6_SERVICE_FILEPATH="{{GetDHCPv6ServiceCSEScriptFilepath}}" -DHCPV6_CONFIG_FILEPATH="{{GetDHCPv6ConfigCSEScriptFilepath}}" -THP_ENABLED="{{GetTransparentHugePageEnabled}}" -THP_DEFRAG="{{GetTransparentHugePageDefrag}}" -SERVICE_PRINCIPAL_FILE_CONTENT="{{GetServicePrincipalSecret}}" -KUBELET_CLIENT_CONTENT="{{GetKubeletClientKey}}" -KUBELET_CLIENT_CERT_CONTENT="{{GetKubeletClientCert}}" -KUBELET_CONFIG_FILE_ENABLED="{{IsKubeletConfigFileEnabled}}" -KUBELET_CONFIG_FILE_CONTENT="{{GetKubeletConfigFileContentBase64}}" -SWAP_FILE_SIZE_MB="{{GetSwapFileSizeMB}}" -GPU_DRIVER_VERSION="{{GPUDriverVersion}}" -GPU_INSTANCE_PROFILE="{{GetGPUInstanceProfile}}" -CUSTOM_SEARCH_DOMAIN_NAME="{{GetSearchDomainName}}" -CUSTOM_SEARCH_REALM_USER="{{GetSearchDomainRealmUser}}" -CUSTOM_SEARCH_REALM_PASSWORD="{{GetSearchDomainRealmPassword}}" -MESSAGE_OF_THE_DAY="{{GetMessageOfTheDay}}" -HAS_KUBELET_DISK_TYPE="{{HasKubeletDiskType}}" -NEEDS_CGROUPV2="{{IsCgroupV2}}" -TLS_BOOTSTRAP_TOKEN="{{GetTLSBootstrapTokenForKubeConfig}}" -KUBELET_FLAGS="{{GetKubeletConfigKeyVals}}" -NETWORK_POLICY="{{GetParameter "networkPolicy"}}" -{{- if not (IsKubernetesVersionGe "1.17.0")}} -KUBELET_IMAGE="{{GetHyperkubeImageReference}}" -{{end}} -{{if IsKubernetesVersionGe "1.16.0"}} -KUBELET_NODE_LABELS="{{GetAgentKubernetesLabels . }}" -{{else}} -KUBELET_NODE_LABELS="{{GetAgentKubernetesLabelsDeprecated . }}" -{{end}} -AZURE_ENVIRONMENT_FILEPATH="{{- if IsAKSCustomCloud}}/etc/kubernetes/{{GetTargetEnvironment}}.json{{end}}" -KUBE_CA_CRT="{{GetParameter "caCertificate"}}" -KUBENET_TEMPLATE="{{GetKubenetTemplate}}" -CONTAINERD_CONFIG_CONTENT="{{GetContainerdConfigContent}}" -CONTAINERD_CONFIG_NO_GPU_CONTENT="{{GetContainerdConfigNoGPUContent}}" -IS_KATA="{{IsKata}}" -ARTIFACT_STREAMING_ENABLED="{{IsArtifactStreamingEnabled}}" -SYSCTL_CONTENT="{{GetSysctlContent}}" -PRIVATE_EGRESS_PROXY_ADDRESS="{{GetPrivateEgressProxyAddress}}" -/usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" \ No newline at end of file diff --git a/self-contained/bootstrap_config.sh b/self-contained/bootstrap_config.sh deleted file mode 100755 index 3a8c157cef5..00000000000 --- a/self-contained/bootstrap_config.sh +++ /dev/null @@ -1,697 +0,0 @@ -#!/bin/bash -NODE_INDEX=$(hostname | tail -c 2) -NODE_NAME=$(hostname) - -configureAdminUser(){ - chage -E -1 -I -1 -m 0 -M 99999 "${ADMINUSER}" - chage -l "${ADMINUSER}" -} - -configPrivateClusterHosts() { - mkdir -p /etc/systemd/system/reconcile-private-hosts.service.d/ - touch /etc/systemd/system/reconcile-private-hosts.service.d/10-fqdn.conf - tee /etc/systemd/system/reconcile-private-hosts.service.d/10-fqdn.conf > /dev/null < /sys/kernel/mm/transparent_hugepage/enabled - echo "kernel/mm/transparent_hugepage/enabled=${THP_ENABLED}" >> ${ETC_SYSFS_CONF} - fi - if [[ "${THP_DEFRAG}" != "" ]]; then - echo "${THP_DEFRAG}" > /sys/kernel/mm/transparent_hugepage/defrag - echo "kernel/mm/transparent_hugepage/defrag=${THP_DEFRAG}" >> ${ETC_SYSFS_CONF} - fi -} - -configureSwapFile() { - # https://learn.microsoft.com/en-us/troubleshoot/azure/virtual-machines/troubleshoot-device-names-problems#identify-disk-luns - swap_size_kb=$(expr ${SWAP_FILE_SIZE_MB} \* 1000) - swap_location="" - - # Attempt to use the resource disk - if [[ -L /dev/disk/azure/resource-part1 ]]; then - resource_disk_path=$(findmnt -nr -o target -S $(readlink -f /dev/disk/azure/resource-part1)) - disk_free_kb=$(df ${resource_disk_path} | sed 1d | awk '{print $4}') - if [[ ${disk_free_kb} -gt ${swap_size_kb} ]]; then - echo "Will use resource disk for swap file" - swap_location=${resource_disk_path}/swapfile - else - echo "Insufficient disk space on resource disk to create swap file: request ${swap_size_kb} free ${disk_free_kb}, attempting to fall back to OS disk..." - fi - fi - - # If we couldn't use the resource disk, attempt to use the OS disk - if [[ -z "${swap_location}" ]]; then - # Directly check size on the root directory since we can't rely on 'root-part1' always being the correct label - os_device=$(readlink -f /dev/disk/azure/root) - disk_free_kb=$(df -P / | sed 1d | awk '{print $4}') - if [[ ${disk_free_kb} -gt ${swap_size_kb} ]]; then - echo "Will use OS disk for swap file" - swap_location=/swapfile - else - echo "Insufficient disk space on OS device ${os_device} to create swap file: request ${swap_size_kb} free ${disk_free_kb}" - exit $ERR_SWAP_CREATE_INSUFFICIENT_DISK_SPACE - fi - fi - - echo "Swap file will be saved to: ${swap_location}" - retrycmd_if_failure 24 5 25 fallocate -l ${swap_size_kb}K ${swap_location} || exit $ERR_SWAP_CREATE_FAIL - chmod 600 ${swap_location} - retrycmd_if_failure 24 5 25 mkswap ${swap_location} || exit $ERR_SWAP_CREATE_FAIL - retrycmd_if_failure 24 5 25 swapon ${swap_location} || exit $ERR_SWAP_CREATE_FAIL - retrycmd_if_failure 24 5 25 swapon --show | grep ${swap_location} || exit $ERR_SWAP_CREATE_FAIL - echo "${swap_location} none swap sw 0 0" >> /etc/fstab -} - -configureEtcEnvironment() { - mkdir -p /etc/systemd/system.conf.d/ - touch /etc/systemd/system.conf.d/proxy.conf - chmod 0644 /etc/systemd/system.conf.d/proxy.conf - - mkdir -p /etc/apt/apt.conf.d - touch /etc/apt/apt.conf.d/95proxy - chmod 0644 /etc/apt/apt.conf.d/95proxy - - # TODO(ace): this pains me but quick and dirty refactor - echo "[Manager]" >> /etc/systemd/system.conf.d/proxy.conf - if [ "${HTTP_PROXY_URLS}" != "" ]; then - echo "HTTP_PROXY=${HTTP_PROXY_URLS}" >> /etc/environment - echo "http_proxy=${HTTP_PROXY_URLS}" >> /etc/environment - echo "Acquire::http::proxy \"${HTTP_PROXY_URLS}\";" >> /etc/apt/apt.conf.d/95proxy - echo "DefaultEnvironment=\"HTTP_PROXY=${HTTP_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf - echo "DefaultEnvironment=\"http_proxy=${HTTP_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf - fi - if [ "${HTTPS_PROXY_URLS}" != "" ]; then - echo "HTTPS_PROXY=${HTTPS_PROXY_URLS}" >> /etc/environment - echo "https_proxy=${HTTPS_PROXY_URLS}" >> /etc/environment - echo "Acquire::https::proxy \"${HTTPS_PROXY_URLS}\";" >> /etc/apt/apt.conf.d/95proxy - echo "DefaultEnvironment=\"HTTPS_PROXY=${HTTPS_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf - echo "DefaultEnvironment=\"https_proxy=${HTTPS_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf - fi - if [ "${NO_PROXY_URLS}" != "" ]; then - echo "NO_PROXY=${NO_PROXY_URLS}" >> /etc/environment - echo "no_proxy=${NO_PROXY_URLS}" >> /etc/environment - echo "DefaultEnvironment=\"NO_PROXY=${NO_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf - echo "DefaultEnvironment=\"no_proxy=${NO_PROXY_URLS}\"" >> /etc/systemd/system.conf.d/proxy.conf - fi - - # for kubelet to pick up the proxy - mkdir -p "/etc/systemd/system/kubelet.service.d" - tee "/etc/systemd/system/kubelet.service.d/10-httpproxy.conf" > /dev/null <<'EOF' -[Service] -EnvironmentFile=/etc/environment -EOF -} - -configureHTTPProxyCA() { - if isMarinerOrAzureLinux "$OS"; then - cert_dest="/usr/share/pki/ca-trust-source/anchors" - update_cmd="update-ca-trust" - else - cert_dest="/usr/local/share/ca-certificates" - update_cmd="update-ca-certificates" - fi - echo "${HTTP_PROXY_TRUSTED_CA}" | base64 -d > "${cert_dest}/proxyCA.crt" || exit $ERR_UPDATE_CA_CERTS - $update_cmd || exit $ERR_UPDATE_CA_CERTS -} - -configureCustomCaCertificate() { - mkdir -p /opt/certs - for i in $(seq 0 $((${CUSTOM_CA_TRUST_COUNT} - 1))); do - # directly referring to the variable as "${CUSTOM_CA_CERT_${i}}" - # causes bad substitution errors in bash - # dynamically declare and use `!` to add a layer of indirection - declare varname=CUSTOM_CA_CERT_${i} - echo "${!varname}" | base64 -d > /opt/certs/00000000000000cert${i}.crt - done - # This will block until the service is considered active. - # Update_certs.service is a oneshot type of unit that - # is considered active when the ExecStart= command terminates with a zero status code. - systemctl restart update_certs.service || exit $ERR_UPDATE_CA_CERTS - # after new certs are added to trust store, containerd will not pick them up properly before restart. - # aim here is to have this working straight away for a freshly provisioned node - # so we force a restart after the certs are updated - # custom CA daemonset copies certs passed by the user to the node, what then triggers update_certs.path unit - # path unit then triggers the script that copies over cert files to correct location on the node and updates the trust store - # as a part of this flow we could restart containerd everytime a new cert is added to the trust store using custom CA - systemctl restart containerd -} - -configureContainerdUlimits() { - CONTAINERD_ULIMIT_DROP_IN_FILE_PATH="/etc/systemd/system/containerd.service.d/set_ulimits.conf" - touch "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" - chmod 0600 "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" - tee "${CONTAINERD_ULIMIT_DROP_IN_FILE_PATH}" > /dev/null < /etc/kubernetes/certs/client.key - fi - if [ -n "${KUBELET_CLIENT_CERT_CONTENT}" ]; then - echo "${KUBELET_CLIENT_CERT_CONTENT}" | base64 -d > /etc/kubernetes/certs/client.crt - fi - if [ -n "${SERVICE_PRINCIPAL_FILE_CONTENT}" ]; then - echo "${SERVICE_PRINCIPAL_FILE_CONTENT}" | base64 -d > /etc/kubernetes/sp.txt - fi - - echo "${APISERVER_PUBLIC_KEY}" | base64 --decode > "${APISERVER_PUBLIC_KEY_PATH}" - # Perform the required JSON escaping - SP_FILE="/etc/kubernetes/sp.txt" - SERVICE_PRINCIPAL_CLIENT_SECRET="$(cat "$SP_FILE")" - SERVICE_PRINCIPAL_CLIENT_SECRET=${SERVICE_PRINCIPAL_CLIENT_SECRET//\\/\\\\} - SERVICE_PRINCIPAL_CLIENT_SECRET=${SERVICE_PRINCIPAL_CLIENT_SECRET//\"/\\\"} - rm "$SP_FILE" # unneeded after reading from disk. - cat << EOF > "${AZURE_JSON_PATH}" -{ - "cloud": "${TARGET_CLOUD}", - "tenantId": "${TENANT_ID}", - "subscriptionId": "${SUBSCRIPTION_ID}", - "aadClientId": "${SERVICE_PRINCIPAL_CLIENT_ID}", - "aadClientSecret": "${SERVICE_PRINCIPAL_CLIENT_SECRET}", - "resourceGroup": "${RESOURCE_GROUP}", - "location": "${LOCATION}", - "vmType": "${VM_TYPE}", - "subnetName": "${SUBNET}", - "securityGroupName": "${NETWORK_SECURITY_GROUP}", - "vnetName": "${VIRTUAL_NETWORK}", - "vnetResourceGroup": "${VIRTUAL_NETWORK_RESOURCE_GROUP}", - "routeTableName": "${ROUTE_TABLE}", - "primaryAvailabilitySetName": "${PRIMARY_AVAILABILITY_SET}", - "primaryScaleSetName": "${PRIMARY_SCALE_SET}", - "cloudProviderBackoffMode": "${CLOUDPROVIDER_BACKOFF_MODE}", - "cloudProviderBackoff": ${CLOUDPROVIDER_BACKOFF}, - "cloudProviderBackoffRetries": ${CLOUDPROVIDER_BACKOFF_RETRIES}, - "cloudProviderBackoffExponent": ${CLOUDPROVIDER_BACKOFF_EXPONENT}, - "cloudProviderBackoffDuration": ${CLOUDPROVIDER_BACKOFF_DURATION}, - "cloudProviderBackoffJitter": ${CLOUDPROVIDER_BACKOFF_JITTER}, - "cloudProviderRateLimit": ${CLOUDPROVIDER_RATELIMIT}, - "cloudProviderRateLimitQPS": ${CLOUDPROVIDER_RATELIMIT_QPS}, - "cloudProviderRateLimitBucket": ${CLOUDPROVIDER_RATELIMIT_BUCKET}, - "cloudProviderRateLimitQPSWrite": ${CLOUDPROVIDER_RATELIMIT_QPS_WRITE}, - "cloudProviderRateLimitBucketWrite": ${CLOUDPROVIDER_RATELIMIT_BUCKET_WRITE}, - "useManagedIdentityExtension": ${USE_MANAGED_IDENTITY_EXTENSION}, - "userAssignedIdentityID": "${USER_ASSIGNED_IDENTITY_ID}", - "useInstanceMetadata": ${USE_INSTANCE_METADATA}, - "loadBalancerSku": "${LOAD_BALANCER_SKU}", - "disableOutboundSNAT": ${LOAD_BALANCER_DISABLE_OUTBOUND_SNAT}, - "excludeMasterFromStandardLB": ${EXCLUDE_MASTER_FROM_STANDARD_LB}, - "providerVaultName": "${KMS_PROVIDER_VAULT_NAME}", - "maximumLoadBalancerRuleCount": ${MAXIMUM_LOADBALANCER_RULE_COUNT}, - "providerKeyName": "k8s", - "providerKeyVersion": "" -} -EOF - set -x - if [[ "${CLOUDPROVIDER_BACKOFF_MODE}" = "v2" ]]; then - sed -i "/cloudProviderBackoffExponent/d" /etc/kubernetes/azure.json - sed -i "/cloudProviderBackoffJitter/d" /etc/kubernetes/azure.json - fi - - configureKubeletServerCert - if [ "${IS_CUSTOM_CLOUD}" == "true" ]; then - set +x - AKS_CUSTOM_CLOUD_JSON_PATH="/etc/kubernetes/${TARGET_ENVIRONMENT}.json" - touch "${AKS_CUSTOM_CLOUD_JSON_PATH}" - chmod 0600 "${AKS_CUSTOM_CLOUD_JSON_PATH}" - chown root:root "${AKS_CUSTOM_CLOUD_JSON_PATH}" - - echo "${CUSTOM_ENV_JSON}" | base64 -d > "${AKS_CUSTOM_CLOUD_JSON_PATH}" - set -x - fi - - if [ "${KUBELET_CONFIG_FILE_ENABLED}" == "true" ]; then - set +x - KUBELET_CONFIG_JSON_PATH="/etc/default/kubeletconfig.json" - touch "${KUBELET_CONFIG_JSON_PATH}" - chmod 0600 "${KUBELET_CONFIG_JSON_PATH}" - chown root:root "${KUBELET_CONFIG_JSON_PATH}" - echo "${KUBELET_CONFIG_FILE_CONTENT}" | base64 -d > "${KUBELET_CONFIG_JSON_PATH}" - set -x - KUBELET_CONFIG_DROP_IN="/etc/systemd/system/kubelet.service.d/10-componentconfig.conf" - touch "${KUBELET_CONFIG_DROP_IN}" - chmod 0600 "${KUBELET_CONFIG_DROP_IN}" - tee "${KUBELET_CONFIG_DROP_IN}" > /dev/null < /etc/modules-load.d/br_netfilter.conf - configureCNIIPTables -} - -configureCNIIPTables() { - if [[ "${NETWORK_PLUGIN}" = "azure" ]]; then - mv $CNI_BIN_DIR/10-azure.conflist $CNI_CONFIG_DIR/ - chmod 600 $CNI_CONFIG_DIR/10-azure.conflist - if [[ "${NETWORK_POLICY}" == "calico" ]]; then - sed -i 's#"mode":"bridge"#"mode":"transparent"#g' $CNI_CONFIG_DIR/10-azure.conflist - elif [[ "${NETWORK_POLICY}" == "" || "${NETWORK_POLICY}" == "none" ]] && [[ "${NETWORK_MODE}" == "transparent" ]]; then - sed -i 's#"mode":"bridge"#"mode":"transparent"#g' $CNI_CONFIG_DIR/10-azure.conflist - fi - /sbin/ebtables -t nat --list - fi -} - -disableSystemdResolved() { - ls -ltr /etc/resolv.conf - cat /etc/resolv.conf - UBUNTU_RELEASE=$(lsb_release -r -s) - if [[ "${UBUNTU_RELEASE}" == "18.04" || "${UBUNTU_RELEASE}" == "20.04" || "${UBUNTU_RELEASE}" == "22.04" ]]; then - echo "Ingorings systemd-resolved query service but using its resolv.conf file" - echo "This is the simplest approach to workaround resolved issues without completely uninstall it" - [ -f /run/systemd/resolve/resolv.conf ] && sudo ln -sf /run/systemd/resolve/resolv.conf /etc/resolv.conf - ls -ltr /etc/resolv.conf - cat /etc/resolv.conf - fi -} - -ensureContainerd() { - if [ "${TELEPORT_ENABLED}" == "true" ]; then - ensureTeleportd - fi - mkdir -p "/etc/systemd/system/containerd.service.d" - tee "/etc/systemd/system/containerd.service.d/exec_start.conf" > /dev/null < /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT - else - echo "Generating containerd config..." - echo "${CONTAINERD_CONFIG_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT - fi - - tee "/etc/sysctl.d/99-force-bridge-forward.conf" > /dev/null <= 1.29 kubelet no longer sets node internalIP when using external cloud provider - # https://github.com/kubernetes/kubernetes/pull/121028 - # This regresses node startup performance in Azure CNI Overlay and Podsubnet clusters, which require the node to be - # assigned an internal IP before configuring pod networking. - # To improve node startup performance, explicitly set `--node-ip` to the IP returned from IMDS so kubelet sets - # the internal IP when it registers the node. - # If this fails, skip setting --node-ip, which is safe because cloud-node-manager will assign it later anyway. - if semverCompare ${KUBERNETES_VERSION:-"0.0.0"} "1.29.0"; then - logs_to_events "AKS.CSE.ensureKubelet.setKubeletNodeIPFlag" setKubeletNodeIPFlag - fi - - echo "KUBELET_FLAGS=${KUBELET_FLAGS}" > "${KUBELET_DEFAULT_FILE}" - echo "KUBELET_REGISTER_SCHEDULABLE=true" >> "${KUBELET_DEFAULT_FILE}" - echo "NETWORK_POLICY=${NETWORK_POLICY}" >> "${KUBELET_DEFAULT_FILE}" - echo "KUBELET_IMAGE=${KUBELET_IMAGE}" >> "${KUBELET_DEFAULT_FILE}" - echo "KUBELET_NODE_LABELS=${KUBELET_NODE_LABELS}" >> "${KUBELET_DEFAULT_FILE}" - if [ -n "${AZURE_ENVIRONMENT_FILEPATH}" ]; then - echo "AZURE_ENVIRONMENT_FILEPATH=${AZURE_ENVIRONMENT_FILEPATH}" >> "${KUBELET_DEFAULT_FILE}" - fi - - KUBE_CA_FILE="/etc/kubernetes/certs/ca.crt" - mkdir -p "$(dirname "${KUBE_CA_FILE}")" - echo "${KUBE_CA_CRT}" | base64 -d > "${KUBE_CA_FILE}" - chmod 0600 "${KUBE_CA_FILE}" - - if [ "${ENABLE_TLS_BOOTSTRAPPING}" == "true" ]; then - KUBELET_TLS_DROP_IN="/etc/systemd/system/kubelet.service.d/10-tlsbootstrap.conf" - mkdir -p "$(dirname "${KUBELET_TLS_DROP_IN}")" - touch "${KUBELET_TLS_DROP_IN}" - chmod 0600 "${KUBELET_TLS_DROP_IN}" - tee "${KUBELET_TLS_DROP_IN}" > /dev/null < /dev/null < /dev/null < /dev/null < /dev/null < "${SYSCTL_CONFIG_FILE}" - retrycmd_if_failure 24 5 25 sysctl --system -} - -ensureK8sControlPlane() { - if $REBOOTREQUIRED || [ "$NO_OUTBOUND" = "true" ]; then - return - fi - retrycmd_if_failure 120 5 25 $KUBECTL 2>/dev/null cluster-info || exit $ERR_K8S_RUNNING_TIMEOUT -} - -createKubeManifestDir() { - KUBEMANIFESTDIR=/etc/kubernetes/manifests - mkdir -p $KUBEMANIFESTDIR -} - -writeKubeConfig() { - KUBECONFIGDIR=/home/$ADMINUSER/.kube - KUBECONFIGFILE=$KUBECONFIGDIR/config - mkdir -p $KUBECONFIGDIR - touch $KUBECONFIGFILE - chown $ADMINUSER:$ADMINUSER $KUBECONFIGDIR - chown $ADMINUSER:$ADMINUSER $KUBECONFIGFILE - chmod 700 $KUBECONFIGDIR - chmod 600 $KUBECONFIGFILE - set +x - echo " ---- -apiVersion: v1 -clusters: -- cluster: - certificate-authority-data: \"$CA_CERTIFICATE\" - server: $KUBECONFIG_SERVER - name: \"$MASTER_FQDN\" -contexts: -- context: - cluster: \"$MASTER_FQDN\" - user: \"$MASTER_FQDN-admin\" - name: \"$MASTER_FQDN\" -current-context: \"$MASTER_FQDN\" -kind: Config -users: -- name: \"$MASTER_FQDN-admin\" - user: - client-certificate-data: \"$KUBECONFIG_CERTIFICATE\" - client-key-data: \"$KUBECONFIG_KEY\" -" > $KUBECONFIGFILE - set -x -} - -configClusterAutoscalerAddon() { - CLUSTER_AUTOSCALER_ADDON_FILE=/etc/kubernetes/addons/cluster-autoscaler-deployment.yaml - sed -i "s||$(echo $SERVICE_PRINCIPAL_CLIENT_ID | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE - sed -i "s||$(echo $SERVICE_PRINCIPAL_CLIENT_SECRET | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE - sed -i "s||$(echo $SUBSCRIPTION_ID | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE - sed -i "s||$(echo $TENANT_ID | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE - sed -i "s||$(echo $RESOURCE_GROUP | base64)|g" $CLUSTER_AUTOSCALER_ADDON_FILE -} - -configACIConnectorAddon() { - ACI_CONNECTOR_CREDENTIALS=$(printf "{\"clientId\": \"%s\", \"clientSecret\": \"%s\", \"tenantId\": \"%s\", \"subscriptionId\": \"%s\", \"activeDirectoryEndpointUrl\": \"https://login.microsoftonline.com\",\"resourceManagerEndpointUrl\": \"https://management.azure.com/\", \"activeDirectoryGraphResourceId\": \"https://graph.windows.net/\", \"sqlManagementEndpointUrl\": \"https://management.core.windows.net:8443/\", \"galleryEndpointUrl\": \"https://gallery.azure.com/\", \"managementEndpointUrl\": \"https://management.core.windows.net/\"}" "$SERVICE_PRINCIPAL_CLIENT_ID" "$SERVICE_PRINCIPAL_CLIENT_SECRET" "$TENANT_ID" "$SUBSCRIPTION_ID" | base64 -w 0) - - openssl req -newkey rsa:4096 -new -nodes -x509 -days 3650 -keyout /etc/kubernetes/certs/aci-connector-key.pem -out /etc/kubernetes/certs/aci-connector-cert.pem -subj "/C=US/ST=CA/L=virtualkubelet/O=virtualkubelet/OU=virtualkubelet/CN=virtualkubelet" - ACI_CONNECTOR_KEY=$(base64 /etc/kubernetes/certs/aci-connector-key.pem -w0) - ACI_CONNECTOR_CERT=$(base64 /etc/kubernetes/certs/aci-connector-cert.pem -w0) - - ACI_CONNECTOR_ADDON_FILE=/etc/kubernetes/addons/aci-connector-deployment.yaml - sed -i "s||$ACI_CONNECTOR_CREDENTIALS|g" $ACI_CONNECTOR_ADDON_FILE - sed -i "s||$RESOURCE_GROUP|g" $ACI_CONNECTOR_ADDON_FILE - sed -i "s||$ACI_CONNECTOR_CERT|g" $ACI_CONNECTOR_ADDON_FILE - sed -i "s||$ACI_CONNECTOR_KEY|g" $ACI_CONNECTOR_ADDON_FILE -} - -configAzurePolicyAddon() { - AZURE_POLICY_ADDON_FILE=/etc/kubernetes/addons/azure-policy-deployment.yaml - sed -i "s||/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP|g" $AZURE_POLICY_ADDON_FILE -} - -configGPUDrivers() { - # install gpu driver - if [[ $OS == $UBUNTU_OS_NAME ]]; then - mkdir -p /opt/{actions,gpu} - if [[ "${CONTAINER_RUNTIME}" == "containerd" ]]; then - ctr image pull $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG - retrycmd_if_failure 5 10 600 bash -c "$CTR_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG gpuinstall /entrypoint.sh install" - ret=$? - if [[ "$ret" != "0" ]]; then - echo "Failed to install GPU driver, exiting..." - exit $ERR_GPU_DRIVERS_START_FAIL - fi - ctr images rm --sync $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG - else - bash -c "$DOCKER_GPU_INSTALL_CMD $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG install" - ret=$? - if [[ "$ret" != "0" ]]; then - echo "Failed to install GPU driver, exiting..." - exit $ERR_GPU_DRIVERS_START_FAIL - fi - docker rmi $NVIDIA_DRIVER_IMAGE:$NVIDIA_DRIVER_IMAGE_TAG - fi - elif isMarinerOrAzureLinux "$OS"; then - downloadGPUDrivers - installNvidiaContainerToolkit - enableNvidiaPersistenceMode - else - echo "os $OS not supported at this time. skipping configGPUDrivers" - exit 1 - fi - - retrycmd_if_failure 120 5 25 nvidia-modprobe -u -c0 || exit $ERR_GPU_DRIVERS_START_FAIL - retrycmd_if_failure 120 5 300 nvidia-smi || exit $ERR_GPU_DRIVERS_START_FAIL - retrycmd_if_failure 120 5 25 ldconfig || exit $ERR_GPU_DRIVERS_START_FAIL - - # Fix the NVIDIA /dev/char link issue - if isMarinerOrAzureLinux "$OS"; then - createNvidiaSymlinkToAllDeviceNodes - fi - - # reload containerd/dockerd - if [[ "${CONTAINER_RUNTIME}" == "containerd" ]]; then - retrycmd_if_failure 120 5 25 pkill -SIGHUP containerd || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT - else - retrycmd_if_failure 120 5 25 pkill -SIGHUP dockerd || exit $ERR_GPU_DRIVERS_INSTALL_TIMEOUT - fi -} - -validateGPUDrivers() { - if [[ $(isARM64) == 1 ]]; then - # no GPU on ARM64 - return - fi - - retrycmd_if_failure 24 5 25 nvidia-modprobe -u -c0 && echo "gpu driver loaded" || configGPUDrivers || exit $ERR_GPU_DRIVERS_START_FAIL - which nvidia-smi - if [[ $? == 0 ]]; then - SMI_RESULT=$(retrycmd_if_failure 24 5 300 nvidia-smi) - else - SMI_RESULT=$(retrycmd_if_failure 24 5 300 $GPU_DEST/bin/nvidia-smi) - fi - SMI_STATUS=$? - if [[ $SMI_STATUS != 0 ]]; then - if [[ $SMI_RESULT == *"infoROM is corrupted"* ]]; then - exit $ERR_GPU_INFO_ROM_CORRUPTED - else - exit $ERR_GPU_DRIVERS_START_FAIL - fi - else - echo "gpu driver working fine" - fi -} - -ensureGPUDrivers() { - if [[ $(isARM64) == 1 ]]; then - # no GPU on ARM64 - return - fi - - if [[ "${CONFIG_GPU_DRIVER_IF_NEEDED}" = true ]]; then - logs_to_events "AKS.CSE.ensureGPUDrivers.configGPUDrivers" configGPUDrivers - else - logs_to_events "AKS.CSE.ensureGPUDrivers.validateGPUDrivers" validateGPUDrivers - fi - if [[ $OS == $UBUNTU_OS_NAME ]]; then - logs_to_events "AKS.CSE.ensureGPUDrivers.nvidia-modprobe" "systemctlEnableAndStart nvidia-modprobe" || exit $ERR_GPU_DRIVERS_START_FAIL - fi -} - -disableSSH() { - systemctlDisableAndStop ssh || exit $ERR_DISABLE_SSH -} - -setKubeletNodeIPFlag() { - imdsOutput=$(curl -s -H Metadata:true --noproxy "*" --max-time 5 "http://169.254.169.254/metadata/instance/network/interface?api-version=2021-02-01" 2> /dev/null) - if [[ $? -eq 0 ]]; then - nodeIPAddrs=() - ipv4Addr=$(echo $imdsOutput | jq -r '.[0].ipv4.ipAddress[0].privateIpAddress // ""') - [ -n "$ipv4Addr" ] && nodeIPAddrs+=("$ipv4Addr") - ipv6Addr=$(echo $imdsOutput | jq -r '.[0].ipv6.ipAddress[0].privateIpAddress // ""') - [ -n "$ipv6Addr" ] && nodeIPAddrs+=("$ipv6Addr") - nodeIPArg=$(IFS=, ; echo "${nodeIPAddrs[*]}") # join, comma-separated - if [ -n "$nodeIPArg" ]; then - echo "Adding --node-ip=$nodeIPArg to kubelet flags" - KUBELET_FLAGS="$KUBELET_FLAGS --node-ip=$nodeIPArg" - fi - fi -} - -#EOF diff --git a/self-contained/bootstrap_helpers.sh b/self-contained/bootstrap_helpers.sh deleted file mode 100755 index 3eab790c551..00000000000 --- a/self-contained/bootstrap_helpers.sh +++ /dev/null @@ -1,400 +0,0 @@ -#!/bin/bash -# ERR_SYSTEMCTL_ENABLE_FAIL=3 Service could not be enabled by systemctl -- DEPRECATED -ERR_SYSTEMCTL_START_FAIL=4 # Service could not be started or enabled by systemctl -ERR_CLOUD_INIT_TIMEOUT=5 # Timeout waiting for cloud-init runcmd to complete -ERR_FILE_WATCH_TIMEOUT=6 # Timeout waiting for a file -ERR_HOLD_WALINUXAGENT=7 # Unable to place walinuxagent apt package on hold during install -ERR_RELEASE_HOLD_WALINUXAGENT=8 # Unable to release hold on walinuxagent apt package after install -ERR_APT_INSTALL_TIMEOUT=9 # Timeout installing required apt packages -ERR_DOCKER_INSTALL_TIMEOUT=20 # Timeout waiting for docker install -ERR_DOCKER_DOWNLOAD_TIMEOUT=21 # Timout waiting for docker downloads -ERR_DOCKER_KEY_DOWNLOAD_TIMEOUT=22 # Timeout waiting to download docker repo key -ERR_DOCKER_APT_KEY_TIMEOUT=23 # Timeout waiting for docker apt-key -ERR_DOCKER_START_FAIL=24 # Docker could not be started by systemctl -ERR_MOBY_APT_LIST_TIMEOUT=25 # Timeout waiting for moby apt sources -ERR_MS_GPG_KEY_DOWNLOAD_TIMEOUT=26 # Timeout waiting for MS GPG key download -ERR_MOBY_INSTALL_TIMEOUT=27 # Timeout waiting for moby-docker install -ERR_CONTAINERD_INSTALL_TIMEOUT=28 # Timeout waiting for moby-containerd install -ERR_RUNC_INSTALL_TIMEOUT=29 # Timeout waiting for moby-runc install -ERR_K8S_RUNNING_TIMEOUT=30 # Timeout waiting for k8s cluster to be healthy -ERR_K8S_DOWNLOAD_TIMEOUT=31 # Timeout waiting for Kubernetes downloads -ERR_KUBECTL_NOT_FOUND=32 # kubectl client binary not found on local disk -ERR_IMG_DOWNLOAD_TIMEOUT=33 # Timeout waiting for img download -ERR_KUBELET_START_FAIL=34 # kubelet could not be started by systemctl -ERR_DOCKER_IMG_PULL_TIMEOUT=35 # Timeout trying to pull a Docker image -ERR_CONTAINERD_CTR_IMG_PULL_TIMEOUT=36 # Timeout trying to pull a containerd image via cli tool ctr -ERR_CONTAINERD_CRICTL_IMG_PULL_TIMEOUT=37 # Timeout trying to pull a containerd image via cli tool crictl -ERR_CONTAINERD_INSTALL_FILE_NOT_FOUND=38 # Unable to locate containerd debian pkg file -ERR_CNI_DOWNLOAD_TIMEOUT=41 # Timeout waiting for CNI downloads -ERR_MS_PROD_DEB_DOWNLOAD_TIMEOUT=42 # Timeout waiting for https://packages.microsoft.com/config/ubuntu/16.04/packages-microsoft-prod.deb -ERR_MS_PROD_DEB_PKG_ADD_FAIL=43 # Failed to add repo pkg file -# ERR_FLEXVOLUME_DOWNLOAD_TIMEOUT=44 Failed to add repo pkg file -- DEPRECATED -ERR_ORAS_DOWNLOAD_ERROR=45 # Unable to install oras -ERR_SYSTEMD_INSTALL_FAIL=48 # Unable to install required systemd version -ERR_MODPROBE_FAIL=49 # Unable to load a kernel module using modprobe -ERR_OUTBOUND_CONN_FAIL=50 # Unable to establish outbound connection -ERR_K8S_API_SERVER_CONN_FAIL=51 # Unable to establish connection to k8s api serve -ERR_K8S_API_SERVER_DNS_LOOKUP_FAIL=52 # Unable to resolve k8s api server name -ERR_K8S_API_SERVER_AZURE_DNS_LOOKUP_FAIL=53 # Unable to resolve k8s api server name due to Azure DNS issue -ERR_KATA_KEY_DOWNLOAD_TIMEOUT=60 # Timeout waiting to download kata repo key -ERR_KATA_APT_KEY_TIMEOUT=61 # Timeout waiting for kata apt-key -ERR_KATA_INSTALL_TIMEOUT=62 # Timeout waiting for kata install -ERR_VHD_FILE_NOT_FOUND=65 # VHD log file not found on VM built from VHD distro (previously classified as exit code 124) -ERR_CONTAINERD_DOWNLOAD_TIMEOUT=70 # Timeout waiting for containerd downloads -ERR_RUNC_DOWNLOAD_TIMEOUT=71 # Timeout waiting for runc downloads -ERR_CUSTOM_SEARCH_DOMAINS_FAIL=80 # Unable to configure custom search domains -ERR_GPU_DOWNLOAD_TIMEOUT=83 # Timeout waiting for GPU driver download -ERR_GPU_DRIVERS_START_FAIL=84 # nvidia-modprobe could not be started by systemctl -ERR_GPU_DRIVERS_INSTALL_TIMEOUT=85 # Timeout waiting for GPU drivers install -ERR_GPU_DEVICE_PLUGIN_START_FAIL=86 # nvidia device plugin could not be started by systemctl -ERR_GPU_INFO_ROM_CORRUPTED=87 # info ROM corrupted error when executing nvidia-smi -ERR_SGX_DRIVERS_INSTALL_TIMEOUT=90 # Timeout waiting for SGX prereqs to download -ERR_SGX_DRIVERS_START_FAIL=91 # Failed to execute SGX driver binary -ERR_APT_DAILY_TIMEOUT=98 # Timeout waiting for apt daily updates -ERR_APT_UPDATE_TIMEOUT=99 # Timeout waiting for apt-get update to complete -ERR_CSE_PROVISION_SCRIPT_NOT_READY_TIMEOUT=100 # Timeout waiting for cloud-init to place this script on the vm -ERR_APT_DIST_UPGRADE_TIMEOUT=101 # Timeout waiting for apt-get dist-upgrade to complete -ERR_APT_PURGE_FAIL=102 # Error purging distro packages -ERR_SYSCTL_RELOAD=103 # Error reloading sysctl config -ERR_CIS_ASSIGN_ROOT_PW=111 # Error assigning root password in CIS enforcement -ERR_CIS_ASSIGN_FILE_PERMISSION=112 # Error assigning permission to a file in CIS enforcement -ERR_PACKER_COPY_FILE=113 # Error writing a file to disk during VHD CI -ERR_CIS_APPLY_PASSWORD_CONFIG=115 # Error applying CIS-recommended passwd configuration -ERR_SYSTEMD_DOCKER_STOP_FAIL=116 # Error stopping dockerd -ERR_CRICTL_DOWNLOAD_TIMEOUT=117 # Timeout waiting for crictl downloads -ERR_CRICTL_OPERATION_ERROR=118 # Error executing a crictl operation -ERR_CTR_OPERATION_ERROR=119 # Error executing a ctr containerd cli operation - -# Azure Stack specific errors -ERR_AZURE_STACK_GET_ARM_TOKEN=120 # Error generating a token to use with Azure Resource Manager -ERR_AZURE_STACK_GET_NETWORK_CONFIGURATION=121 # Error fetching the network configuration for the node -ERR_AZURE_STACK_GET_SUBNET_PREFIX=122 # Error fetching the subnet address prefix for a subnet ID - -# Error code 124 is returned when a `timeout` command times out, and --preserve-status is not specified: https://man7.org/linux/man-pages/man1/timeout.1.html -ERR_VHD_BUILD_ERROR=125 # Reserved for VHD CI exit conditions - -ERR_SWAP_CREATE_FAIL=130 # Error allocating swap file -ERR_SWAP_CREATE_INSUFFICIENT_DISK_SPACE=131 # Error insufficient disk space for swap file creation - -ERR_TELEPORTD_DOWNLOAD_ERR=150 # Error downloading teleportd binary -ERR_TELEPORTD_INSTALL_ERR=151 # Error installing teleportd binary -ERR_ARTIFACT_STREAMING_DOWNLOAD=152 # Error downloading mirror proxy and overlaybd components -ERR_ARTIFACT_STREAMING_INSTALL=153 # Error installing mirror proxy and overlaybd components - -ERR_HTTP_PROXY_CA_CONVERT=160 # Error converting http proxy ca cert from pem to crt format -ERR_UPDATE_CA_CERTS=161 # Error updating ca certs to include user-provided certificates - -ERR_DISBALE_IPTABLES=170 # Error disabling iptables service - -ERR_KRUSTLET_DOWNLOAD_TIMEOUT=171 # Timeout waiting for krustlet downloads -ERR_DISABLE_SSH=172 # Error disabling ssh service - -ERR_VHD_REBOOT_REQUIRED=200 # Reserved for VHD reboot required exit condition -ERR_NO_PACKAGES_FOUND=201 # Reserved for no security packages found exit condition - -ERR_SYSTEMCTL_MASK_FAIL=2 # Service could not be masked by systemctl - -OS=$(sort -r /etc/*-release | gawk 'match($0, /^(ID_LIKE=(coreos)|ID=(.*))$/, a) { print toupper(a[2] a[3]); exit }') -OS_VERSION=$(sort -r /etc/*-release | gawk 'match($0, /^(VERSION_ID=(.*))$/, a) { print toupper(a[2] a[3]); exit }' | tr -d '"') -UBUNTU_OS_NAME="UBUNTU" -MARINER_OS_NAME="MARINER" -AZURELINUX_OS_NAME="AZURELINUX" -KUBECTL=/usr/local/bin/kubectl -DOCKER=/usr/bin/docker -# this will be empty during VHD build -# but vhd build runs with `set -o nounset` -# so needs a default value -# prefer empty string to avoid potential "it works but did something weird" scenarios -export GPU_DV="${GPU_DRIVER_VERSION:=}" -export GPU_DEST=/usr/local/nvidia -NVIDIA_DOCKER_VERSION=2.8.0-1 -DOCKER_VERSION=1.13.1-1 -NVIDIA_CONTAINER_RUNTIME_VERSION="3.6.0" -export NVIDIA_DRIVER_IMAGE_SHA="sha-e8873b" -export NVIDIA_DRIVER_IMAGE_TAG="${GPU_DV}-${NVIDIA_DRIVER_IMAGE_SHA}" -export NVIDIA_DRIVER_IMAGE="mcr.microsoft.com/aks/aks-gpu" -export CTR_GPU_INSTALL_CMD="ctr run --privileged --rm --net-host --with-ns pid:/proc/1/ns/pid --mount type=bind,src=/opt/gpu,dst=/mnt/gpu,options=rbind --mount type=bind,src=/opt/actions,dst=/mnt/actions,options=rbind" -export DOCKER_GPU_INSTALL_CMD="docker run --privileged --net=host --pid=host -v /opt/gpu:/mnt/gpu -v /opt/actions:/mnt/actions --rm" -APT_CACHE_DIR=/var/cache/apt/archives/ -PERMANENT_CACHE_DIR=/root/aptcache/ -EVENTS_LOGGING_DIR=/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/ -CURL_OUTPUT=/tmp/curl_verbose.out - -retrycmd_if_failure() { - retries=$1; wait_sleep=$2; timeout=$3; shift && shift && shift - for i in $(seq 1 $retries); do - timeout $timeout "${@}" && break || \ - if [ $i -eq $retries ]; then - echo Executed \"$@\" $i times; - return 1 - else - sleep $wait_sleep - fi - done - echo Executed \"$@\" $i times; -} -retrycmd_if_failure_no_stats() { - retries=$1; wait_sleep=$2; timeout=$3; shift && shift && shift - for i in $(seq 1 $retries); do - timeout $timeout ${@} && break || \ - if [ $i -eq $retries ]; then - return 1 - else - sleep $wait_sleep - fi - done -} -retrycmd_get_tarball() { - tar_retries=$1; wait_sleep=$2; tarball=$3; url=$4 - echo "${tar_retries} retries" - for i in $(seq 1 $tar_retries); do - tar -tzf $tarball && break || \ - if [ $i -eq $tar_retries ]; then - return 1 - else - timeout 60 curl -fsSLv $url -o $tarball > $CURL_OUTPUT 2>&1 - if [[ $? != 0 ]]; then - cat $CURL_OUTPUT - fi - sleep $wait_sleep - fi - done -} -retrycmd_get_tarball_from_registry_with_oras() { - tar_retries=$1; wait_sleep=$2; tarball=$3; url=$4 - tar_folder=$(dirname "$tarball") - echo "${tar_retries} retries" - for i in $(seq 1 $tar_retries); do - tar -tzf $tarball && break || \ - if [ $i -eq $tar_retries ]; then - return 1 - else - # TODO: support private acr via kubelet identity - timeout 60 oras pull $url -o $tar_folder --registry-config ${ORAS_REGISTRY_CONFIG_FILE} > $ORAS_OUTPUT 2>&1 - if [[ $? != 0 ]]; then - cat $ORAS_OUTPUT - fi - sleep $wait_sleep - fi - done -} -retrycmd_get_binary_from_registry_with_oras() { - binary_retries=$1; wait_sleep=$2; binary_path=$3; url=$4 - binary_folder=$(dirname "$binary_path") - echo "${binary_retries} retries" - - for i in $(seq 1 $binary_retries); do - if [ -f "$binary_path" ]; then - break - else - if [ $i -eq $binary_retries ]; then - return 1 - else - # TODO: support private acr via kubelet identity - timeout 60 oras pull $url -o $binary_folder --registry-config ${ORAS_REGISTRY_CONFIG_FILE} > $ORAS_OUTPUT 2>&1 - if [[ $? != 0 ]]; then - cat $ORAS_OUTPUT - fi - sleep $wait_sleep - fi - fi - done -} -retrycmd_curl_file() { - curl_retries=$1; wait_sleep=$2; timeout=$3; filepath=$4; url=$5 - echo "${curl_retries} retries" - for i in $(seq 1 $curl_retries); do - [[ -f $filepath ]] && break - if [ $i -eq $curl_retries ]; then - return 1 - else - timeout $timeout curl -fsSLv $url -o $filepath 2>&1 | tee $CURL_OUTPUT >/dev/null - if [[ $? != 0 ]]; then - cat $CURL_OUTPUT - fi - sleep $wait_sleep - fi - done -} -wait_for_file() { - retries=$1; wait_sleep=$2; filepath=$3 - paved=/opt/azure/cloud-init-files.paved - grep -Fq "${filepath}" $paved && return 0 - for i in $(seq 1 $retries); do - grep -Fq '#EOF' $filepath && break - if [ $i -eq $retries ]; then - return 1 - else - sleep $wait_sleep - fi - done - sed -i "/#EOF/d" $filepath - echo $filepath >> $paved -} -systemctl_restart() { - retries=$1; wait_sleep=$2; timeout=$3 svcname=$4 - for i in $(seq 1 $retries); do - timeout $timeout systemctl daemon-reload - timeout $timeout systemctl restart $svcname && break || \ - if [ $i -eq $retries ]; then - return 1 - else - systemctl status $svcname --no-pager -l - journalctl -u $svcname - sleep $wait_sleep - fi - done -} -systemctl_stop() { - retries=$1; wait_sleep=$2; timeout=$3 svcname=$4 - for i in $(seq 1 $retries); do - timeout $timeout systemctl daemon-reload - timeout $timeout systemctl stop $svcname && break || \ - if [ $i -eq $retries ]; then - return 1 - else - sleep $wait_sleep - fi - done -} -systemctl_disable() { - retries=$1; wait_sleep=$2; timeout=$3 svcname=$4 - for i in $(seq 1 $retries); do - timeout $timeout systemctl daemon-reload - timeout $timeout systemctl disable $svcname && break || \ - if [ $i -eq $retries ]; then - return 1 - else - sleep $wait_sleep - fi - done -} -sysctl_reload() { - retries=$1; wait_sleep=$2; timeout=$3 - for i in $(seq 1 $retries); do - timeout $timeout sysctl --system && break || \ - if [ $i -eq $retries ]; then - return 1 - else - sleep $wait_sleep - fi - done -} -version_gte() { - test "$(printf '%s\n' "$@" | sort -rV | head -n 1)" == "$1" -} - -systemctlEnableAndStart() { - systemctl_restart 100 5 30 $1 - RESTART_STATUS=$? - systemctl status $1 --no-pager -l > /var/log/azure/$1-status.log - if [ $RESTART_STATUS -ne 0 ]; then - echo "$1 could not be started" - return 1 - fi - if ! retrycmd_if_failure 120 5 25 systemctl enable $1; then - echo "$1 could not be enabled by systemctl" - return 1 - fi -} - -systemctlDisableAndStop() { - if systemctl list-units --full --all | grep -q "$1.service"; then - systemctl_stop 20 5 25 $1 || echo "$1 could not be stopped" - systemctl_disable 20 5 25 $1 || echo "$1 could not be disabled" - fi -} - -# return true if a >= b -semverCompare() { - VERSION_A=$(echo $1 | cut -d "+" -f 1) - VERSION_B=$(echo $2 | cut -d "+" -f 1) - [[ "${VERSION_A}" == "${VERSION_B}" ]] && return 0 - sorted=$(echo ${VERSION_A} ${VERSION_B} | tr ' ' '\n' | sort -V ) - highestVersion=$(IFS= echo "${sorted}" | cut -d$'\n' -f2) - [[ "${VERSION_A}" == ${highestVersion} ]] && return 0 - return 1 -} -downloadDebPkgToFile() { - PKG_NAME=$1 - PKG_VERSION=$2 - PKG_DIRECTORY=$3 - mkdir -p $PKG_DIRECTORY - # shellcheck disable=SC2164 - pushd ${PKG_DIRECTORY} - retrycmd_if_failure 10 5 600 apt-get download ${PKG_NAME}=${PKG_VERSION}* - # shellcheck disable=SC2164 - popd -} -apt_get_download() { - retries=$1; wait_sleep=$2; shift && shift; - local ret=0 - pushd $APT_CACHE_DIR || return 1 - for i in $(seq 1 $retries); do - dpkg --configure -a --force-confdef - wait_for_apt_locks - apt-get -o Dpkg::Options::=--force-confold download -y "${@}" && break - if [ $i -eq $retries ]; then ret=1; else sleep $wait_sleep; fi - done - popd || return 1 - return $ret -} -getCPUArch() { - arch=$(uname -m) - if [[ ${arch,,} == "aarch64" || ${arch,,} == "arm64" ]]; then - echo "arm64" - else - echo "amd64" - fi -} -isARM64() { - if [[ $(getCPUArch) == "arm64" ]]; then - echo 1 - else - echo 0 - fi -} - -logs_to_events() { - # local vars here allow for nested function tracking - # installContainerRuntime for example - local task=$1; shift - local eventsFileName=$(date +%s%3N) - - local startTime=$(date +"%F %T.%3N") - ${@} - ret=$? - local endTime=$(date +"%F %T.%3N") - - # arg names are defined by GA and all these are required to be correctly read by GA - # EventPid, EventTid are required to be int. No use case for them at this point. - json_string=$( jq -n \ - --arg Timestamp "${startTime}" \ - --arg OperationId "${endTime}" \ - --arg Version "1.23" \ - --arg TaskName "${task}" \ - --arg EventLevel "Informational" \ - --arg Message "Completed: ${@}" \ - --arg EventPid "0" \ - --arg EventTid "0" \ - '{Timestamp: $Timestamp, OperationId: $OperationId, Version: $Version, TaskName: $TaskName, EventLevel: $EventLevel, Message: $Message, EventPid: $EventPid, EventTid: $EventTid}' - ) - echo ${json_string} > ${EVENTS_LOGGING_DIR}${eventsFileName}.json - - # this allows an error from the command at ${@} to be returned and correct code assigned in cse_main - if [ "$ret" != "0" ]; then - return $ret - fi -} - -should_skip_nvidia_drivers() { - set -x - body=$(curl -fsSL -H "Metadata: true" --noproxy "*" "http://169.254.169.254/metadata/instance?api-version=2021-02-01") - ret=$? - if [ "$ret" != "0" ]; then - return $ret - fi - should_skip=$(echo "$body" | jq -e '.compute.tagsList | map(select(.name | test("SkipGpuDriverInstall"; "i")))[0].value // "false" | test("true"; "i")') - echo "$should_skip" # true or false -} -#HELPERSEOF diff --git a/self-contained/bootstrap_install.sh b/self-contained/bootstrap_install.sh deleted file mode 100755 index c4b4042f618..00000000000 --- a/self-contained/bootstrap_install.sh +++ /dev/null @@ -1,531 +0,0 @@ -#!/bin/bash - -CC_SERVICE_IN_TMP=/opt/azure/containers/cc-proxy.service.in -CC_SOCKET_IN_TMP=/opt/azure/containers/cc-proxy.socket.in -CNI_CONFIG_DIR="/etc/cni/net.d" -CNI_BIN_DIR="/opt/cni/bin" -CNI_DOWNLOADS_DIR="/opt/cni/downloads" -CRICTL_DOWNLOAD_DIR="/opt/crictl/downloads" -CRICTL_BIN_DIR="/usr/local/bin" -CONTAINERD_DOWNLOADS_DIR="/opt/containerd/downloads" -RUNC_DOWNLOADS_DIR="/opt/runc/downloads" -K8S_DOWNLOADS_DIR="/opt/kubernetes/downloads" -UBUNTU_RELEASE=$(lsb_release -r -s) -OS=$(sort -r /etc/*-release | gawk 'match($0, /^(ID_LIKE=(coreos)|ID=(.*))$/, a) { print toupper(a[2] a[3]); exit }') -TELEPORTD_PLUGIN_DOWNLOAD_DIR="/opt/teleportd/downloads" -TELEPORTD_PLUGIN_BIN_DIR="/usr/local/bin" -CONTAINERD_WASM_VERSIONS="v0.3.0 v0.5.1 v0.8.0" -SPIN_KUBE_VERSIONS="v0.15.1" -MANIFEST_FILEPATH="/opt/azure/manifest.json" -MAN_DB_AUTO_UPDATE_FLAG_FILEPATH="/var/lib/man-db/auto-update" -CURL_OUTPUT=/tmp/curl_verbose.out - -removeManDbAutoUpdateFlagFile() { - rm -f $MAN_DB_AUTO_UPDATE_FLAG_FILEPATH -} - -createManDbAutoUpdateFlagFile() { - touch $MAN_DB_AUTO_UPDATE_FLAG_FILEPATH -} - -cleanupContainerdDlFiles() { - rm -rf $CONTAINERD_DOWNLOADS_DIR -} - -installContainerRuntime() { - if [ "${NEEDS_CONTAINERD}" == "true" ]; then - echo "in installContainerRuntime - KUBERNETES_VERSION = ${KUBERNETES_VERSION}" - local containerd_version - if [ -f "$MANIFEST_FILEPATH" ]; then - containerd_version="$(jq -r .containerd.edge "$MANIFEST_FILEPATH")" - if [ "${UBUNTU_RELEASE}" == "18.04" ]; then - containerd_version="$(jq -r '.containerd.pinned."1804"' "$MANIFEST_FILEPATH")" - fi - else - echo "WARNING: containerd version not found in manifest, defaulting to hardcoded." - fi - - containerd_patch_version="$(echo "$containerd_version" | cut -d- -f1)" - containerd_revision="$(echo "$containerd_version" | cut -d- -f2)" - if [ -z "$containerd_patch_version" ] || [ "$containerd_patch_version" == "null" ] || [ "$containerd_revision" == "null" ]; then - echo "invalid container version: $containerd_version" - exit $ERR_CONTAINERD_INSTALL_TIMEOUT - fi - - logs_to_events "AKS.CSE.installContainerRuntime.installStandaloneContainerd" "installStandaloneContainerd ${containerd_patch_version} ${containerd_revision}" - echo "in installContainerRuntime - CONTAINERD_VERION = ${containerd_patch_version}" - else - installMoby - fi -} - -installNetworkPlugin() { - if [[ "${NETWORK_PLUGIN}" = "azure" ]]; then - installAzureCNI - fi - installCNI #reference plugins. Mostly for kubenet but loop back used by contaierd until containerd 2 - rm -rf $CNI_DOWNLOADS_DIR & -} - -wasmFilesExist() { - local containerd_wasm_filepath=${1} - local shim_version=${2} - local version_suffix=${3} - local shims_to_download=("${@:4}") # Capture all arguments starting from the fourth indx - - local binary_version="$(echo "${shim_version}" | tr . -)" - for shim in "${shims_to_download[@]}"; do - if [ ! -f "${containerd_wasm_filepath}/containerd-shim-${shim}-${binary_version}-${version_suffix}" ]; then - return 1 # file is missing - fi - done - echo "all wasm files exist for ${containerd_wasm_filepath}/containerd-shim-*-${binary_version}-${version_suffix}" - return 0 -} - -# Install, download, update wasm must all be run from the same function call -# in order to ensure WASMSHIMPIDS persists correctly since in bash a new -# function call from install-dependnecies will create a new shell process. -installContainerdWasmShims(){ - local download_location=${1} - PACKAGE_DOWNLOAD_URL=${2} - local package_versions=("${@:3}") # Capture all arguments starting from the third indx - - for version in "${package_versions[@]}"; do - local shims_to_download=("spin" "slight") - if [[ "$version" == "0.8.0" ]]; then - shims_to_download+=("wws") - fi - containerd_wasm_url=$(evalPackageDownloadURL ${PACKAGE_DOWNLOAD_URL}) - downloadContainerdWasmShims $download_location $containerd_wasm_url "v$version" "${shims_to_download[@]}" # adding v to version for simplicity - done - # wait for file downloads to complete before updating file permissions - wait ${WASMSHIMPIDS[@]} - for version in "${package_versions[@]}"; do - local shims_to_download=("spin" "slight") - if [[ "$version" == "0.8.0" ]]; then - shims_to_download+=("wws") - fi - updateContainerdWasmShimsPermissions $download_location "v$version" "${shims_to_download[@]}" - done -} - -downloadContainerdWasmShims() { - local containerd_wasm_filepath=${1} - local containerd_wasm_url=${2} - local shim_version=${3} - local shims_to_download=("${@:4}") # Capture all arguments starting from the fourth indx - - local binary_version="$(echo "${shim_version}" | tr . -)" # replaces . with - == 1.2.3 -> 1-2-3 - - if wasmFilesExist "$containerd_wasm_filepath" "$shim_version" "-v1" "${shims_to_download[@]}"; then - echo "containerd-wasm-shims already exists in $containerd_wasm_filepath, will not be downloading." - return - fi - - # Oras download for WASM for Network Isolated Clusters - BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER="${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER:=}" - if [[ ! -z ${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER} ]]; then - local registry_url="${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER}/oss/binaries/deislabs/containerd-wasm-shims:${shim_version}-linux-${CPU_ARCH}" - local wasm_shims_tgz_tmp=$containerd_wasm_filepath/containerd-wasm-shims-linux-${CPU_ARCH}.tar.gz - - retrycmd_get_tarball_from_registry_with_oras 120 5 "${wasm_shims_tgz_tmp}" ${registry_url} || exit $ERR_ORAS_PULL_CONTAINERD_WASM - tar -zxf "$wasm_shims_tgz_tmp" -C $containerd_wasm_filepath - mv "$containerd_wasm_filepath/containerd-shim-*-${shim_version}-v1" "$containerd_wasm_filepath/containerd-shim-*-${binary_version}-v1" - rm -f "$wasm_shims_tgz_tmp" - return - fi - - for shim in "${shims_to_download[@]}"; do - retrycmd_if_failure 30 5 60 curl -fSLv -o "$containerd_wasm_filepath/containerd-shim-${shim}-${binary_version}-v1" "$containerd_wasm_url/containerd-shim-${shim}-v1" 2>&1 | tee $CURL_OUTPUT >/dev/null | grep -E "^(curl:.*)|([eE]rr.*)$" && (cat $CURL_OUTPUT && exit $ERR_KRUSTLET_DOWNLOAD_TIMEOUT) & - WASMSHIMPIDS+=($!) - done -} - -updateContainerdWasmShimsPermissions() { - local containerd_wasm_filepath=${1} - local shim_version=${2} - local shims_to_download=("${@:3}") # Capture all arguments starting from the third indx - - local binary_version="$(echo "${shim_version}" | tr . -)" - - for shim in "${shims_to_download[@]}"; do - chmod 755 "$containerd_wasm_filepath/containerd-shim-${shim}-${binary_version}-v1" - done -} - -installSpinKube(){ - local download_location=${1} - PACKAGE_DOWNLOAD_URL=${2} - local package_versions=("${@:3}") # Capture all arguments starting from the third indx - - for version in "${package_versions[@]}"; do - containerd_spinkube_url=$(evalPackageDownloadURL ${PACKAGE_DOWNLOAD_URL}) - downloadSpinKube $download_location $containerd_spinkube_url "v$version" # adding v to version for simplicity - done - wait ${SPINKUBEPIDS[@]} - for version in "${package_versions[@]}"; do - chmod 755 "$download_location/containerd-shim-spin-v2" - done -} - -downloadSpinKube(){ - local containerd_spinkube_filepath=${1} - local containerd_spinkube_url=${2} - local shim_version=${3} - local shims_to_download=("${@:4}") # Capture all arguments starting from the fourth indx - - if [ -f "$containerd_spinkube_filepath/containerd-shim-spin-v2" ]; then - echo "containerd-shim-spin-v2 already exists in $containerd_spinkube_filepath, will not be downloading." - return - fi - - BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER="${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER:=}" - if [[ ! -z ${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER} ]]; then - local registry_url="${BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER}/oss/binaries/spinkube/containerd-shim-spin:${shim_version}-linux-${CPU_ARCH}" - local wasm_shims_tgz_tmp="${containerd_spinkube_filepath}/containerd-shim-spin-v2" - retrycmd_get_binary_from_registry_with_oras 120 5 "${wasm_shims_tgz_tmp}" "${registry_url}" || exit $ERR_ORAS_PULL_CONTAINERD_WASM - rm -f "$wasm_shims_tgz_tmp" - return - fi - - retrycmd_if_failure 30 5 60 curl -fSLv -o "$containerd_spinkube_filepath/containerd-shim-spin-v2" "$containerd_spinkube_url/containerd-shim-spin-v2" 2>&1 | tee $CURL_OUTPUT >/dev/null | grep -E "^(curl:.*)|([eE]rr.*)$" && (cat $CURL_OUTPUT && exit $ERR_KRUSTLET_DOWNLOAD_TIMEOUT) & - SPINKUBEPIDS+=($!) -} - -downloadAzureCNI() { - mkdir -p $CNI_DOWNLOADS_DIR - CNI_TGZ_TMP=${VNET_CNI_PLUGINS_URL##*/} # Use bash builtin ## to remove all chars ("*") up to the final "/" - retrycmd_get_tarball 120 5 "$CNI_DOWNLOADS_DIR/${CNI_TGZ_TMP}" ${VNET_CNI_PLUGINS_URL} || exit $ERR_CNI_DOWNLOAD_TIMEOUT -} - -downloadCrictl() { - CRICTL_VERSION=$1 - CPU_ARCH=$(getCPUArch) #amd64 or arm64 - mkdir -p $CRICTL_DOWNLOAD_DIR - CRICTL_DOWNLOAD_URL="https://acs-mirror.azureedge.net/cri-tools/v${CRICTL_VERSION}/binaries/crictl-v${CRICTL_VERSION}-linux-${CPU_ARCH}.tar.gz" - CRICTL_TGZ_TEMP=${CRICTL_DOWNLOAD_URL##*/} - retrycmd_curl_file 10 5 60 "$CRICTL_DOWNLOAD_DIR/${CRICTL_TGZ_TEMP}" ${CRICTL_DOWNLOAD_URL} -} - -installCrictl() { - CPU_ARCH=$(getCPUArch) #amd64 or arm64 - currentVersion=$(crictl --version 2>/dev/null | sed 's/crictl version //g') - if [[ "${currentVersion}" != "" ]]; then - echo "version ${currentVersion} of crictl already installed. skipping installCrictl of target version ${KUBERNETES_VERSION%.*}.0" - else - # this is only called during cse. VHDs should have crictl binaries pre-cached so no need to download. - # if the vhd does not have crictl pre-baked, return early - CRICTL_TGZ_TEMP="crictl-v${CRICTL_VERSION}-linux-${CPU_ARCH}.tar.gz" - if [[ ! -f "$CRICTL_DOWNLOAD_DIR/${CRICTL_TGZ_TEMP}" ]]; then - rm -rf ${CRICTL_DOWNLOAD_DIR} - echo "pre-cached crictl not found: skipping installCrictl" - return 1 - fi - echo "Unpacking crictl into ${CRICTL_BIN_DIR}" - tar zxvf "$CRICTL_DOWNLOAD_DIR/${CRICTL_TGZ_TEMP}" -C ${CRICTL_BIN_DIR} - chown root:root $CRICTL_BIN_DIR/crictl - chmod 755 $CRICTL_BIN_DIR/crictl - fi -} - -downloadTeleportdPlugin() { - DOWNLOAD_URL=$1 - TELEPORTD_VERSION=$2 - if [[ $(isARM64) == 1 ]]; then - # no arm64 teleport binaries according to owner - return - fi - - if [[ -z ${DOWNLOAD_URL} ]]; then - echo "download url parameter for downloadTeleportdPlugin was not given" - exit $ERR_TELEPORTD_DOWNLOAD_ERR - fi - if [[ -z ${TELEPORTD_VERSION} ]]; then - echo "teleportd version not given" - exit $ERR_TELEPORTD_DOWNLOAD_ERR - fi - mkdir -p $TELEPORTD_PLUGIN_DOWNLOAD_DIR - retrycmd_curl_file 10 5 60 "${TELEPORTD_PLUGIN_DOWNLOAD_DIR}/teleportd-v${TELEPORTD_VERSION}" "${DOWNLOAD_URL}/v${TELEPORTD_VERSION}/teleportd" || exit ${ERR_TELEPORTD_DOWNLOAD_ERR} -} - -installTeleportdPlugin() { - if [[ $(isARM64) == 1 ]]; then - # no arm64 teleport binaries according to owner - return - fi - - CURRENT_VERSION=$(teleportd --version 2>/dev/null | sed 's/teleportd version v//g') - local TARGET_VERSION="0.8.0" - if semverCompare ${CURRENT_VERSION:-"0.0.0"} ${TARGET_VERSION}; then - echo "currently installed teleportd version ${CURRENT_VERSION} is greater than (or equal to) target base version ${TARGET_VERSION}. skipping installTeleportdPlugin." - else - downloadTeleportdPlugin ${TELEPORTD_PLUGIN_DOWNLOAD_URL} ${TARGET_VERSION} - mv "${TELEPORTD_PLUGIN_DOWNLOAD_DIR}/teleportd-v${TELEPORTD_VERSION}" "${TELEPORTD_PLUGIN_BIN_DIR}/teleportd" || exit ${ERR_TELEPORTD_INSTALL_ERR} - chmod 755 "${TELEPORTD_PLUGIN_BIN_DIR}/teleportd" || exit ${ERR_TELEPORTD_INSTALL_ERR} - fi - rm -rf ${TELEPORTD_PLUGIN_DOWNLOAD_DIR} -} - -setupCNIDirs() { - mkdir -p $CNI_BIN_DIR - chown -R root:root $CNI_BIN_DIR - chmod -R 755 $CNI_BIN_DIR - - mkdir -p $CNI_CONFIG_DIR - chown -R root:root $CNI_CONFIG_DIR - chmod 755 $CNI_CONFIG_DIR -} - -# Reference CNI plugins is used by kubenet and the loopback plugin used by containerd 1.0 (dependency gone in 2.0) -# The version used to be deteremined by RP/toggle but are now just hadcoded in vhd as they rarely change and require a node image upgrade anyways -# Latest VHD should have the untar, older should have the tgz. And who knows will have neither. -installCNI() { - #always just use what is listed in components.json so we don't have to sync. - cniPackage=$(jq ".Packages" "$COMPONENTS_FILEPATH" | jq ".[] | select(.name == \"cni-plugins\")") || exit $ERR_CNI_VERSION_INVALID - - #CNI doesn't really care about this but wanted to reuse updatePackageVersions which requires it. - os=${UBUNTU_OS_NAME} - if [[ -z "$UBUNTU_RELEASE" ]]; then - os=${OS} - os_version="current" - fi - os_version="${UBUNTU_RELEASE}" - PACKAGE_VERSIONS=() - updatePackageVersions "${cniPackage}" "${os}" "${os_version}" - - #should change to ne - if [[ ${#PACKAGE_VERSIONS[@]} -gt 1 ]]; then - echo "WARNING: containerd package versions array has more than one element. Installing the last element in the array." - exit $ERR_CONTAINERD_VERSION_INVALID - fi - packageVersion=${PACKAGE_VERSIONS[0]} - - # Is there a ${arch} variable I can use instead of the iff - if [[ $(isARM64) == 1 ]]; then - CNI_DIR_TMP="cni-plugins-linux-arm64-v${packageVersion}" - else - CNI_DIR_TMP="cni-plugins-linux-amd64-v${packageVersion}" - fi - - if [[ -d "$CNI_DOWNLOADS_DIR/${CNI_DIR_TMP}" ]]; then - #not clear to me when this would ever happen. assume its related to the line above Latest VHD should have the untar, older should have the tgz. - mv ${CNI_DOWNLOADS_DIR}/${CNI_DIR_TMP}/* $CNI_BIN_DIR - else - echo "CNI tarball should already be unzipped by components.json" - exit $ERR_CNI_VERSION_INVALID - fi - - chown -R root:root $CNI_BIN_DIR -} - -installAzureCNI() { - CNI_TGZ_TMP=${VNET_CNI_PLUGINS_URL##*/} # Use bash builtin ## to remove all chars ("*") up to the final "/" - CNI_DIR_TMP=${CNI_TGZ_TMP%.tgz} # Use bash builtin % to remove the .tgz to look for a folder rather than tgz - - # We want to use the untar azurecni reference first. And if that doesn't exist on the vhd does the tgz? - # And if tgz is already on the vhd then just untar into CNI_BIN_DIR - # Latest VHD should have the untar, older should have the tgz. And who knows will have neither. - if [[ -d "$CNI_DOWNLOADS_DIR/${CNI_DIR_TMP}" ]]; then - mv ${CNI_DOWNLOADS_DIR}/${CNI_DIR_TMP}/* $CNI_BIN_DIR - else - if [[ ! -f "$CNI_DOWNLOADS_DIR/${CNI_TGZ_TMP}" ]]; then - logs_to_events "AKS.CSE.installAzureCNI.downloadAzureCNI" downloadAzureCNI - fi - - tar -xzf "$CNI_DOWNLOADS_DIR/${CNI_TGZ_TMP}" -C $CNI_BIN_DIR - fi - - chown -R root:root $CNI_BIN_DIR -} - -extractKubeBinaries() { - K8S_VERSION=$1 - KUBE_BINARY_URL=$2 - - mkdir -p ${K8S_DOWNLOADS_DIR} - K8S_TGZ_TMP=${KUBE_BINARY_URL##*/} - retrycmd_get_tarball 120 5 "$K8S_DOWNLOADS_DIR/${K8S_TGZ_TMP}" ${KUBE_BINARY_URL} || exit $ERR_K8S_DOWNLOAD_TIMEOUT - tar --transform="s|.*|&-${K8S_VERSION}|" --show-transformed-names -xzvf "$K8S_DOWNLOADS_DIR/${K8S_TGZ_TMP}" \ - --strip-components=3 -C /usr/local/bin kubernetes/node/bin/kubelet kubernetes/node/bin/kubectl - rm -f "$K8S_DOWNLOADS_DIR/${K8S_TGZ_TMP}" -} - -installKubeletKubectlAndKubeProxy() { - - CUSTOM_KUBE_BINARY_DOWNLOAD_URL="${CUSTOM_KUBE_BINARY_URL:=}" - if [[ ! -z ${CUSTOM_KUBE_BINARY_DOWNLOAD_URL} ]]; then - # remove the kubelet binaries to make sure the only binary left is from the CUSTOM_KUBE_BINARY_DOWNLOAD_URL - rm -rf /usr/local/bin/kubelet-* /usr/local/bin/kubectl-* - - # NOTE(mainred): we expect kubelet binary to be under `kubernetes/node/bin`. This suits the current setting of - # kube binaries used by AKS and Kubernetes upstream. - # TODO(mainred): let's see if necessary to auto-detect the path of kubelet - logs_to_events "AKS.CSE.installKubeletKubectlAndKubeProxy.extractKubeBinaries" extractKubeBinaries ${KUBERNETES_VERSION} ${CUSTOM_KUBE_BINARY_DOWNLOAD_URL} - - else - if [[ ! -f "/usr/local/bin/kubectl-${KUBERNETES_VERSION}" ]]; then - #TODO: remove the condition check on KUBE_BINARY_URL once RP change is released - if (($(echo ${KUBERNETES_VERSION} | cut -d"." -f2) >= 17)) && [ -n "${KUBE_BINARY_URL}" ]; then - logs_to_events "AKS.CSE.installKubeletKubectlAndKubeProxy.extractKubeBinaries" extractKubeBinaries ${KUBERNETES_VERSION} ${KUBE_BINARY_URL} - fi - fi - fi - mv "/usr/local/bin/kubelet-${KUBERNETES_VERSION}" "/usr/local/bin/kubelet" - mv "/usr/local/bin/kubectl-${KUBERNETES_VERSION}" "/usr/local/bin/kubectl" - - chmod a+x /usr/local/bin/kubelet /usr/local/bin/kubectl - rm -rf /usr/local/bin/kubelet-* /usr/local/bin/kubectl-* /home/hyperkube-downloads & -} - -pullContainerImage() { - CLI_TOOL=$1 - CONTAINER_IMAGE_URL=$2 - echo "pulling the image ${CONTAINER_IMAGE_URL} using ${CLI_TOOL}" - if [[ ${CLI_TOOL} == "ctr" ]]; then - logs_to_events "AKS.CSE.imagepullctr.${CONTAINER_IMAGE_URL}" "retrycmd_if_failure 2 1 120 ctr --namespace k8s.io image pull $CONTAINER_IMAGE_URL" || (echo "timed out pulling image ${CONTAINER_IMAGE_URL} via ctr" && exit $ERR_CONTAINERD_CTR_IMG_PULL_TIMEOUT) - elif [[ ${CLI_TOOL} == "crictl" ]]; then - logs_to_events "AKS.CSE.imagepullcrictl.${CONTAINER_IMAGE_URL}" "retrycmd_if_failure 2 1 120 crictl pull $CONTAINER_IMAGE_URL" || (echo "timed out pulling image ${CONTAINER_IMAGE_URL} via crictl" && exit $ERR_CONTAINERD_CRICTL_IMG_PULL_TIMEOUT) - else - logs_to_events "AKS.CSE.imagepull.${CONTAINER_IMAGE_URL}" "retrycmd_if_failure 2 1 120 docker pull $CONTAINER_IMAGE_URL" || (echo "timed out pulling image ${CONTAINER_IMAGE_URL} via docker" && exit $ERR_DOCKER_IMG_PULL_TIMEOUT) - fi -} - -retagContainerImage() { - CLI_TOOL=$1 - CONTAINER_IMAGE_URL=$2 - RETAG_IMAGE_URL=$3 - echo "retaging from ${CONTAINER_IMAGE_URL} to ${RETAG_IMAGE_URL} using ${CLI_TOOL}" - if [[ ${CLI_TOOL} == "ctr" ]]; then - ctr --namespace k8s.io image tag $CONTAINER_IMAGE_URL $RETAG_IMAGE_URL - elif [[ ${CLI_TOOL} == "crictl" ]]; then - crictl image tag $CONTAINER_IMAGE_URL $RETAG_IMAGE_URL - else - docker image tag $CONTAINER_IMAGE_URL $RETAG_IMAGE_URL - fi -} - -retagMCRImagesForChina() { - # retag all the mcr for mooncake - if [[ "${CONTAINER_RUNTIME}" == "containerd" ]]; then - # shellcheck disable=SC2016 - allMCRImages=($(ctr --namespace k8s.io images list | grep '^mcr.microsoft.com/' | awk '{print $1}')) - else - # shellcheck disable=SC2016 - allMCRImages=($(docker images | grep '^mcr.microsoft.com/' | awk '{str = sprintf("%s:%s", $1, $2)} {print str}')) - fi - if [[ "${allMCRImages}" == "" ]]; then - echo "failed to find mcr images for retag" - return - fi - for mcrImage in ${allMCRImages[@]+"${allMCRImages[@]}"}; do - # in mooncake, the mcr endpoint is: mcr.azk8s.cn - # shellcheck disable=SC2001 - retagMCRImage=$(echo ${mcrImage} | sed -e 's/^mcr.microsoft.com/mcr.azk8s.cn/g') - # can't use CLI_TOOL because crictl doesn't support retagging. - if [[ "${CONTAINER_RUNTIME}" == "containerd" ]]; then - retagContainerImage "ctr" ${mcrImage} ${retagMCRImage} - else - retagContainerImage "docker" ${mcrImage} ${retagMCRImage} - fi - done -} - -removeContainerImage() { - CLI_TOOL=$1 - CONTAINER_IMAGE_URL=$2 - if [[ "${CLI_TOOL}" == "docker" ]]; then - docker image rm $CONTAINER_IMAGE_URL - else - # crictl should always be present - crictl rmi $CONTAINER_IMAGE_URL - fi -} - -cleanUpImages() { - local targetImage=$1 - export targetImage - function cleanupImagesRun() { - if [ "${NEEDS_CONTAINERD}" == "true" ]; then - if [[ "${CLI_TOOL}" == "crictl" ]]; then - images_to_delete=$(crictl images | awk '{print $1":"$2}' | grep -vE "${KUBERNETES_VERSION}$|${KUBERNETES_VERSION}.[0-9]+$|${KUBERNETES_VERSION}-|${KUBERNETES_VERSION}_" | grep ${targetImage} | tr ' ' '\n') - else - images_to_delete=$(ctr --namespace k8s.io images list | awk '{print $1}' | grep -vE "${KUBERNETES_VERSION}$|${KUBERNETES_VERSION}.[0-9]+$|${KUBERNETES_VERSION}-|${KUBERNETES_VERSION}_" | grep ${targetImage} | tr ' ' '\n') - fi - else - images_to_delete=$(docker images --format '{{OpenBraces}}.Repository{{CloseBraces}}:{{OpenBraces}}.Tag{{CloseBraces}}' | grep -vE "${KUBERNETES_VERSION}$|${KUBERNETES_VERSION}.[0-9]+$|${KUBERNETES_VERSION}-|${KUBERNETES_VERSION}_" | grep ${targetImage} | tr ' ' '\n') - fi - local exit_code=$? - if [[ $exit_code != 0 ]]; then - exit $exit_code - elif [[ "${images_to_delete}" != "" ]]; then - echo "${images_to_delete}" | while read image; do - if [ "${NEEDS_CONTAINERD}" == "true" ]; then - removeContainerImage ${CLI_TOOL} ${image} - else - removeContainerImage "docker" ${image} - fi - done - fi - } - export -f cleanupImagesRun - retrycmd_if_failure 10 5 120 bash -c cleanupImagesRun -} - -cleanUpKubeProxyImages() { - echo $(date),$(hostname), startCleanUpKubeProxyImages - cleanUpImages "kube-proxy" - echo $(date),$(hostname), endCleanUpKubeProxyImages -} - -cleanupRetaggedImages() { - if [[ "${TARGET_CLOUD}" != "AzureChinaCloud" ]]; then - if [ "${NEEDS_CONTAINERD}" == "true" ]; then - if [[ "${CLI_TOOL}" == "crictl" ]]; then - images_to_delete=$(crictl images | awk '{print $1":"$2}' | grep '^mcr.azk8s.cn/' | tr ' ' '\n') - else - images_to_delete=$(ctr --namespace k8s.io images list | awk '{print $1}' | grep '^mcr.azk8s.cn/' | tr ' ' '\n') - fi - else - images_to_delete=$(docker images --format '{{OpenBraces}}.Repository{{CloseBraces}}:{{OpenBraces}}.Tag{{CloseBraces}}' | grep '^mcr.azk8s.cn/' | tr ' ' '\n') - fi - if [[ "${images_to_delete}" != "" ]]; then - echo "${images_to_delete}" | while read image; do - if [ "${NEEDS_CONTAINERD}" == "true" ]; then - # always use ctr, even if crictl is installed. - # crictl will remove *ALL* references to a given imageID (SHA), which removes too much. - removeContainerImage "ctr" ${image} - else - removeContainerImage "docker" ${image} - fi - done - fi - else - echo "skipping container cleanup for AzureChinaCloud" - fi -} - -cleanUpContainerImages() { - export KUBERNETES_VERSION - export CLI_TOOL - export -f retrycmd_if_failure - export -f removeContainerImage - export -f cleanUpImages - export -f cleanUpKubeProxyImages - bash -c cleanUpKubeProxyImages & -} - -cleanUpContainerd() { - rm -Rf $CONTAINERD_DOWNLOADS_DIR -} - -overrideNetworkConfig() { - CONFIG_FILEPATH="/etc/cloud/cloud.cfg.d/80_azure_net_config.cfg" - touch ${CONFIG_FILEPATH} - cat <>${CONFIG_FILEPATH} -datasource: - Azure: - apply_network_config: false -EOF -} -#EOF diff --git a/self-contained/bootstrap_main.sh b/self-contained/bootstrap_main.sh deleted file mode 100755 index 13425b27a87..00000000000 --- a/self-contained/bootstrap_main.sh +++ /dev/null @@ -1,424 +0,0 @@ -#!/bin/bash -# Timeout waiting for a file -ERR_FILE_WATCH_TIMEOUT=6 -set -x -if [ -f /opt/azure/containers/provision.complete ]; then - echo "Already ran to success exiting..." - exit 0 -fi - -aptmarkWALinuxAgent hold & - -# Setup logs for upload to host -LOG_DIR=/var/log/azure/aks -mkdir -p ${LOG_DIR} -ln -s /var/log/azure/cluster-provision.log \ - /var/log/azure/cluster-provision-cse-output.log \ - /opt/azure/*.json \ - /opt/azure/cloud-init-files.paved \ - /opt/azure/vhd-install.complete \ - ${LOG_DIR}/ - -# Redact the necessary secrets from cloud-config.txt so we don't expose any sensitive information -# when cloud-config.txt gets included within log bundles -python3 /opt/azure/containers/provision_redact_cloud_config.py \ - --cloud-config-path /var/lib/cloud/instance/cloud-config.txt \ - --output-path ${LOG_DIR}/cloud-config.txt - -UBUNTU_RELEASE=$(lsb_release -r -s) -if [[ ${UBUNTU_RELEASE} == "16.04" ]]; then - sudo apt-get -y autoremove chrony - echo $? - sudo systemctl restart systemd-timesyncd -fi - -echo $(date),$(hostname), startcustomscript>>/opt/m - -for i in $(seq 1 3600); do - if [ -s "${CSE_HELPERS_FILEPATH}" ]; then - grep -Fq '#HELPERSEOF' "${CSE_HELPERS_FILEPATH}" && break - fi - if [ $i -eq 3600 ]; then - exit $ERR_FILE_WATCH_TIMEOUT - else - sleep 1 - fi -done -sed -i "/#HELPERSEOF/d" "${CSE_HELPERS_FILEPATH}" -source "${CSE_HELPERS_FILEPATH}" - -source "${CSE_DISTRO_HELPERS_FILEPATH}" -source "${CSE_INSTALL_FILEPATH}" -source "${CSE_DISTRO_INSTALL_FILEPATH}" -source "${CSE_CONFIG_FILEPATH}" - -if [[ "${DISABLE_SSH}" == "true" ]]; then - disableSSH || exit $ERR_DISABLE_SSH -fi - -# This involes using proxy, log the config before fetching packages -echo "private egress proxy address is '${PRIVATE_EGRESS_PROXY_ADDRESS}'" -# TODO update to use proxy - -if [[ "${SHOULD_CONFIGURE_HTTP_PROXY}" == "true" ]]; then - if [[ "${SHOULD_CONFIGURE_HTTP_PROXY_CA}" == "true" ]]; then - configureHTTPProxyCA || exit $ERR_UPDATE_CA_CERTS - fi - configureEtcEnvironment -fi - - -if [[ "${SHOULD_CONFIGURE_CUSTOM_CA_TRUST}" == "true" ]]; then - configureCustomCaCertificate || exit $ERR_UPDATE_CA_CERTS -fi - -if [[ -n "${OUTBOUND_COMMAND}" ]]; then - if [[ -n "${PROXY_VARS}" ]]; then - eval $PROXY_VARS - fi - retrycmd_if_failure 50 1 5 $OUTBOUND_COMMAND >> /var/log/azure/cluster-provision-cse-output.log 2>&1 || exit $ERR_OUTBOUND_CONN_FAIL; -fi - -# Bring in OS-related vars -source /etc/os-release - -# Mandb is not currently available on MarinerV1 -if [[ ${ID} != "mariner" ]] && [[ ${ID} != "azurelinux" ]]; then - echo "Removing man-db auto-update flag file..." - logs_to_events "AKS.CSE.removeManDbAutoUpdateFlagFile" removeManDbAutoUpdateFlagFile -fi - -export -f should_skip_nvidia_drivers -skip_nvidia_driver_install=$(retrycmd_if_failure_no_stats 10 1 10 bash -cx should_skip_nvidia_drivers) -ret=$? -if [[ "$ret" != "0" ]]; then - echo "Failed to determine if nvidia driver install should be skipped" - exit $ERR_NVIDIA_DRIVER_INSTALL -fi - -if [[ "${GPU_NODE}" != "true" ]] || [[ "${skip_nvidia_driver_install}" == "true" ]]; then - logs_to_events "AKS.CSE.cleanUpGPUDrivers" cleanUpGPUDrivers -fi - -logs_to_events "AKS.CSE.disableSystemdResolved" disableSystemdResolved - -logs_to_events "AKS.CSE.configureAdminUser" configureAdminUser - -VHD_LOGS_FILEPATH=/opt/azure/vhd-install.complete -if [ -f $VHD_LOGS_FILEPATH ]; then - echo "detected golden image pre-install" - logs_to_events "AKS.CSE.cleanUpContainerImages" cleanUpContainerImages - FULL_INSTALL_REQUIRED=false -else - if [[ "${IS_VHD}" = true ]]; then - echo "Using VHD distro but file $VHD_LOGS_FILEPATH not found" - exit $ERR_VHD_FILE_NOT_FOUND - fi - FULL_INSTALL_REQUIRED=true -fi - -if [[ $OS == $UBUNTU_OS_NAME ]] && [ "$FULL_INSTALL_REQUIRED" = "true" ]; then - logs_to_events "AKS.CSE.installDeps" installDeps -else - echo "Golden image; skipping dependencies installation" -fi - -logs_to_events "AKS.CSE.installContainerRuntime" installContainerRuntime -if [ "${NEEDS_CONTAINERD}" == "true" ] && [ "${TELEPORT_ENABLED}" == "true" ]; then - logs_to_events "AKS.CSE.installTeleportdPlugin" installTeleportdPlugin -fi - -setupCNIDirs - -logs_to_events "AKS.CSE.installNetworkPlugin" installNetworkPlugin - -if [ "${IS_KRUSTLET}" == "true" ]; then - local versionsWasm=$(jq -r '.Packages[] | select(.name == "containerd-wasm-shims") | .downloadURIs.default.current.versionsV2[].latestVersion' "$COMPONENTS_FILEPATH") - local downloadLocationWasm=$(jq -r '.Packages[] | select(.name == "containerd-wasm-shims") | .downloadLocation' "$COMPONENTS_FILEPATH") - local downloadURLWasm=$(jq -r '.Packages[] | select(.name == "containerd-wasm-shims") | .downloadURIs.default.current.downloadURL' "$COMPONENTS_FILEPATH") - logs_to_events "AKS.CSE.installContainerdWasmShims" installContainerdWasmShims "$downloadLocationWasm" "$downloadURLWasm" "$versionsWasm" - - local versionsSpinKube=$(jq -r '.Packages[] | select(.name == spinkube") | .downloadURIs.default.current.versionsV2[].latestVersion' "$COMPONENTS_FILEPATH") - local downloadLocationSpinKube=$(jq -r '.Packages[] | select(.name == "spinkube) | .downloadLocation' "$COMPONENTS_FILEPATH") - local downloadURLSpinKube=$(jq -r '.Packages[] | select(.name == "spinkube") | .downloadURIs.default.current.downloadURL' "$COMPONENTS_FILEPATH") - logs_to_events "AKS.CSE.installSpinKube" installSpinKube "$downloadURSpinKube" "$downloadLocationSpinKube" "$versionsSpinKube" -fi - -# By default, never reboot new nodes. -REBOOTREQUIRED=false - -echo $(date),$(hostname), "Start configuring GPU drivers" -if [[ "${GPU_NODE}" = true ]] && [[ "${skip_nvidia_driver_install}" != "true" ]]; then - logs_to_events "AKS.CSE.ensureGPUDrivers" ensureGPUDrivers - if [[ "${ENABLE_GPU_DEVICE_PLUGIN_IF_NEEDED}" = true ]]; then - if [[ "${MIG_NODE}" == "true" ]] && [[ -f "/etc/systemd/system/nvidia-device-plugin.service" ]]; then - mkdir -p "/etc/systemd/system/nvidia-device-plugin.service.d" - tee "/etc/systemd/system/nvidia-device-plugin.service.d/10-mig_strategy.conf" > /dev/null <<'EOF' -[Service] -Environment="MIG_STRATEGY=--mig-strategy single" -ExecStart= -ExecStart=/usr/local/nvidia/bin/nvidia-device-plugin $MIG_STRATEGY -EOF - fi - logs_to_events "AKS.CSE.start.nvidia-device-plugin" "systemctlEnableAndStart nvidia-device-plugin" || exit $ERR_GPU_DEVICE_PLUGIN_START_FAIL - else - logs_to_events "AKS.CSE.stop.nvidia-device-plugin" "systemctlDisableAndStop nvidia-device-plugin" - fi - - if [[ "${GPU_NEEDS_FABRIC_MANAGER}" == "true" ]]; then - # fabric manager trains nvlink connections between multi instance gpus. - # it appears this is only necessary for systems with *multiple cards*. - # i.e., an A100 can be partitioned a maximum of 7 ways. - # An NC24ads_A100_v4 has one A100. - # An ND96asr_v4 has eight A100, for a maximum of 56 partitions. - # ND96 seems to require fabric manager *even when not using mig partitions* - # while it fails to install on NC24. - if isMarinerOrAzureLinux "$OS"; then - logs_to_events "AKS.CSE.installNvidiaFabricManager" installNvidiaFabricManager - fi - logs_to_events "AKS.CSE.nvidia-fabricmanager" "systemctlEnableAndStart nvidia-fabricmanager" || exit $ERR_GPU_DRIVERS_START_FAIL - fi - - # This will only be true for multi-instance capable VM sizes - # for which the user has specified a partitioning profile. - # it is valid to use mig-capable gpus without a partitioning profile. - if [[ "${MIG_NODE}" == "true" ]]; then - # A100 GPU has a bit in the physical card (infoROM) to enable mig mode. - # Changing this bit in either direction requires a VM reboot on Azure (hypervisor/plaform stuff). - # Commands such as `nvidia-smi --gpu-reset` may succeed, - # while commands such as `nvidia-smi -q` will show mismatched current/pending mig mode. - # this will not be required per nvidia for next gen H100. - REBOOTREQUIRED=true - - # this service applies the partitioning scheme with nvidia-smi. - # we should consider moving to mig-parted which is simpler/newer. - # we couldn't because of old drivers but that has long been fixed. - logs_to_events "AKS.CSE.ensureMigPartition" ensureMigPartition - fi -fi - -echo $(date),$(hostname), "End configuring GPU drivers" - -logs_to_events "AKS.CSE.installKubeletKubectlAndKubeProxy" installKubeletKubectlAndKubeProxy - -createKubeManifestDir - -if [ "${HAS_CUSTOM_SEARCH_DOMAIN}" == "true" ]; then - "${CUSTOM_SEARCH_DOMAIN_FILEPATH}" > /opt/azure/containers/setup-custom-search-domain.log 2>&1 || exit $ERR_CUSTOM_SEARCH_DOMAINS_FAIL -fi - - -# for drop ins, so they don't all have to check/create the dir -mkdir -p "/etc/systemd/system/kubelet.service.d" - -logs_to_events "AKS.CSE.configureK8s" configureK8s - -logs_to_events "AKS.CSE.configureCNI" configureCNI - -# configure and enable dhcpv6 for dual stack feature -if [ "${IPV6_DUAL_STACK_ENABLED}" == "true" ]; then - logs_to_events "AKS.CSE.ensureDHCPv6" ensureDHCPv6 -fi - -if [ "${NEEDS_CONTAINERD}" == "true" ]; then - # containerd should not be configured until cni has been configured first - logs_to_events "AKS.CSE.ensureContainerd" ensureContainerd -else - logs_to_events "AKS.CSE.ensureDocker" ensureDocker -fi - -if [[ "${MESSAGE_OF_THE_DAY}" != "" ]]; then - echo "${MESSAGE_OF_THE_DAY}" | base64 -d > /etc/motd -fi - -# must run before kubelet starts to avoid race in container status using wrong image -# https://github.com/kubernetes/kubernetes/issues/51017 -# can remove when fixed -if [[ "${TARGET_CLOUD}" == "AzureChinaCloud" ]]; then - retagMCRImagesForChina -fi - -if [[ "${ENABLE_HOSTS_CONFIG_AGENT}" == "true" ]]; then - logs_to_events "AKS.CSE.configPrivateClusterHosts" configPrivateClusterHosts -fi - -if [ "${SHOULD_CONFIG_TRANSPARENT_HUGE_PAGE}" == "true" ]; then - logs_to_events "AKS.CSE.configureTransparentHugePage" configureTransparentHugePage -fi - -if [ "${SHOULD_CONFIG_SWAP_FILE}" == "true" ]; then - logs_to_events "AKS.CSE.configureSwapFile" configureSwapFile -fi - -if [ "${NEEDS_CGROUPV2}" == "true" ]; then - tee "/etc/systemd/system/kubelet.service.d/10-cgroupv2.conf" > /dev/null < /etc/containerd/kubenet_template.conf - - # In k8s 1.27, the flag --container-runtime was removed. - # We now have 2 drop-in's, one with the still valid flags that will be applied to all k8s versions, - # the flags are --runtime-request-timeout, --container-runtime-endpoint, --runtime-cgroups - # For k8s >= 1.27, the flag --container-runtime will not be passed. - tee "/etc/systemd/system/kubelet.service.d/10-containerd-base-flag.conf" > /dev/null <<'EOF' -[Service] -Environment="KUBELET_CONTAINERD_FLAGS=--runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock --runtime-cgroups=/system.slice/containerd.service" -EOF - - # if k8s version < 1.27.0, add the drop in for --container-runtime flag - if ! semverCompare ${KUBERNETES_VERSION:-"0.0.0"} "1.27.0"; then - tee "/etc/systemd/system/kubelet.service.d/10-container-runtime-flag.conf" > /dev/null <<'EOF' -[Service] -Environment="KUBELET_CONTAINER_RUNTIME_FLAG=--container-runtime=remote" -EOF - fi -fi - -if [ "${HAS_KUBELET_DISK_TYPE}" == "true" ]; then - tee "/etc/systemd/system/kubelet.service.d/10-bindmount.conf" > /dev/null < /sys/bus/vmbus/drivers/hv_util/unbind - sed -i "13i\echo 2dd1ce17-079e-403c-b352-a1921ee207ee > /sys/bus/vmbus/drivers/hv_util/unbind\n" /etc/rc.local - fi -fi - -VALIDATION_ERR=0 - -# TODO(djsly): Look at leveraging the `aks-check-network.sh` script for this validation instead of duplicating the logic here - -# Edge case scenarios: -# high retry times to wait for new API server DNS record to replicate (e.g. stop and start cluster) -# high timeout to address high latency for private dns server to forward request to Azure DNS -# dns check will be done only if we use FQDN for API_SERVER_NAME -API_SERVER_CONN_RETRIES=50 -if [[ $API_SERVER_NAME == *.privatelink.* ]]; then - API_SERVER_CONN_RETRIES=100 -fi -if ! [[ ${API_SERVER_NAME} =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then - API_SERVER_DNS_RETRIES=100 - if [[ $API_SERVER_NAME == *.privatelink.* ]]; then - API_SERVER_DNS_RETRIES=200 - fi - if [[ "${ENABLE_HOSTS_CONFIG_AGENT}" != "true" ]]; then - RES=$(logs_to_events "AKS.CSE.apiserverNslookup" "retrycmd_if_failure ${API_SERVER_DNS_RETRIES} 1 20 nslookup -timeout=5 -retry=0 ${API_SERVER_NAME}") - STS=$? - else - STS=0 - fi - if [[ $STS != 0 ]]; then - time nslookup ${API_SERVER_NAME} - if [[ $RES == *"168.63.129.16"* ]]; then - VALIDATION_ERR=$ERR_K8S_API_SERVER_AZURE_DNS_LOOKUP_FAIL - else - VALIDATION_ERR=$ERR_K8S_API_SERVER_DNS_LOOKUP_FAIL - fi - else - if [ "${UBUNTU_RELEASE}" == "18.04" ]; then - #TODO (djsly): remove this once 18.04 isn't supported anymore - logs_to_events "AKS.CSE.apiserverNC" "retrycmd_if_failure ${API_SERVER_CONN_RETRIES} 1 10 nc -vz ${API_SERVER_NAME} 443" || time nc -vz ${API_SERVER_NAME} 443 || VALIDATION_ERR=$ERR_K8S_API_SERVER_CONN_FAIL - else - logs_to_events "AKS.CSE.apiserverCurl" "retrycmd_if_failure ${API_SERVER_CONN_RETRIES} 1 10 curl -v --cacert /etc/kubernetes/certs/ca.crt https://${API_SERVER_NAME}:443" || time curl -v --cacert /etc/kubernetes/certs/ca.crt "https://${API_SERVER_NAME}:443" || VALIDATION_ERR=$ERR_K8S_API_SERVER_CONN_FAIL - fi - fi -else - if [ "${UBUNTU_RELEASE}" == "18.04" ]; then - #TODO (djsly): remove this once 18.04 isn't supported anymore - logs_to_events "AKS.CSE.apiserverNC" "retrycmd_if_failure ${API_SERVER_CONN_RETRIES} 1 10 nc -vz ${API_SERVER_NAME} 443" || time nc -vz ${API_SERVER_NAME} 443 || VALIDATION_ERR=$ERR_K8S_API_SERVER_CONN_FAIL - else - logs_to_events "AKS.CSE.apiserverCurl" "retrycmd_if_failure ${API_SERVER_CONN_RETRIES} 1 10 curl -v --cacert /etc/kubernetes/certs/ca.crt https://${API_SERVER_NAME}:443" || time curl -v --cacert /etc/kubernetes/certs/ca.crt "https://${API_SERVER_NAME}:443" || VALIDATION_ERR=$ERR_K8S_API_SERVER_CONN_FAIL - fi -fi - -if [[ ${ID} != "mariner" ]] && [[ ${ID} != "azurelinux" ]]; then - echo "Recreating man-db auto-update flag file and kicking off man-db update process at $(date)" - createManDbAutoUpdateFlagFile - /usr/bin/mandb && echo "man-db finished updates at $(date)" & -fi - -if $REBOOTREQUIRED; then - echo 'reboot required, rebooting node in 1 minute' - /bin/bash -c "shutdown -r 1 &" - if [[ $OS == $UBUNTU_OS_NAME ]]; then - # logs_to_events should not be run on & commands - aptmarkWALinuxAgent unhold & - fi -else - if [[ $OS == $UBUNTU_OS_NAME ]]; then - # logs_to_events should not be run on & commands - if [ "${ENABLE_UNATTENDED_UPGRADES}" == "true" ]; then - UU_CONFIG_DIR="/etc/apt/apt.conf.d/99periodic" - mkdir -p "$(dirname "${UU_CONFIG_DIR}")" - touch "${UU_CONFIG_DIR}" - chmod 0644 "${UU_CONFIG_DIR}" - echo 'APT::Periodic::Update-Package-Lists "1";' >> "${UU_CONFIG_DIR}" - echo 'APT::Periodic::Unattended-Upgrade "1";' >> "${UU_CONFIG_DIR}" - systemctl unmask apt-daily.service apt-daily-upgrade.service - systemctl enable apt-daily.service apt-daily-upgrade.service - systemctl enable apt-daily.timer apt-daily-upgrade.timer - systemctl restart --no-block apt-daily.timer apt-daily-upgrade.timer - # this is the DOWNLOAD service - # meaning we are wasting IO without even triggering an upgrade - # -________________- - systemctl restart --no-block apt-daily.service - - fi - aptmarkWALinuxAgent unhold & - elif isMarinerOrAzureLinux "$OS"; then - if [ "${ENABLE_UNATTENDED_UPGRADES}" == "true" ]; then - if [ "${IS_KATA}" == "true" ]; then - # Currently kata packages must be updated as a unit (including the kernel which requires a reboot). This can - # only be done reliably via image updates as of now so never enable automatic updates. - echo 'EnableUnattendedUpgrade is not supported by kata images, will not be enabled' - else - # By default the dnf-automatic is service is notify only in Mariner/AzureLinux. - # Enable the automatic install timer and the check-restart timer. - # Stop the notify only dnf timer since we've enabled the auto install one. - # systemctlDisableAndStop adds .service to the end which doesn't work on timers. - systemctl disable dnf-automatic-notifyonly.timer - systemctl stop dnf-automatic-notifyonly.timer - # At 6:00:00 UTC (1 hour random fuzz) download and install package updates. - systemctl unmask dnf-automatic-install.service || exit $ERR_SYSTEMCTL_START_FAIL - systemctl unmask dnf-automatic-install.timer || exit $ERR_SYSTEMCTL_START_FAIL - systemctlEnableAndStart dnf-automatic-install.timer || exit $ERR_SYSTEMCTL_START_FAIL - # The check-restart service which will inform kured of required restarts should already be running - fi - fi - fi -fi - -echo "Custom script finished. API server connection check code:" $VALIDATION_ERR -echo $(date),$(hostname), endcustomscript>>/opt/m -mkdir -p /opt/azure/containers && touch /opt/azure/containers/provision.complete - -exit $VALIDATION_ERR - - -#EOF diff --git a/self-contained/bootstrap_start.sh b/self-contained/bootstrap_start.sh deleted file mode 100644 index 6e5eea936d0..00000000000 --- a/self-contained/bootstrap_start.sh +++ /dev/null @@ -1,96 +0,0 @@ -CSE_STARTTIME=$(date) -CSE_STARTTIME_FORMATTED=$(date +"%F %T.%3N") -timeout -k5s 15m /bin/bash /opt/azure/containers/provision.sh >> /var/log/azure/cluster-provision.log 2>&1 -EXIT_CODE=$? -systemctl --no-pager -l status kubelet >> /var/log/azure/cluster-provision-cse-output.log 2>&1 -OUTPUT=$(tail -c 3000 "/var/log/azure/cluster-provision.log") -KERNEL_STARTTIME=$(systemctl show -p KernelTimestamp | sed -e "s/KernelTimestamp=//g" || true) -KERNEL_STARTTIME_FORMATTED=$(date -d "${KERNEL_STARTTIME}" +"%F %T.%3N" ) -CLOUDINITLOCAL_STARTTIME=$(systemctl show cloud-init-local -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true) -CLOUDINITLOCAL_STARTTIME_FORMATTED=$(date -d "${CLOUDINITLOCAL_STARTTIME}" +"%F %T.%3N" ) -CLOUDINIT_STARTTIME=$(systemctl show cloud-init -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true) -CLOUDINIT_STARTTIME_FORMATTED=$(date -d "${CLOUDINIT_STARTTIME}" +"%F %T.%3N" ) -CLOUDINITFINAL_STARTTIME=$(systemctl show cloud-final -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true) -CLOUDINITFINAL_STARTTIME_FORMATTED=$(date -d "${CLOUDINITFINAL_STARTTIME}" +"%F %T.%3N" ) -NETWORKD_STARTTIME=$(systemctl show systemd-networkd -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true) -NETWORKD_STARTTIME_FORMATTED=$(date -d "${NETWORKD_STARTTIME}" +"%F %T.%3N" ) -GUEST_AGENT_STARTTIME=$(systemctl show walinuxagent.service -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true) -GUEST_AGENT_STARTTIME_FORMATTED=$(date -d "${GUEST_AGENT_STARTTIME}" +"%F %T.%3N" ) -KUBELET_START_TIME=$(systemctl show kubelet.service -p ExecMainStartTimestamp | sed -e "s/ExecMainStartTimestamp=//g" || true) -KUBELET_START_TIME_FORMATTED=$(date -d "${KUBELET_START_TIME}" +"%F %T.%3N" ) -KUBELET_READY_TIME_FORMATTED="$(date -d "$(journalctl -u kubelet | grep NodeReady | cut -d' ' -f1-3)" +"%F %T.%3N")" -SYSTEMD_SUMMARY=$(systemd-analyze || true) -CSE_ENDTIME_FORMATTED=$(date +"%F %T.%3N") -EVENTS_LOGGING_DIR=/var/log/azure/Microsoft.Azure.Extensions.CustomScript/events/ -EVENTS_FILE_NAME=$(date +%s%3N) -EXECUTION_DURATION=$(echo $(($(date +%s) - $(date -d "$CSE_STARTTIME" +%s)))) - -JSON_STRING=$( jq -n \ - --arg ec "$EXIT_CODE" \ - --arg op "$OUTPUT" \ - --arg er "" \ - --arg ed "$EXECUTION_DURATION" \ - --arg ks "$KERNEL_STARTTIME" \ - --arg cinitl "$CLOUDINITLOCAL_STARTTIME" \ - --arg cinit "$CLOUDINIT_STARTTIME" \ - --arg cf "$CLOUDINITFINAL_STARTTIME" \ - --arg ns "$NETWORKD_STARTTIME" \ - --arg cse "$CSE_STARTTIME" \ - --arg ga "$GUEST_AGENT_STARTTIME" \ - --arg ss "$SYSTEMD_SUMMARY" \ - --arg kubelet "$KUBELET_START_TIME" \ - '{ExitCode: $ec, Output: $op, Error: $er, ExecDuration: $ed, KernelStartTime: $ks, CloudInitLocalStartTime: $cinitl, CloudInitStartTime: $cinit, CloudFinalStartTime: $cf, NetworkdStartTime: $ns, CSEStartTime: $cse, GuestAgentStartTime: $ga, SystemdSummary: $ss, BootDatapoints: { KernelStartTime: $ks, CSEStartTime: $cse, GuestAgentStartTime: $ga, KubeletStartTime: $kubelet }}' ) -mkdir -p /var/log/azure/aks -echo $JSON_STRING | tee /var/log/azure/aks/provision.json - -# messsage_string is here because GA only accepts strings in Message. -message_string=$( jq -n \ ---arg EXECUTION_DURATION "${EXECUTION_DURATION}" \ ---arg EXIT_CODE "${EXIT_CODE}" \ ---arg KERNEL_STARTTIME_FORMATTED "${KERNEL_STARTTIME_FORMATTED}" \ ---arg CLOUDINITLOCAL_STARTTIME_FORMATTED "${CLOUDINITLOCAL_STARTTIME_FORMATTED}" \ ---arg CLOUDINIT_STARTTIME_FORMATTED "${CLOUDINIT_STARTTIME_FORMATTED}" \ ---arg CLOUDINITFINAL_STARTTIME_FORMATTED "${CLOUDINITFINAL_STARTTIME_FORMATTED}" \ ---arg NETWORKD_STARTTIME_FORMATTED "${NETWORKD_STARTTIME_FORMATTED}" \ ---arg GUEST_AGENT_STARTTIME_FORMATTED "${GUEST_AGENT_STARTTIME_FORMATTED}" \ ---arg KUBELET_START_TIME_FORMATTED "${KUBELET_START_TIME_FORMATTED}" \ ---arg KUBELET_READY_TIME_FORMATTED "${KUBELET_READY_TIME_FORMATTED}" \ -'{ExitCode: $EXIT_CODE, E2E: $EXECUTION_DURATION, KernelStartTime: $KERNEL_STARTTIME_FORMATTED, CloudInitLocalStartTime: $CLOUDINITLOCAL_STARTTIME_FORMATTED, CloudInitStartTime: $CLOUDINIT_STARTTIME_FORMATTED, CloudFinalStartTime: $CLOUDINITFINAL_STARTTIME_FORMATTED, NetworkdStartTime: $NETWORKD_STARTTIME_FORMATTED, GuestAgentStartTime: $GUEST_AGENT_STARTTIME_FORMATTED, KubeletStartTime: $KUBELET_START_TIME_FORMATTED, KubeletReadyTime: $KUBELET_READY_TIME_FORMATTED } | tostring' -) -# this clean up brings me no joy, but removing extra "\" and then removing quotes at the end of the string -# allows parsing to happening without additional manipulation -message_string=$(echo $message_string | sed 's/\\//g' | sed 's/^.\(.*\).$/\1/') - -# arg names are defined by GA and all these are required to be correctly read by GA -# EventPid, EventTid are required to be int. No use case for them at this point. -EVENT_JSON=$( jq -n \ - --arg Timestamp "${CSE_STARTTIME_FORMATTED}" \ - --arg OperationId "${CSE_ENDTIME_FORMATTED}" \ - --arg Version "1.23" \ - --arg TaskName "AKS.CSE.cse_start" \ - --arg EventLevel "${eventlevel}" \ - --arg Message "${message_string}" \ - --arg EventPid "0" \ - --arg EventTid "0" \ - '{Timestamp: $Timestamp, OperationId: $OperationId, Version: $Version, TaskName: $TaskName, EventLevel: $EventLevel, Message: $Message, EventPid: $EventPid, EventTid: $EventTid}' -) -echo ${EVENT_JSON} > ${EVENTS_LOGGING_DIR}${EVENTS_FILE_NAME}.json - -# force a log upload to the host after the provisioning script finishes -# if we failed, wait for the upload to complete so that we don't remove -# the VM before it finishes. if we succeeded, upload in the background -# so that the provisioning script returns success more quickly -upload_logs() { - # find the most recent version of WALinuxAgent and use it to collect logs per - # https://supportability.visualstudio.com/AzureIaaSVM/_wiki/wikis/AzureIaaSVM/495009/Log-Collection_AGEX?anchor=manually-collect-logs - PYTHONPATH=$(find /var/lib/waagent -name WALinuxAgent\*.egg | sort -rV | head -n1) - python3 $PYTHONPATH -collect-logs -full >/dev/null 2>&1 - python3 /opt/azure/containers/provision_send_logs.py >/dev/null 2>&1 -} -if [ $EXIT_CODE -ne 0 ]; then - upload_logs -else - upload_logs & -fi - -exit $EXIT_CODE \ No newline at end of file