-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathinstall.sh
executable file
·316 lines (259 loc) · 13.5 KB
/
install.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#!/bin/bash
set -euo pipefail
MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
CONFIG_FILE="${MYDIR}/config.yaml"
TEMPLATES_PATH="${MYDIR}/templates"
help()
{
echo
echo "Deploys and configure Azure infrastrucure for CycleCloud cluster"
echo
echo "USAGE: install.sh [OPTION]"
echo "Options:"
echo "-a Run only ansible configuration"
echo "-b Run only bastion deployment"
echo "-s Run post-deployment routine for deployment to adhere to Microsoft security policies"
echo "-h Prints this help"
echo
}
create_bastion_scripts()
{
TARGET_NAME=$1
BICEP_OUTPUT_FILE=$2
VMRESOURCEIDS=("${@:3}")
USER=$(jq -r '.globalVars.value.cycleserverAdmin' ${BICEP_OUTPUT_FILE})
BASTIONNAME=$(jq -r '.globalVars.value.bastionName' ${BICEP_OUTPUT_FILE})
KEY="${MYDIR}/${USERNAME}_id_rsa"
for TEMPLATE_ROOT in bastion_ssh bastion_tunnel; do
VMRESOURCEIDS_STR=$(IFS=' '; echo "${VMRESOURCEIDS[*]}")
sed -e "s|<RESOURCEGROUP>|${RESOURCE_GROUP}|g" \
-e "s|<USERNAME>|${USER}|g" \
-e "s|<SSHKEYPATH>|${KEY}|g" \
-e "s|<BASTIONNAME>|${BASTIONNAME}|g" \
-e "s|<VMRESOURCEIDS>|${VMRESOURCEIDS_STR}|g" \
-e "s|<SUBNAME>|${SUBSCRIPTION}|g" \
${TEMPLATES_PATH}/${TEMPLATE_ROOT}.template > ${TEMPLATE_ROOT}_${TARGET_NAME}.sh
chmod +x ${TEMPLATE_ROOT}_${TARGET_NAME}.sh
done
}
# Run Bicep and Ansible by default, but not the security post-deployment routine
RUN_BICEP=true
RUN_ANSIBLE=true
RUN_SECURE=false
while getopts ":absh" OPT; do
case $OPT in
a) RUN_BICEP=false;;
b) RUN_ANSIBLE=false;;
s) RUN_SECURE=true;;
h) help
exit 0;;
\?) help
exit 1;;
esac
done
##############
### CHECKS ###
##############
cmd_exists() {
command -v "$@" &> /dev/null || { echo >&2 "$@ is required but not installed. Aborting."; exit 1; }
}
cmd_exists az
cmd_exists jq
cmd_exists yq
cmd_exists perl
cmd_exists rsync
# Check that config file is valid
source ./scripts/validate_config.sh
# Make sure submodules are also cloned
git submodule update --init --recursive
#############
### BICEP ###
#############
# Those variables must be exported to be visible from Ansible
export RESOURCE_GROUP=$(yq -r '.resource_group_name' ${CONFIG_FILE})
export SUBSCRIPTION=$(yq -r '.subscription_name' ${CONFIG_FILE})
export REGION=$(yq -r '.region' ${CONFIG_FILE})
USERNAME=$(grep cycleAdminUsername bicep/params.bicepparam | cut -d"'" -f 2)
KEYFILE="${USERNAME}_id_rsa"
MYSQL_PWD_FILE="mysql_admin_pwd.txt"
if [ ${RUN_BICEP} == true ]; then
DEPLOYMENT_NAME=bicepdeploy-$(date +%Y%m%d%H%M%S)
DEPLOYMENT_OUTPUT=${RESOURCE_GROUP}_${DEPLOYMENT_NAME}.json
if [ ! -f ./${KEYFILE} ]; then
echo "Generating new keypair for ${USERNAME}"
ssh-keygen -m PEM -t rsa -b 4096 -f ./${KEYFILE} -N ''
# Remove newline after public key to avoid issues when using it as parameter json files
perl -pi -e 'chomp if eof' ./${KEYFILE}.pub
fi
# Generate password for MySQL database
if [ ! -f ./${MYSQL_PWD_FILE} ]; then
echo "Generating new password for MySQL database"
openssl rand -base64 16 | tr -d \\n > ./${MYSQL_PWD_FILE}
chmod 400 ./${MYSQL_PWD_FILE}
fi
# Make sure we are using the correct subscription
az account set --subscription "${SUBSCRIPTION}"
# Accept Azure Marketplace terms for CycleCloud image
az vm image terms accept --publisher azurecyclecloud \
--offer azure-cyclecloud \
--plan cyclecloud8-gen2
# Required to grant access to key vault secrets
export USER_OBJECTID=$(az ad signed-in-user show --query id --output tsv)
# Create JSON files for additional bicep variables
set +e # Ignore errors when the optional variables are not present
yq -ej '.resource_group_tags' ${CONFIG_FILE} > bicep/rg_tags.json
[ $? -ne 0 ] && echo '{}' > bicep/rg_tags.json
yq -ej '.monitor_tags' ${CONFIG_FILE} > bicep/monitor_tags.json
[ $? -ne 0 ] && echo '{}' > bicep/monitor_tags.json
set -e # Resume normal error handling
# If running in secure mode, add security tags to resource group tags
if [ ${RUN_SECURE} == true ]; then
jq '. += {"AzSecPackAutoConfigReady": "true"}' bicep/rg_tags.json > temp.json && mv temp.json bicep/rg_tags.json
fi
# Start deployment
az deployment sub create --template-file bicep/main.bicep \
--parameters bicep/params.bicepparam \
--location ${REGION} \
--name ${DEPLOYMENT_NAME}
# Collect deployment output
az deployment sub show --name ${DEPLOYMENT_NAME} \
--query properties.outputs \
> ${DEPLOYMENT_OUTPUT}
# Assign Metrics Publisher role to Prometheus VM identity
# Cannot be done in previous bicep deployment as explained here:
# https://github.com/Azure/bicep/discussions/13352
ROLE_ASSIGNMENT_OUTPUT_FILE=metrics_publisher_assignment.json
VM_PRINCIPAL_ID=$(jq -r '.globalVars.value.prometheusVmPrincipalId' ${DEPLOYMENT_OUTPUT})
ROLE_SCOPE=$(jq -r '.globalVars.value.dataCollectionRuleId' ${DEPLOYMENT_OUTPUT})
az role assignment create --role 'Monitoring Metrics Publisher' \
--assignee ${VM_PRINCIPAL_ID} \
--scope ${ROLE_SCOPE} > ${ROLE_ASSIGNMENT_OUTPUT_FILE}
# Add the system managed identity application ID to the deployment output file
APP_ID=$(jq -r '.principalName' ${ROLE_ASSIGNMENT_OUTPUT_FILE})
# Sometimes the application ID is not immediately available, so we try again
while [ $(echo $APP_ID | wc -m) -lt 37 ]; do
sleep 5
APP_ID=$(az role assignment list --role 'Monitoring Metrics Publisher' --assignee ${VM_PRINCIPAL_ID} --scope ${ROLE_SCOPE} --query '[].principalName' --output tsv)
done
# Propogate env vars to Ansible via the deployment output file
jq --arg appId "${APP_ID}" '.globalVars.value.prometheusMetricsPubAppId = $appId' ${DEPLOYMENT_OUTPUT} > temp.json && mv temp.json ${DEPLOYMENT_OUTPUT}
jq --arg hpcSku "${HPC_SKU}" '.globalVars.value.hpcSku = $hpcSku' ${DEPLOYMENT_OUTPUT} > temp.json && mv temp.json ${DEPLOYMENT_OUTPUT}
jq --arg hpcMaxCoreCount "${HPC_MAX_CORE_COUNT}" '.globalVars.value.hpcMaxCoreCount = $hpcMaxCoreCount' ${DEPLOYMENT_OUTPUT} > temp.json && mv temp.json ${DEPLOYMENT_OUTPUT}
jq --arg hpcMaxNumVMs "${HPC_MAX_NUM_VMS}" '.globalVars.value.hpcMaxNumVMs = $hpcMaxNumVMs' ${DEPLOYMENT_OUTPUT} > temp.json && mv temp.json ${DEPLOYMENT_OUTPUT}
# Add fields removed from deployment output to be ingested by Ansible
jq --arg cycleserverAdminPubKey "$(cat cycleadmin_id_rsa.pub)" '.globalVars.value.cycleserverAdminPubKey = $cycleserverAdminPubKey' ${DEPLOYMENT_OUTPUT} > temp.json && mv temp.json ${DEPLOYMENT_OUTPUT}
jq --arg mySqlPwd "$(cat mysql_admin_pwd.txt)" '.globalVars.value.mySqlPwd = $mySqlPwd' ${DEPLOYMENT_OUTPUT} > temp.json && mv temp.json ${DEPLOYMENT_OUTPUT}
rm -f ${ROLE_ASSIGNMENT_OUTPUT_FILE}
fi
# Use the latest available Bicep deployment output
DEPLOYMENT_OUTPUT=$(ls -t ${RESOURCE_GROUP}_bicepdeploy-*.json | head -1)
# Generate bastion scripts for cycleserver and promehteus VMs
VM_ID=$(jq -r '.globalVars.value.cycleserverId' ${DEPLOYMENT_OUTPUT})
create_bastion_scripts 'cycleserver' ${DEPLOYMENT_OUTPUT} ${VM_ID}
VM_ID=$(jq -r '.globalVars.value.prometheusVmId' ${DEPLOYMENT_OUTPUT})
create_bastion_scripts 'prometheus' ${DEPLOYMENT_OUTPUT} ${VM_ID}
###############
### ANSIBLE ###
###############
if [ ${RUN_ANSIBLE} == true ]; then
# Install Ansible in conda environment
[ -d ./miniconda ] || ./ansible/install/install_ansible.sh > "${MYDIR}/ansible_install.log" 2>&1
# The special variable @ must be set to empty before activating the conda
# environment as the conda activate script appends it to the conda command
# causing it to fail if still containing the install script options
set --
# Activate conda environment
source ${MYDIR}/miniconda/bin/activate
# Create inventory file with the appropriate variable to execute through jump host
ANSIBLE_INVENTORY=${MYDIR}/ansible/inventory.json
sed "s|ROOT_DIR|${MYDIR}|g" ansible/templates/ssh_jumphost_vars.json.tmpl > ansible/templates/ssh_jumphost_vars.json
jq -s '.[0].ansible_inventory.value * {"all": .[1]}' ${DEPLOYMENT_OUTPUT} ansible/templates/ssh_jumphost_vars.json > ${ANSIBLE_INVENTORY}
# Create global variables file
mkdir -p ansible/group_vars/all
jq -s '.[].globalVars.value' ${DEPLOYMENT_OUTPUT} > ansible/group_vars/all/global_vars.json
# Open SSH tunnel through bastion
./bastion_tunnel_cycleserver.sh 22 10022 &
sleep 5
# Kill tunnel processes on exit
TUNNEL_PIDS=$(ps aux | grep bastion | grep -v grep | awk '{print $2}')
trap 'kill $(echo $TUNNEL_PIDS)' EXIT
# Run Ansible playbooks
export ANSIBLE_CONFIG=${MYDIR}/ansible/ansible.cfg
ansible-playbook -i ${ANSIBLE_INVENTORY} ansible/playbooks/cyclecloud.yml
sleep 15 # Necessary for SSH control persist to expire (see ansible.cfg)
ansible-playbook -i ${ANSIBLE_INVENTORY} ansible/playbooks/prometheus.yml
# Create Bastion connection scripts for scheduler VM
for i in {1..20}; do
SCHEDULER_VM_ID=$(az resource list -g ${RESOURCE_GROUP} --resource-type 'Microsoft.Compute/virtualMachines' --query "[?tags.Name == 'scheduler'].id" -o tsv)
# If scheduler VM is not yet created, wait and try again
if [ -z "${SCHEDULER_VM_ID}" ]; then
echo "Scheduler VM not yet allocated. Retrying bastion scripts generation in 5 seconds..."
sleep 5
continue
else
create_bastion_scripts 'scheduler' ${DEPLOYMENT_OUTPUT} ${SCHEDULER_VM_ID}
break
fi
done
# Create Bastion connection scripts for login VMs
NUMBER_OF_LOGIN_VMS=$(jq -r '.globalVars.value.loginNicsCount' ${DEPLOYMENT_OUTPUT})
LOGIN_VM_IDS=()
for LOGIN_VM_IDX in $(seq 1 ${NUMBER_OF_LOGIN_VMS}); do
for i in {1..20}; do
LOGIN_VM_ID=$(az resource list -g ${RESOURCE_GROUP} --resource-type 'Microsoft.Compute/virtualMachines' --query "[?tags.Name == 'login${LOGIN_VM_IDX}'].id" -o tsv)
# If login VM is not yet created, wait and try again
if [ -z "${LOGIN_VM_ID}" ]; then
echo "Login VM ${LOGIN_VM_IDX} not yet allocated. Retrying bastion scripts generation in 5 seconds..."
sleep 5
continue
else
LOGIN_VM_IDS+=(${LOGIN_VM_ID})
break
fi
done
done
# Create scripts only if all VM resource IDs have been collected
if [ ${#LOGIN_VM_IDS[@]} -eq ${NUMBER_OF_LOGIN_VMS} ]; then
create_bastion_scripts 'login' ${DEPLOYMENT_OUTPUT} "${LOGIN_VM_IDS[@]}"
else
echo "Could not retreive all login VMs resource ID"
exit 1
fi
fi
# Apply security post-deployment routine if requested
if [ ${RUN_SECURE} == true ]; then
echo "Applying post-deployment security configuration"
# Remove NRMS Rule 103 and Rule 104 which are applied by policy if they exist in the NSG
# These rules are automatically added to engineering accounts, but then violate some other security policies
NSG_NAMES=$(az network nsg list --resource-group ${RESOURCE_GROUP} --query "[].name" -o tsv)
for NSG_NAME in ${NSG_NAMES}; do
echo "Removing NRMS rules 103 and 104 from NSG ${NSG_NAME}"
az network nsg rule delete --name "NRMS-Rule-103" --nsg-name ${NSG_NAME} --resource-group ${RESOURCE_GROUP} --output none
az network nsg rule delete --name "NRMS-Rule-104" --nsg-name ${NSG_NAME} --resource-group ${RESOURCE_GROUP} --output none
done
# Associate NSG from CycleServer to all the subnets in the VNET
VNET_NAME=$(az network vnet list --resource-group ${RESOURCE_GROUP} --query "[].name" -o tsv)
SUBNET_NAMES=$(az network vnet subnet list --resource-group ${RESOURCE_GROUP} --vnet-name ${VNET_NAME} --query "[].name" -o tsv | grep -v AzureBastionSubnet)
CYCLESERVER_NSG_NAME='cycleserverNSG'
for SUBNET in ${SUBNET_NAMES}; do
az network vnet subnet update --resource-group ${RESOURCE_GROUP} --vnet-name ${VNET_NAME} --name ${SUBNET} --network-security-group ${CYCLESERVER_NSG_NAME}
done
# Remove public access of KeyVault
KV_NAME=$(jq -r '.globalVars.value.keyVaultName' ${DEPLOYMENT_OUTPUT})
echo "Updating KeyVault"
az keyvault update --name ${KV_NAME} --default-action Deny --bypass AzureServices --output none
# Remove public access of Blob
BLOB_NAME=$(jq -r '.globalVars.value.lockerAccountName' ${DEPLOYMENT_OUTPUT})
echo "Updating Blob access"
az storage account update --name ${BLOB_NAME} --allow-blob-public-access false --output none
# Add AAD extension and enable autopatching of deployed VMs
echo "Updating VM config"
VMs=$(az vm list --resource-group ${RESOURCE_GROUP} --query "[].name" -o tsv)
echo "${VMs}"
for VM in ${VMs}; do
az vm extension set --resource-group ${RESOURCE_GROUP} --vm-name ${VM} --name AADSSHLoginForLinux --publisher Microsoft.Azure.ActiveDirectory
az vm extension set --resource-group ${RESOURCE_GROUP} --vm-name ${VM} --name ConfigurationforLinux --publisher Microsoft.GuestConfiguration
az vm identity assign --resource-group ${RESOURCE_GROUP} --name ${VM}
az vm update --resource-group ${RESOURCE_GROUP} --name ${VM} --set osProfile.linuxConfiguration.patchSettings.patchMode=ImageDefault
done
fi