-
Notifications
You must be signed in to change notification settings - Fork 54
/
Copy pathconfig.tpl.yml
508 lines (478 loc) · 19.8 KB
/
config.tpl.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
---
# yaml-language-server: $schema=config.schema.json
# name of the cluster
project_name: az-hop
# azure location name as returned by the command : az account list-locations -o table
location: westeurope
# Name of the resource group to create all resources
resource_group: azhop
# If using an existing resource group set to true. Default is false
# When using an existing resource group make sure the location match the one of the existing resource group
use_existing_rg: false
# If set to true, will disable telemetry for azhop. See https://azure.github.io/az-hop/deploy/telemetry.html.
#optout_telemetry: true
# Create a log analytics workspace to enable monitoring and alerting
log_analytics:
create: true
# Option to install the monitoring agent on static infra VMs. Can be disabled if the agent is installed by policy.
monitoring:
install_agent: true
#If set to true, it will create alert rules associated with az-hop. Enablement of alerting will require the specification of an admin email to send alerts to.
alerting:
enabled: true
admin_email: [email protected]
local_volume_threshold: 80
# Additional tags to be added on the Resource Group
tags:
env: dev
project: azhop
# Define an Azure Netapp Files (ANF) account, single pool and volume
# If not present, assume that there is an existing NFS share for the users home directory
anf:
create: true
# Size of the ANF pool and unique volume (min: 4TB, max: 100TB)
homefs_size_tb: 4
# Service level of the ANF volume, can be: Standard, Premium, Ultra
homefs_service_level: Standard
# dual protocol
dual_protocol: false # true to enable SMB support. false by default
# If alerting is enabled, this value will be used to determine when to trigger alerts
alert_threshold: 80 # alert when ANF volume reaches this threshold
# For small deployments you can use Azure Files instead of ANF for the home directory
azurefiles:
create: false
size_gb: 1024
# These mounts will be listed in the Files menu of the OnDemand portal and automatically mounted on all compute nodes and remote desktop nodes
mounts:
# mount settings for the user home directory
home: # This home name can't be changed
type: anf # anf or azurefiles, default to anf. One of the two should be defined in order to mount the home directory
mountpoint: /anfhome # /sharedhome for example
server: '{{anf_home_ip}}' # Specify an existing NFS server name or IP, when using the ANF built in use '{{anf_home_ip}}'
export: '{{anf_home_path}}' # Specify an existing NFS export directory, when using the ANF built in use '{{anf_home_path}}'
options: '{{anf_home_opts}}' # Specify the mount options. Default to rw,hard,rsize=262144,wsize=262144,vers=3,tcp,_netdev
# mount1:
# mountpoint: /mount1
# server: a.b.c.d # Specify an existing NFS server name or IP
# export: myexport1 # Specify an existing NFS export name
# options: my_options # Specify the mount options.
# name of the admin account
admin_user: hpcadmin
# List of identities (object ids) to grant read access to az-hop key vault (optional)
# key_vault_readers:
# Network
network:
# Create Network and Application Security Rules, true by default, false when using an existing VNET if not specified
create_nsg: true
vnet:
name: hpcvnet # Optional - default to hpcvnet
#id: # If a vnet id is set then no network will be created and the provided vnet will be used
address_space: "10.0.0.0/23"
# Special VNET Tags
# tags:
# key1: value1
# When using an existing VNET, only the subnet names will be used and not the adress_prefixes
subnets: # all subnets are optionals
# name values can be used to rename the default to specific names, address_prefixes to change the IP ranges to be used
# All values below are the default values
frontend:
name: frontend
address_prefixes: "10.0.0.0/29"
create: true # create the subnet if true. default to true when not specified, default to false if using an existing VNET when not specified
admin:
name: admin
address_prefixes: "10.0.0.16/28"
create: true
netapp:
name: netapp
address_prefixes: "10.0.0.32/28"
create: true
# the outbounddns is optional and only when deploying an Azure Private DNS Resolver
# outbounddns:
# name: outbounddns
# address_prefixes: "10.0.0.48/28"
# create: true
ad:
name: ad
address_prefixes: "10.0.0.8/29"
create: true
# Bastion and Gateway subnets are optional and can be added if a Bastion or a VPN need to be created in the environment
# bastion: # Bastion subnet name is always fixed to AzureBastionSubnet
# address_prefixes: "10.0.0.64/26" # CIDR minimal range must be /26
# create: true
# gateway: # Gateway subnet name is always fixed to GatewaySubnet
# address_prefixes: "10.0.0.128/27" # Recommendation is to use /27 or /26 network
# create: true
compute:
name: compute
address_prefixes: "10.0.1.0/24"
create: true
# Specify the Application Security Groups mapping if already existing
# asg:
# resource_group: # name of the resource group containing the ASG. Default to the resource group containing azhop resources
# names: # list of ASG names mapping to the one defined in az-hop
# asg-ssh: asg-ssh
# asg-rdp: asg-rdp
# asg-jumpbox: asg-jumpbox
# asg-ad: asg-ad
# asg-ad-client: asg-ad-client
# asg-lustre: asg-lustre
# asg-lustre-client: asg-lustre-client
# asg-pbs: asg-pbs
# asg-pbs-client: asg-pbs-client
# asg-cyclecloud: asg-cyclecloud
# asg-cyclecloud-client: asg-cyclecloud-client
# asg-nfs-client: asg-nfs-client
# asg-telegraf: asg-telegraf
# asg-grafana: asg-grafana
# asg-robinhood: asg-robinhood
# asg-ondemand: asg-ondemand
# asg-deployer: asg-deployer
# asg-guacamole: asg-guacamole
# asg-mariadb-client: asg-mariadb-client
# peering: # This list is optional, and can be used to create VNet Peerings in the same subscription.
# - vnet_name: #"VNET Name to Peer to"
# vnet_resource_group: #"Resource Group of the VNET to peer to"
# vnet_allow_gateway: false # optional: allow gateway transit (default: true)
# Specify DNS forwarders available in the network
# dns:
# forwarders:
# - { name: foo.com, ips: "10.2.0.4, 10.2.0.5" }
# When working in a locked down network, uncomment and fill out this section
locked_down_network:
enforce: false
# grant_access_from: [a.b.c.d] # Array of CIDR to grant access from, see https://docs.microsoft.com/en-us/azure/storage/common/storage-network-security?tabs=azure-portal#grant-access-from-an-internet-ip-range
public_ip: true # Enable public IP creation for Jumpbox, OnDemand and create images. Default to true
# Base image configuration. Can be either an image reference or an image_id from the image registry or a custom managed image
linux_base_image: "OpenLogic:CentOS:7_9-gen2:latest" # publisher:offer:sku:version or image_id
linux_base_plan: # linux image plan if required, format is publisher:product:name
windows_base_image: "MicrosoftWindowsServer:WindowsServer:2019-Datacenter-smalldisk:latest" # publisher:offer:sku:version or image_id
lustre_base_image: "azhpc:azurehpc-lustre:azurehpc-lustre-2_12:latest"
# The lustre plan to use. Only needed when using the default lustre image from the marketplace. use "::" for an empty plan
lustre_base_plan: "azhpc:azurehpc-lustre:azurehpc-lustre-2_12" # publisher:product:name
# Jumpbox VM configuration, only needed when deploying thru a public IP and without a configured deployer VM
jumpbox:
vm_size: Standard_B2ms
# SSH port under which the jumpbox SSH server listens on the public IP. Default to 22
# Change this to, e.g., 2222, if security policies (like "zero trust") in your tenant automatically block access to port 22 from the internet
#ssh_port: 2222
# Active directory VM configuration
ad:
vm_size: Standard_B2ms
hybrid_benefit: false # Enable hybrid benefit for AD, default to false
high_availability: false # Build AD in High Availability mode (2 Domain Controlers) - default to false
# On demand VM configuration
ondemand:
vm_size: Standard_D4s_v5
#fqdn: azhop.foo.com # When provided it will be used for the certificate server name
generate_certificate: true # Generate an SSL certificate for the OnDemand portal. Default to true
# Grafana VM configuration
grafana:
vm_size: Standard_B2ms
# Guacamole VM configuration
guacamole:
vm_size: Standard_B2ms
# Scheduler VM configuration
scheduler:
vm_size: Standard_B2ms
# CycleCloud VM configuration
cyclecloud:
vm_size: Standard_B2ms
# version: 8.3.0-3062 # to specify a specific version, see https://packages.microsoft.com/yumrepos/cyclecloud/
# Lustre cluster is optional and can be used to create a Lustre cluster in the environment.
lustre:
create: true # true or false to create a lustre cluster
rbh_sku: "Standard_D8d_v4"
mds_sku: "Standard_D8d_v4"
oss_sku: "Standard_D32d_v4"
oss_count: 2
hsm_max_requests: 8
mdt_device: "/dev/sdb"
ost_device: "/dev/sdb"
# optional to use existing storage for the archive
# if not included it will use the azhop storage account that is created
# hsm:
# storage_account: #existing_storage_account_name
# storage_container: #only_used_with_existing_storage_account
# List of users to be created on this environment
users:
# name: username - must be less than 20 characters
# uid: uniqueid
# shell: /bin/bash # default to /bin/bash
# home: /anfhome/<user_name> # default to /homedir_mountpoint/user_name
# groups: list of groups the user belongs to
- { name: clusteradmin, uid: 10001, groups: [5001, 5002] }
- { name: hpcuser, uid: 10002 }
# - { name: user1, uid: 10003, groups: [6000] }
# - { name: user2, uid: 10004, groups: [6001] }
usergroups:
# These groups can’t be changed
- name: Domain Users # All users will be added to this one by default
gid: 5000
- name: az-hop-admins
gid: 5001
description: "For users with azhop admin privileges"
- name: az-hop-localadmins
gid: 5002
description: "For users with sudo right or local admin right on nodes"
# For custom groups use gid >= 6000
# - name: project1 # For project1 users
# gid: 6000
# - name: project2 # For project2 users
# gid: 6001
# Enable cvmfs-eessi - disabled by default
cvmfs_eessi:
enabled: false
# scheduler to be installed and configured (openpbs, slurm)
queue_manager: openpbs
# Specific SLURM configuration
slurm:
# Enable SLURM accounting, this will create a SLURM accounting database in a managed MariaDB server instance
accounting_enabled: false
# SLURM version to install. Currently supported: only 20.11.9 and 22.05.3.
# Other versions can be installed by building from source (See build_rpms setting in the slurmserver role)
slurm_version: 20.11.9
# Name of the SLURM cluster for accounting (optional, default to 'slurm')
# WARNING: changing this value on a running cluster will cause slurmctld to fail to start. This is a
# safety check to prevent accounting errors. To override, remove /var/spool/slurmd/clustername
cluster_name: slurm_azhop
enroot:
enroot_version: 3.4.1
# If using an existing Managed MariaDB instance for SLURM accounting and/or Guacamole, specify these values
#database:
# Admin user of the database for which the password will be retrieved from the azhop keyvault
#user: sqladmin
# FQDN of the managed instance
#fqdn:
# IP of the managed private endpoint if the FQDN is not registered in a private DNS
#ip:
# Create a Bastion in the bastion subnet when defined
bastion:
create: false
# Create a VPN Gateway in the gateway subnet when specified
vpn_gateway:
create: false
# Authentication configuration for accessing the az-hop portal
# Default is basic authentication. For oidc authentication you have to specify the following values
# The OIDCClient secret need to be stored as a secret named <oidc-client-id>-password in the keyvault used by az-hop
authentication:
httpd_auth: basic # oidc or basic
# User mapping https://osc.github.io/ood-documentation/latest/reference/files/ood-portal-yml.html#ood-portal-generator-user-map-match
# You can specify either a map_match or a user_map_cmd
# Domain users are mapped to az-hop users with the same name and without the domain name
# user_map_match: '^([^@]+)@mydomain.foo$'
# If using a custom mapping script, update it from the ./playbooks/files directory before running the playbook
# user_map_cmd: /opt/ood/ood_auth_map/bin/custom_mapping.sh
# ood_auth_openidc:
# OIDCProviderMetadataURL: # for AAD use 'https://sts.windows.net/{{tenant_id}}/.well-known/openid-configuration'
# OIDCClientID: 'XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX'
# OIDCRemoteUserClaim: # for AAD use 'upn'
# OIDCScope: # for AAD use 'openid profile email groups'
# OIDCPassIDTokenAs: # for AAD use 'serialized'
# OIDCPassRefreshToken: # for AAD use 'On'
# OIDCPassClaimsAs: # for AAD use 'environment'
image_gallery:
create: true # Create the shared image gallery to store custom images
# List of images to be defined
images:
# - name: image_definition_name # Should match the packer configuration file name, one per packer file
# publisher: azhop
# offer: CentOS
# sku: 7_9-gen2
# hyper_v: V2 # V1 or V2 (V1 is the default)
# os_type: Linux # Linux or Windows
# version: 7.9 # Version of the image to create the image definition in SIG. Pattern is major.minor where minor is mandatory
# Pre-defined images
- name: azhop-compute-almalinux-8_7
publisher: azhop
offer: almalinux
sku: 8_7-hpc-gen2
hyper_v: V2
os_type: Linux
version: 8.7
- name: azhop-centos79-v2-rdma-gpgpu
publisher: azhop
offer: CentOS
sku: 7.9-gen2
hyper_v: V2
os_type: Linux
version: 7.9
# Image definition when using a custom image to build compute nodes images
- name: azhop-centos79-v2-rdma-ci
publisher: azhop
offer: CentOS
sku: 7.9-gen2-ci
hyper_v: V2
os_type: Linux
version: 7.9
# Image definition when using a custom image to build remote viz nodes images
- name: azhop-centos79-desktop3d-ci
publisher: azhop
offer: CentOS
sku: 7.9-gen2-desktop3d-ci
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-centos79-desktop3d
publisher: azhop
offer: CentOS
sku: 7.9-gen2-desktop3d
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-compute-centos-7_9
publisher: azhpc
offer: azhop-compute
sku: centos-7_9
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-desktop-centos-7_9
publisher: azhpc
offer: azhop-desktop
sku: centos-7_9
hyper_v: V2
os_type: Linux
version: 7.9
- name: azhop-compute-ubuntu-1804
publisher: azhpc
offer: azhop-compute
sku: ubuntu-1804
hyper_v: V2
os_type: Linux
version: 18.04
- name: azhop-win10
publisher: azhop
offer: Windows-10
sku: 21h1-pron
hyper_v: V1
os_type: Windows
version: 10.19043
# Base image when building your own HPC image and not using the HPC marketplace images
- name: base-centos79-v2-rdma
publisher: azhop
offer: CentOS
sku: 7.9-gen2-rdma-nogpu
hyper_v: V2
os_type: Linux
version: 7.9
# Autoscale default settings for all queues, can be overriden on each queue depending on the VM type if needed
autoscale:
idle_timeout: 1800 # Idle time in seconds before shutting down VMs - default to 1800 like in CycleCloud
# List of queues (node arrays in Cycle) to be defined
# don't use queue names longer than 8 characters in order to leave space for node suffix, as hostnames are limited to 15 chars due to domain join and NETBIOS constraints.
queues:
- name: execute # name of the Cycle Cloud node array
# Azure VM Instance type
vm_size: Standard_F2s_v2
# maximum number of cores that can be instanciated
max_core_count: 1024
# Use the pre-built azhop image from the marketplace
image: azhpc:azhop-compute:centos-7_9:latest
# Use this image ID when building your own custom images
#image: /subscriptions/{{subscription_id}}/resourceGroups/{{resource_group}}/providers/Microsoft.Compute/galleries/{{sig_name}}/images/azhop-centos79-v2-rdma-gpgpu/latest
# Image plan specification (when needed for the image). Terms must be accepted prior to deployment
# plan: publisher:product:name
# Set to true if AccelNet need to be enabled. false is the default value
EnableAcceleratedNetworking: false
# spot instance support. Default is false
spot: false
# Set to false to disable creation of placement groups (for SLURM only). Default is true
ColocateNodes: false
# Specific idle time in seconds before shutting down VMs, make sure it's lower than autoscale.idle_timeout
idle_timeout: 300
# Set the max number of vm's in a VMSS; requires additional limit raise through support ticket for >100;
# 100 is default value; lower numbers will improve scaling for single node jobs or jobs with small number of nodes
MaxScaleSetSize: 100
- name: hc44rs
vm_size: Standard_HC44rs
max_core_count: 440
image: azhpc:azhop-compute:centos-7_9:latest
spot: true
EnableAcceleratedNetworking: true
- name: hb120v2
vm_size: Standard_HB120rs_v2
max_core_count: 1200
image: azhpc:azhop-compute:centos-7_9:latest
spot: true
EnableAcceleratedNetworking: true
- name: hb120v3
vm_size: Standard_HB120rs_v3
max_core_count: 1200
image: azhpc:azhop-compute:centos-7_9:latest
spot: true
EnableAcceleratedNetworking: true
# Queue dedicated to GPU remote viz nodes. This name is fixed and can't be changed
- name: viz3d
vm_size: Standard_NV12s_v3
max_core_count: 48
# Use the pre-built azhop image from the marketplace
image: azhpc:azhop-desktop:centos-7_9:latest
# Use this image ID when building your own custom images
#image: /subscriptions/{{subscription_id}}/resourceGroups/{{resource_group}}/providers/Microsoft.Compute/galleries/{{sig_name}}/images/azhop-centos79-desktop3d/latest
ColocateNodes: false
spot: false
EnableAcceleratedNetworking: true
max_hours: 12 # Maximum session duration
min_hours: 1 # Minimum session duration - 0 is infinite
# Queue dedicated to share GPU remote viz nodes. This name is fixed and can't be changed
- name: largeviz3d
vm_size: Standard_NV48s_v3
max_core_count: 96
image: azhpc:azhop-desktop:centos-7_9:latest
ColocateNodes: false
EnableAcceleratedNetworking: true
spot: false
max_hours: 12
min_hours: 1
# Queue dedicated to non GPU remote viz nodes. This name is fixed and can't be changed
- name: viz
vm_size: Standard_D8s_v5
max_core_count: 200
image: azhpc:azhop-desktop:centos-7_9:latest
ColocateNodes: false
spot: false
EnableAcceleratedNetworking: true
max_hours: 12
min_hours: 1
# Remote Visualization definitions
enable_remote_winviz: false # Set to true to enable windows remote visualization
remoteviz:
- name: winviz # This name is fixed and can't be changed
vm_size: Standard_NV12s_v3 # Standard_NV8as_v4 Only NVsv3 and NVsV4 are supported
max_core_count: 48
image: "MicrosoftWindowsDesktop:Windows-10:21h1-pron:latest"
ColocateNodes: false
spot: false
EnableAcceleratedNetworking: true
# Application settings
applications:
bc_codeserver:
enabled: true
bc_jupyter:
enabled: true
bc_amlsdk:
enabled: false
bc_rstudio:
enabled: false
bc_ansys_workbench:
enabled: false
bc_vmd:
enabled: false
bc_paraview:
enabled: false
bc_vizer:
enabled: false
cryosparc:
enabled: false
license_id: <LICENSE-ID>
admin_user: adminuser
master_vm_size: Standard_D8s_v5
master_vm_image: azhpc:azhop-compute:centos-7_9:latest
master_hostname: cryosparc-master
master_data_disk_size: 256
master_data_disk_type: Premium_LRS
target_queues:
- nc24v3
- hb120v3
- hc44rs