From d8336c885504c8efb6e872db7f73bee154ec836e Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 20 Jun 2023 22:07:01 -0700 Subject: [PATCH 001/144] Make execute command ansible script generic so it can be reused --- .../modules/scripts/ramble-execute/main.tf | 7 +++--- .../templates/ramble_execute.yml.tpl | 23 ++++++++----------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/community/modules/scripts/ramble-execute/main.tf b/community/modules/scripts/ramble-execute/main.tf index 3eb176a64a..b58efa2d04 100644 --- a/community/modules/scripts/ramble-execute/main.tf +++ b/community/modules/scripts/ramble-execute/main.tf @@ -20,10 +20,9 @@ locals { execute_contents = templatefile( "${path.module}/templates/ramble_execute.yml.tpl", { - spack_path = var.spack_path - ramble_path = var.ramble_path - log_file = var.log_file - commands = local.commands_content + pre_script = ". ${var.spack_path}/share/spack/setup-env.sh && . ${var.ramble_path}/share/ramble/setup-env.sh" + log_file = var.log_file + commands = local.commands_content } ) diff --git a/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl b/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl index 07790a76f8..a3cf73e097 100644 --- a/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl +++ b/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl @@ -12,38 +12,35 @@ # See the License for the specific language governing permissions and # limitations under the License. -- name: Execute Ramble Commands +- name: Execute Commands hosts: localhost vars: - spack_path: ${spack_path} - ramble_path: ${ramble_path} + pre_script: ${pre_script} log_file: ${log_file} commands: ${commands} tasks: - name: Execute command block block: - - name: Print Ramble commands to be executed + - name: Print commands to be executed ansible.builtin.debug: msg: "{{ commands.split('\n') }}" - - name: Execute ramble commands + - name: Execute commands ansible.builtin.shell: | set -eo pipefail { - . {{ spack_path }}/share/spack/setup-env.sh - . {{ ramble_path }}/share/ramble/setup-env.sh - - echo " === Starting ramble commands ===" + {{ pre_script }} + echo " === Starting commands ===" {{ commands }} - echo " === Finished ramble commands ===" + echo " === Finished commands ===" } | tee -a {{ log_file }} - register: ramble_output + register: output always: - name: Print commands output to stderr ansible.builtin.debug: - var: ramble_output.stderr_lines + var: output.stderr_lines - name: Print commands output to stdout ansible.builtin.debug: - var: ramble_output.stdout_lines + var: output.stdout_lines From 5d56f590dbf0a43bbdf04389d412b10e80ee1801 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Wed, 21 Jun 2023 15:05:07 -0700 Subject: [PATCH 002/144] Add commands feature to spack-install to allow execution of arbitrary commands --- .../modules/scripts/spack-install/README.md | 20 +++++-- .../modules/scripts/spack-install/main.tf | 52 +++++++++++++++++++ .../modules/scripts/spack-install/outputs.tf | 6 +-- .../templates/execute_commands.yml.tpl | 46 ++++++++++++++++ .../scripts/spack-install/variables.tf | 51 +++++++++++++----- .../modules/scripts/spack-install/versions.tf | 8 ++- tools/duplicate-diff.py | 4 ++ 7 files changed, 167 insertions(+), 20 deletions(-) create mode 100644 community/modules/scripts/spack-install/templates/execute_commands.yml.tpl diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index 2f909bf7a3..0500a491af 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -189,36 +189,48 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | +| [terraform](#requirement\_terraform) | >= 1.0.0 | +| [local](#requirement\_local) | >= 2.0.0 | ## Providers -No providers. +| Name | Version | +|------|---------| +| [local](#provider\_local) | >= 2.0.0 | ## Modules -No modules. +| Name | Source | Version | +|------|--------|---------| +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.19.1 | ## Resources -No resources. +| Name | Type | +|------|------| +| [local_file.debug_file_ansible_execute](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | +| [local_file.debug_file_shell_install](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [caches\_to\_populate](#input\_caches\_to\_populate) | Defines caches which will be populated with the installed packages.
Each cache must specify a type (either directory, or mirror).
Each cache must also specify a path. For directory caches, this path
must be on a local file system (i.e. file:///path/to/cache). For
mirror paths, this can be any valid URL that spack accepts.

NOTE: GPG Keys should be installed before trying to populate a cache
with packages.

NOTE: The gpg\_keys variable can be used to install existing GPG keys
and create new GPG keys, both of which are acceptable for populating a
cache. | `list(map(any))` | `[]` | no | +| [commands](#input\_commands) | String of commands to run within this module | `string` | `null` | no | | [compilers](#input\_compilers) | Defines compilers for spack to install before installing packages. | `list(string)` | `[]` | no | | [concretize\_flags](#input\_concretize\_flags) | Defines the flags to pass into `spack concretize` | `string` | `""` | no | | [configs](#input\_configs) | List of configuration options to set within spack.
Configs can be of type 'single-config' or 'file'.
All configs must specify content, and a
a scope. | `list(map(any))` | `[]` | no | +| [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | | [environments](#input\_environments) | Defines spack environments to configure, given as a list.
Each environment must define a name.
Additional optional attributes are 'content' and 'packages'.
'content' must be a string, defining the content of the Spack Environment YAML file.
'packages' must be a list of strings, defining the spack specs to install.
If both 'content' and 'packages' are defined, 'content' is processed first. | `any` | `[]` | no | | [gpg\_keys](#input\_gpg\_keys) | GPG Keys to trust within spack.
Each key must define a type. Valid types are 'file' and 'new'.
Keys of type 'file' must define a path to the key that
should be trusted.
Keys of type 'new' must define a 'name' and 'email' to create
the key with. | `list(map(any))` | `[]` | no | | [install\_dir](#input\_install\_dir) | Directory to install spack into. | `string` | `"/sw/spack"` | no | | [install\_flags](#input\_install\_flags) | Defines the flags to pass into `spack install` | `string` | `""` | no | +| [labels](#input\_labels) | Key-value pairs of labels to be added to created resources. | `map(string)` | n/a | yes | | [licenses](#input\_licenses) | List of software licenses to install within spack. |
list(object({
source = string
dest = string
}))
| `null` | no | | [log\_file](#input\_log\_file) | Defines the logfile that script output will be written to | `string` | `"/var/log/spack.log"` | no | | [packages](#input\_packages) | Defines root packages for spack to install (in order). | `list(string)` | `[]` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | +| [region](#input\_region) | Region to place bucket containing startup script. | `string` | n/a | yes | | [spack\_cache\_url](#input\_spack\_cache\_url) | List of buildcaches for spack. |
list(object({
mirror_name = string
mirror_url = string
}))
| `null` | no | | [spack\_ref](#input\_spack\_ref) | Git ref to checkout for spack. | `string` | `"v0.20.0"` | no | | [spack\_url](#input\_spack\_url) | URL to clone the spack repo from. | `string` | `"https://github.com/spack/spack"` | no | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index b8217c05bc..76391ffe3f 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -14,6 +14,11 @@ * limitations under the License. */ +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "spack-install" }) +} + locals { env = [ for e in var.environments : { @@ -56,3 +61,50 @@ locals { "destination" = "install_spack.sh" } } + +locals { + commands_content = var.commands == null ? "echo 'no spack commands provided'" : indent(4, yamlencode(var.commands)) + + execute_contents = templatefile( + "${path.module}/templates/execute_commands.yml.tpl", + { + pre_script = ". ${var.install_dir}/share/spack/setup-env.sh" + log_file = var.log_file + commands = local.commands_content + } + ) + + execute_md5 = substr(md5(local.execute_contents), 0, 4) + execute_runner = { + "type" = "ansible-local" + "content" = local.execute_contents + "destination" = "spack_execute_${local.execute_md5}.yml" + } + + combined_md5 = substr(md5(module.startup_script.startup_script), 0, 4) + combined_install_execute_runner = { + "type" = "shell" + "content" = module.startup_script.startup_script + "destination" = "combined_install_spack_${local.combined_md5}.sh" + } +} + +module "startup_script" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.19.1" + + labels = local.labels + project_id = var.project_id + deployment_name = var.deployment_name + region = var.region + runners = [local.install_spack_runner, local.execute_runner] +} + +resource "local_file" "debug_file_shell_install" { + content = local.script_content + filename = "${path.module}/debug_install.sh" +} + +resource "local_file" "debug_file_ansible_execute" { + content = local.execute_contents + filename = "${path.module}/debug_execute_${local.execute_md5}.yml" +} diff --git a/community/modules/scripts/spack-install/outputs.tf b/community/modules/scripts/spack-install/outputs.tf index 2f85c4d0ee..e02dbf227b 100644 --- a/community/modules/scripts/spack-install/outputs.tf +++ b/community/modules/scripts/spack-install/outputs.tf @@ -16,12 +16,12 @@ output "startup_script" { description = "Path to the Spack installation script." - value = local.script_content + value = module.startup_script.startup_script } output "controller_startup_script" { description = "Path to the Spack installation script, duplicate for SLURM controller." - value = local.script_content + value = module.startup_script.startup_script } output "install_spack_deps_runner" { @@ -40,7 +40,7 @@ output "install_spack_deps_runner" { output "install_spack_runner" { description = "Runner to install Spack using the startup-script module" - value = local.install_spack_runner + value = local.combined_install_execute_runner } output "setup_spack_runner" { diff --git a/community/modules/scripts/spack-install/templates/execute_commands.yml.tpl b/community/modules/scripts/spack-install/templates/execute_commands.yml.tpl new file mode 100644 index 0000000000..a3cf73e097 --- /dev/null +++ b/community/modules/scripts/spack-install/templates/execute_commands.yml.tpl @@ -0,0 +1,46 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- name: Execute Commands + hosts: localhost + vars: + pre_script: ${pre_script} + log_file: ${log_file} + commands: ${commands} + tasks: + - name: Execute command block + block: + - name: Print commands to be executed + ansible.builtin.debug: + msg: "{{ commands.split('\n') }}" + + - name: Execute commands + ansible.builtin.shell: | + set -eo pipefail + { + {{ pre_script }} + echo " === Starting commands ===" + {{ commands }} + echo " === Finished commands ===" + } | tee -a {{ log_file }} + register: output + + always: + - name: Print commands output to stderr + ansible.builtin.debug: + var: output.stderr_lines + + - name: Print commands output to stdout + ansible.builtin.debug: + var: output.stdout_lines diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index 8a8dbb7650..409205739f 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -24,6 +24,8 @@ variable "project_id" { type = string } +# spack-setup variables + variable "install_dir" { description = "Directory to install spack into." type = string @@ -42,6 +44,43 @@ variable "spack_ref" { default = "v0.20.0" } +variable "spack_virtualenv_path" { + description = "Virtual environment path in which to install Spack Python interpreter and other dependencies" + default = "/usr/local/spack-python" + type = string +} + +# spack-build variables + +variable "log_file" { + description = "Defines the logfile that script output will be written to" + default = "/var/log/spack.log" + type = string +} + +variable "commands" { + description = "String of commands to run within this module" + default = null + type = string +} + +variable "deployment_name" { + description = "Name of deployment, used to name bucket containing startup script." + type = string +} + +variable "region" { + description = "Region to place bucket containing startup script." + type = string +} + +variable "labels" { + description = "Key-value pairs of labels to be added to created resources." + type = map(string) +} + +# variables to be deprecated + variable "spack_cache_url" { description = "List of buildcaches for spack." type = list(object({ @@ -221,15 +260,3 @@ EOT error_message = "The content attribute within environments is required to be a string." } } - -variable "log_file" { - description = "Defines the logfile that script output will be written to" - default = "/var/log/spack.log" - type = string -} - -variable "spack_virtualenv_path" { - description = "Virtual environment path in which to install Spack Python interpreter and other dependencies" - default = "/usr/local/spack-python" - type = string -} diff --git a/community/modules/scripts/spack-install/versions.tf b/community/modules/scripts/spack-install/versions.tf index d70527e5cf..b708682f88 100644 --- a/community/modules/scripts/spack-install/versions.tf +++ b/community/modules/scripts/spack-install/versions.tf @@ -15,5 +15,11 @@ */ terraform { - required_version = ">= 0.14.0" + required_version = ">= 1.0.0" + required_providers { + local = { + source = "hashicorp/local" + version = ">= 2.0.0" + } + } } diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index a0730fcaef..95cb688f2d 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -45,6 +45,10 @@ "community/modules/compute/gke-node-pool/threads_per_core_calc.tf", "modules/compute/vm-instance/threads_per_core_calc.tf" ], + [ + "community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl", + "community/modules/scripts/spack-install/templates/execute_commands.yml.tpl", + ], ] for group in duplicates: From a591fca58b1fd648c9a4131279b89548b256030b Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Wed, 21 Jun 2023 16:31:15 -0700 Subject: [PATCH 003/144] Deprecate dedicated variable to set up spack cache in favor of commands --- .../modules/scripts/spack-install/README.md | 2 +- .../spack-install/templates/install_spack.tpl | 20 ------------ .../scripts/spack-install/variables.tf | 32 +++++++++---------- .../test_configs/spack-buildcache.yaml | 8 +++-- .../test_configs/spack-environments.yaml | 8 +++-- 5 files changed, 26 insertions(+), 44 deletions(-) diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index 0500a491af..05189b53ea 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -215,7 +215,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [caches\_to\_populate](#input\_caches\_to\_populate) | Defines caches which will be populated with the installed packages.
Each cache must specify a type (either directory, or mirror).
Each cache must also specify a path. For directory caches, this path
must be on a local file system (i.e. file:///path/to/cache). For
mirror paths, this can be any valid URL that spack accepts.

NOTE: GPG Keys should be installed before trying to populate a cache
with packages.

NOTE: The gpg\_keys variable can be used to install existing GPG keys
and create new GPG keys, both of which are acceptable for populating a
cache. | `list(map(any))` | `[]` | no | +| [caches\_to\_populate](#input\_caches\_to\_populate) | DEPRECATED

The following `commands` can be used to populate a cache:
MIRROR_URL=gs://my-bucket
spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash});
spack gpg publish --mirror-url $MIRROR_URL;
spack buildcache update-index --mirror-url $MIRROR_URL --keys;
Defines caches which will be populated with the installed packages.

NOTE: GPG Keys should be installed before trying to populate a cache
with packages.

NOTE: The gpg\_keys variable can be used to install existing GPG keys
and create new GPG keys, both of which are acceptable for populating a
cache. | `list(map(any))` | `null` | no | | [commands](#input\_commands) | String of commands to run within this module | `string` | `null` | no | | [compilers](#input\_compilers) | Defines compilers for spack to install before installing packages. | `list(string)` | `[]` | no | | [concretize\_flags](#input\_concretize\_flags) | Defines the flags to pass into `spack concretize` | `string` | `""` | no | diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index c3f418f372..03b8ffba94 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -144,24 +144,4 @@ fi %{endfor ~} %{endif ~} -echo "$PREFIX Populating defined buildcaches" -%{for c in CACHES_TO_POPULATE ~} - %{if c.type == "directory" ~} - # shellcheck disable=SC2046 - { - spack buildcache create -d ${c.path} -af $(spack find --format /{hash}); - spack gpg publish -d ${c.path}; - spack buildcache update-index -d ${c.path} --keys; - } >> ${LOG_FILE} - %{endif ~} - %{if c.type == "mirror" ~} - # shellcheck disable=SC2046 - { - spack buildcache create --mirror-url ${c.path} -af $(spack find --format /{hash}); - spack gpg publish --mirror-url ${c.path}; - spack buildcache update-index --mirror-url ${c.path} --keys; - } >> ${LOG_FILE} - %{endif ~} -%{endfor ~} - echo "$PREFIX Setup complete..." diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index 409205739f..169b8c426b 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -196,12 +196,19 @@ EOT } variable "caches_to_populate" { - description = < + spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash}); + spack gpg publish --mirror-url $MIRROR_URL; + spack buildcache update-index --mirror-url $MIRROR_URL --keys; - id: spack-startup source: modules/scripts/startup-script diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml index b04fe0562d..028180b8d9 100644 --- a/tools/validate_configs/test_configs/spack-environments.yaml +++ b/tools/validate_configs/test_configs/spack-environments.yaml @@ -75,9 +75,11 @@ deployment_groups: gpg_keys: - type: 'file' path: '/tmp/spack_key.gpg' - caches_to_populate: - - type: 'mirror' - path: ## Add GCS bucket to populate here ## + commands: | + MIRROR_URL=gs:// + spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash}); + spack gpg publish --mirror-url $MIRROR_URL; + spack buildcache update-index --mirror-url $MIRROR_URL --keys; - id: spack-startup source: modules/scripts/startup-script From bcf9966d85def6d367b21c89d0517f4b2887d831 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 23 Jun 2023 00:06:49 -0700 Subject: [PATCH 004/144] Add functionality for Spack module to take data runners --- .../modules/scripts/spack-install/README.md | 1 + community/modules/scripts/spack-install/main.tf | 6 +++++- .../modules/scripts/spack-install/variables.tf | 16 +++++++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index 05189b53ea..1a39a2a51d 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -220,6 +220,7 @@ limitations under the License. | [compilers](#input\_compilers) | Defines compilers for spack to install before installing packages. | `list(string)` | `[]` | no | | [concretize\_flags](#input\_concretize\_flags) | Defines the flags to pass into `spack concretize` | `string` | `""` | no | | [configs](#input\_configs) | List of configuration options to set within spack.
Configs can be of type 'single-config' or 'file'.
All configs must specify content, and a
a scope. | `list(map(any))` | `[]` | no | +| [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | | [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | | [environments](#input\_environments) | Defines spack environments to configure, given as a list.
Each environment must define a name.
Additional optional attributes are 'content' and 'packages'.
'content' must be a string, defining the content of the Spack Environment YAML file.
'packages' must be a list of strings, defining the spack specs to install.
If both 'content' and 'packages' are defined, 'content' is processed first. | `any` | `[]` | no | | [gpg\_keys](#input\_gpg\_keys) | GPG Keys to trust within spack.
Each key must define a type. Valid types are 'file' and 'new'.
Keys of type 'file' must define a path to the key that
should be trusted.
Keys of type 'new' must define a 'name' and 'email' to create
the key with. | `list(map(any))` | `[]` | no | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index 76391ffe3f..99a7717d61 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -74,6 +74,8 @@ locals { } ) + data_runners = [for data_file in var.data_files : merge(data_file, { type = "data" })] + execute_md5 = substr(md5(local.execute_contents), 0, 4) execute_runner = { "type" = "ansible-local" @@ -81,6 +83,8 @@ locals { "destination" = "spack_execute_${local.execute_md5}.yml" } + runners = concat([local.install_spack_runner], local.data_runners, [local.execute_runner]) + combined_md5 = substr(md5(module.startup_script.startup_script), 0, 4) combined_install_execute_runner = { "type" = "shell" @@ -96,7 +100,7 @@ module "startup_script" { project_id = var.project_id deployment_name = var.deployment_name region = var.region - runners = [local.install_spack_runner, local.execute_runner] + runners = local.runners } resource "local_file" "debug_file_shell_install" { diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index 169b8c426b..b0eee347e3 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -58,10 +58,24 @@ variable "log_file" { type = string } +variable "data_files" { + description = <<-EOT + A list of files to be transferred prior to running commands. + It must specify one of 'source' (absolute local file path) or 'content' (string). + It must specify a 'destination' with absolute path where file should be placed. + EOT + type = list(map(string)) + default = [] + validation { + condition = alltrue([for r in var.data_files : substr(r["destination"], 0, 1) == "/"]) + error_message = "All destinations must be absolute paths and start with '/'." + } +} + variable "commands" { description = "String of commands to run within this module" - default = null type = string + default = null } variable "deployment_name" { From b748d3f6dbe0b0ffa66d7b6269eb29de965356ee Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 23 Jun 2023 13:53:49 -0700 Subject: [PATCH 005/144] Address feedback in #1496: Add validation for XOR property of content and source --- .../modules/scripts/spack-install/variables.tf | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index b0eee347e3..1879d24c1d 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -70,6 +70,20 @@ variable "data_files" { condition = alltrue([for r in var.data_files : substr(r["destination"], 0, 1) == "/"]) error_message = "All destinations must be absolute paths and start with '/'." } + validation { + condition = alltrue([ + for r in var.data_files : + can(r["content"]) != can(r["source"]) + ]) + error_message = "A runner must specify either 'content' or 'source', but never both." + } + validation { + condition = alltrue([ + for r in var.data_files : + lookup(r, "content", lookup(r, "source", null)) != null + ]) + error_message = "A runner must specify a non-null 'content' or 'source'." + } } variable "commands" { From 610387a8f81d3a76a6e7a304f6bd7c0499e9c637 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 23 Jun 2023 13:46:23 -0700 Subject: [PATCH 006/144] Make spack output runner destination deterministic to fix known at apply error --- community/modules/scripts/spack-install/main.tf | 3 ++- community/modules/scripts/spack-install/variables.tf | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index 99a7717d61..c3415b3c8e 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -85,7 +85,8 @@ locals { runners = concat([local.install_spack_runner], local.data_runners, [local.execute_runner]) - combined_md5 = substr(md5(module.startup_script.startup_script), 0, 4) + combined_unique_string = join("\n", [for runner in local.runners : try(runner["content"], runner["source"])]) + combined_md5 = substr(md5(local.combined_unique_string), 0, 4) combined_install_execute_runner = { "type" = "shell" "content" = module.startup_script.startup_script diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index 1879d24c1d..706ab65dbc 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -75,14 +75,14 @@ variable "data_files" { for r in var.data_files : can(r["content"]) != can(r["source"]) ]) - error_message = "A runner must specify either 'content' or 'source', but never both." + error_message = "A data_file must specify either 'content' or 'source', but never both." } validation { condition = alltrue([ for r in var.data_files : lookup(r, "content", lookup(r, "source", null)) != null ]) - error_message = "A runner must specify a non-null 'content' or 'source'." + error_message = "A data_file must specify a non-null 'content' or 'source'." } } From 6b61ea972b01d07ab691757a87c6052ddc4686e6 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 23 Jun 2023 01:14:57 -0700 Subject: [PATCH 007/144] DEPRECATE spack-install.environments in favor of using data_files and commands --- community/examples/AMD/hpc-amd-slurm.yaml | 11 +++-- .../modules/scripts/spack-install/README.md | 2 +- .../modules/scripts/spack-install/main.tf | 8 ---- .../spack-install/templates/install_spack.tpl | 38 --------------- .../scripts/spack-install/variables.tf | 47 ++++++++----------- docs/tutorials/gromacs/spack-gromacs.yaml | 11 +++-- docs/tutorials/openfoam/spack-openfoam.yaml | 10 +++- docs/tutorials/wrfv3/spack-wrfv3.yaml | 11 +++-- .../hcls-blueprint.yaml | 11 +++-- examples/serverless-batch-mpi.yaml | 11 +++-- .../test_configs/spack-environments.yaml | 34 ++++++++++---- 11 files changed, 93 insertions(+), 101 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index f48121c50d..3acaabd1fd 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -83,9 +83,8 @@ deployment_groups: # gcc 12.1.0 is known to have runtime failures with OpenFOAM 8 # gcc 10.3.0 is the earliest copy of gcc with Zen 3 support - gcc@10.3.0 %gcc@4.8.5 target=x86_64 - environments: - - name: openfoam - type: file + data_files: + - destination: /sw/spack/openfoam_env.yaml content: | spack: definitions: @@ -110,6 +109,12 @@ deployment_groups: - - $^mpis concretizer: unify: when_possible + commands: | + if ! spack env list | grep -q openfoam; then + spack env create openfoam /sw/spack/openfoam_env.yaml + spack env activate openfoam + spack concretize + fi - id: spack-startup source: modules/scripts/startup-script diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index 1a39a2a51d..7e1b1203b0 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -222,7 +222,7 @@ limitations under the License. | [configs](#input\_configs) | List of configuration options to set within spack.
Configs can be of type 'single-config' or 'file'.
All configs must specify content, and a
a scope. | `list(map(any))` | `[]` | no | | [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | | [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | -| [environments](#input\_environments) | Defines spack environments to configure, given as a list.
Each environment must define a name.
Additional optional attributes are 'content' and 'packages'.
'content' must be a string, defining the content of the Spack Environment YAML file.
'packages' must be a list of strings, defining the spack specs to install.
If both 'content' and 'packages' are defined, 'content' is processed first. | `any` | `[]` | no | +| [environments](#input\_environments) | DEPRECATED

The following `commands` can be used to configure an environment:
if ! spack env list | grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | | [gpg\_keys](#input\_gpg\_keys) | GPG Keys to trust within spack.
Each key must define a type. Valid types are 'file' and 'new'.
Keys of type 'file' must define a path to the key that
should be trusted.
Keys of type 'new' must define a 'name' and 'email' to create
the key with. | `list(map(any))` | `[]` | no | | [install\_dir](#input\_install\_dir) | Directory to install spack into. | `string` | `"/sw/spack"` | no | | [install\_flags](#input\_install\_flags) | Defines the flags to pass into `spack install` | `string` | `""` | no | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index c3415b3c8e..3a0358eb88 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -20,13 +20,6 @@ locals { } locals { - env = [ - for e in var.environments : { - name = e.name - packages = contains(keys(e), "packages") ? e.packages : null - content = contains(keys(e), "content") ? e.content : null - } - ] script_content = templatefile( "${path.module}/templates/install_spack.tpl", { @@ -41,7 +34,6 @@ locals { PACKAGES = var.packages == null ? [] : var.packages INSTALL_FLAGS = var.install_flags == null ? "" : var.install_flags CONCRETIZE_FLAGS = var.concretize_flags == null ? "" : var.concretize_flags - ENVIRONMENTS = local.env MIRRORS = var.spack_cache_url == null ? [] : var.spack_cache_url GPG_KEYS = var.gpg_keys == null ? [] : var.gpg_keys CACHES_TO_POPULATE = var.caches_to_populate == null ? [] : var.caches_to_populate diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index 03b8ffba94..a8096033b7 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -106,42 +106,4 @@ echo "$PREFIX Installing root spack specs..." spack clean -s %{endfor ~} -echo "$PREFIX Configuring spack environments" -%{if ENVIRONMENTS != null ~} -%{for e in ENVIRONMENTS ~} -if [ ! -d ${INSTALL_DIR}/var/spack/environments/${e.name} ]; then - %{if e.content != null} - { - cat << 'EOF' > ${INSTALL_DIR}/spack_env.yaml -${e.content} -EOF - spack env create ${e.name} ${INSTALL_DIR}/spack_env.yaml - rm -f ${INSTALL_DIR}/spack_env.yaml - } &>> ${LOG_FILE} - %{else ~} - spack env create ${e.name} >> ${LOG_FILE} 2>&1 - %{endif ~} - - spack env activate ${e.name} >> ${LOG_FILE} 2>&1 - - %{if e.packages != null} - echo "$PREFIX Configuring spack environment ${e.name}" - %{for p in e.packages ~} - spack add ${p} >> ${LOG_FILE} 2>&1 - %{endfor ~} - %{endif ~} - - echo "$PREFIX Concretizing spack environment ${e.name}" - spack concretize ${CONCRETIZE_FLAGS} >> ${LOG_FILE} 2>&1 - - echo "$PREFIX Installing packages for spack environment ${e.name}" - # shellcheck disable=SC2129 - spack install ${INSTALL_FLAGS} >> ${LOG_FILE} 2>&1 - - spack env deactivate >> ${LOG_FILE} 2>&1 - spack clean -s >> ${LOG_FILE} 2>&1 -fi -%{endfor ~} -%{endif ~} - echo "$PREFIX Setup complete..." diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index 706ab65dbc..32d327ebc3 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -249,40 +249,33 @@ EOT type = list(map(any)) validation { condition = var.caches_to_populate == null - error_message = "caches_to_populate is deprecated. Use commands instead. See variable documentation for proposed alterantive commands." + error_message = "caches_to_populate is deprecated. Use commands instead. See variable documentation for proposed alternative commands." } } variable "environments" { - description = < spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash}); spack gpg publish --mirror-url $MIRROR_URL; From 2fc326fdca34999c69668802a2bda9db009e893c Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 27 Jun 2023 15:26:47 -0700 Subject: [PATCH 008/144] Switch Spack commands pre_script to also set SPACK_PYTHON var --- community/modules/scripts/spack-install/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index 3a0358eb88..61b7f5d0e5 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -60,7 +60,7 @@ locals { execute_contents = templatefile( "${path.module}/templates/execute_commands.yml.tpl", { - pre_script = ". ${var.install_dir}/share/spack/setup-env.sh" + pre_script = ". /etc/profile.d/spack.sh" log_file = var.log_file commands = local.commands_content } From 7a1efe7d0e543a0d47c6bd617aa434a25d369daa Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 27 Jun 2023 22:32:25 -0700 Subject: [PATCH 009/144] Fix: Add 'spack install' at end of environment setup --- community/examples/AMD/hpc-amd-slurm.yaml | 1 + community/modules/scripts/spack-install/README.md | 2 +- community/modules/scripts/spack-install/variables.tf | 1 + docs/tutorials/gromacs/spack-gromacs.yaml | 1 + docs/tutorials/openfoam/spack-openfoam.yaml | 1 + docs/tutorials/wrfv3/spack-wrfv3.yaml | 1 + docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml | 1 + examples/serverless-batch-mpi.yaml | 1 + tools/validate_configs/test_configs/spack-environments.yaml | 3 +++ 9 files changed, 11 insertions(+), 1 deletion(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 3acaabd1fd..064df8353c 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -114,6 +114,7 @@ deployment_groups: spack env create openfoam /sw/spack/openfoam_env.yaml spack env activate openfoam spack concretize + spack install fi - id: spack-startup diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index 7e1b1203b0..a198075466 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -222,7 +222,7 @@ limitations under the License. | [configs](#input\_configs) | List of configuration options to set within spack.
Configs can be of type 'single-config' or 'file'.
All configs must specify content, and a
a scope. | `list(map(any))` | `[]` | no | | [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | | [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | -| [environments](#input\_environments) | DEPRECATED

The following `commands` can be used to configure an environment:
if ! spack env list | grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | +| [environments](#input\_environments) | DEPRECATED

The following `commands` can be used to configure an environment:
if ! spack env list | grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
spack install
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | | [gpg\_keys](#input\_gpg\_keys) | GPG Keys to trust within spack.
Each key must define a type. Valid types are 'file' and 'new'.
Keys of type 'file' must define a path to the key that
should be trusted.
Keys of type 'new' must define a 'name' and 'email' to create
the key with. | `list(map(any))` | `[]` | no | | [install\_dir](#input\_install\_dir) | Directory to install spack into. | `string` | `"/sw/spack"` | no | | [install\_flags](#input\_install\_flags) | Defines the flags to pass into `spack install` | `string` | `""` | no | diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index 32d327ebc3..d60f4751b6 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -266,6 +266,7 @@ variable "environments" { spack env activate my-env spack add intel-mpi@2018.4.274 %gcc@10.3.0 spack concretize + spack install ``` Defines spack environments to configure. diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index 936dade9ef..f8d4d9801e 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -91,6 +91,7 @@ deployment_groups: spack env create gromacs /share/spack/gromacs_env.yaml spack env activate gromacs spack concretize + spack install fi # Un-comment and update mirror_url to install from spack cache # spack_cache_url: diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index 01658f61d6..fc5c741c6b 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -99,6 +99,7 @@ deployment_groups: spack env create openfoam /share/spack/openfoam_env.yaml spack env activate openfoam spack concretize + spack install fi # Un-comment and update mirror_url to install from spack cache # spack_cache_url: diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index 7b2716d8e0..4e192ab96f 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -91,6 +91,7 @@ deployment_groups: spack env create wrfv3 /share/spack/wrfv3_env.yaml spack env activate wrfv3 spack concretize + spack install fi # Un-comment and update mirror_url to install from spack cache # spack_cache_url: diff --git a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml index f391331641..4102b211d0 100644 --- a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml +++ b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml @@ -180,6 +180,7 @@ deployment_groups: spack env create gromacs /share/spack/gromacs_env.yaml spack env activate gromacs spack concretize + spack install fi - id: spack-builder-startup diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index bdc5f9a378..52e2772b34 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -84,6 +84,7 @@ deployment_groups: spack env create wrfv3 /share/spack/wrfv3_env.yaml spack env activate wrfv3 spack concretize + spack install fi # Un-comment and update mirror_url to install from spack cache # spack_cache_url: diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml index ad9b900730..06f505bd7a 100644 --- a/tools/validate_configs/test_configs/spack-environments.yaml +++ b/tools/validate_configs/test_configs/spack-environments.yaml @@ -73,6 +73,7 @@ deployment_groups: spack env create test-file /apps/spack/env_file.yaml spack env activate test-file spack concretize + spack install fi if ! spack env list | grep -q test-package; then @@ -82,6 +83,7 @@ deployment_groups: spack add intel-mpi@2018.4.274 %gcc@10.3.0 spack add gromacs@2021.2 %gcc@10.3.0 ^intel-mpi@2018.4.274 spack concretize + spack install if ! spack env list | grep -q test-mixed; then spack env create test-mixed /apps/spack/mixed_env_file.yaml @@ -89,6 +91,7 @@ deployment_groups: spack env activate test-mixed spack add gromacs@2021.2 %gcc@10.3.0 ^intel-mpi@2018.4.274 spack concretize + spack install MIRROR_URL=gs:// spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash}); From d7d508f26733b79bcc511c5bc96934ad2a90e30e Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 23 Jun 2023 11:59:59 -0700 Subject: [PATCH 010/144] DEPRECATE spack-install.packages in favor of using commands --- community/examples/hpc-slurm-gromacs.yaml | 6 +++--- .../modules/scripts/spack-install/README.md | 2 +- .../modules/scripts/spack-install/main.tf | 1 - .../spack-install/templates/install_spack.tpl | 6 ------ .../modules/scripts/spack-install/variables.tf | 18 ++++++++++++++++-- .../test_configs/centos8-ss.yaml | 3 +-- .../test_configs/debian-ss.yaml | 3 +-- .../test_configs/hpc-centos-ss.yaml | 4 ++-- .../test_configs/rocky-ss.yaml | 3 +-- .../test_configs/spack-buildcache.yaml | 6 +++--- .../test_configs/ubuntu-ss.yaml | 12 +++++------- 11 files changed, 33 insertions(+), 31 deletions(-) diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index 919af2497c..aad6b77d45 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -68,9 +68,9 @@ deployment_groups: all: '{name}/{version}-{compiler.name}-{compiler.version}' compilers: - gcc@10.3.0 target=x86_64 - packages: - - intel-mpi@2018.4.274%gcc@10.3.0 - - gromacs@2023.1 %gcc@10.3.0 ^intel-mpi@2018.4.274 ^cmake@3.26.3 %gcc@4.8.5 + commands: | + spack install intel-mpi@2018.4.274%gcc@10.3.0 + spack install gromacs@2023.1 %gcc@10.3.0 ^intel-mpi@2018.4.274 ^cmake@3.26.3 %gcc@4.8.5 # Uncomment and update the name and path to add a shared or personal Spack # cache location to speed up future deployments. # spack_cache_url: diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index a198075466..a07fff5e2c 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -229,7 +229,7 @@ limitations under the License. | [labels](#input\_labels) | Key-value pairs of labels to be added to created resources. | `map(string)` | n/a | yes | | [licenses](#input\_licenses) | List of software licenses to install within spack. |
list(object({
source = string
dest = string
}))
| `null` | no | | [log\_file](#input\_log\_file) | Defines the logfile that script output will be written to | `string` | `"/var/log/spack.log"` | no | -| [packages](#input\_packages) | Defines root packages for spack to install (in order). | `list(string)` | `[]` | no | +| [packages](#input\_packages) | DEPRECATED

The following `commands` can be used to install a package:
spack install intel-mpi@2018.4.274 %gcc@10.3.0
Defines root packages for spack to install. | `list(string)` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | | [region](#input\_region) | Region to place bucket containing startup script. | `string` | n/a | yes | | [spack\_cache\_url](#input\_spack\_cache\_url) | List of buildcaches for spack. |
list(object({
mirror_name = string
mirror_url = string
}))
| `null` | no | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index 61b7f5d0e5..309b24cf60 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -31,7 +31,6 @@ locals { COMPILERS = var.compilers == null ? [] : var.compilers CONFIGS = var.configs == null ? [] : var.configs LICENSES = var.licenses == null ? [] : var.licenses - PACKAGES = var.packages == null ? [] : var.packages INSTALL_FLAGS = var.install_flags == null ? "" : var.install_flags CONCRETIZE_FLAGS = var.concretize_flags == null ? "" : var.concretize_flags MIRRORS = var.spack_cache_url == null ? [] : var.spack_cache_url diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index a8096033b7..83d92cf5c1 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -100,10 +100,4 @@ echo "$PREFIX Installing compilers..." spack compiler find --scope site >> ${LOG_FILE} 2>&1 -echo "$PREFIX Installing root spack specs..." -%{for p in PACKAGES ~} - spack install ${INSTALL_FLAGS} ${p} >> ${LOG_FILE} 2>&1 - spack clean -s -%{endfor ~} - echo "$PREFIX Setup complete..." diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index d60f4751b6..af59572595 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -169,9 +169,23 @@ variable "licenses" { } variable "packages" { - description = "Defines root packages for spack to install (in order)." - default = [] + description = <<-EOT + DEPRECATED + + The following `commands` can be used to install a package: + + ``` + spack install intel-mpi@2018.4.274 %gcc@10.3.0 + ``` + + Defines root packages for spack to install. + EOT type = list(string) + default = null + validation { + condition = var.packages == null + error_message = "packages is deprecated. Use commands instead. See variable documentation for proposed alternative commands." + } } variable "install_flags" { diff --git a/tools/validate_configs/test_configs/centos8-ss.yaml b/tools/validate_configs/test_configs/centos8-ss.yaml index 8b6feec717..bc9b17a1aa 100644 --- a/tools/validate_configs/test_configs/centos8-ss.yaml +++ b/tools/validate_configs/test_configs/centos8-ss.yaml @@ -47,8 +47,7 @@ deployment_groups: install_dir: /apps/spack compilers: - gcc@10.3.0 target=x86_64 - packages: - - cmake%gcc@10.3.0 target=x86_64 + commands: spack install cmake%gcc@10.3.0 target=x86_64 - id: startup source: ./modules/scripts/startup-script diff --git a/tools/validate_configs/test_configs/debian-ss.yaml b/tools/validate_configs/test_configs/debian-ss.yaml index fba3bfe406..30aaf7771a 100644 --- a/tools/validate_configs/test_configs/debian-ss.yaml +++ b/tools/validate_configs/test_configs/debian-ss.yaml @@ -47,8 +47,7 @@ deployment_groups: install_dir: /apps/spack compilers: - gcc@10.3.0 target=x86_64 - packages: - - cmake%gcc@10.3.0 target=x86_64 + commands: spack install cmake%gcc@10.3.0 target=x86_64 - id: startup source: ./modules/scripts/startup-script diff --git a/tools/validate_configs/test_configs/hpc-centos-ss.yaml b/tools/validate_configs/test_configs/hpc-centos-ss.yaml index 18d6fd3840..2408f373a5 100644 --- a/tools/validate_configs/test_configs/hpc-centos-ss.yaml +++ b/tools/validate_configs/test_configs/hpc-centos-ss.yaml @@ -47,8 +47,8 @@ deployment_groups: install_dir: /apps/spack compilers: - gcc@10.3.0 target=x86_64 - packages: - - cmake%gcc@10.3.0 target=x86_64 + commands: | + spack install cmake%gcc@10.3.0 target=x86_64 - id: startup source: ./modules/scripts/startup-script diff --git a/tools/validate_configs/test_configs/rocky-ss.yaml b/tools/validate_configs/test_configs/rocky-ss.yaml index a631a89e86..31306820eb 100644 --- a/tools/validate_configs/test_configs/rocky-ss.yaml +++ b/tools/validate_configs/test_configs/rocky-ss.yaml @@ -49,8 +49,7 @@ deployment_groups: spack_cache_url: compilers: - gcc@10.3.0 target=x86_64 - packages: - - cmake%gcc@10.3.0 target=x86_64 + commands: spack install cmake%gcc@10.3.0 target=x86_64 - id: startup source: ./modules/scripts/startup-script diff --git a/tools/validate_configs/test_configs/spack-buildcache.yaml b/tools/validate_configs/test_configs/spack-buildcache.yaml index 9a2bd7ab7f..8b232a1ed7 100644 --- a/tools/validate_configs/test_configs/spack-buildcache.yaml +++ b/tools/validate_configs/test_configs/spack-buildcache.yaml @@ -39,13 +39,13 @@ deployment_groups: content: 'config:install_tree:padded_length:128' compilers: - gcc@10.3.0 target=x86_64 - packages: - - intel-mpi@2018.4.274%gcc@10.3.0 - - gromacs@2021.2 %gcc@10.3.0 ^intel-mpi@2018.4.274 gpg_keys: - type: 'file' path: '/tmp/spack_key.gpg' commands: | + spack install intel-mpi@2018.4.274%gcc@10.3.0 + spack install gromacs@2021.2 %gcc@10.3.0 ^intel-mpi@2018.4.274 + MIRROR_URL=gs:// spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash}); spack gpg publish --mirror-url $MIRROR_URL; diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml index 9a9a922b04..75c6da57a9 100644 --- a/tools/validate_configs/test_configs/ubuntu-ss.yaml +++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml @@ -48,13 +48,11 @@ deployment_groups: spack_cache_url: compilers: - gcc@10.3.0 target=x86_64 - packages: - - cmake%gcc@10.3.0 target=x86_64 - - intel-mkl%gcc@10.3.0 target=skylake - - intel-mpi@2018.4.274%gcc@10.3.0 target=skylake - - >- - fftw%intel@18.0.5 target=skylake ^intel-mpi@2018.4.274%intel@18.0.5 - target=x86_64 + commands: | + spack install cmake%gcc@10.3.0 target=x86_64 + spack install intel-mkl%gcc@10.3.0 target=skylake + spack install intel-mpi@2018.4.274%gcc@10.3.0 target=skylake + spack install fftw%intel@18.0.5 target=skylake ^intel-mpi@2018.4.274%intel@18.0.5 target=x86_64 - id: startup source: ./modules/scripts/startup-script From 155cb4e2aab3448d49677ebb895cab5eb44c6fbd Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 27 Jun 2023 23:03:59 -0700 Subject: [PATCH 011/144] DEPRECATE spack-install.licenses in favor of using commands --- .../modules/scripts/spack-install/README.md | 4 ++-- .../modules/scripts/spack-install/main.tf | 1 - .../spack-install/templates/install_spack.tpl | 5 ---- .../scripts/spack-install/variables.tf | 24 ++++++++++++++++--- 4 files changed, 23 insertions(+), 11 deletions(-) diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index a07fff5e2c..d71f3eefea 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -222,12 +222,12 @@ limitations under the License. | [configs](#input\_configs) | List of configuration options to set within spack.
Configs can be of type 'single-config' or 'file'.
All configs must specify content, and a
a scope. | `list(map(any))` | `[]` | no | | [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | | [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | -| [environments](#input\_environments) | DEPRECATED

The following `commands` can be used to configure an environment:
if ! spack env list | grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
spack install
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | +| [environments](#input\_environments) | DEPRECATED

The following `commands` can be used to configure an environment:
if ! spack env list \| grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
spack install
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | | [gpg\_keys](#input\_gpg\_keys) | GPG Keys to trust within spack.
Each key must define a type. Valid types are 'file' and 'new'.
Keys of type 'file' must define a path to the key that
should be trusted.
Keys of type 'new' must define a 'name' and 'email' to create
the key with. | `list(map(any))` | `[]` | no | | [install\_dir](#input\_install\_dir) | Directory to install spack into. | `string` | `"/sw/spack"` | no | | [install\_flags](#input\_install\_flags) | Defines the flags to pass into `spack install` | `string` | `""` | no | | [labels](#input\_labels) | Key-value pairs of labels to be added to created resources. | `map(string)` | n/a | yes | -| [licenses](#input\_licenses) | List of software licenses to install within spack. |
list(object({
source = string
dest = string
}))
| `null` | no | +| [licenses](#input\_licenses) | DEPRECATED

Use `data_files` variable to install license files:
data_files = [{
source = "/abs/path/on/deployment/machine/license.lic"
destination = "/sw/spack/etc/spack/licenses/license.lic"
}]
List of software licenses to install within spack. |
list(object({
source = string
dest = string
}))
| `null` | no | | [log\_file](#input\_log\_file) | Defines the logfile that script output will be written to | `string` | `"/var/log/spack.log"` | no | | [packages](#input\_packages) | DEPRECATED

The following `commands` can be used to install a package:
spack install intel-mpi@2018.4.274 %gcc@10.3.0
Defines root packages for spack to install. | `list(string)` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index 309b24cf60..8d7dd9d5ce 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -30,7 +30,6 @@ locals { SPACK_REF = var.spack_ref COMPILERS = var.compilers == null ? [] : var.compilers CONFIGS = var.configs == null ? [] : var.configs - LICENSES = var.licenses == null ? [] : var.licenses INSTALL_FLAGS = var.install_flags == null ? "" : var.install_flags CONCRETIZE_FLAGS = var.concretize_flags == null ? "" : var.concretize_flags MIRRORS = var.spack_cache_url == null ? [] : var.spack_cache_url diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index 83d92cf5c1..9300f42836 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -84,11 +84,6 @@ else source ${INSTALL_DIR}/share/spack/setup-env.sh >> ${LOG_FILE} 2>&1 fi -echo "$PREFIX Installing licenses..." -%{for lic in LICENSES ~} - gsutil cp ${lic.source} ${lic.dest} >> ${LOG_FILE} 2>&1 -%{endfor ~} - echo "$PREFIX Installing compilers..." %{for c in COMPILERS ~} { diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index af59572595..252b6934e8 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -160,12 +160,30 @@ variable "compilers" { } variable "licenses" { - description = "List of software licenses to install within spack." - default = null + description = <<-EOT + DEPRECATED + + Use `data_files` variable to install license files: + + ``` + data_files = [{ + source = "/abs/path/on/deployment/machine/license.lic" + destination = "/sw/spack/etc/spack/licenses/license.lic" + }] + ``` + + List of software licenses to install within spack. + EOT + + default = null type = list(object({ source = string dest = string })) + validation { + condition = var.licenses == null + error_message = "licenses is deprecated. Use commands instead. See variable documentation for proposed alternative commands." + } } variable "packages" { @@ -274,7 +292,7 @@ variable "environments" { The following `commands` can be used to configure an environment: ``` - if ! spack env list | grep -q my-env; then + if ! spack env list \| grep -q my-env; then spack env create my-env fi spack env activate my-env From eb2e5cd7da606bd8460520e6d36330b9c902b3a1 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 23 Jun 2023 15:13:24 -0700 Subject: [PATCH 012/144] DEPRECATE spack-install.compilers in favor of using commands --- community/examples/AMD/hpc-amd-slurm.yaml | 10 +++++---- community/examples/hpc-slurm-gromacs.yaml | 6 +++-- .../modules/scripts/spack-install/README.md | 2 +- .../modules/scripts/spack-install/main.tf | 1 - .../spack-install/templates/install_spack.tpl | 11 ---------- .../scripts/spack-install/variables.tf | 22 +++++++++++++++++-- examples/serverless-batch-mpi.yaml | 6 +++-- .../test_configs/centos8-ss.yaml | 9 +++++--- .../test_configs/debian-ss.yaml | 9 +++++--- .../test_configs/hpc-centos-ss.yaml | 6 +++-- .../test_configs/rocky-ss.yaml | 9 +++++--- .../test_configs/spack-buildcache.yaml | 6 +++-- .../test_configs/spack-environments.yaml | 6 +++-- .../test_configs/ubuntu-ss.yaml | 6 +++-- 14 files changed, 69 insertions(+), 40 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 064df8353c..d17562532f 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -79,10 +79,6 @@ deployment_groups: concretizer: targets: host_compatible: false - compilers: - # gcc 12.1.0 is known to have runtime failures with OpenFOAM 8 - # gcc 10.3.0 is the earliest copy of gcc with Zen 3 support - - gcc@10.3.0 %gcc@4.8.5 target=x86_64 data_files: - destination: /sw/spack/openfoam_env.yaml content: | @@ -110,6 +106,12 @@ deployment_groups: concretizer: unify: when_possible commands: | + # gcc 12.1.0 is known to have runtime failures with OpenFOAM 8 + # gcc 10.3.0 is the earliest copy of gcc with Zen 3 support + spack install gcc@10.3.0 %gcc@4.8.5 target=x86_64 + spack load gcc@10.3.0 %gcc@4.8.5 target=x86_64 + spack compiler find --scope site + if ! spack env list | grep -q openfoam; then spack env create openfoam /sw/spack/openfoam_env.yaml spack env activate openfoam diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index aad6b77d45..a11a341c94 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -66,9 +66,11 @@ deployment_groups: - '{name}' projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' - compilers: - - gcc@10.3.0 target=x86_64 commands: | + spack install gcc@10.3.0 target=x86_64 + spack load gcc@10.3.0 target=x86_64 + spack compiler find --scope site + spack install intel-mpi@2018.4.274%gcc@10.3.0 spack install gromacs@2023.1 %gcc@10.3.0 ^intel-mpi@2018.4.274 ^cmake@3.26.3 %gcc@4.8.5 # Uncomment and update the name and path to add a shared or personal Spack diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index d71f3eefea..7edee5212a 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -217,7 +217,7 @@ limitations under the License. |------|-------------|------|---------|:--------:| | [caches\_to\_populate](#input\_caches\_to\_populate) | DEPRECATED

The following `commands` can be used to populate a cache:
MIRROR_URL=gs://my-bucket
spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash});
spack gpg publish --mirror-url $MIRROR_URL;
spack buildcache update-index --mirror-url $MIRROR_URL --keys;
Defines caches which will be populated with the installed packages.

NOTE: GPG Keys should be installed before trying to populate a cache
with packages.

NOTE: The gpg\_keys variable can be used to install existing GPG keys
and create new GPG keys, both of which are acceptable for populating a
cache. | `list(map(any))` | `null` | no | | [commands](#input\_commands) | String of commands to run within this module | `string` | `null` | no | -| [compilers](#input\_compilers) | Defines compilers for spack to install before installing packages. | `list(string)` | `[]` | no | +| [compilers](#input\_compilers) | DEPRECATED

The following `commands` can be used to install compilers:
spack install gcc@10.3.0 target=x86_64
spack load gcc@10.3.0 target=x86_64
spack compiler find --scope site
spack clean -s
spack unload gcc@10.3.0
Defines compilers for spack to install before installing packages. | `list(string)` | `null` | no | | [concretize\_flags](#input\_concretize\_flags) | Defines the flags to pass into `spack concretize` | `string` | `""` | no | | [configs](#input\_configs) | List of configuration options to set within spack.
Configs can be of type 'single-config' or 'file'.
All configs must specify content, and a
a scope. | `list(map(any))` | `[]` | no | | [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index 8d7dd9d5ce..eed320e4d2 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -28,7 +28,6 @@ locals { INSTALL_DIR = var.install_dir SPACK_URL = var.spack_url SPACK_REF = var.spack_ref - COMPILERS = var.compilers == null ? [] : var.compilers CONFIGS = var.configs == null ? [] : var.configs INSTALL_FLAGS = var.install_flags == null ? "" : var.install_flags CONCRETIZE_FLAGS = var.concretize_flags == null ? "" : var.concretize_flags diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index 9300f42836..420aa60e61 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -84,15 +84,4 @@ else source ${INSTALL_DIR}/share/spack/setup-env.sh >> ${LOG_FILE} 2>&1 fi -echo "$PREFIX Installing compilers..." -%{for c in COMPILERS ~} - { - spack install ${INSTALL_FLAGS} ${c}; - spack load ${c}; - spack clean -s - } &>> ${LOG_FILE} -%{endfor ~} - -spack compiler find --scope site >> ${LOG_FILE} 2>&1 - echo "$PREFIX Setup complete..." diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index 252b6934e8..5a4f33a5f1 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -154,9 +154,27 @@ EOT } variable "compilers" { - description = "Defines compilers for spack to install before installing packages." - default = [] + description = <<-EOT + DEPRECATED + + The following `commands` can be used to install compilers: + + ``` + spack install gcc@10.3.0 target=x86_64 + spack load gcc@10.3.0 target=x86_64 + spack compiler find --scope site + spack clean -s + spack unload gcc@10.3.0 + ``` + + Defines compilers for spack to install before installing packages. + EOT type = list(string) + default = null + validation { + condition = var.compilers == null + error_message = "compilers is deprecated. Use commands instead. See variable documentation for proposed alternative commands." + } } variable "licenses" { diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index 52e2772b34..c4711dd49b 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -58,8 +58,6 @@ deployment_groups: - '{name}' projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' - compilers: - - gcc@8.2.0 target=x86_64 data_files: - destination: /share/spack/wrfv3_env.yaml content: | @@ -80,6 +78,10 @@ deployment_groups: - - $%compilers - - $^mpis commands: | + spack install gcc@8.2.0 target=x86_64 + spack load gcc@8.2.0 target=x86_64 + spack compiler find --scope site + if ! spack env list | grep -q wrfv3; then spack env create wrfv3 /share/spack/wrfv3_env.yaml spack env activate wrfv3 diff --git a/tools/validate_configs/test_configs/centos8-ss.yaml b/tools/validate_configs/test_configs/centos8-ss.yaml index bc9b17a1aa..58bd6f769b 100644 --- a/tools/validate_configs/test_configs/centos8-ss.yaml +++ b/tools/validate_configs/test_configs/centos8-ss.yaml @@ -45,9 +45,12 @@ deployment_groups: source: ./community//modules/scripts/spack-install settings: install_dir: /apps/spack - compilers: - - gcc@10.3.0 target=x86_64 - commands: spack install cmake%gcc@10.3.0 target=x86_64 + commands: | + spack install gcc@10.3.0 target=x86_64 + spack load gcc@10.3.0 target=x86_64 + spack compiler find --scope site + + spack install cmake%gcc@10.3.0 target=x86_64 - id: startup source: ./modules/scripts/startup-script diff --git a/tools/validate_configs/test_configs/debian-ss.yaml b/tools/validate_configs/test_configs/debian-ss.yaml index 30aaf7771a..7d02949510 100644 --- a/tools/validate_configs/test_configs/debian-ss.yaml +++ b/tools/validate_configs/test_configs/debian-ss.yaml @@ -45,9 +45,12 @@ deployment_groups: source: ./community//modules/scripts/spack-install settings: install_dir: /apps/spack - compilers: - - gcc@10.3.0 target=x86_64 - commands: spack install cmake%gcc@10.3.0 target=x86_64 + commands: | + spack install gcc@10.3.0 target=x86_64 + spack load gcc@10.3.0 target=x86_64 + spack compiler find --scope site + + spack install cmake%gcc@10.3.0 target=x86_64 - id: startup source: ./modules/scripts/startup-script diff --git a/tools/validate_configs/test_configs/hpc-centos-ss.yaml b/tools/validate_configs/test_configs/hpc-centos-ss.yaml index 2408f373a5..e364b3528a 100644 --- a/tools/validate_configs/test_configs/hpc-centos-ss.yaml +++ b/tools/validate_configs/test_configs/hpc-centos-ss.yaml @@ -45,9 +45,11 @@ deployment_groups: source: ./community//modules/scripts/spack-install settings: install_dir: /apps/spack - compilers: - - gcc@10.3.0 target=x86_64 commands: | + spack install gcc@10.3.0 target=x86_64 + spack load gcc@10.3.0 target=x86_64 + spack compiler find --scope site + spack install cmake%gcc@10.3.0 target=x86_64 - id: startup diff --git a/tools/validate_configs/test_configs/rocky-ss.yaml b/tools/validate_configs/test_configs/rocky-ss.yaml index 31306820eb..2c12a0d3c4 100644 --- a/tools/validate_configs/test_configs/rocky-ss.yaml +++ b/tools/validate_configs/test_configs/rocky-ss.yaml @@ -47,9 +47,12 @@ deployment_groups: settings: install_dir: /apps/spack spack_cache_url: - compilers: - - gcc@10.3.0 target=x86_64 - commands: spack install cmake%gcc@10.3.0 target=x86_64 + commands: | + spack install gcc@10.3.0 target=x86_64 + spack load gcc@10.3.0 target=x86_64 + spack compiler find --scope site + + spack install cmake%gcc@10.3.0 target=x86_64 - id: startup source: ./modules/scripts/startup-script diff --git a/tools/validate_configs/test_configs/spack-buildcache.yaml b/tools/validate_configs/test_configs/spack-buildcache.yaml index 8b232a1ed7..bb5b049df5 100644 --- a/tools/validate_configs/test_configs/spack-buildcache.yaml +++ b/tools/validate_configs/test_configs/spack-buildcache.yaml @@ -37,12 +37,14 @@ deployment_groups: - type: 'single-config' scope: 'site' content: 'config:install_tree:padded_length:128' - compilers: - - gcc@10.3.0 target=x86_64 gpg_keys: - type: 'file' path: '/tmp/spack_key.gpg' commands: | + spack install gcc@10.3.0 target=x86_64 + spack load gcc@10.3.0 target=x86_64 + spack compiler find --scope site + spack install intel-mpi@2018.4.274%gcc@10.3.0 spack install gromacs@2021.2 %gcc@10.3.0 ^intel-mpi@2018.4.274 diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml index 06f505bd7a..2e92cba50b 100644 --- a/tools/validate_configs/test_configs/spack-environments.yaml +++ b/tools/validate_configs/test_configs/spack-environments.yaml @@ -39,8 +39,6 @@ deployment_groups: - type: 'single-config' scope: 'site' content: 'config:install_tree:padded_length:128' - compilers: - - gcc@10.3.0 target=x86_64 data_files: - destination: /apps/spack/env_file.yaml content: | @@ -69,6 +67,10 @@ deployment_groups: - type: 'file' path: '/tmp/spack_key.gpg' commands: | + spack install gcc@10.3.0 target=x86_64 + spack load gcc@10.3.0 target=x86_64 + spack compiler find --scope site + if ! spack env list | grep -q test-file; then spack env create test-file /apps/spack/env_file.yaml spack env activate test-file diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml index 75c6da57a9..7d858a7a29 100644 --- a/tools/validate_configs/test_configs/ubuntu-ss.yaml +++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml @@ -46,9 +46,11 @@ deployment_groups: settings: install_dir: /apps/spack spack_cache_url: - compilers: - - gcc@10.3.0 target=x86_64 commands: | + spack install gcc@10.3.0 target=x86_64 + spack load gcc@10.3.0 target=x86_64 + spack compiler find --scope site + spack install cmake%gcc@10.3.0 target=x86_64 spack install intel-mkl%gcc@10.3.0 target=skylake spack install intel-mpi@2018.4.274%gcc@10.3.0 target=skylake From 38991021695b906836ee2ec6c0c15f04ec71742d Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 29 Jun 2023 00:24:51 -0700 Subject: [PATCH 013/144] DEPRECATE install_flags and concretize_flags in spack-install module --- .../modules/scripts/spack-install/README.md | 4 ++-- community/modules/scripts/spack-install/main.tf | 2 -- .../modules/scripts/spack-install/variables.tf | 16 ++++++++++++---- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index 7edee5212a..d620b1e488 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -218,14 +218,14 @@ limitations under the License. | [caches\_to\_populate](#input\_caches\_to\_populate) | DEPRECATED

The following `commands` can be used to populate a cache:
MIRROR_URL=gs://my-bucket
spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash});
spack gpg publish --mirror-url $MIRROR_URL;
spack buildcache update-index --mirror-url $MIRROR_URL --keys;
Defines caches which will be populated with the installed packages.

NOTE: GPG Keys should be installed before trying to populate a cache
with packages.

NOTE: The gpg\_keys variable can be used to install existing GPG keys
and create new GPG keys, both of which are acceptable for populating a
cache. | `list(map(any))` | `null` | no | | [commands](#input\_commands) | String of commands to run within this module | `string` | `null` | no | | [compilers](#input\_compilers) | DEPRECATED

The following `commands` can be used to install compilers:
spack install gcc@10.3.0 target=x86_64
spack load gcc@10.3.0 target=x86_64
spack compiler find --scope site
spack clean -s
spack unload gcc@10.3.0
Defines compilers for spack to install before installing packages. | `list(string)` | `null` | no | -| [concretize\_flags](#input\_concretize\_flags) | Defines the flags to pass into `spack concretize` | `string` | `""` | no | +| [concretize\_flags](#input\_concretize\_flags) | DEPRECATED - spack concretize is now performed using the `commands` variable. | `string` | `null` | no | | [configs](#input\_configs) | List of configuration options to set within spack.
Configs can be of type 'single-config' or 'file'.
All configs must specify content, and a
a scope. | `list(map(any))` | `[]` | no | | [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | | [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | | [environments](#input\_environments) | DEPRECATED

The following `commands` can be used to configure an environment:
if ! spack env list \| grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
spack install
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | | [gpg\_keys](#input\_gpg\_keys) | GPG Keys to trust within spack.
Each key must define a type. Valid types are 'file' and 'new'.
Keys of type 'file' must define a path to the key that
should be trusted.
Keys of type 'new' must define a 'name' and 'email' to create
the key with. | `list(map(any))` | `[]` | no | | [install\_dir](#input\_install\_dir) | Directory to install spack into. | `string` | `"/sw/spack"` | no | -| [install\_flags](#input\_install\_flags) | Defines the flags to pass into `spack install` | `string` | `""` | no | +| [install\_flags](#input\_install\_flags) | DEPRECATED - spack install is now performed using the `commands` variable. | `string` | `null` | no | | [labels](#input\_labels) | Key-value pairs of labels to be added to created resources. | `map(string)` | n/a | yes | | [licenses](#input\_licenses) | DEPRECATED

Use `data_files` variable to install license files:
data_files = [{
source = "/abs/path/on/deployment/machine/license.lic"
destination = "/sw/spack/etc/spack/licenses/license.lic"
}]
List of software licenses to install within spack. |
list(object({
source = string
dest = string
}))
| `null` | no | | [log\_file](#input\_log\_file) | Defines the logfile that script output will be written to | `string` | `"/var/log/spack.log"` | no | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index eed320e4d2..d99886cd82 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -29,8 +29,6 @@ locals { SPACK_URL = var.spack_url SPACK_REF = var.spack_ref CONFIGS = var.configs == null ? [] : var.configs - INSTALL_FLAGS = var.install_flags == null ? "" : var.install_flags - CONCRETIZE_FLAGS = var.concretize_flags == null ? "" : var.concretize_flags MIRRORS = var.spack_cache_url == null ? [] : var.spack_cache_url GPG_KEYS = var.gpg_keys == null ? [] : var.gpg_keys CACHES_TO_POPULATE = var.caches_to_populate == null ? [] : var.caches_to_populate diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index 5a4f33a5f1..9d4ddcb5af 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -225,15 +225,23 @@ variable "packages" { } variable "install_flags" { - description = "Defines the flags to pass into `spack install`" - default = "" + description = "DEPRECATED - spack install is now performed using the `commands` variable." + default = null type = string + validation { + condition = var.install_flags == null + error_message = "install_flags is deprecated. Add install flags to the relevant line in commands." + } } variable "concretize_flags" { - description = "Defines the flags to pass into `spack concretize`" - default = "" + description = "DEPRECATED - spack concretize is now performed using the `commands` variable." + default = null type = string + validation { + condition = var.concretize_flags == null + error_message = "concretize_flags is deprecated. Add concretize flags to the relevant line in commands." + } } variable "gpg_keys" { From 1cb1a0c9d214c55a1f786849f2dcc17e13cd917a Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Wed, 28 Jun 2023 23:34:09 -0700 Subject: [PATCH 014/144] DEPRECATE spack-install.gpg_keys in favor of using commands --- .../modules/scripts/spack-install/README.md | 2 +- .../modules/scripts/spack-install/main.tf | 1 - .../spack-install/templates/install_spack.tpl | 12 ------ .../scripts/spack-install/variables.tf | 42 +++++++------------ .../test_configs/spack-buildcache.yaml | 9 ++-- .../test_configs/spack-environments.yaml | 6 +-- 6 files changed, 24 insertions(+), 48 deletions(-) diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index d620b1e488..79f56b1953 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -223,7 +223,7 @@ limitations under the License. | [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | | [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | | [environments](#input\_environments) | DEPRECATED

The following `commands` can be used to configure an environment:
if ! spack env list \| grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
spack install
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | -| [gpg\_keys](#input\_gpg\_keys) | GPG Keys to trust within spack.
Each key must define a type. Valid types are 'file' and 'new'.
Keys of type 'file' must define a path to the key that
should be trusted.
Keys of type 'new' must define a 'name' and 'email' to create
the key with. | `list(map(any))` | `[]` | no | +| [gpg\_keys](#input\_gpg\_keys) | DEPRECATED

The following `commands` can be used to create a new GPG key:
spack gpg init
spack gpg create
Alternatively, `data_files` can be used to transfer an existing GPG key. Then use `spack gpg trust ` to add the key to the keyring.

GPG Keys to trust within spack. | `list(map(any))` | `null` | no | | [install\_dir](#input\_install\_dir) | Directory to install spack into. | `string` | `"/sw/spack"` | no | | [install\_flags](#input\_install\_flags) | DEPRECATED - spack install is now performed using the `commands` variable. | `string` | `null` | no | | [labels](#input\_labels) | Key-value pairs of labels to be added to created resources. | `map(string)` | n/a | yes | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index d99886cd82..c4fdf6618c 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -30,7 +30,6 @@ locals { SPACK_REF = var.spack_ref CONFIGS = var.configs == null ? [] : var.configs MIRRORS = var.spack_cache_url == null ? [] : var.spack_cache_url - GPG_KEYS = var.gpg_keys == null ? [] : var.gpg_keys CACHES_TO_POPULATE = var.caches_to_populate == null ? [] : var.caches_to_populate LOG_FILE = var.log_file == null ? "/dev/null" : var.log_file SPACK_PYTHON_VENV = var.spack_virtualenv_path diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index 420aa60e61..4f30c0c4f4 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -67,18 +67,6 @@ EOF spack mirror add --scope site ${m.mirror_name} ${m.mirror_url} >> ${LOG_FILE} 2>&1 %{endfor ~} - echo "$PREFIX Installing GPG keys" - spack gpg init >> ${LOG_FILE} 2>&1 - %{for k in GPG_KEYS ~} - %{if k.type == "file" ~} - spack gpg trust ${k.path} - %{endif ~} - - %{if k.type == "new" ~} - spack gpg create "${k.name}" ${k.email} - %{endif ~} - %{endfor ~} - spack buildcache keys --install --trust >> ${LOG_FILE} 2>&1 else source ${INSTALL_DIR}/share/spack/setup-env.sh >> ${LOG_FILE} 2>&1 diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index 9d4ddcb5af..d905a47851 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -246,38 +246,24 @@ variable "concretize_flags" { variable "gpg_keys" { description = < + ``` + + Alternatively, `data_files` can be used to transfer an existing GPG key. Then use `spack gpg trust ` to add the key to the keyring. + GPG Keys to trust within spack. - Each key must define a type. Valid types are 'file' and 'new'. - Keys of type 'file' must define a path to the key that - should be trusted. - Keys of type 'new' must define a 'name' and 'email' to create - the key with. EOT - default = [] + default = null type = list(map(any)) validation { - condition = alltrue([ - for k in var.gpg_keys : contains(keys(k), "type") - ]) - error_message = "Each gpg_key must define a type." - } - validation { - condition = alltrue([ - for k in var.gpg_keys : (k["type"] == "file" || k["type"] == "new") - ]) - error_message = "Valid types for gpg_keys are 'file' and 'new'." - } - validation { - condition = alltrue([ - for k in var.gpg_keys : ((k["type"] == "file" && contains(keys(k), "path")) || (k["type"] == "new")) - ]) - error_message = "Each gpg_key of type file must define a path." - } - validation { - condition = alltrue([ - for k in var.gpg_keys : (k["type"] == "file" || ((k["type"] == "new") && contains(keys(k), "name") && contains(keys(k), "email"))) - ]) - error_message = "Each gpg_key of type new must define a name and email." + condition = var.gpg_keys == null + error_message = "gpg_keys is deprecated. Use commands instead. See variable documentation for proposed alternative commands." } } diff --git a/tools/validate_configs/test_configs/spack-buildcache.yaml b/tools/validate_configs/test_configs/spack-buildcache.yaml index bb5b049df5..a01d17eea1 100644 --- a/tools/validate_configs/test_configs/spack-buildcache.yaml +++ b/tools/validate_configs/test_configs/spack-buildcache.yaml @@ -37,10 +37,13 @@ deployment_groups: - type: 'single-config' scope: 'site' content: 'config:install_tree:padded_length:128' - gpg_keys: - - type: 'file' - path: '/tmp/spack_key.gpg' + data_files: + - source: /path/to/local/spack_gpg_key.pub + destination: /apps/spack_gpg_key.pub commands: | + spack gpg init + spack gpg trust /apps/spack_gpg_key.pub + spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 spack compiler find --scope site diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml index 2e92cba50b..5828479dbe 100644 --- a/tools/validate_configs/test_configs/spack-environments.yaml +++ b/tools/validate_configs/test_configs/spack-environments.yaml @@ -63,10 +63,10 @@ deployment_groups: spack: specs: intel-mpi@20184.274 %gcc@10.3.0 - gpg_keys: - - type: 'file' - path: '/tmp/spack_key.gpg' commands: | + spack gpg init + spack gpg create "Robert Pirsig" pirsig@zamm.org + spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 spack compiler find --scope site From be0ea0e8fa76d5eda4708450f5f0dc6fef76aa05 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 29 Jun 2023 00:10:12 -0700 Subject: [PATCH 015/144] DEPRECATE spack-install.spack_cache_url in favor of commands --- community/examples/hpc-slurm-gromacs.yaml | 9 ++++----- .../examples/hpc-slurm-ramble-gromacs.yaml | 6 +++--- .../modules/scripts/spack-install/README.md | 2 +- community/modules/scripts/spack-install/main.tf | 1 - .../spack-install/templates/install_spack.tpl | 6 +----- .../modules/scripts/spack-install/variables.tf | 17 ++++++++++++++++- docs/tutorials/gromacs/spack-gromacs.yaml | 8 ++++---- docs/tutorials/openfoam/spack-openfoam.yaml | 8 ++++---- docs/tutorials/wrfv3/spack-wrfv3.yaml | 8 ++++---- examples/serverless-batch-mpi.yaml | 8 ++++---- .../daily-tests/builds/batch-mpi.yaml | 5 ++--- .../daily-tests/builds/spack-gromacs.yaml | 5 ++--- .../validate_configs/test_configs/rocky-ss.yaml | 1 - .../test_configs/ubuntu-ss.yaml | 1 - 14 files changed, 45 insertions(+), 40 deletions(-) diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index a11a341c94..ca0f743736 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -67,17 +67,16 @@ deployment_groups: projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' commands: | + # Un-comment and update mirror_url to install from spack cache + # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket + # spack buildcache keys --install --trust + spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 spack compiler find --scope site spack install intel-mpi@2018.4.274%gcc@10.3.0 spack install gromacs@2023.1 %gcc@10.3.0 ^intel-mpi@2018.4.274 ^cmake@3.26.3 %gcc@4.8.5 - # Uncomment and update the name and path to add a shared or personal Spack - # cache location to speed up future deployments. - # spack_cache_url: - # - mirror_name: gcs_cache - # mirror_url: gs://bucket-name/... - id: spack-startup source: modules/scripts/startup-script diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index d268861092..2fdbcbab51 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -49,9 +49,9 @@ deployment_groups: content: "concretizer:targets:host_compatible:false" # Uncomment and update the name and path to add a shared or personal Spack # cache location to speed up future deployments. - # spack_cache_url: - # - mirror_name: gcs_cache - # mirror_url: gs://bucket-name/... + # commands: | + # spack mirror add --scope site gcs_cache gs://bucket-name/... + # spack buildcache keys --install --trust - id: ramble-setup source: community/modules/scripts/ramble-setup diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index 79f56b1953..d2d25bf164 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -232,7 +232,7 @@ limitations under the License. | [packages](#input\_packages) | DEPRECATED

The following `commands` can be used to install a package:
spack install intel-mpi@2018.4.274 %gcc@10.3.0
Defines root packages for spack to install. | `list(string)` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | | [region](#input\_region) | Region to place bucket containing startup script. | `string` | n/a | yes | -| [spack\_cache\_url](#input\_spack\_cache\_url) | List of buildcaches for spack. |
list(object({
mirror_name = string
mirror_url = string
}))
| `null` | no | +| [spack\_cache\_url](#input\_spack\_cache\_url) | DEPRECATED

The following `commands` can be used to add a build cache:
spack mirror add --scope site  gs://my-build-cache
spack buildcache keys --install --trust
List of build caches for Spack. |
list(object({
mirror_name = string
mirror_url = string
}))
| `null` | no | | [spack\_ref](#input\_spack\_ref) | Git ref to checkout for spack. | `string` | `"v0.20.0"` | no | | [spack\_url](#input\_spack\_url) | URL to clone the spack repo from. | `string` | `"https://github.com/spack/spack"` | no | | [spack\_virtualenv\_path](#input\_spack\_virtualenv\_path) | Virtual environment path in which to install Spack Python interpreter and other dependencies | `string` | `"/usr/local/spack-python"` | no | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index c4fdf6618c..d60f15b417 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -29,7 +29,6 @@ locals { SPACK_URL = var.spack_url SPACK_REF = var.spack_ref CONFIGS = var.configs == null ? [] : var.configs - MIRRORS = var.spack_cache_url == null ? [] : var.spack_cache_url CACHES_TO_POPULATE = var.caches_to_populate == null ? [] : var.caches_to_populate LOG_FILE = var.log_file == null ? "/dev/null" : var.log_file SPACK_PYTHON_VENV = var.spack_virtualenv_path diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index 4f30c0c4f4..d21ade5c2e 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -62,12 +62,8 @@ EOF %{endif ~} %{endfor ~} - echo "$PREFIX Setting up spack mirrors..." - %{for m in MIRRORS ~} - spack mirror add --scope site ${m.mirror_name} ${m.mirror_url} >> ${LOG_FILE} 2>&1 - %{endfor ~} + spack gpg init - spack buildcache keys --install --trust >> ${LOG_FILE} 2>&1 else source ${INSTALL_DIR}/share/spack/setup-env.sh >> ${LOG_FILE} 2>&1 fi diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index d905a47851..8280423748 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -110,12 +110,27 @@ variable "labels" { # variables to be deprecated variable "spack_cache_url" { - description = "List of buildcaches for spack." + description = <<-EOT + DEPRECATED + + The following `commands` can be used to add a build cache: + + ``` + spack mirror add --scope site gs://my-build-cache + spack buildcache keys --install --trust + ``` + + List of build caches for Spack. + EOT type = list(object({ mirror_name = string mirror_url = string })) default = null + validation { + condition = var.spack_cache_url == null + error_message = "spack_cache_url is deprecated. Use commands instead. See variable documentation for proposed alternative commands." + } } variable "configs" { diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index f8d4d9801e..83cb01cce0 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -87,16 +87,16 @@ deployment_groups: - - $%compilers - - $^mpis commands: | + # Un-comment and update mirror_url to install from spack cache + # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket + # spack buildcache keys --install --trust + if ! spack env list | grep -q gromacs; then spack env create gromacs /share/spack/gromacs_env.yaml spack env activate gromacs spack concretize spack install fi - # Un-comment and update mirror_url to install from spack cache - # spack_cache_url: - # - mirror_name: gcs_cache - # mirror_url: gs://optionally_set_spack_cache_bucket - id: controller-setup source: modules/scripts/startup-script diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index fc5c741c6b..1c4779bd2c 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -95,16 +95,16 @@ deployment_groups: concretizer: unify: when_possible commands: | + # Un-comment and update mirror_url to install from spack cache + # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket + # spack buildcache keys --install --trust + if ! spack env list | grep -q openfoam; then spack env create openfoam /share/spack/openfoam_env.yaml spack env activate openfoam spack concretize spack install fi - # Un-comment and update mirror_url to install from spack cache - # spack_cache_url: - # - mirror_name: gcs_cache - # mirror_url: gs://optionally_set_spack_cache_bucket - id: controller-setup source: modules/scripts/startup-script diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index 4e192ab96f..d78b3763eb 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -87,16 +87,16 @@ deployment_groups: - - $%compilers - - $^mpis commands: | + # Un-comment and update mirror_url to install from spack cache + # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket + # spack buildcache keys --install --trust + if ! spack env list | grep -q wrfv3; then spack env create wrfv3 /share/spack/wrfv3_env.yaml spack env activate wrfv3 spack concretize spack install fi - # Un-comment and update mirror_url to install from spack cache - # spack_cache_url: - # - mirror_name: gcs_cache - # mirror_url: gs://optionally_set_spack_cache_bucket - id: controller-setup source: modules/scripts/startup-script diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index c4711dd49b..554144bce1 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -78,6 +78,10 @@ deployment_groups: - - $%compilers - - $^mpis commands: | + # Un-comment and update mirror_url to install from spack cache + # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket + # spack buildcache keys --install --trust + spack install gcc@8.2.0 target=x86_64 spack load gcc@8.2.0 target=x86_64 spack compiler find --scope site @@ -88,10 +92,6 @@ deployment_groups: spack concretize spack install fi - # Un-comment and update mirror_url to install from spack cache - # spack_cache_url: - # - mirror_name: gcs_cache - # mirror_url: gs://optionally_set_spack_cache_bucket - id: spack-build-startup source: modules/scripts/startup-script diff --git a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml index c2f248c4d1..a762813496 100644 --- a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml +++ b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml @@ -56,9 +56,8 @@ steps: BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} SG_EXAMPLE=examples/serverless-batch-mpi.yaml - sed -i "s/# spack_cache_url:/spack_cache_url:/" $${SG_EXAMPLE} - sed -i "s/# - mirror_name: gcs_cache/- mirror_name: gcs_cache/" $${SG_EXAMPLE} - sed -i "s/# mirror_url: .*/ mirror_url: $${SPACK_CACHE_WRF//\//\\\/}/" $${SG_EXAMPLE} + sed -i "s/# spack mirror add .*/spack mirror add --scope site gcs_cache $${SPACK_CACHE_WRF//\//\\\/}/" $${SG_EXAMPLE} + sed -i "s/# spack buildcache keys .*/spack buildcache keys --install --trust/" $${SG_EXAMPLE} echo ' - id: wait' >> $${SG_EXAMPLE} echo ' source: community/modules/scripts/wait-for-startup' >> $${SG_EXAMPLE} diff --git a/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml b/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml index eeaed6e84a..64a422e71b 100644 --- a/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml +++ b/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml @@ -55,9 +55,8 @@ steps: BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6} SG_EXAMPLE=community/examples/hpc-slurm-gromacs.yaml - sed -i "s/# spack_cache_url:/spack_cache_url:/" $${SG_EXAMPLE} - sed -i "s/# - mirror_name: gcs_cache/- mirror_name: gcs_cache/" $${SG_EXAMPLE} - sed -i "s/# mirror_url: .*/ mirror_url: $${SPACK_CACHE//\//\\\/}/" $${SG_EXAMPLE} + sed -i "s/# spack mirror add .*/spack mirror add --scope site gcs_cache $${SPACK_CACHE_WRF//\//\\\/}/" $${SG_EXAMPLE} + sed -i "s/# spack buildcache keys .*/spack buildcache keys --install --trust/" $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ diff --git a/tools/validate_configs/test_configs/rocky-ss.yaml b/tools/validate_configs/test_configs/rocky-ss.yaml index 2c12a0d3c4..8234e720d8 100644 --- a/tools/validate_configs/test_configs/rocky-ss.yaml +++ b/tools/validate_configs/test_configs/rocky-ss.yaml @@ -46,7 +46,6 @@ deployment_groups: source: ./community//modules/scripts/spack-install settings: install_dir: /apps/spack - spack_cache_url: commands: | spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml index 7d858a7a29..ca16c98f6f 100644 --- a/tools/validate_configs/test_configs/ubuntu-ss.yaml +++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml @@ -45,7 +45,6 @@ deployment_groups: source: ./community//modules/scripts/spack-install settings: install_dir: /apps/spack - spack_cache_url: commands: | spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 From 0d438ab0d074cd8118b1707af7f65d432b9015af Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 29 Jun 2023 11:18:29 -0700 Subject: [PATCH 016/144] DEPRECATE spack-install.configs in favor of using data_files and commands --- community/examples/AMD/hpc-amd-slurm.yaml | 26 ++++------- community/examples/hpc-slurm-gromacs.yaml | 11 +++-- .../examples/hpc-slurm-ramble-gromacs.yaml | 19 +++----- .../modules/scripts/spack-install/README.md | 2 +- .../modules/scripts/spack-install/main.tf | 1 - .../spack-install/templates/install_spack.tpl | 18 -------- .../scripts/spack-install/variables.tf | 45 +++++++------------ docs/tutorials/gromacs/spack-gromacs.yaml | 23 ++++------ docs/tutorials/openfoam/spack-openfoam.yaml | 23 ++++------ docs/tutorials/wrfv3/spack-wrfv3.yaml | 23 ++++------ .../hcls-blueprint.yaml | 21 ++++----- examples/serverless-batch-mpi.yaml | 15 +++---- .../test_configs/spack-buildcache.yaml | 6 +-- .../test_configs/spack-environments.yaml | 6 +-- 14 files changed, 81 insertions(+), 158 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index d17562532f..710aec046e 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -45,15 +45,8 @@ deployment_groups: install_dir: /sw/spack spack_ref: v0.18.1 log_file: /var/log/spack.log - configs: - - type: file - scope: defaults - content: | - config: - build_stage: - - /opt/spack_build_stage - - type: file - scope: defaults + data_files: + - destination: /tmp/projections-config.yaml content: | modules: default: @@ -64,8 +57,7 @@ deployment_groups: - '{name}' projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' - - type: file - scope: site + - destination: /tmp/slurm-external-config.yaml content: | packages: slurm: @@ -73,13 +65,6 @@ deployment_groups: - spec: slurm@22-05-8 prefix: /usr/local buildable: False - - type: file - scope: site - content: | - concretizer: - targets: - host_compatible: false - data_files: - destination: /sw/spack/openfoam_env.yaml content: | spack: @@ -106,6 +91,11 @@ deployment_groups: concretizer: unify: when_possible commands: | + spack config --scope defaults add config:build_stage:/opt/spack_build_stage + spack config --scope defaults add -f /tmp/projections-config.yaml + spack config --scope site add -f /tmp/slurm-external-config.yaml + spack config --scope site add concretizer:targets:host_compatible:false + # gcc 12.1.0 is known to have runtime failures with OpenFOAM 8 # gcc 10.3.0 is the earliest copy of gcc with Zen 3 support spack install gcc@10.3.0 %gcc@4.8.5 target=x86_64 diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index ca0f743736..b284915c46 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -50,12 +50,8 @@ deployment_groups: settings: install_dir: /sw/spack log_file: /var/log/spack.log - configs: - - type: single-config - scope: defaults - content: "config:build_stage:/sw/spack/spack-stage" - - type: file - scope: defaults + data_files: + - destination: /tmp/projections-config.yaml content: | modules: default: @@ -71,6 +67,9 @@ deployment_groups: # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket # spack buildcache keys --install --trust + spack config --scope defaults add config:build_stage:/sw/spack/spack-stage + spack config --scope defaults add -f /tmp/projections-config.yaml + spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 spack compiler find --scope site diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index 2fdbcbab51..306a2de624 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -40,18 +40,13 @@ deployment_groups: settings: install_dir: /opt/apps/spack log_file: /var/log/spack.log - configs: - - type: single-config - scope: defaults - content: "config:build_stage:/opt/apps/spack/spack-stage" - - type: single-config - scope: defaults - content: "concretizer:targets:host_compatible:false" - # Uncomment and update the name and path to add a shared or personal Spack - # cache location to speed up future deployments. - # commands: | - # spack mirror add --scope site gcs_cache gs://bucket-name/... - # spack buildcache keys --install --trust + commands: | + # Un-comment and update mirror_url to install from spack cache + # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket + # spack buildcache keys --install --trust + + spack config --scope defaults add config:build_stage:/opt/apps/spack/spack-stage + spack config --scope defaults add concretizer:targets:host_compatible:false - id: ramble-setup source: community/modules/scripts/ramble-setup diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index d2d25bf164..d4e3ba314d 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -219,7 +219,7 @@ limitations under the License. | [commands](#input\_commands) | String of commands to run within this module | `string` | `null` | no | | [compilers](#input\_compilers) | DEPRECATED

The following `commands` can be used to install compilers:
spack install gcc@10.3.0 target=x86_64
spack load gcc@10.3.0 target=x86_64
spack compiler find --scope site
spack clean -s
spack unload gcc@10.3.0
Defines compilers for spack to install before installing packages. | `list(string)` | `null` | no | | [concretize\_flags](#input\_concretize\_flags) | DEPRECATED - spack concretize is now performed using the `commands` variable. | `string` | `null` | no | -| [configs](#input\_configs) | List of configuration options to set within spack.
Configs can be of type 'single-config' or 'file'.
All configs must specify content, and a
a scope. | `list(map(any))` | `[]` | no | +| [configs](#input\_configs) | DEPRECATED

The following `commands` can be used to add a single config:
spack config --scope defaults add config:default:true
Alternatively, use `data_files` to transfer a config file and use the `spack config add -f ` command to add the config.

List of configuration options to set within spack. | `list(map(any))` | `null` | no | | [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | | [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | | [environments](#input\_environments) | DEPRECATED

The following `commands` can be used to configure an environment:
if ! spack env list \| grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
spack install
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index d60f15b417..771bcc241b 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -28,7 +28,6 @@ locals { INSTALL_DIR = var.install_dir SPACK_URL = var.spack_url SPACK_REF = var.spack_ref - CONFIGS = var.configs == null ? [] : var.configs CACHES_TO_POPULATE = var.caches_to_populate == null ? [] : var.caches_to_populate LOG_FILE = var.log_file == null ? "/dev/null" : var.log_file SPACK_PYTHON_VENV = var.spack_virtualenv_path diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index d21ade5c2e..22cd3a3981 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -44,24 +44,6 @@ if [ ! -d ${INSTALL_DIR} ]; then spack compiler find --scope site } &>> ${LOG_FILE} 2>&1 - echo "$PREFIX Configuring spack..." - %{for c in CONFIGS ~} - %{if c.type == "single-config" ~} - spack config --scope=${c.scope} add "${c.content}" >> ${LOG_FILE} 2>&1 - %{endif ~} - - %{if c.type == "file" ~} - { - cat << 'EOF' > ${INSTALL_DIR}/spack_conf.yaml -${c.content} -EOF - - spack config --scope=${c.scope} add -f ${INSTALL_DIR}/spack_conf.yaml - rm -f ${INSTALL_DIR}/spack_conf.yaml - } &>> ${LOG_FILE} 2>&1 - %{endif ~} - %{endfor ~} - spack gpg init else diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index 8280423748..f45d12b23f 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -134,37 +134,24 @@ variable "spack_cache_url" { } variable "configs" { - description = <` command to add the config. + + List of configuration options to set within spack. + EOT + default = null type = list(map(any)) validation { - condition = alltrue([ - for c in var.configs : contains(keys(c), "type") - ]) - error_message = "All configs must declare a type." - } - validation { - condition = alltrue([ - for c in var.configs : contains(keys(c), "scope") - ]) - error_message = "All configs must declare a scope." - } - validation { - condition = alltrue([ - for c in var.configs : contains(keys(c), "content") - ]) - error_message = "All configs must declare a content." - } - validation { - condition = alltrue([ - for c in var.configs : (c["type"] == "single-config" || c["type"] == "file") - ]) - error_message = "The 'type' must be 'single-config' or 'file'." + condition = var.configs == null + error_message = "configs is deprecated. Use commands instead. See variable documentation for proposed alternative commands." } } diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index 83cb01cce0..f834d833b4 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -37,15 +37,10 @@ deployment_groups: settings: install_dir: /apps/spack log_file: /var/log/spack.log - configs: - - type: file - scope: defaults - content: | - config: - build_stage: - - /apps/spack/spack-stage - - type: file - scope: defaults + compilers: + - gcc@9.3.0 target=x86_64 + data_files: + - destination: /tmp/projections-config.yaml content: | modules: default: @@ -56,8 +51,7 @@ deployment_groups: - '{name}' projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' - - type: 'file' - scope: 'site' + - destination: /tmp/slurm-external-config.yaml content: | packages: slurm: @@ -65,9 +59,6 @@ deployment_groups: - spec: slurm@21-08-8-2 prefix: /usr/local buildable: False - compilers: - - gcc@9.3.0 target=x86_64 - data_files: - destination: /share/spack/gromacs_env.yaml content: | spack: @@ -91,6 +82,10 @@ deployment_groups: # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket # spack buildcache keys --install --trust + spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add -f /tmp/projections-config.yaml + spack config --scope site add -f /tmp/slurm-external-config.yaml + if ! spack env list | grep -q gromacs; then spack env create gromacs /share/spack/gromacs_env.yaml spack env activate gromacs diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index 1c4779bd2c..31215c6952 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -37,15 +37,10 @@ deployment_groups: settings: install_dir: /apps/spack log_file: /var/log/spack.log - configs: - - type: file - scope: defaults - content: | - config: - build_stage: - - /apps/spack/spack-stage - - type: file - scope: defaults + compilers: + - gcc@9.3.0 %gcc@4.8.5 target=x86_64 + data_files: + - destination: /tmp/projections-config.yaml content: | modules: default: @@ -56,8 +51,7 @@ deployment_groups: - '{name}' projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' - - type: 'file' - scope: 'site' + - destination: /tmp/slurm-external-config.yaml content: | packages: slurm: @@ -65,9 +59,6 @@ deployment_groups: - spec: slurm@21-08-8-2 prefix: /usr/local buildable: False - compilers: - - gcc@9.3.0 %gcc@4.8.5 target=x86_64 - data_files: - destination: /share/spack/openfoam_env.yaml - name: openfoam content: | @@ -99,6 +90,10 @@ deployment_groups: # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket # spack buildcache keys --install --trust + spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add -f /tmp/projections-config.yaml + spack config --scope site add -f /tmp/slurm-external-config.yaml + if ! spack env list | grep -q openfoam; then spack env create openfoam /share/spack/openfoam_env.yaml spack env activate openfoam diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index d78b3763eb..6323dedc33 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -37,15 +37,10 @@ deployment_groups: settings: install_dir: /apps/spack log_file: /var/log/spack.log - configs: - - type: file - scope: defaults - content: | - config: - build_stage: - - /apps/spack/spack-stage - - type: file - scope: defaults + compilers: + - gcc@8.2.0 target=x86_64 + data_files: + - destination: /tmp/projections-config.yaml content: | modules: default: @@ -56,8 +51,7 @@ deployment_groups: - '{name}' projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' - - type: 'file' - scope: 'site' + - destination: /tmp/slurm-external-config.yaml content: | packages: slurm: @@ -65,9 +59,6 @@ deployment_groups: - spec: slurm@21-08-8-2 prefix: /usr/local buildable: False - compilers: - - gcc@8.2.0 target=x86_64 - data_files: - destination: /share/spack/wrfv3_env.yaml content: | spack: @@ -91,6 +82,10 @@ deployment_groups: # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket # spack buildcache keys --install --trust + spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add -f /tmp/projections-config.yaml + spack config --scope site add -f /tmp/slurm-external-config.yaml + if ! spack env list | grep -q wrfv3; then spack env create wrfv3 /share/spack/wrfv3_env.yaml spack env activate wrfv3 diff --git a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml index 4102b211d0..f4a7284d59 100644 --- a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml +++ b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml @@ -117,15 +117,9 @@ deployment_groups: source: community/modules/scripts/spack-install settings: install_dir: /apps/spack - configs: - - type: file - scope: defaults - content: | - config: - build_stage: - - /apps/spack/spack-stage - - type: file - scope: defaults + compilers: [gcc@11.3.0 target=x86_64] + data_files: + - destination: /tmp/projections-config.yaml content: | modules: default: @@ -136,8 +130,7 @@ deployment_groups: - '{name}' projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' - - type: file - scope: site + - destination: /tmp/slurm-external-config.yaml content: | packages: slurm: @@ -145,8 +138,6 @@ deployment_groups: - spec: slurm@21-08-8-2 prefix: /usr/local buildable: False - compilers: [gcc@11.3.0 target=x86_64] - data_files: - destination: /share/spack/gromacs_env.yaml content: | spack: @@ -176,6 +167,10 @@ deployment_groups: - [$%compilers] - [target=skylake] commands: | + spack config --scope defaults add config:build_stage:/apps/spack/spack-stage + spack config --scope defaults add -f /tmp/projections-config.yaml + spack config --scope site add -f /tmp/slurm-external-config.yaml + if ! spack env list | grep -q gromacs; then spack env create gromacs /share/spack/gromacs_env.yaml spack env activate gromacs diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index 554144bce1..36a6d63bae 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -39,15 +39,8 @@ deployment_groups: settings: spack_ref: v0.19.0 install_dir: /share/spack - configs: - - type: file - scope: defaults - content: | - config: - build_stage: - - /share/spack/spack-stage - - type: file - scope: defaults + data_files: + - destination: /tmp/projections-config.yaml content: | modules: default: @@ -58,7 +51,6 @@ deployment_groups: - '{name}' projections: all: '{name}/{version}-{compiler.name}-{compiler.version}' - data_files: - destination: /share/spack/wrfv3_env.yaml content: | spack: @@ -82,6 +74,9 @@ deployment_groups: # spack mirror add --scope site gcs_cache gs://optionally_set_spack_cache_bucket # spack buildcache keys --install --trust + spack config --scope defaults add config:build_stage:/share/spack/spack-stage + spack config --scope defaults add -f /tmp/projections-config.yaml + spack install gcc@8.2.0 target=x86_64 spack load gcc@8.2.0 target=x86_64 spack compiler find --scope site diff --git a/tools/validate_configs/test_configs/spack-buildcache.yaml b/tools/validate_configs/test_configs/spack-buildcache.yaml index a01d17eea1..f70f0ee7f2 100644 --- a/tools/validate_configs/test_configs/spack-buildcache.yaml +++ b/tools/validate_configs/test_configs/spack-buildcache.yaml @@ -33,14 +33,12 @@ deployment_groups: settings: install_dir: /apps/spack log_file: /var/log/spack.log - configs: - - type: 'single-config' - scope: 'site' - content: 'config:install_tree:padded_length:128' data_files: - source: /path/to/local/spack_gpg_key.pub destination: /apps/spack_gpg_key.pub commands: | + spack config --scope site add config:install_tree:padded_length:128 + spack gpg init spack gpg trust /apps/spack_gpg_key.pub diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml index 5828479dbe..8ca29f1fed 100644 --- a/tools/validate_configs/test_configs/spack-environments.yaml +++ b/tools/validate_configs/test_configs/spack-environments.yaml @@ -35,10 +35,6 @@ deployment_groups: spack_url: https://github.com/spack/spack spack_ref: v0.17.1 log_file: /var/log/spack.log - configs: - - type: 'single-config' - scope: 'site' - content: 'config:install_tree:padded_length:128' data_files: - destination: /apps/spack/env_file.yaml content: | @@ -64,6 +60,8 @@ deployment_groups: specs: intel-mpi@20184.274 %gcc@10.3.0 commands: | + spack config --scope site add config:install_tree:padded_length:128 + spack gpg init spack gpg create "Robert Pirsig" pirsig@zamm.org From 4743528ccf3b569a490c57a5d1da88fe058bfaa6 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 29 Jun 2023 11:23:54 -0700 Subject: [PATCH 017/144] Update missed usage of spack-install.compilers --- docs/tutorials/gromacs/spack-gromacs.yaml | 6 ++++-- docs/tutorials/openfoam/spack-openfoam.yaml | 7 ++++--- docs/tutorials/wrfv3/spack-wrfv3.yaml | 6 ++++-- .../healthcare-and-life-sciences/hcls-blueprint.yaml | 5 ++++- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index f834d833b4..85c6b93fc0 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -37,8 +37,6 @@ deployment_groups: settings: install_dir: /apps/spack log_file: /var/log/spack.log - compilers: - - gcc@9.3.0 target=x86_64 data_files: - destination: /tmp/projections-config.yaml content: | @@ -86,6 +84,10 @@ deployment_groups: spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml + spack install gcc@9.3.0 target=x86_64 + spack load gcc@9.3.0 target=x86_64 + spack compiler find --scope site + if ! spack env list | grep -q gromacs; then spack env create gromacs /share/spack/gromacs_env.yaml spack env activate gromacs diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index 31215c6952..1cd694f9f0 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -37,8 +37,6 @@ deployment_groups: settings: install_dir: /apps/spack log_file: /var/log/spack.log - compilers: - - gcc@9.3.0 %gcc@4.8.5 target=x86_64 data_files: - destination: /tmp/projections-config.yaml content: | @@ -60,7 +58,6 @@ deployment_groups: prefix: /usr/local buildable: False - destination: /share/spack/openfoam_env.yaml - - name: openfoam content: | spack: definitions: @@ -94,6 +91,10 @@ deployment_groups: spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml + spack install gcc@9.3.0 %gcc@4.8.5 target=x86_64 + spack load gcc@9.3.0 %gcc@4.8.5 target=x86_64 + spack compiler find --scope site + if ! spack env list | grep -q openfoam; then spack env create openfoam /share/spack/openfoam_env.yaml spack env activate openfoam diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index 6323dedc33..15944153ab 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -37,8 +37,6 @@ deployment_groups: settings: install_dir: /apps/spack log_file: /var/log/spack.log - compilers: - - gcc@8.2.0 target=x86_64 data_files: - destination: /tmp/projections-config.yaml content: | @@ -86,6 +84,10 @@ deployment_groups: spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml + spack install gcc@8.2.0 target=x86_64 + spack load gcc@8.2.0 target=x86_64 + spack compiler find --scope site + if ! spack env list | grep -q wrfv3; then spack env create wrfv3 /share/spack/wrfv3_env.yaml spack env activate wrfv3 diff --git a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml index f4a7284d59..7e8b8a86e5 100644 --- a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml +++ b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml @@ -117,7 +117,6 @@ deployment_groups: source: community/modules/scripts/spack-install settings: install_dir: /apps/spack - compilers: [gcc@11.3.0 target=x86_64] data_files: - destination: /tmp/projections-config.yaml content: | @@ -171,6 +170,10 @@ deployment_groups: spack config --scope defaults add -f /tmp/projections-config.yaml spack config --scope site add -f /tmp/slurm-external-config.yaml + spack install gcc@11.3.0 target=x86_64 + spack load gcc@11.3.0 target=x86_64 + spack compiler find --scope site + if ! spack env list | grep -q gromacs; then spack env create gromacs /share/spack/gromacs_env.yaml spack env activate gromacs From 14e2210833bf466a7013701991d381f00abef6c3 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 29 Jun 2023 11:56:58 -0700 Subject: [PATCH 018/144] Add blueprint validation to tutorials and video blueprints --- tools/validate_configs/validate_configs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/validate_configs/validate_configs.sh b/tools/validate_configs/validate_configs.sh index d94c491a91..0f8ca10814 100755 --- a/tools/validate_configs/validate_configs.sh +++ b/tools/validate_configs/validate_configs.sh @@ -116,7 +116,7 @@ check_background() { fi } -CONFIGS=$(find examples/ community/examples/ tools/validate_configs/test_configs/ -name "*.yaml" -type f) +CONFIGS=$(find examples/ community/examples/ tools/validate_configs/test_configs/ docs/tutorials/ docs/videos/build-your-own-blueprint/ -name "*.yaml" -type f) cwd=$(pwd) NPROCS=${NPROCS:-$(nproc)} echo "Running tests in $NPROCS processes" From 3a27497cd87ceebcaa6da71e551d2eecbfa32a07 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 29 Jun 2023 15:01:47 -0700 Subject: [PATCH 019/144] Convert spack-install script to ansible --- .../modules/scripts/spack-install/README.md | 4 +- .../modules/scripts/spack-install/main.tf | 21 +++-- .../scripts/install_spack_deps.yml | 3 + .../spack-install/templates/install_spack.tpl | 53 ------------- .../templates/spack_setup.yml.tpl | 79 +++++++++++++++++++ .../scripts/spack-install/variables.tf | 27 +++++-- 6 files changed, 117 insertions(+), 70 deletions(-) delete mode 100755 community/modules/scripts/spack-install/templates/install_spack.tpl create mode 100644 community/modules/scripts/spack-install/templates/spack_setup.yml.tpl diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index d4e3ba314d..f47b872879 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -216,6 +216,9 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [caches\_to\_populate](#input\_caches\_to\_populate) | DEPRECATED

The following `commands` can be used to populate a cache:
MIRROR_URL=gs://my-bucket
spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash});
spack gpg publish --mirror-url $MIRROR_URL;
spack buildcache update-index --mirror-url $MIRROR_URL --keys;
Defines caches which will be populated with the installed packages.

NOTE: GPG Keys should be installed before trying to populate a cache
with packages.

NOTE: The gpg\_keys variable can be used to install existing GPG keys
and create new GPG keys, both of which are acceptable for populating a
cache. | `list(map(any))` | `null` | no | +| [chgrp\_group](#input\_chgrp\_group) | Group to chgrp the Spack clone to. Default will not modify the clone. | `string` | `null` | no | +| [chmod\_mode](#input\_chmod\_mode) | Mode to chmod the Spack clone to. Defaults to null (i.e. do not modify).
For usage information see:
https://docs.ansible.com/ansible/latest/collections/ansible/builtin/file_module.html#parameter-mode | `string` | `"a+rwxs"` | no | +| [chown\_owner](#input\_chown\_owner) | Owner to chown the Spack clone to. Default will not modify the clone. | `string` | `null` | no | | [commands](#input\_commands) | String of commands to run within this module | `string` | `null` | no | | [compilers](#input\_compilers) | DEPRECATED

The following `commands` can be used to install compilers:
spack install gcc@10.3.0 target=x86_64
spack load gcc@10.3.0 target=x86_64
spack compiler find --scope site
spack clean -s
spack unload gcc@10.3.0
Defines compilers for spack to install before installing packages. | `list(string)` | `null` | no | | [concretize\_flags](#input\_concretize\_flags) | DEPRECATED - spack concretize is now performed using the `commands` variable. | `string` | `null` | no | @@ -236,7 +239,6 @@ limitations under the License. | [spack\_ref](#input\_spack\_ref) | Git ref to checkout for spack. | `string` | `"v0.20.0"` | no | | [spack\_url](#input\_spack\_url) | URL to clone the spack repo from. | `string` | `"https://github.com/spack/spack"` | no | | [spack\_virtualenv\_path](#input\_spack\_virtualenv\_path) | Virtual environment path in which to install Spack Python interpreter and other dependencies | `string` | `"/usr/local/spack-python"` | no | -| [zone](#input\_zone) | The GCP zone where the instance is running. | `string` | n/a | yes | ## Outputs diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index 771bcc241b..fa0ebbc7de 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -21,16 +21,15 @@ locals { locals { script_content = templatefile( - "${path.module}/templates/install_spack.tpl", + "${path.module}/templates/spack_setup.yml.tpl", { - ZONE = var.zone - PROJECT_ID = var.project_id - INSTALL_DIR = var.install_dir - SPACK_URL = var.spack_url - SPACK_REF = var.spack_ref - CACHES_TO_POPULATE = var.caches_to_populate == null ? [] : var.caches_to_populate - LOG_FILE = var.log_file == null ? "/dev/null" : var.log_file - SPACK_PYTHON_VENV = var.spack_virtualenv_path + install_dir = var.install_dir + git_url = var.spack_url + git_ref = var.spack_ref + chown_owner = var.chown_owner == null ? "" : var.chown_owner + chgrp_group = var.chgrp_group == null ? "" : var.chgrp_group + chmod_mode = var.chmod_mode == null ? "" : var.chmod_mode + spack_python_env = var.spack_virtualenv_path } ) install_spack_deps_runner = { @@ -40,9 +39,9 @@ locals { "args" = "-e spack_virtualenv_path=${var.spack_virtualenv_path}" } install_spack_runner = { - "type" = "shell" + "type" = "ansible-local" "content" = local.script_content - "destination" = "install_spack.sh" + "destination" = "install_spack.yml" } } diff --git a/community/modules/scripts/spack-install/scripts/install_spack_deps.yml b/community/modules/scripts/spack-install/scripts/install_spack_deps.yml index 71625cab9f..d962113527 100644 --- a/community/modules/scripts/spack-install/scripts/install_spack_deps.yml +++ b/community/modules/scripts/spack-install/scripts/install_spack_deps.yml @@ -23,6 +23,7 @@ - name: Install pip3 and git ansible.builtin.package: name: + - python - python3-pip - git register: package @@ -30,6 +31,7 @@ retries: 5 delay: 10 until: package is success + - name: Create virtualenv for Spack # Python 3.6 is minimum we wish to support due to ease of installation on # CentOS 7 and Rocky Linux 8. pip 21.3.1 is the *maximum* version of pip @@ -39,6 +41,7 @@ ansible.builtin.pip: name: pip>=21.3.1 virtualenv: "{{ spack_virtualenv_path }}" + - name: Add google-cloud-storage to Spack virtualenv ansible.builtin.pip: name: google-cloud-storage diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl deleted file mode 100755 index 22cd3a3981..0000000000 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -set -e -o pipefail - -SPACK_PYTHON=${SPACK_PYTHON_VENV}/bin/python3 - -PREFIX="spack:" - -echo "$PREFIX Beginning setup..." -if [[ $EUID -ne 0 ]]; then - echo "$PREFIX This script must be run as root" - exit 1 -fi - -# create an /etc/profile.d file that sources the Spack environment; it safely -# skips sourcing when Spack has not yet been installed -if [ ! -f /etc/profile.d/spack.sh ]; then - cat < /etc/profile.d/spack.sh -SPACK_PYTHON=${SPACK_PYTHON_VENV}/bin/python3 -if [ -f ${INSTALL_DIR}/share/spack/setup-env.sh ]; then - . ${INSTALL_DIR}/share/spack/setup-env.sh -fi -EOF - chmod 0644 /etc/profile.d/spack.sh -fi - -# Only install and configure spack if ${INSTALL_DIR} doesn't exist -if [ ! -d ${INSTALL_DIR} ]; then - - # Install spack - echo "$PREFIX Installing spack from ${SPACK_URL}..." - { - mkdir -p ${INSTALL_DIR}; - chmod a+rwx ${INSTALL_DIR}; - chmod a+s ${INSTALL_DIR}; - cd ${INSTALL_DIR}; - git clone --no-checkout ${SPACK_URL} . - } &>> ${LOG_FILE} - echo "$PREFIX Checking out ${SPACK_REF}..." - git checkout ${SPACK_REF} >> ${LOG_FILE} 2>&1 - - { - source ${INSTALL_DIR}/share/spack/setup-env.sh; - spack compiler find --scope site - } &>> ${LOG_FILE} 2>&1 - - spack gpg init - -else - source ${INSTALL_DIR}/share/spack/setup-env.sh >> ${LOG_FILE} 2>&1 -fi - -echo "$PREFIX Setup complete..." diff --git a/community/modules/scripts/spack-install/templates/spack_setup.yml.tpl b/community/modules/scripts/spack-install/templates/spack_setup.yml.tpl new file mode 100644 index 0000000000..224d6859b2 --- /dev/null +++ b/community/modules/scripts/spack-install/templates/spack_setup.yml.tpl @@ -0,0 +1,79 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +- name: Install Spack + hosts: localhost + vars: + install_dir: ${install_dir} + git_url: ${git_url} + git_ref: ${git_ref} + chmod_mode: ${chmod_mode} + chown_owner: ${chown_owner} + chgrp_group: ${chgrp_group} + spack_python_env: ${spack_python_env} + tasks: + - name: Add to profile + ansible.builtin.copy: + dest: /etc/profile.d/spack.sh + mode: '0644' + content: | + SPACK_PYTHON={{ spack_python_env }}/bin/python3 + if [ -f {{ install_dir }}/share/spack/setup-env.sh ]; then + . {{ install_dir }}/share/spack/setup-env.sh + fi + + - name: Create parent of install directory + ansible.builtin.file: + path: "{{ install_dir | dirname }}" + state: directory + + - name: Acquire lock + ansible.builtin.command: + mkdir "{{ install_dir | dirname }}/.install_lock" + register: lock_out + changed_when: lock_out.rc == 0 + failed_when: false + + - name: Clones into installation directory + ansible.builtin.command: git clone --branch {{ git_ref }} {{ git_url }} {{ install_dir }} + when: lock_out.rc == 0 + + - name: chgrp on installation + ansible.builtin.file: + path: "{{ install_dir }}" + group: "{{ chgrp_group }}" + recurse: true + when: chgrp_group != "" and lock_out.rc == 0 + + - name: chown on installation + ansible.builtin.file: + path: "{{ install_dir }}" + owner: "{{ chown_owner }}" + recurse: true + when: chown_owner != "" and lock_out.rc == 0 + + - name: chmod on installation + ansible.builtin.file: + path: "{{ install_dir }}" + mode: "{{ chmod_mode }}" + recurse: true + when: chmod_mode != "" and lock_out.rc == 0 + + - name: Final Spack Setup - gpg init and compiler find + ansible.builtin.shell: | + set -e + source /etc/profile.d/spack.sh + spack gpg init + spack compiler find --scope site + when: lock_out.rc == 0 diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index f45d12b23f..e4768e27fd 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -14,11 +14,6 @@ * limitations under the License. */ -variable "zone" { - description = "The GCP zone where the instance is running." - type = string -} - variable "project_id" { description = "Project in which the HPC deployment will be created." type = string @@ -44,6 +39,28 @@ variable "spack_ref" { default = "v0.20.0" } +variable "chown_owner" { + description = "Owner to chown the Spack clone to. Default will not modify the clone." + default = null + type = string +} + +variable "chgrp_group" { + description = "Group to chgrp the Spack clone to. Default will not modify the clone." + default = null + type = string +} + +variable "chmod_mode" { + description = <<-EOT + Mode to chmod the Spack clone to. Defaults to null (i.e. do not modify). + For usage information see: + https://docs.ansible.com/ansible/latest/collections/ansible/builtin/file_module.html#parameter-mode + EOT + default = "a+rwxs" + type = string +} + variable "spack_virtualenv_path" { description = "Virtual environment path in which to install Spack Python interpreter and other dependencies" default = "/usr/local/spack-python" From 9a658b24efc814ba9197b9b22aff5f168b6c26c1 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 29 Jun 2023 15:55:06 -0700 Subject: [PATCH 020/144] Refactor ramble and spack install to share code --- .../modules/scripts/ramble-setup/main.tf | 15 +++--- .../templates/ramble_setup.yml.tpl | 47 ++++++++++--------- .../modules/scripts/spack-install/main.tf | 32 +++++++++---- .../templates/spack_setup.yml.tpl | 30 ++++++------ tools/duplicate-diff.py | 4 ++ 5 files changed, 76 insertions(+), 52 deletions(-) diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index 3278fc0e00..f88cecdf1f 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -18,12 +18,15 @@ locals { setup_file = templatefile( "${path.module}/templates/ramble_setup.yml.tpl", { - install_dir = var.install_dir - ramble_url = var.ramble_url - ramble_ref = var.ramble_ref - chown_owner = var.chown_owner == null ? "" : var.chown_owner - chgrp_group = var.chgrp_group == null ? "" : var.chgrp_group - chmod_mode = var.chmod_mode == null ? "" : var.chmod_mode + app_name = "ramble" + profile_script = ". {{ install_dir }}/share/ramble/setup-env.sh" + install_dir = var.install_dir + git_url = var.ramble_url + git_ref = var.ramble_ref + chown_owner = var.chown_owner == null ? "" : var.chown_owner + chgrp_group = var.chgrp_group == null ? "" : var.chgrp_group + chmod_mode = var.chmod_mode == null ? "" : var.chmod_mode + finalize_setup_script = "echo 'no finalize setup script'" } ) diff --git a/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tpl b/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tpl index cdda2f526f..0e6d6d8116 100644 --- a/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tpl +++ b/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tpl @@ -12,16 +12,29 @@ # See the License for the specific language governing permissions and # limitations under the License. -- name: Install Ramble +- name: Install Application hosts: localhost vars: + app_name: ${app_name} + profile_script: ${profile_script} install_dir: ${install_dir} - ramble_url: ${ramble_url} - ramble_ref: ${ramble_ref} + git_url: ${git_url} + git_ref: ${git_ref} chmod_mode: ${chmod_mode} chown_owner: ${chown_owner} chgrp_group: ${chgrp_group} + finalize_setup_script: ${finalize_setup_script} tasks: + - name: Print Application Name + ansible.builtin.debug: + msg: "Running installation for application: {{app_name}}" + + - name: Add profile script for application + ansible.builtin.copy: + dest: /etc/profile.d/{{ app_name }}.sh + mode: '0644' + content: "{{ profile_script }}" + - name: Create parent of install directory ansible.builtin.file: path: "{{ install_dir | dirname }}" @@ -29,46 +42,36 @@ - name: Acquire lock ansible.builtin.command: - mkdir "{{ install_dir | dirname }}/.ramble_lock" + mkdir "{{ install_dir | dirname }}/.install_{{ app_name }}_lock" register: lock_out changed_when: lock_out.rc == 0 failed_when: false - - name: Clones ramble into installation directory - ansible.builtin.git: - repo: "{{ ramble_url }}" - dest: "{{ install_dir }}" - version: "{{ ramble_ref }}" + - name: Clones into installation directory + ansible.builtin.command: git clone --branch {{ git_ref }} {{ git_url }} {{ install_dir }} when: lock_out.rc == 0 - - name: chgrp ramble installation + - name: chgrp on installation ansible.builtin.file: path: "{{ install_dir }}" group: "{{ chgrp_group }}" recurse: true when: chgrp_group != "" and lock_out.rc == 0 - - name: chown ramble installation + - name: chown on installation ansible.builtin.file: path: "{{ install_dir }}" owner: "{{ chown_owner }}" recurse: true when: chown_owner != "" and lock_out.rc == 0 - - name: chmod ramble installation + - name: chmod on installation ansible.builtin.file: path: "{{ install_dir }}" mode: "{{ chmod_mode }}" recurse: true when: chmod_mode != "" and lock_out.rc == 0 - - name: Check if ramble profile exists - ansible.builtin.stat: - path: /etc/profile.d/ramble.sh - register: profile_check - - - name: Add ramble to profile - ansible.builtin.copy: - dest: /etc/profile.d/ramble.sh - content: ". {{ install_dir }}/share/ramble/setup-env.sh" - when: not profile_check.stat.exists + - name: Finalize Setup + ansible.builtin.shell: "{{ finalize_setup_script }}" + when: lock_out.rc == 0 diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index fa0ebbc7de..fe8b4ca77f 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -20,16 +20,32 @@ locals { } locals { + profile_script = <<-EOF + SPACK_PYTHON=${var.spack_virtualenv_path}/bin/python3 + if [ -f ${var.install_dir}/share/spack/setup-env.sh ]; then + . ${var.install_dir}/share/spack/setup-env.sh + fi + EOF + + finalize_setup_script = <<-EOF + set -e + source /etc/profile.d/spack.sh + spack gpg init + spack compiler find --scope site + EOF + script_content = templatefile( "${path.module}/templates/spack_setup.yml.tpl", { - install_dir = var.install_dir - git_url = var.spack_url - git_ref = var.spack_ref - chown_owner = var.chown_owner == null ? "" : var.chown_owner - chgrp_group = var.chgrp_group == null ? "" : var.chgrp_group - chmod_mode = var.chmod_mode == null ? "" : var.chmod_mode - spack_python_env = var.spack_virtualenv_path + app_name = "spack" + profile_script = indent(4, yamlencode(local.profile_script)) + install_dir = var.install_dir + git_url = var.spack_url + git_ref = var.spack_ref + chown_owner = var.chown_owner == null ? "" : var.chown_owner + chgrp_group = var.chgrp_group == null ? "" : var.chgrp_group + chmod_mode = var.chmod_mode == null ? "" : var.chmod_mode + finalize_setup_script = indent(4, yamlencode(local.finalize_setup_script)) } ) install_spack_deps_runner = { @@ -89,7 +105,7 @@ module "startup_script" { resource "local_file" "debug_file_shell_install" { content = local.script_content - filename = "${path.module}/debug_install.sh" + filename = "${path.module}/debug_install.yml" } resource "local_file" "debug_file_ansible_execute" { diff --git a/community/modules/scripts/spack-install/templates/spack_setup.yml.tpl b/community/modules/scripts/spack-install/templates/spack_setup.yml.tpl index 224d6859b2..0e6d6d8116 100644 --- a/community/modules/scripts/spack-install/templates/spack_setup.yml.tpl +++ b/community/modules/scripts/spack-install/templates/spack_setup.yml.tpl @@ -12,26 +12,28 @@ # See the License for the specific language governing permissions and # limitations under the License. -- name: Install Spack +- name: Install Application hosts: localhost vars: + app_name: ${app_name} + profile_script: ${profile_script} install_dir: ${install_dir} git_url: ${git_url} git_ref: ${git_ref} chmod_mode: ${chmod_mode} chown_owner: ${chown_owner} chgrp_group: ${chgrp_group} - spack_python_env: ${spack_python_env} + finalize_setup_script: ${finalize_setup_script} tasks: - - name: Add to profile + - name: Print Application Name + ansible.builtin.debug: + msg: "Running installation for application: {{app_name}}" + + - name: Add profile script for application ansible.builtin.copy: - dest: /etc/profile.d/spack.sh + dest: /etc/profile.d/{{ app_name }}.sh mode: '0644' - content: | - SPACK_PYTHON={{ spack_python_env }}/bin/python3 - if [ -f {{ install_dir }}/share/spack/setup-env.sh ]; then - . {{ install_dir }}/share/spack/setup-env.sh - fi + content: "{{ profile_script }}" - name: Create parent of install directory ansible.builtin.file: @@ -40,7 +42,7 @@ - name: Acquire lock ansible.builtin.command: - mkdir "{{ install_dir | dirname }}/.install_lock" + mkdir "{{ install_dir | dirname }}/.install_{{ app_name }}_lock" register: lock_out changed_when: lock_out.rc == 0 failed_when: false @@ -70,10 +72,6 @@ recurse: true when: chmod_mode != "" and lock_out.rc == 0 - - name: Final Spack Setup - gpg init and compiler find - ansible.builtin.shell: | - set -e - source /etc/profile.d/spack.sh - spack gpg init - spack compiler find --scope site + - name: Finalize Setup + ansible.builtin.shell: "{{ finalize_setup_script }}" when: lock_out.rc == 0 diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index 95cb688f2d..84566fc5ff 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -49,6 +49,10 @@ "community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl", "community/modules/scripts/spack-install/templates/execute_commands.yml.tpl", ], + [ + "community/modules/scripts/spack-install/templates/spack_setup.yml.tpl", + "community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tpl", + ], ] for group in duplicates: From 81885b94a9f5e179ff0eec424248bf9ce9455efc Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 30 Jun 2023 12:09:26 -0700 Subject: [PATCH 021/144] Feedback #1531: no string compare, file rename, conditional profile, var naming --- .pre-commit-config.yaml | 2 +- .../modules/scripts/ramble-setup/main.tf | 12 ++++-- ...e_setup.yml.tpl => ramble_setup.yml.tftpl} | 39 +++++++------------ .../modules/scripts/spack-install/main.tf | 6 +-- ...ck_setup.yml.tpl => spack_setup.yml.tftpl} | 39 +++++++------------ tools/duplicate-diff.py | 4 +- 6 files changed, 43 insertions(+), 59 deletions(-) rename community/modules/scripts/ramble-setup/templates/{ramble_setup.yml.tpl => ramble_setup.yml.tftpl} (66%) rename community/modules/scripts/spack-install/templates/{spack_setup.yml.tpl => spack_setup.yml.tftpl} (66%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7a0ef7d201..63cffedf1f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -46,7 +46,7 @@ repos: entry: python3 tools/duplicate-diff.py language: python language_version: python3 - files: '.*(\.sh|\.tf)$' + files: '.*(\.sh|\.tf|\.tftpl)$' pass_filenames: true require_serial: true - id: module-label-check diff --git a/community/modules/scripts/ramble-setup/main.tf b/community/modules/scripts/ramble-setup/main.tf index f88cecdf1f..44bc02d9b9 100644 --- a/community/modules/scripts/ramble-setup/main.tf +++ b/community/modules/scripts/ramble-setup/main.tf @@ -15,11 +15,17 @@ */ locals { + profile_script = <<-EOF + if [ -f ${var.install_dir}/share/ramble/setup-env.sh ]; then + . ${var.install_dir}/share/ramble/setup-env.sh + fi + EOF + setup_file = templatefile( - "${path.module}/templates/ramble_setup.yml.tpl", + "${path.module}/templates/ramble_setup.yml.tftpl", { - app_name = "ramble" - profile_script = ". {{ install_dir }}/share/ramble/setup-env.sh" + sw_name = "ramble" + profile_script = indent(4, yamlencode(local.profile_script)) install_dir = var.install_dir git_url = var.ramble_url git_ref = var.ramble_ref diff --git a/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tpl b/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl similarity index 66% rename from community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tpl rename to community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl index 0e6d6d8116..9b80783fcf 100644 --- a/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tpl +++ b/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -- name: Install Application +- name: Install Software hosts: localhost vars: - app_name: ${app_name} + sw_name: ${sw_name} profile_script: ${profile_script} install_dir: ${install_dir} git_url: ${git_url} @@ -25,15 +25,16 @@ chgrp_group: ${chgrp_group} finalize_setup_script: ${finalize_setup_script} tasks: - - name: Print Application Name + - name: Print Software Name ansible.builtin.debug: - msg: "Running installation for application: {{app_name}}" + msg: "Running installation for software: {{ sw_name }}" - - name: Add profile script for application + - name: Add profile script for software ansible.builtin.copy: - dest: /etc/profile.d/{{ app_name }}.sh + dest: /etc/profile.d/{{ sw_name }}.sh mode: '0644' content: "{{ profile_script }}" + when: profile_script - name: Create parent of install directory ansible.builtin.file: @@ -42,7 +43,7 @@ - name: Acquire lock ansible.builtin.command: - mkdir "{{ install_dir | dirname }}/.install_{{ app_name }}_lock" + mkdir "{{ install_dir | dirname }}/.install_{{ sw_name }}_lock" register: lock_out changed_when: lock_out.rc == 0 failed_when: false @@ -51,27 +52,15 @@ ansible.builtin.command: git clone --branch {{ git_ref }} {{ git_url }} {{ install_dir }} when: lock_out.rc == 0 - - name: chgrp on installation + - name: Set ownership and permissions ansible.builtin.file: path: "{{ install_dir }}" - group: "{{ chgrp_group }}" + mode: "{{ chmod_mode | default(omit, true) }}" + owner: "{{ chown_owner | default(omit, true) }}" + group: "{{ chgrp_group | default(omit, true) }}" recurse: true - when: chgrp_group != "" and lock_out.rc == 0 - - - name: chown on installation - ansible.builtin.file: - path: "{{ install_dir }}" - owner: "{{ chown_owner }}" - recurse: true - when: chown_owner != "" and lock_out.rc == 0 - - - name: chmod on installation - ansible.builtin.file: - path: "{{ install_dir }}" - mode: "{{ chmod_mode }}" - recurse: true - when: chmod_mode != "" and lock_out.rc == 0 + when: lock_out.rc == 0 - name: Finalize Setup ansible.builtin.shell: "{{ finalize_setup_script }}" - when: lock_out.rc == 0 + when: lock_out.rc == 0 and finalize_setup_script diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index fe8b4ca77f..c3ab3eae09 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -23,7 +23,7 @@ locals { profile_script = <<-EOF SPACK_PYTHON=${var.spack_virtualenv_path}/bin/python3 if [ -f ${var.install_dir}/share/spack/setup-env.sh ]; then - . ${var.install_dir}/share/spack/setup-env.sh + . ${var.install_dir}/share/spack/setup-env.sh fi EOF @@ -35,9 +35,9 @@ locals { EOF script_content = templatefile( - "${path.module}/templates/spack_setup.yml.tpl", + "${path.module}/templates/spack_setup.yml.tftpl", { - app_name = "spack" + sw_name = "spack" profile_script = indent(4, yamlencode(local.profile_script)) install_dir = var.install_dir git_url = var.spack_url diff --git a/community/modules/scripts/spack-install/templates/spack_setup.yml.tpl b/community/modules/scripts/spack-install/templates/spack_setup.yml.tftpl similarity index 66% rename from community/modules/scripts/spack-install/templates/spack_setup.yml.tpl rename to community/modules/scripts/spack-install/templates/spack_setup.yml.tftpl index 0e6d6d8116..9b80783fcf 100644 --- a/community/modules/scripts/spack-install/templates/spack_setup.yml.tpl +++ b/community/modules/scripts/spack-install/templates/spack_setup.yml.tftpl @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -- name: Install Application +- name: Install Software hosts: localhost vars: - app_name: ${app_name} + sw_name: ${sw_name} profile_script: ${profile_script} install_dir: ${install_dir} git_url: ${git_url} @@ -25,15 +25,16 @@ chgrp_group: ${chgrp_group} finalize_setup_script: ${finalize_setup_script} tasks: - - name: Print Application Name + - name: Print Software Name ansible.builtin.debug: - msg: "Running installation for application: {{app_name}}" + msg: "Running installation for software: {{ sw_name }}" - - name: Add profile script for application + - name: Add profile script for software ansible.builtin.copy: - dest: /etc/profile.d/{{ app_name }}.sh + dest: /etc/profile.d/{{ sw_name }}.sh mode: '0644' content: "{{ profile_script }}" + when: profile_script - name: Create parent of install directory ansible.builtin.file: @@ -42,7 +43,7 @@ - name: Acquire lock ansible.builtin.command: - mkdir "{{ install_dir | dirname }}/.install_{{ app_name }}_lock" + mkdir "{{ install_dir | dirname }}/.install_{{ sw_name }}_lock" register: lock_out changed_when: lock_out.rc == 0 failed_when: false @@ -51,27 +52,15 @@ ansible.builtin.command: git clone --branch {{ git_ref }} {{ git_url }} {{ install_dir }} when: lock_out.rc == 0 - - name: chgrp on installation + - name: Set ownership and permissions ansible.builtin.file: path: "{{ install_dir }}" - group: "{{ chgrp_group }}" + mode: "{{ chmod_mode | default(omit, true) }}" + owner: "{{ chown_owner | default(omit, true) }}" + group: "{{ chgrp_group | default(omit, true) }}" recurse: true - when: chgrp_group != "" and lock_out.rc == 0 - - - name: chown on installation - ansible.builtin.file: - path: "{{ install_dir }}" - owner: "{{ chown_owner }}" - recurse: true - when: chown_owner != "" and lock_out.rc == 0 - - - name: chmod on installation - ansible.builtin.file: - path: "{{ install_dir }}" - mode: "{{ chmod_mode }}" - recurse: true - when: chmod_mode != "" and lock_out.rc == 0 + when: lock_out.rc == 0 - name: Finalize Setup ansible.builtin.shell: "{{ finalize_setup_script }}" - when: lock_out.rc == 0 + when: lock_out.rc == 0 and finalize_setup_script diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index 84566fc5ff..cd7e2bfd95 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -50,8 +50,8 @@ "community/modules/scripts/spack-install/templates/execute_commands.yml.tpl", ], [ - "community/modules/scripts/spack-install/templates/spack_setup.yml.tpl", - "community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tpl", + "community/modules/scripts/spack-install/templates/spack_setup.yml.tftpl", + "community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl", ], ] From 449af77040972d9bf87a795ef27cbf58406423aa Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 16 Jun 2023 08:51:27 +0100 Subject: [PATCH 022/144] OFE Backend: adding custom image support for cluster backend --- .../ghpcfe/cluster_manager/clusterinfo.py | 118 +++++++++++------- 1 file changed, 74 insertions(+), 44 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index ffa68d235f..a81030d136 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -165,13 +165,19 @@ def _prepare_ghpc_filesystems(self): refs.append(storage_id) return ("\n\n".join(yaml), refs) - + def _prepare_ghpc_partitions(self, part_uses): yaml = [] refs = [] uses_str = self._yaml_refs_to_uses(part_uses) for (count, part) in enumerate(self.cluster.partitions.all()): part_id = f"partition_{count}" + if part.image is not None: + instance_image_yaml = f"""instance_image: + family: {part.image.family} + project: {self.cluster.project_id}""" + else: + instance_image_yaml = "" yaml.append( f""" - source: community/modules/compute/schedmd-slurm-gcp-v5-partition @@ -193,6 +199,7 @@ def _prepare_ghpc_partitions(self, part_uses): enable_smt: {part.enable_hyperthreads} machine_type: {part.machine_type} node_count_dynamic_max: {part.max_node_count} + {instance_image_yaml} """ ) @@ -217,43 +224,61 @@ def _prepare_ghpc_partitions(self, part_uses): def _yaml_refs_to_uses(self, use_list): return "\n".join([f" - {x}" for x in use_list]) + def _prepare_ghpc_yaml(self): - yaml_file = self.cluster_dir / "cluster.yaml" - project_id = json.loads(self.cluster.cloud_credential.detail)[ - "project_id" - ] - - ( - filesystems_yaml, - filesystems_references, - ) = self._prepare_ghpc_filesystems() - ( - partitions_yaml, - partitions_references, - ) = self._prepare_ghpc_partitions( - ["hpc_network"] + filesystems_references - ) + try: + yaml_file = self.cluster_dir / "cluster.yaml" + project_id = json.loads(self.cluster.cloud_credential.detail)[ + "project_id" + ] + + ( + filesystems_yaml, + filesystems_references, + ) = self._prepare_ghpc_filesystems() + ( + partitions_yaml, + partitions_references, + ) = self._prepare_ghpc_partitions( + ["hpc_network"] + filesystems_references + ) - controller_uses = self._yaml_refs_to_uses( - ["hpc_network"] + partitions_references + filesystems_references - ) - login_uses = self._yaml_refs_to_uses( - ["hpc_network"] + filesystems_references - ) + controller_uses = self._yaml_refs_to_uses( + ["hpc_network"] + partitions_references + filesystems_references + ) + login_uses = self._yaml_refs_to_uses( + ["hpc_network"] + filesystems_references + ) - controller_sa = "sa" - # TODO: Determine if these all should be different, and if so, add to - # resource to be created. NOTE though, that at the moment, GHPC won't - # let us unpack output variables, so we can't index properly. - # for now, just use the singular access, and only create a single acct - # compute_sa = controller_sa - # login_sa = controller_sa - - # pylint: disable=line-too-long - startup_bucket = self.config["server"]["gcs_bucket"] - with yaml_file.open("w") as f: - f.write( + controller_sa = "sa" + # TODO: Determine if these all should be different, and if so, add to + # resource to be created. NOTE though, that at the moment, GHPC won't + # let us unpack output variables, so we can't index properly. + # for now, just use the singular access, and only create a single acct + # compute_sa = controller_sa + # login_sa = controller_sa + + # pylint: disable=line-too-long + startup_bucket = self.config["server"]["gcs_bucket"] + + if self.cluster.login_node_image is not None: + login_image_yaml = f"""instance_image: + family: {self.cluster.login_node_image.family} + project: {self.cluster.project_id}""" + else: + login_image_yaml = "" + + if self.cluster.controller_node_image is not None: + controller_image_yaml = f"""instance_image: + family: {self.cluster.controller_node_image.family} + project: {self.cluster.project_id} + """ + else: + controller_image_yaml = "" + + with yaml_file.open("w") as f: + f.write( f""" blueprint_name: {self.cluster.cloud_id} @@ -305,6 +330,7 @@ def _prepare_ghpc_yaml(self): machine_type: {self.cluster.controller_instance_type} disk_type: {self.cluster.controller_disk_type} disk_size_gb: {self.cluster.controller_disk_size} + {controller_image_yaml} service_account: email: $(hpc_service_account.service_account_email) scopes: @@ -313,13 +339,13 @@ def _prepare_ghpc_yaml(self): - https://www.googleapis.com/auth/logging.write - https://www.googleapis.com/auth/devstorage.read_write - https://www.googleapis.com/auth/pubsub - controller_startup_script: | - #!/bin/bash - echo "******************************************** CALLING CONTROLLER STARTUP" - gsutil cp gs://{startup_bucket}/clusters/{self.cluster.id}/bootstrap_controller.sh - | bash - compute_startup_script: | - #!/bin/bash - gsutil cp gs://{startup_bucket}/clusters/{self.cluster.id}/bootstrap_compute.sh - | bash + controller_startup_script: | + #!/bin/bash + echo "******************************************** CALLING CONTROLLER STARTUP" + gsutil cp gs://{startup_bucket}/clusters/{self.cluster.id}/bootstrap_controller.sh - | bash + compute_startup_script: | + #!/bin/bash + gsutil cp gs://{startup_bucket}/clusters/{self.cluster.id}/bootstrap_compute.sh - | bash #TODO: enable_cleanup_compute: True #TODO: enable_cleanup_subscriptions: True use: @@ -334,6 +360,7 @@ def _prepare_ghpc_yaml(self): machine_type: {self.cluster.login_node_instance_type} disk_type: {self.cluster.login_node_disk_type} disk_size_gb: {self.cluster.login_node_disk_size} + {login_image_yaml} service_account: email: $(hpc_service_account.service_account_email) scopes: @@ -349,10 +376,13 @@ def _prepare_ghpc_yaml(self): - slurm_controller {login_uses} -""" - ) + """ + ) + # pylint: enable=line-too-long + + except Exception as E: + logger.exception(f"Exception happened creating blueprint for cluster {self.cluster.name} - {E}") - # pylint: enable=line-too-long def _prepare_bootstrap_gcs(self): template_dir = ( From 7f3111fd9a1f36ade8f8e9633283cf43c0e183db Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 16 Jun 2023 09:08:26 +0100 Subject: [PATCH 023/144] OFE Backend: custom image functionality backend handlers --- .../front-end/ofe/website/ghpcfe/admin.py | 2 + .../website/ghpcfe/cluster_manager/image.py | 272 ++++++++++++++++++ .../website/ghpcfe/cluster_manager/utils.py | 52 +++- .../front-end/ofe/website/website/settings.py | 4 +- 4 files changed, 327 insertions(+), 3 deletions(-) create mode 100644 community/front-end/ofe/website/ghpcfe/cluster_manager/image.py diff --git a/community/front-end/ofe/website/ghpcfe/admin.py b/community/front-end/ofe/website/ghpcfe/admin.py index c8a8d07483..2a47a02170 100644 --- a/community/front-end/ofe/website/ghpcfe/admin.py +++ b/community/front-end/ofe/website/ghpcfe/admin.py @@ -174,3 +174,5 @@ def get_name(self, obj): admin.site.register(WorkbenchPreset) admin.site.register(AuthorisedUser) admin.site.register(WorkbenchMountPoint) +admin.site.register(StartupScript) +admin.site.register(Image) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py new file mode 100644 index 0000000000..f1aa094078 --- /dev/null +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py @@ -0,0 +1,272 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +''' +This is a backend part of custom image creation fuctionality. +Frontend views will talk with functions here to perform real actions. +''' + +import logging +from . import utils +import json +import subprocess +import os +from django.conf import settings +from google.api_core.exceptions import NotFound +from google.cloud import compute_v1 + +logger = logging.getLogger(__name__) + +class ImageBackend: + """Image configuration and management class""" + + def __init__(self, image): + self.config = utils.load_config() + self.ghpc_path = self.config["baseDir"].parent.parent / "ghpc" + + self.image = image + self.image_dir = ( + self.config["baseDir"] + / "images" + / f"image_{self.image.id}" + ) + self.blueprint_name = f"image_{self.image.id}" + self.credentials_file = self.image_dir / "cloud_credentials" + + + def prepare(self): + self._create_image_dir() + self._create_blueprint() + self._run_ghpc() + self._create_builder_env() + self._create_image() + self._destroy_builder_env() + + def update_image_status(self, new_status): + self.image.status = new_status + self.image.save() + + def _create_image_dir(self): + try: + self.image_dir.mkdir(parents=True, exist_ok=True) + creds = self.image.cloud_credential.detail + with self.credentials_file.open("w") as fp: + fp.write(creds) + except OSError as e: + self.update_image_status("e") + logger.error(f"Error occurred while creating the image directory: {e}") + except IOError as e: + self.update_image_status("e") + logger.error(f"Error occurred while writing to the credentials file: {e}") + + + def _create_blueprint(self): + """ + Create HPC Toolkit blueprint that will build the image. + """ + try: + blueprint_file = self.image_dir / "image.yaml" + project_id = json.loads(self.image.cloud_credential.detail)["project_id"] + scripts = self.image.startup_script.all() + runners = "" + for script in scripts: + script_path = os.path.join(settings.MEDIA_ROOT, script.content.name) + print(script_path) + runners+=f""" + - type: {script.type} + destination: {script.name} + source: {script_path}""" + + with blueprint_file.open("w") as f: + f.write( + f"""blueprint_name: {self.blueprint_name} +vars: + project_id: {project_id} + deployment_name: {self.blueprint_name} + region: {self.image.cloud_region} + zone: {self.image.cloud_zone} + network_name: {"image-"+ str(self.image.id) + "-network"} + subnetwork_name: {"image" + str(self.image.id) + "-subnetwork"} + image_name: {"image-" + self.image.name} + image_family: {"image" + self.image.family} + tag: ofe-created + +deployment_groups: +- group: builder-env + modules: + - id: network1 + source: modules/network/vpc + settings: + network_name: $(vars.network_name) + + - id: scripts_for_image + source: modules/scripts/startup-script + settings: + runners:{runners} + outputs: [startup_script] + +- group: packer-image + modules: + - id: custom-image + source: modules/packer/custom-image + kind: packer + use: + - scripts_for_image + settings: + source_image_project_id: [{self.image.source_image_project}] + source_image_family: {self.image.source_image_family} + disk_size: 50 + image_family: $(vars.image_family) + state_timeout: 30m + zone: $(vars.zone) + subnetwork_name: $(vars.subnetwork_name) + image_storage_locations: ["{self.image.cloud_region}"] + metadata: + enable-oslogin: {self.image.enable_os_login} + block-project-ssh-keys: {self.image.block_project_ssh_keys} +""" + ) + except Exception as e: + self.update_image_status("e") + print(f"Error occurred while creating blueprint: {str(e)}") + + def _run_ghpc(self): + target_dir = self.image_dir + try: + logger.info(f"Invoking ghpc create for the image {self.image.id}") + log_out_fn = target_dir / "ghpc_create_log.stdout" + log_err_fn = target_dir / "ghpc_create_log.stderr" + with log_out_fn.open("wb") as log_out: + with log_err_fn.open("wb") as log_err: + subprocess.run( + [self.ghpc_path.as_posix(), "create", "image.yaml"], + cwd=target_dir, + stdout=log_out, + stderr=log_err, + check=True, + ) + except subprocess.CalledProcessError as cpe: + self.update_image_status("e") + logger.error(f"ghpc exec failed for image {self.image.id}", exc_info=cpe) + # No logs from stdout/err - get dumped to files + raise + + def _create_builder_env(self): + """Setup builder environment on GCP.""" + extra_env = { + "GOOGLE_APPLICATION_CREDENTIALS": self.credentials_file + } + try: + logger.info("Invoking Terraform Init for builder env.") + try: + terraform_dir = os.path.join(self.image_dir, f"{self.blueprint_name}/builder-env") + packer_dir = os.path.join(self.image_dir, f"{self.blueprint_name}/packer-image") + except OSError as e: + self.update_image_status("e") + print(f"Error occurred while constructing terraform_dir: {e}") + utils.run_terraform(terraform_dir, "init") + utils.run_terraform(terraform_dir, "validate", extra_env=extra_env) + logger.info("Invoking Terraform Plan for builder env.") + utils.run_terraform(terraform_dir, "plan", extra_env=extra_env) + logger.info("Invoking Terraform Apply for builder env.") + utils.run_terraform(terraform_dir, "apply", extra_env=extra_env) + logger.info("Exporting startup script from builder env.") + utils.run_terraform(terraform_dir, "output", extra_env=extra_env, + arguments=[ + "-raw", + "startup_script_scripts_for_image"], + ) + utils.copy_file(f"{terraform_dir}/terraform_output_log.stdout",f"{packer_dir}/custom-image/startup_script.sh") + except subprocess.CalledProcessError as cpe: + self.update_image_status("e") + logger.error(f"Terraform exec failed for builder env, image: {self.image.id}", exc_info=cpe) + if cpe.stdout: + logger.info(" STDOUT:\n%s\n", cpe.stdout.decode("utf-8")) + if cpe.stderr: + logger.info(" STDERR:\n%s\n", cpe.stderr.decode("utf-8")) + raise + + def _create_image(self): + """Create image on GCP.""" + extra_env = { + "GOOGLE_APPLICATION_CREDENTIALS": self.credentials_file + } + try: + logger.info("Invoking Packer Init for image.") + try: + packer_dir = os.path.join(self.image_dir, f"{self.blueprint_name}/packer-image/custom-image") + except OSError as e: + self.update_image_status("e") + logger.exception(f"Error occurred while constructing packer_dir: {e}") + utils.run_packer(packer_dir, "init", arguments=["."]) + utils.run_packer(packer_dir, "validate", extra_env=extra_env, + arguments=["-var", "startup_script_file=startup_script.sh", "."]) + logger.info("Invoking Packer build for the image") + utils.run_packer(packer_dir, "build", extra_env=extra_env, + arguments=["-var", "startup_script_file=startup_script.sh", "."]) + self.update_image_status("r") + + except subprocess.CalledProcessError as cpe: + self.update_image_status("e") + logger.exception(f"Packer image build failed for image: {self.image.id}", exc_info=cpe) + if cpe.stdout: + logger.info(" STDOUT:\n%s\n", cpe.stdout.decode("utf-8")) + if cpe.stderr: + logger.info(" STDERR:\n%s\n", cpe.stderr.decode("utf-8")) + raise + except Exception as e: + logger.exception(f"Unhandled error happened durring image {self.image.id} creation.") + + def _destroy_builder_env(self): + """Destroy builder environment on GCP.""" + extra_env = { + "GOOGLE_APPLICATION_CREDENTIALS": self.credentials_file + } + try: + logger.info("Invoking Terraform Destroy for builder env.") + try: + terraform_dir = os.path.join(self.image_dir, f"{self.blueprint_name}/builder-env") + except OSError as e: + self.update_image_status("e") + print(f"Error occurred while constructing terraform_dir: {e}") + logger.info("Invoking Terraform Destroy for builder env.") + utils.run_terraform(terraform_dir, "destroy", extra_env=extra_env) + except subprocess.CalledProcessError as cpe: + self.update_image_status("e") + logger.error(f"Terraform exec failed for destroying builder env, image: {self.image.id}", exc_info=cpe) + if cpe.stdout: + logger.info(" STDOUT:\n%s\n", cpe.stdout.decode("utf-8")) + if cpe.stderr: + logger.info(" STDERR:\n%s\n", cpe.stderr.decode("utf-8")) + raise + + def delete_image(self): + project_id = json.loads(self.image.cloud_credential.detail)["project_id"] + image_name = f"image-{self.image.name}" + zone = self.image.cloud_zone + + # Create a client + client = compute_v1.ImagesClient() + + try: + # Delete the image + operation = client.delete(project=project_id, image=image_name) + operation.result() + logger.info(f"Image '{image_name}' deleted successfully from project '{project_id}' in zone '{zone}'") + + except NotFound: + logger.error(f"Image '{image_name}' not found in project '{project_id}' or zone '{zone}'") + + except Exception as e: + logger.error(f"An error occurred while deleting the image: {e}") diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/utils.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/utils.py index 2bb7651c75..d4a0f6f909 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/utils.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/utils.py @@ -20,6 +20,7 @@ import os import subprocess from pathlib import Path +import shutil import yaml @@ -200,8 +201,8 @@ def run_terraform(target_dir, command, arguments=None, extra_env=None): if command in ["apply", "destroy"]: cmdline.append("-auto-approve") - log_out_fn = target_dir / f"terraform_{command}_log.stdout" - log_err_fn = target_dir / f"terraform_{command}_log.stderr" + log_out_fn = Path(target_dir) / f"terraform_{command}_log.stdout" + log_err_fn = Path(target_dir) / f"terraform_{command}_log.stderr" new_env = os.environ.copy() # Don't have terraform try to re-use any existing SSH agent @@ -222,3 +223,50 @@ def run_terraform(target_dir, command, arguments=None, extra_env=None): ) return (log_out_fn, log_err_fn) + +def run_packer(target_dir, command, arguments=None, extra_env=None): + arguments = arguments if arguments else [] + extra_env = extra_env if extra_env else {} + + # There is another binary called packer on the OS + # To make sure we using correct packer specify full path + cmdline = ["/usr/bin/packer", command] + cmdline.extend(arguments) + + log_out_fn = Path(target_dir) / f"packer_{command}_log.stdout" + log_err_fn = Path(target_dir) / f"packer_{command}_log.stderr" + + new_env = os.environ.copy() + if "SSH_AUTH_SOCK" in new_env: + del new_env["SSH_AUTH_SOCK"] + new_env.update(extra_env) + + try: + with log_out_fn.open("wb") as log_out: + with log_err_fn.open("wb") as log_err: + subprocess.run( + cmdline, + cwd=target_dir, + env=new_env, + stdout=log_out, + stderr=log_err, + check=True, + ) + except subprocess.CalledProcessError as e: + # Handle the error from Packer command execution + raise RuntimeError(f"Packer command failed: {e}") + except Exception as E: + # At this point catch any other exception as well. + raise RuntimeError(f"Packer command failed: {E}") + + return (log_out_fn, log_err_fn) + + +def copy_file(source_file, destination_file): + try: + shutil.copy(source_file, destination_file) + logger.info("File copied successfully.") + except shutil.Error as e: + logger.exception(f"Error occurred while copying the file: {e}") + except IOError as e: + logger.exception(f"Error occurred while reading or writing the file: {e}") diff --git a/community/front-end/ofe/website/website/settings.py b/community/front-end/ofe/website/website/settings.py index 1d90491c4d..68a67c5d78 100644 --- a/community/front-end/ofe/website/website/settings.py +++ b/community/front-end/ofe/website/website/settings.py @@ -34,7 +34,7 @@ def get_listen_hosts(): - ip_list = ["127.0.0.1"] # Start with localhost + ip_list = ["127.0.0.1", "localhost"] # Start with localhost try: # Try to get IP info from Google Metadata metadata_headers = {"Metadata-Flavor": "Google"} @@ -81,6 +81,8 @@ def get_site_name(): # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = Path(__file__).resolve().parent.parent +MEDIA_ROOT = "/opt/gcluster/hpc-toolkit/community/front-end/website/startup-scripts:" + # Quick-start development settings - unsuitable for production # See https://docs.djangoproject.com/en/3.1/howto/deployment/checklist/ From 766be456330ffdd4639837b216dbd20f85b2846d Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 16 Jun 2023 08:58:05 +0100 Subject: [PATCH 024/144] OFE Frontend: adding url handlers for image funct. --- .../front-end/ofe/website/ghpcfe/urls.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/community/front-end/ofe/website/ghpcfe/urls.py b/community/front-end/ofe/website/ghpcfe/urls.py index 23db777f82..00b600dda2 100644 --- a/community/front-end/ofe/website/ghpcfe/urls.py +++ b/community/front-end/ofe/website/ghpcfe/urls.py @@ -21,6 +21,7 @@ from rest_framework import routers from . import views from .views.credentials import * +from .views.images import * from .views.clusters import * from .views.applications import * from .views.jobs import * @@ -475,4 +476,40 @@ BackendUpdateWorkbench.as_view(), name="backend-update-workbench", ), + path( + "backend/image-create/", + BackendCreateImage.as_view(), + name="backend-create-image", + ), + path( + "backend/get-regions/", + BackendListRegions.as_view(), + name="backend-list-regions", + ), +] + +# Url paths that handle custom image views +urlpatterns += [ + path("images/", ImagesListView.as_view(), name="images"), + path( + "images/create/startup-script", StartupScriptCreateView.as_view(), name="startup-script-create" + ), + path( + "images/startup-script-view/", StartupScriptDetailView.as_view(), name="startup-script-view" + ), + path( + "images/startup-script-delete/", StartupScriptDeleteView.as_view(), name="startup-script-delete" + ), + path( + "images/create/image", ImageCreateView.as_view(), name="image-create" + ), + path( + "images/image-view/", ImageDetailView.as_view(), name="image-view" + ), + path( + "images/image-delete/", ImageDeleteView.as_view(), name="image-delete" + ), + path( + "images/image-status/", ImageStatusView.as_view(), name="image-status" + ), ] From 05267fe54d16b830f24dd016fb1560d8520d049e Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 16 Jun 2023 08:59:12 +0100 Subject: [PATCH 025/144] OFE Frontend: forms for handing image creation and image selection for cluster --- .../front-end/ofe/website/ghpcfe/forms.py | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/community/front-end/ofe/website/ghpcfe/forms.py b/community/front-end/ofe/website/ghpcfe/forms.py index 31a4adfce6..4a5a2a5f24 100644 --- a/community/front-end/ofe/website/ghpcfe/forms.py +++ b/community/front-end/ofe/website/ghpcfe/forms.py @@ -176,6 +176,8 @@ class Meta: "login_node_instance_type", "login_node_disk_type", "login_node_disk_size", + "login_node_image", + "controller_node_image", ) widgets = { @@ -206,6 +208,14 @@ class Meta: "num_login_nodes": forms.NumberInput( attrs={"class": "form-control"} ), + "login_node_image": forms.Select(attrs={"class": "form-control", + "id": "login-node-image", + "name": "login_node_image", + "value": "",}), + "controller_node_image": forms.Select(attrs={"class": "form-control", + "id": "controller-node-image", + "name": "controller_node_image", + "value": "",}), } @@ -857,3 +867,74 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) for field in self.fields: self.fields[field].widget.attrs.update({"class": "form-control"}) + +class StartupScriptForm(forms.ModelForm): + """Custom form for StartupScript model""" + + class Meta: + model = StartupScript + + fields = ( + "name", + "description", + "type", + "content", + ) + + widgets = { + "name": forms.TextInput(attrs={"class": "form-control"}), + "description": forms.Textarea(attrs={"class": "form-control"}), + "type": forms.Select(attrs={"class": "form-control"}), + "content": forms.ClearableFileInput(attrs={"class": "form-control"}), + } + +class ImageForm(forms.ModelForm): + """Custom form for Image model""" + + class Meta: + model = Image + + fields = ( + "cloud_credential", + "name", + "family", + "cloud_region", + "cloud_zone", + "source_image_project", + "source_image_family", + "startup_script", + "enable_os_login", + "block_project_ssh_keys", + "authorised_users" + ) + + widgets = { + "cloud_credential": forms.Select(attrs={"class": "form-control"}), + "name": forms.TextInput(attrs={"class": "form-control"}), + "family": forms.TextInput(attrs={"class": "form-control"}), + "cloud_region": forms.Select(attrs={"class": "form-control"}), + "cloud_zone": forms.Select(attrs={"class": "form-control"}), + "source_image_project": forms.TextInput(attrs={"class": "form-control"}), + "source_image_family": forms.TextInput(attrs={"class": "form-control"}), + "startup_script": forms.SelectMultiple(attrs={"class": "form-control"}), + "enable_os_login": forms.RadioSelect(), + "block_project_ssh_keys": forms.RadioSelect(), + "authorised_users": forms.SelectMultiple(attrs={"class": "form-control"}), + } + + def __init__(self, *args, **kwargs): + user = kwargs.pop("user", None) + super().__init__(*args, **kwargs) + self.fields["startup_script"].queryset = self.get_startup_scripts(user) + + def get_startup_scripts(self, user): + # Retrieve startup scripts owned by the user + owned_scripts = StartupScript.objects.filter(owner=user) + + # Retrieve startup scripts authorized for the user + authorized_scripts = StartupScript.objects.filter(authorised_users=user) + + # Combine the owned and authorized scripts + startup_scripts = owned_scripts | authorized_scripts + + return startup_scripts From 16c04da7038c20091a3e45331eb760c7d6998446 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 16 Jun 2023 09:02:41 +0100 Subject: [PATCH 026/144] OFE Frontend: HTML templates for custom image functionality. --- .../ofe/website/ghpcfe/static/img/image.png | Bin 0 -> 9747 bytes .../ghpcfe/templates/base_generic.html | 3 + .../ghpcfe/templates/image/image-create.html | 110 +++++++++ .../ghpcfe/templates/image/image-view.html | 66 ++++++ .../website/ghpcfe/templates/image/list.html | 224 ++++++++++++++++++ .../image/startup-script-create.html | 44 ++++ .../templates/image/startup-script-view.html | 28 +++ 7 files changed, 475 insertions(+) create mode 100644 community/front-end/ofe/website/ghpcfe/static/img/image.png create mode 100644 community/front-end/ofe/website/ghpcfe/templates/image/image-create.html create mode 100644 community/front-end/ofe/website/ghpcfe/templates/image/image-view.html create mode 100644 community/front-end/ofe/website/ghpcfe/templates/image/list.html create mode 100644 community/front-end/ofe/website/ghpcfe/templates/image/startup-script-create.html create mode 100644 community/front-end/ofe/website/ghpcfe/templates/image/startup-script-view.html diff --git a/community/front-end/ofe/website/ghpcfe/static/img/image.png b/community/front-end/ofe/website/ghpcfe/static/img/image.png new file mode 100644 index 0000000000000000000000000000000000000000..b39e9285fef7ce6ed5b3324ab0065d8af3dc4f12 GIT binary patch literal 9747 zcmaiacQjmI)b`A%(TSQMqKlFkU6jFK^b&3K-g_s==rtHM7$OOx4n~Piv?xIkz4vI* zqx1T$@1O6VcfEJ5v(A3@e)eAb?0fEAXWhGFHPsb~37!xD003eo7*rboz+(DuU}HUK zxJ;iKJxthkGHNmaKy?z~jRnpF26|~L$^y{C^xF^Mkvk0T1pp8}{%-&Q*(llvC55J{ zj=ZXa$AkF)Uu>Y2)CY{?t*!nBAnZ9)(vJ530{}1ql%O&?{d#q8(uWxy=22C6VO#LGEAN-NBSn^q1KdzfF{Sg6l-Q7)A>j3vfCfhtV<*9Z^~1LO>9;-XQ<-5UOr~*IZ`6$=Kls`6D3L^o5w07tW(E`Y&~p00!MlCT$Z(EU3OY#cK4VCA zqTFh~c8H!CXo?ywjCf>p+i7(BGg!~hbLh*+o6WUhB$UQaYU8K|)|3)NzC>@~&Gt!1 z-)y=<-Kc#$oh4~8^;cQI*NRSWpO72Bmw&IeFe5+i8)pv8^1iAY4J}M0$%`%>MN)6u zer~Ry_dC0wY`pW$ZR?qcWYv-y)`)G6nL2k!_uM%~VvN$|kC`1@HZYOU?!h0|A^l2h z=AQj7Z);IB2A(R@98CsLowf9cKj219hooA4wu9jKzB%4^Cd`aRDQaCXSeu{3EN_TH zY=6(p(uaRvX5W9gPVD<=VSip^`L9i#At~b)-fg>HvNq~%@uKCod*kh$sZ!8MRlrRqJ7I;kmjw(Rdee z=Qu8ON*(-7^5?3>rK?74v+K`SJM|rBmZg*9DF5&*1T&kz3;Q(5#e(YxE|}zGF*2AYmDI6 zb*Xo>rReO-Mv5_te&W6);&1_V+kHJ5gRh$Gp0yWqLdhA*w=8Y^u5`pw4zW7nXZQR# zn&Hi%9OVJCt!t7QWPb_@DTp&xq&Bse3xo>QDBqQFhIU?)~!Mr&< z-S&683t24KOu`#u(l1dgg)?-(q~}Ty9@R+~8MnGvO5o)V9-Z^OLb#X=KNskjty!k< z$y<|5uBZje1&m%ylj~zlUw#F|Uc%%V1z*5FeHMf5Uc)JQSuHnU<{g`1J9n4jE_EZdAIVW9sucUmGf?IsYwuN@(ev7TNG;VugJ0nxw%5 z*rIrvZ%D8sYp|E-FGlsFOfm19_;QHwuM0au?P80I z2cH-Rx^M9m<*c%AmO^nEZ^TqQ_vBak1~vhm_jt-t7O@{_2Xo{M@KJ0R=`jiYPRMAy zjuB&_W(t0O2Ny7KWdt5JClvo7PqT5}d<1Uh9sSyttr;%fbtPmvgXP&%2K`I`wOb24 zJdF$0g(s2AJ>I6qem8t&eD8+ZIjhqo1a-qv%GNQQ$rg_o$u#D(F08e>3-Q142* z+Lariae%AkBLPYizp@V_`y%}z?vowspKYL_(}f6#M5^N|zx1d@ARYA@@H-qUep3@l z+2<+oD$iNeqj;AeHP{4TB0~we+mn1X4~?OC@Yd_Ls6B6~?BRV$W=DKFJcaEg+XN2n zfa|Y2uy&O@5CTEp;VgA06W=BDA`bxpVS;rhF8WMNs6vOzK>Wo9`nZ+Q@9?mHr7l&F zubv2V_lzQA?sS<)+y+ccZZ*g>3_4s)A{`q$tVT)*#A_otjr*SMsKfvBxt-CB`egKS zXd~t!(T@>l=_4XZk-+cE<)NijHtvqvw4`LiO`lVI7#s;yXz&T_-b3XsmK~^mU`5nm z50RNhO1NEuWL9|Oz!4(Pi%Uc&^rZwf)7D?D=zR9ZlGWZfae z-)32#!myc&AE+cU0RXO)m-YI|gB-YxTLCLP3#aHlGEr6&ax${v6t(?NkAA`ACCo(! zh3P-W0IsamAey31_I>bA2ic{dhwSu;#Re}JB1+@%i0rm)QEVWSrV>K@YZ~B4U!Y_u z!>vhidBTH9twMD0YD94gw>uO77XEBV&e^l$pYNVw-wg*YM_(ZxM3*jjXl3;oP-dn4 zQ|t)E9XnLG<6#BpY#2sI#op#TJrsPZR_&DukC1(mSRG1a6dL zhMJBTm5d}Fyq#-?W2()?%B*Y9EnJRIut4G?fyL26yjIU!)DOzS#dNK@9?wi;=hT#5 zM&eoa%5Ic<+FGBMq}nIpaoVK|wDoQ+;esA&I;aC~GtK7(GUZ>xbHsHTSnPOchber} zhS$O{MT-9jdRZ6hZ5nJAJijF`UikP~cNn}vu{o&*;XNC`B4^@63F*FSoAslJ!hfjL z7}c2lu{#cx)pz%d7atYIwhHmU4>8JaPOVRUa;y_@j#hj`I z^71lO(z02`$MyZmIwdfafS7(g|3TiSmqzxm;FWR=^XhA$95Ft=T{KkL^x(S83&P%z1ivT*KEzx=CvKRVAPH!!k?JDvDBS9aS)QllC=)m zA1gQvLVU-K=ylvKY}w}Gk7lIol;UXtB^9aoLpVeFm}sa=@O}Xq$tdXH#XF1x;};0c z(`err?8To>O(sAlJ4@Yn2ibRuE#zfanC*4sk@eBd1*X0AE+*%8@TB`g?fU0R{gdqB z65gPe+b-$29J_k;-EWoe48Vl1Pn-$2LNXr5_H(C|?gpsYfuav+#8#!EnuXI!Lkz<< zcUzh?<=rE%@|My(3@lIlovy5)p7F0;1$Z0trP%D$l5U5hiH;_-v?r1ZOXH2lAN$T6 z_-{82-Ty^a7IE+<&R3238oj&SkYu?wM%WD%Zxjd%Dvthy7&~m@rL=uJAd%f zMrw9B&zHa-y;pI2#>tGF-`sZ}840sgIOlA*Sn$v?hCs73VD&>!f-`x39UiibVW1e7 zj3eff2f+LQqKtEFZjHZ=9v;@$5UW5phabth=C>xa&=k6rQuNosnb$LO2Kxd6+FyD6 zkuPc))A>H{%*)c;=`AyX&S}Ysvbyk`8-O9j8Z9-M#YfJ2Zv^65{(NV36gg!$zTCBGuFZg_^Y|=!C|fRP zO;@Mjxb_m>S;S~VCKxUL_1_Nzyw6qmgMR%LsDlq{e^<2x4I(xg*Y$?=H+~n!+D$#` zwZMUuO(TK34!13QQ&*+zOj@@vXaZAhshJ@k$*69Zbb3>1ddN?O?L`gL%X zH$@L1%p#^a?2anK6eUkw1S+e_6MjoS!35|zsuf*ZAH)>j1(SJr-3ML7{P;RYNh>rV z!F`4}QL0-u)fCjdJJLyw@S`9srE5{L0#!3vIXjJ9Ppk)$6Hfl3g@RdJ|xuBP+c_qTe)=@j{#gmRa-w2|5+mHc;j=}6NS!E_| z%k8_vrHdG0MI1_CUII>3>7YT8arGk#awRYZ16^`-1nZ5x&-`|3S+*2Q#6@jJwK879 zXvLPX#6M#o(keH82-%epRI1`~V9eL%B|8)GSIliV!rt0nme7L_)7yDZDb+~|-S@tM7 zje1x0skhuY`SUqiNU=;V4-3n6*(A)9K^F+G19+s9-|=tq>RaqD4>bL!iuOZO>E|E8;Vd zdd1NamQbif8Os1;br#gRTahq8Q{N}zmCpS6gu4>H$KHgctH(_@V)?^6Tas<2V!~kE z=(liHYmZ`FS%xc3-NQyz45t@-^R!B z*LK)wU!C1Ot#zRmHCflT0gno{t^lkf%dW1rf&C6VBi{`&hz%V4YC0HRY}&p|@eY(p zvPs|3o;weF+n_M)D2T0A=?{P}Uw=!-!EPBY%sr&NGthi7`%W5muRijH0<@JtopK0Zl2{Cc#*mRL9Bn%Jk<#T;a}_C8veM!umBG~D)f;!o-{-@D z2SUM73kF-&2)l`qttY)Ni-QM%x=42prIW?C6Vcz1dt11}cs7VK37>m)#VtrJ9 zAJo%Lw)ycEH66d!yE*#W1Tk1rA{|c719J zq5e};@sS`Iiw-WNN)0+^0`#9@cCdL(OjCW7<h%V0?6hd~UNm$WMVrN@Nh7PWvxzZRuV zXX2&W9z9DoEosIN{&>_62o2$zpQFL!1dVCeeY{O{MpaP${*=wGl_UcI-O+;mz!BLA zLZT2`6Nv0}V9~9vnyi!KV0>$Hlo3}ft!KpB>v@E;CLg&+t0Hf9MaT;XE>okx^&nF>Y5<*sF38NcX$RDXCzgbh?}f=j}BVA*c!77F5mnezr3K$oidd z@x@@|Nh!%4IAlfM9#PB<(uM$!PQ0i(k7x;CUqv=ve(L$Ng=Z3BgWdEi4nfP=T6VLp z70r0L0~oW%pM|fi(Zs8byX41fis(G01or0#8>s7k{4T=6!@`HKzpMY8UeS6}RZx`x zIgu~KhvA{20gt#AViKbnI}pDy2tjkH?QXULPbcyd09ZiqxqQk>wo-f_5g*A)0DA7A zN@FTRSIzk^X-ZtyXqs2!zuhC32pS&s z0{Z$wQ82RY1y!;}Bu-%P8)f?7!#fgRgVJebC2e0j0=6ZR0+x@gP_M!ml z6qgy#)+=MZ99gto&OH{|ymQ!|Rt0xbCPp77p`B$aGJ91gWdal06|MsNz0|ADBhs&3bD;o zeK|ffs~H#m=}8-ntpOEO4q9^`QcxuHxW=61g?V*FK*^`At2eQ(8E3qBV5o+zHfT)TLq|-I?_#U}X<9P-TI_Sa*cxlaR6-7Do1L<* zx2ZMt3xWI$$&2ySA|66TszlTcI|cE}itg?|k=;=|LE;rpCz7D+j%J%!Plw*LThs?8mn>|dd3HGDS;;I@OH?*``CGHy8+#FqU!BK4onOLxTH$D%Dmco+);};xVYp(!&;wNis(rq#;%Qu#S^hobM zJchLJz7?IPCowOV-rY29?eE_{MZVCr#%}IN6=v%B`(k%K_V?`{_x(Qnr@o4HZK5jk z?k6Eu!m`ez%qChq46QRo6AC#7K$#$ApozNqVig-qr=*k0EGAXd*+gJ+I&m0uqcb95tVn_w%Wpk}G*AF{sTV zXgDX0gV_#M*IOr(KTLN2>`G?c+I#8&nX|3uYh7n{pVRcNg{^zhfDV`6YT)_rO@2$o?QSbm$&5Z( zd8;up12$Un%xqWqGt0-v)9eTw+j~+V#0ju$-smlyZoOC>Z>V?7N3*l!4*nepzF72n zE#50{;LGRfRV5Wv*^@NsL7q_&L+=!XUWw@*fLJFYiBLQ|HOc(K`JV-eL}x z#E|~}93@-f3=>D+A*U5p%0ikPn-d5TIEAznK~JN#mC^YAS&q#NGm$R;{(ye)#tSl& zl7;voOv&FD=VRI6)uyWO2m6-*W@a z*ihw{{t}EYeN3l1Q-?5bZk??CC63DlPA)KT8jBSn_BQb*D?gV>@$p!1xk}`U$2Z@` z=C@}yHE4x$64Tw6AN_;Y3&(Q(TV~43kUbR&ZS(6?NlC8@JTo_Y2Y-=O$wSO6kKd&T zUi4T=o9Utgc)K+e#RP-z8y8uSNk0m6<^z6QIOaimVVbi8983Q5E?CNVZ_kT%lBW+D zd;T>n;TF7jc2go=K;CeBy(82{1YADS8Yfh4+6@xY05p*xScycPvqQdc?yAv|O*`c7 z-&#*?En$VOxvsD)BQw+X*(!J2`2!m7M>5TSRXGJ}SiaYLO4faIy}$MGWxIJmz~#wx zl4Nn2o$=Vh*HR}2{shJG*od!+h8AX9n3hrH(<0KCOKku0Ew)5AMRE}X<_f$bjyS|o z+0NZ{Ezy*JTRzDU=}0O1a7Ey#OzJC2JYIc5GO<(M(n8K9Khye3~DY4=or z(C|V2B5z&ur`oAl;8P5Yy0=6`_>qTidGl7uO7sq`Wkzx$$2fm0(Hk>vbf_+`)~v25 zGeyqr0<^E4=ux&soW19a9ZhBQS|-w6{H^IjjV1 zjd0coa6c#LG1A`OsUC)JL+2%IA0W_ASrd^Q!!AJ{8{W{ZS52q^z4HscB^G?<_Fzho=c}4p3jZAy@ca`ZFD3YB74gCHlG=E9<`|E zR_`^OZZJAAD`b&nYMN03%8nJ)L%NL*Hb;DqhsB`Ij$GMIQ!LmzwwBA~y>VU_4Ie%B z&~wYO_$9`!OwI+eERPBRswMP_5ONePovA7OwR3uGx1#Dc%-3nsw~$IER#rd%C|a$; zQm~CtY?s|+gk5267g=L#WRq~yBg{UGF{|h~{R`@;XC0QM?`slh6^XSRq_BHV>qJ%s z4M^+#h96dKJ?R=VdSNT`N{_9RPc5x?iqwv`K4)3H{*8h|&>(=8drsc4rt#-sh{Eu! zw{if%NsBQ8B{bKUci>f{`w)WYef0%1N(t5T-QdgbFFl=bHG_n@!BLM@9$G76OCMwC zjJeKMc~1Ja*kd2QLiLaFHMV6mmP2l-$?vX>QMz-q@qVMF-S9u-#0n{Ts;fME0X!?r zj1NhUSS`HMZE~)ti>*OFX;lzzNq-Q~qXP!5&J}FKO^!M90@T;PKC7T237*^1Fz!;t z)@VL2NQvM6QI{b)$Tu5)@u5y-hKyXcn{L;F^r0AF1(sn0(vOc-P&Q>FGT*V(JUl0t zttc!7wPn*cqkbvH;95y~>M`Ik^@bPNH7h&jtV-;@`lAw9GCozjUSV8-s+(pBOJh+( z;izsml2}Q)PohM3rDW6-+o(_;|({l4F`JP=N6rD365rdF;p7KImljNTKa;D zqm>^q+V0)rQ-^1+mCyAUDb)m$e=Q&M(0liMdv!lSQW;LOR=yu6+x;{|4Ta(BfuBV7 z4cKjgANZ@~W#@EfPWvTdP9)|&rJ^<_Q8hb&i>_|2U1n6Bu{YS8(l-wC$W2f4n=CSRT~I%iQb}ut+-pNGq~bD&pUf{cP^h=7a?&{ zu}KO0Tl*IMJi6;F0R%^mOFkk03}rv%@i3}I(35bSls{7lQ5$v}-426ZyYTmy3-0E$>p>c#C*?U=_IMQnifMhUd1r#81Ei0f%H)p9 zf8LnoqKqkHDBvUfkBc^wJv{SuqgUWWGYb)Y_mp9Fx$A#_Vr%_%pJjRI4RHCd7l65c zJ^h^f$8gj-g(kC=q>~$UOfy};drqX~D)jzEa&-QJ@9{znI0^#xiys8CQK;qoF~d2g ztZPV}%A0sK&_0(`LnsznA7Qso*VDUC{ER=gB#6*|GQ8`rTK1iQn`9X;lTvSq(#J@t zmb3DwTlPN#V)PIcSwE|oQ}hMiq@+961n3Qw+)^0YvXz|=ZNf#=3z4s)3s*f|s7C^; z(%3DDxQ~juEGfE_l{P6e$mW5{;2zkt<)my>6@{BPsxl3d z{2qO9*uVm{>gr1upsPB;72ben(A#uYfPf$Qi$q}dLM!+ zFcv;JdMvXwdG*&$q)=G2gCx$|$(euMyVYT`IFI+*C3l<8LTRXmxkP;M($NBh4i_6risTN`;@y7u#ao&YgQ1$cCwbqls{*M#2wUswh?!%GCAePt zbkuy)M4Zxgafm*|1b<)XG~?8iQf;T(?5)3=MJHk5zoqy1U4OMd;3t)kfXbxqpF`N~ zV)wvqU%(ijACA7p+$TA)%QIRWTG$qtaw^pq#wIxzCftN@P=8QjKGBWn#0y7WcHO>D zBjr~36FjL^r6W~Nm-SM)-d0?8E&N;}J>bQw=tRw{bY2#L;t2WZ`;&mDzAe#^R*h|j zmptN~>s{V`;=yZiiSsnZI0S#c>=tJ+sn$Q6*S~Hfpyt01$k`hDKr>6P@{fC8AIP%W zuF*bbE%3Y4*_6#*N)(Uw<6^~bnMlmmBz`?#Z@BsTEfj(ztBpM?Z-CCX{?z^&KFHe6)l*Ruu-7rfxo&Q zs3~lF%y%tOa!@(Y+akNNbX#Q26FIcvYQEtq?R*++n4Ph|64Gvb=UY9v5hC64f{Gwg i_LC+j<^Mk6_h_8x%MYq1ZVvs|rKKdV4n@nFhy4$CAc?8~ literal 0 HcmV?d00001 diff --git a/community/front-end/ofe/website/ghpcfe/templates/base_generic.html b/community/front-end/ofe/website/ghpcfe/templates/base_generic.html index 67ba7d3a11..088c981acb 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/base_generic.html +++ b/community/front-end/ofe/website/ghpcfe/templates/base_generic.html @@ -118,6 +118,9 @@ + diff --git a/community/front-end/ofe/website/ghpcfe/templates/image/image-create.html b/community/front-end/ofe/website/ghpcfe/templates/image/image-create.html new file mode 100644 index 0000000000..38611a1ee4 --- /dev/null +++ b/community/front-end/ofe/website/ghpcfe/templates/image/image-create.html @@ -0,0 +1,110 @@ + + +{% extends "base_generic.html" %} + +{% block content %} +

Create a new image

+ + {% if form.non_field_errors %} + + {% endif %} + +
+ {% csrf_token %} + + {% for field in form.visible_fields %} +
+ {{ field.label_tag }} + {{ field.errors }} + {{ field }} + {% if field.help_text %} + {{ field.help_text }} + {% endif %} +
+ {% endfor %} + +
+ +
+ + + + +{% endblock %} diff --git a/community/front-end/ofe/website/ghpcfe/templates/image/image-view.html b/community/front-end/ofe/website/ghpcfe/templates/image/image-view.html new file mode 100644 index 0000000000..ba5ec10331 --- /dev/null +++ b/community/front-end/ofe/website/ghpcfe/templates/image/image-view.html @@ -0,0 +1,66 @@ + + +{% extends "base_generic.html" %} + +{% block content %} +

Image Detail

+

ID: {{ object.id }}

+

Owner: {{ object.owner }}

+

Name: {{ object.name }}

+

Family: {{ object.family }}

+

Source Image Project: {{ object.source_image_project }}

+

Source Image Family: {{ object.source_image_family }}

+

Enable OS Login: {{ object.enable_os_login }}

+

Block Project SSH Keys: {{ object.block_project_ssh_keys }}

+

Startup Scripts:

+ {% if object.startup_script.all %} +
+
+ + + + + + + + + + + {% for script in object.startup_script.all %} + + + + + + + {% empty %} + + + + {% endfor %} + +
NameDescriptionTypeFile
{{ script.name }}{{ script.description }}{{ script.type }}{{ script.content }}
No startup scripts associated with this image.
+
+ {% else %} +

No startup scripts associated with this image.

+ {% endif %} + + + +{% endblock %} diff --git a/community/front-end/ofe/website/ghpcfe/templates/image/list.html b/community/front-end/ofe/website/ghpcfe/templates/image/list.html new file mode 100644 index 0000000000..1f96087402 --- /dev/null +++ b/community/front-end/ofe/website/ghpcfe/templates/image/list.html @@ -0,0 +1,224 @@ + + +{% extends "base_generic.html" %} + +{% block meta %} + {% if loading == 1 %} + + {% endif %} +{% endblock %} + +{% block extrameta %} +{% load static %} + +{% endblock %} + +{% block content %} +

Startup scripts

+ {% if startupscripts %} +
+ + + + + + + + + + + + + {% for script in startupscripts %} + + + + + + + + + + {% endfor %} + +
#NameDescriptionTypeFile nameActions
{{ script.id }}{{ script.name }}{{ script.description }}{{ script.type }}{{ script.content }} + View + {% if admin_view == 1 %} + + {% endif %} +
+ {% else %} +

You have not created a startup script yet. Create one!

+ {% endif %} + +
+ Add startup script + +
+ +

Images

+ {% if images %} +
+ + + + + + + + + + + + + + + {% for image in images %} + + + + + + + + + + + {% endfor %} + +
#NameFamilySource projectSource familyStartup scriptsStatusActions
{{ image.id }}{{ image.name }}{{ image.family }}{{ image.source_image_project }}{{ image.source_image_family }} + {% for script in image.startup_script.all %} + {{ script.name }} +
+ {% endfor %} +
+ + + View + {% if admin_view == 1 and image.status == "r" %} + + {% endif %} +
+ {% else %} +

You have not created an image yet. Create one!

+ {% endif %} + +
+ Create an image +{% endblock %} + + + +{% block tailscript %} + + + +{% endblock %} diff --git a/community/front-end/ofe/website/ghpcfe/templates/image/startup-script-create.html b/community/front-end/ofe/website/ghpcfe/templates/image/startup-script-create.html new file mode 100644 index 0000000000..4e630eb969 --- /dev/null +++ b/community/front-end/ofe/website/ghpcfe/templates/image/startup-script-create.html @@ -0,0 +1,44 @@ + + +{% extends "base_generic.html" %} + +{% block content %} +

Create a new startup script

+ + {% if form.non_field_errors %} + + {% endif %} + +
+ {% csrf_token %} + + {% for field in form.visible_fields %} +
+ {{ field.label_tag }} + {{ field.errors }} + {{ field }} + {% if field.help_text %} + {{ field.help_text }} + {% endif %} +
+ {% endfor %} + +
+ +
+ +{% endblock %} diff --git a/community/front-end/ofe/website/ghpcfe/templates/image/startup-script-view.html b/community/front-end/ofe/website/ghpcfe/templates/image/startup-script-view.html new file mode 100644 index 0000000000..1891e0bb98 --- /dev/null +++ b/community/front-end/ofe/website/ghpcfe/templates/image/startup-script-view.html @@ -0,0 +1,28 @@ + + +{% extends "base_generic.html" %} + +{% block content %} +

Startup Script Detail

+

ID: {{ object.id }}

+

Name: {{ object.name }}

+

Description: {{ object.description }}

+

Type: {{ object.type }}

+

File Contents:

+
{{ file_contents }}
+ Back to list +{% endblock %} From 548b4504098ea3a1ce5dea0ee332bc2d0a6fced2 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 16 Jun 2023 09:09:44 +0100 Subject: [PATCH 027/144] OFE Frontend: classes that implement frontend object for custom images --- .../front-end/ofe/website/ghpcfe/models.py | 144 +++++++++++++++++- 1 file changed, 141 insertions(+), 3 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/models.py b/community/front-end/ofe/website/ghpcfe/models.py index a039ab6915..21e3e5b722 100644 --- a/community/front-end/ofe/website/ghpcfe/models.py +++ b/community/front-end/ofe/website/ghpcfe/models.py @@ -534,6 +534,122 @@ def mount_source(self): def __str__(self): return f"{self.mount_path} on {self.cluster}" +class StartupScript(models.Model): + """Model representing a startup script for custom image build.""" + + def __str__(self): + return self.name + + name = models.CharField( + max_length=30, + help_text="Enter a startup script name", + ) + description = models.TextField( + max_length=4000, + help_text="(Optional) description of this stratup script", + blank=True, + null=True, + ) + STARTUP_SCRIPT_TYPES = ( + ("shell", "Shell script"), + ("ansible-local", "Ansible playbook"), + ) + type = models.CharField( + max_length=13, + choices=STARTUP_SCRIPT_TYPES, + blank=False, + default="shell", + help_text="Type of this application installation", + ) + content = models.FileField( + upload_to="startup-scripts/", + help_text="Startup script file." + ) + owner = models.ForeignKey( + User, + related_name="startup_script_owner", + help_text="Who owns this startup script?", + on_delete=models.RESTRICT, + ) + authorised_users = models.ManyToManyField( + User, + related_name="startup_script_authorised_users", + help_text="Select other users authorised to use this startup script", + ) + +class Image(CloudResource): + """Model representing a custom node image.""" + + name = models.CharField( + max_length=30, + help_text="Enter an image name", + unique=True, + ) + + family = models.CharField( + max_length=30, + help_text="Enter you new image family", + unique=True, + ) + + source_image_project = models.CharField( + max_length=60, + help_text="Enter a source image project", + blank=False, + default="schedmd-slurm-public", + ) + + source_image_family = models.CharField( + max_length=60, + help_text="Enter a soure image family", + blank=False, + default="schedmd-v5-slurm-22-05-8-rocky-linux-8", + ) + + startup_script = models.ManyToManyField( + StartupScript, + help_text="Which startup scripts to use?", + ) + + enable_os_login = models.CharField( + max_length=5, + help_text="Enable OS Login durring the image creation?", + choices=(("TRUE", "TRUE"),("FALSE", "FALSE")), + default="TRUE", + ) + + block_project_ssh_keys = models.CharField( + max_length=5, + help_text="Don't use SSH keys in project metadata to create users?", + choices=(("TRUE", "TRUE"),("FALSE", "FALSE")), + default="TRUE", + ) + owner = models.ForeignKey( + User, + related_name="image_owner", + help_text="Who owns this image?", + on_delete=models.RESTRICT, + ) + authorised_users = models.ManyToManyField( + User, + related_name="image_authorised_users", + help_text="Select other users authorised to use this image", + ) + IMAGE_STATUS = ( + ("n", "Image is being newly configured by user"), + ("c", "Image is being created"), + ("r", "Image is ready"), + ("e", "Image creation has failed"), + ) + status = models.CharField( + max_length=1, + choices=IMAGE_STATUS, + default="n", + help_text="Status of this image", + ) + + def __str__(self): + return self.name class Cluster(CloudResource): """Model representing a cluster""" @@ -654,6 +770,24 @@ class Cluster(CloudResource): null=True, blank=True, ) + login_node_image = models.ForeignKey( + Image, + related_name="login_node_image", + help_text="Select login node image", + blank=True, + null=True, + default=None, + on_delete=models.RESTRICT, + ) + controller_node_image = models.ForeignKey( + Image, + related_name="controller_node_image", + help_text="Select controller node image", + blank=True, + null=True, + default=None, + on_delete=models.RESTRICT + ) def get_access_key(self): return Token.objects.get(user=self.owner) @@ -731,10 +865,14 @@ class ClusterPartition(models.Model): max_length=40, help_text="GCP Instance Type name", ) - image = models.CharField( - max_length=4096, - help_text="OS Image path", + image = models.ForeignKey( + Image, + related_name="compute_node_image", + help_text="Select compute node image", blank=True, + null=True, + default=None, + on_delete=models.RESTRICT, ) max_node_count = models.PositiveIntegerField( validators=[MinValueValidator(1)], From 02d2b78a685bd80027370b229827dc927213f24f Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 16 Jun 2023 09:10:30 +0100 Subject: [PATCH 028/144] OFE Frontend: views that resolve front end templates and calls to backend functions. --- .../ofe/website/ghpcfe/views/images.py | 289 ++++++++++++++++++ 1 file changed, 289 insertions(+) create mode 100644 community/front-end/ofe/website/ghpcfe/views/images.py diff --git a/community/front-end/ofe/website/ghpcfe/views/images.py b/community/front-end/ofe/website/ghpcfe/views/images.py new file mode 100644 index 0000000000..fe906bb858 --- /dev/null +++ b/community/front-end/ofe/website/ghpcfe/views/images.py @@ -0,0 +1,289 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" clusters.py """ + +import os +from asgiref.sync import sync_to_async +from django.shortcuts import get_object_or_404 +from django.contrib.auth.mixins import LoginRequiredMixin +from django.contrib.auth.views import redirect_to_login +from django.contrib.auth.mixins import UserPassesTestMixin +from django.core.exceptions import PermissionDenied +from django.urls import reverse_lazy +from django.conf import settings +from django.http import ( + HttpResponseRedirect, + JsonResponse, +) +from django.urls import reverse +from django.views import generic +from django.views.generic.edit import CreateView +from ..models import StartupScript, Image, Credential +from ..forms import StartupScriptForm, ImageForm +from ..cluster_manager.image import ImageBackend +from ..cluster_manager.cloud_info import get_region_zone_info +from ..views.asyncview import BackendAsyncView + +import logging + +logger = logging.getLogger(__name__) + + +class ImagesListView(LoginRequiredMixin, generic.ListView): + """Custom ListView for StartupScript and Images model""" + + model = StartupScript + template_name = "image/list.html" + + def get_queryset(self): + # If user is admin, return all objects. + if self.request.user.has_admin_role(): + startup_scripts = StartupScript.objects.all() + images = Image.objects.all() + return startup_scripts, images + else: + # Retrieve startup scripts and images owned by the user + startup_scripts = StartupScript.objects.filter(owner=self.request.user) + images = Image.objects.filter(owner=self.request.user) + + # Retrieve startup scripts and images authorized for the user + authorized_startup_scripts = StartupScript.objects.filter(authorised_users=self.request.user) + authorized_images = Image.objects.filter(authorised_users=self.request.user) + + # Combine the owned and authorized objects + startup_scripts = startup_scripts | authorized_startup_scripts + images = images | authorized_images + + return startup_scripts, images + + def get_context_data(self, *args, **kwargs): + loading = 0 + admin_view = 0 + if self.request.user.has_admin_role(): + admin_view = 1 + context = super().get_context_data(*args, **kwargs) + context["loading"] = loading + context["admin_view"] = admin_view + context["navtab"] = "image" + + startup_scripts, images = self.get_queryset() + context["startupscripts"] = startup_scripts + context["images"] = images + + return context + +class StartupScriptDetailView(LoginRequiredMixin, generic.DetailView): + """Custom DetailView for StartupScript model""" + + model = StartupScript + template_name = "image/startup-script-view.html" + + def is_admin_or_authorized_user(self, startup_script): + user = self.request.user + return ( + user.has_admin_role() + or user == startup_script.owner + or user in startup_script.authorised_users.all() + ) + + def get_context_data(self, **kwargs): + context = super().get_context_data(**kwargs) + startup_script = self.get_object() + + # Check if the user is an admin, the owner, or authorized for the startup script + if self.is_admin_or_authorized_user(startup_script): + file_path = os.path.join(settings.MEDIA_ROOT, startup_script.content.name) + try: + with open(file_path, 'r') as file: + try: + context["file_contents"] = file.read() + except UnicodeDecodeError: + context["file_contents"] = "Error: Unable to decode file" + except IOError: + context["file_contents"] = "Error: Unable to read file" + else: + raise PermissionDenied() + + context["navtab"] = "image" + return context + +class StartupScriptCreateView(LoginRequiredMixin, generic.CreateView): + """Custom CreateView for StartupScript model""" + + success_url = reverse_lazy("images") + form_class = StartupScriptForm + template_name = "image/startup-script-create.html" + + def get_context_data(self, **kwargs): + context = super().get_context_data(**kwargs) + context["navtab"] = "image" + return context + + # Set currently logged-in user as owner. + def form_valid(self, form): + form.instance.owner = self.request.user + return super().form_valid(form) + + +class StartupScriptDeleteView(UserPassesTestMixin, generic.View): + """Custom view for deleting StartupScript objects""" + + def test_func(self): + return self.request.user.is_superuser + + def post(self, request, *args, **kwargs): + startup_script = StartupScript.objects.get(pk=self.kwargs['pk']) + file_path = os.path.join(settings.MEDIA_ROOT, startup_script.content.name) + try: + os.remove(file_path) + logger.info("File deleted successfully.") + except FileNotFoundError: + logger.error("Error: File not found.") + except PermissionError: + logger.error("Error: Permission denied.") + except Exception as e: + logger.exception(f"Error: {str(e)}") + + startup_script.delete() + response = {'success': True} + return JsonResponse(response) + +class ImageCreateView(LoginRequiredMixin, CreateView): + """Custom CreateView for Image model""" + + form_class = ImageForm + template_name = "image/image-create.html" + + def get_success_url(self): + image = self.object + success_url = reverse("backend-create-image", kwargs={"pk": image.pk}) + return success_url + + def get_form_kwargs(self): + kwargs = super().get_form_kwargs() + kwargs["user"] = self.request.user + return kwargs + + def get_context_data(self, **kwargs): + context = super().get_context_data(**kwargs) + context["navtab"] = "image" + return context + + # Set currently logged-in user as owner. + def form_valid(self, form): + form.instance.owner = self.request.user + return super().form_valid(form) + +class ImageDetailView(LoginRequiredMixin, generic.DetailView): + """Custom DetailView for Image model""" + + model = Image + template_name = "image/image-view.html" + + def is_admin_or_authorized_user(self, image): + user = self.request.user + return ( + user.has_admin_role() + or user == image.owner + or user in image.authorised_users.all() + ) + + def get_context_data(self, **kwargs): + context = super().get_context_data(**kwargs) + image = self.get_object() + startup_scripts = image.startup_script.all() + context["startup_scripts"] = startup_scripts + + # Check if the user is an admin, the owner, or authorized for the image + if self.is_admin_or_authorized_user(image): + context["navtab"] = "image" + return context + else: + raise PermissionDenied() + + +class ImageDeleteView(UserPassesTestMixin, generic.View): + """Custom view for deleting Image objects""" + + def test_func(self): + return self.request.user.is_superuser + + def post(self, request, *args, **kwargs): + image = Image.objects.get(pk=self.kwargs['pk']) + img_backend = ImageBackend(image) + img_backend.delete_image() + image.delete() + response = {'success': True} + return JsonResponse(response) + +class ImageStatusView(LoginRequiredMixin, generic.View): + """Custom view for Image model that returns Image status""" + + def is_admin_or_authorized_user(self, image): + user = self.request.user + return ( + user.has_admin_role() + or user == image.owner + or user in image.authorised_users.all() + ) + + def get(self, request, pk, *args, **kwargs): + image = get_object_or_404(Image, pk=pk) + + # Check if the user is an admin, the owner, or authorized for the image + if self.is_admin_or_authorized_user(image): + response = {'status': image.status} + return JsonResponse(response) + + else: + raise PermissionDenied() + +class BackendCreateImage(BackendAsyncView): + """A view to make async call to create a new image""" + + @sync_to_async + def get_orm(self, image_id): + image = Image.objects.get(pk=image_id) + creds = image.cloud_credential + return (image, creds) + + def cmd(self, unused_task_id, unused_token, image, creds): + img_backend = ImageBackend(image) + img_backend.prepare() + + async def get(self, request, pk): + """this will invoke the background tasks and return immediately""" + # Mixins don't yet work with Async views + if not await sync_to_async(lambda: request.user.is_authenticated)(): + return redirect_to_login(request.get_full_path) + await self.test_user_is_cluster_admin(request.user) + + args = await self.get_orm(pk) + await self.create_task("Create Image", *args) + return HttpResponseRedirect( + reverse("images") + ) + +class BackendListRegions(LoginRequiredMixin, generic.View): + """Custom view that returns json of available GCP regions.""" + + def get(self, request, pk, *args, **kwargs): + credentials = get_object_or_404(Credential, pk=pk) + regions = get_region_zone_info("GCP", credentials.detail) + return JsonResponse(regions) + + + + From e987757af24681d37ba27306132120494d4c9512 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 16 Jun 2023 09:11:19 +0100 Subject: [PATCH 029/144] OFE Frontend: allow clusters to use custom images in the front end --- .../ghpcfe/templates/cluster/create_form.html | 34 +++++++++++++++++++ .../ghpcfe/templates/cluster/detail.html | 12 +++++++ 2 files changed, 46 insertions(+) diff --git a/community/front-end/ofe/website/ghpcfe/templates/cluster/create_form.html b/community/front-end/ofe/website/ghpcfe/templates/cluster/create_form.html index 4072dc9ca5..b95e9c3578 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/cluster/create_form.html +++ b/community/front-end/ofe/website/ghpcfe/templates/cluster/create_form.html @@ -117,6 +117,40 @@

Create a new cluster

{{ form.spackdir.help_text }}
+ + +
+ +
+
+ + +
+
+ +
+ + +
+ {{ form.login_node_image.errors }} + {{ form.login_node_image.label_tag }} + {{ form.login_node_image }} + {{ form.login_node_image.help_text }} +
+ +
+ {{ form.controller_node_image.errors }} + {{ form.controller_node_image.label_tag }} + {{ form.controller_node_image }} + {{ form.controller_node_image.help_text }} +
+
+
+
diff --git a/community/front-end/ofe/website/ghpcfe/templates/cluster/detail.html b/community/front-end/ofe/website/ghpcfe/templates/cluster/detail.html index 28eab544a6..a771936ad2 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/cluster/detail.html +++ b/community/front-end/ofe/website/ghpcfe/templates/cluster/detail.html @@ -71,6 +71,18 @@

Cluster Detail

{% endif %}

+
+
+ + +
+
+ +
+

Login node image: {{ object.login_node_image.name }}

+

Controller node image: {{ object.controller_node_image.name }}

+
+

Partitions

From fecb10156068dffd61039e0d86a41684dad682ef Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 16 Jun 2023 09:12:09 +0100 Subject: [PATCH 030/144] OFE dependency that allows deleting custom images from GCP --- community/front-end/ofe/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index ba99fa8c1e..cebf623c7a 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -28,6 +28,7 @@ google-auth-httplib2==0.1.0 google-cloud-billing==1.11.0 google-cloud-core==2.3.2 google-cloud-pubsub==2.17.1 +google-cloud-compute google-cloud-storage==2.9.0 google-crc32c==1.5.0 google-resumable-media==2.5.0 From cad6bfb21b61dcf709b33e5be2fa4e33fc98a15c Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 16 Jun 2023 09:12:44 +0100 Subject: [PATCH 031/144] Trying to avoid pushing deployment config file to the repo --- community/front-end/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/community/front-end/.gitignore b/community/front-end/.gitignore index 87a0d0b5c4..d4a8ac9b42 100644 --- a/community/front-end/.gitignore +++ b/community/front-end/.gitignore @@ -25,3 +25,4 @@ terraform.tfvars tf/.tkfe.lock tf/tfapply.log tf/tfdestroy.log +config.yaml From 055b470c2aea9ab350c621caedfb3a3ad616d87b Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 14 Jul 2023 07:21:36 +0100 Subject: [PATCH 032/144] OFE Backend: updating custom image naming convention --- .../ofe/website/ghpcfe/cluster_manager/clusterinfo.py | 6 +++--- .../front-end/ofe/website/ghpcfe/cluster_manager/image.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index a81030d136..361ff9e317 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -174,7 +174,7 @@ def _prepare_ghpc_partitions(self, part_uses): part_id = f"partition_{count}" if part.image is not None: instance_image_yaml = f"""instance_image: - family: {part.image.family} + family: image-{part.image.family} project: {self.cluster.project_id}""" else: instance_image_yaml = "" @@ -264,14 +264,14 @@ def _prepare_ghpc_yaml(self): if self.cluster.login_node_image is not None: login_image_yaml = f"""instance_image: - family: {self.cluster.login_node_image.family} + family: image-{self.cluster.login_node_image.family} project: {self.cluster.project_id}""" else: login_image_yaml = "" if self.cluster.controller_node_image is not None: controller_image_yaml = f"""instance_image: - family: {self.cluster.controller_node_image.family} + family: image-{self.cluster.controller_node_image.family} project: {self.cluster.project_id} """ else: diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py index f1aa094078..c183219ab1 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py @@ -99,7 +99,7 @@ def _create_blueprint(self): network_name: {"image-"+ str(self.image.id) + "-network"} subnetwork_name: {"image" + str(self.image.id) + "-subnetwork"} image_name: {"image-" + self.image.name} - image_family: {"image" + self.image.family} + image_family: {"image-" + self.image.family} tag: ofe-created deployment_groups: From 5da364e7f8f486b4159baf1ff56bebf7e9534664 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 14 Jul 2023 14:29:01 +0100 Subject: [PATCH 033/144] OFE Backend: Additional dependencies for Rocky HPC custom image support --- .../roles/c2_daemon/tasks/main.yaml | 24 +++++++++++++++++++ .../roles/common/tasks/main.yaml | 4 ++-- .../roles/dev_env/tasks/main.yaml | 4 ++-- .../roles/spack_setup/tasks/main.yaml | 12 +++++++++- .../ghpcfe/cluster_manager/clusterinfo.py | 14 +++++------ .../ofe/website/ghpcfe/views/images.py | 1 + 6 files changed, 47 insertions(+), 12 deletions(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml index 92dd0ae1c2..6ffe2cf2ae 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/c2_daemon/tasks/main.yaml @@ -13,6 +13,26 @@ # limitations under the License. --- +- name: Set most recent Python version as default + ansible.builtin.shell: + cmd: | + latest_version=$(ls -1 /usr/bin/python3* | awk -F/ '{print $NF}' | grep -E 'python[0-9]+\.[0-9]+$' | sort -V | tail -1) + alternatives --set python3 /usr/bin/$latest_version + when: ansible_distribution == 'Rocky' + +- name: Install pip3 + ansible.builtin.package: + name: python3-pip + state: present + become: true + when: ansible_distribution == 'Rocky' + +- name: Install setuptools for Python 3.11 + ansible.builtin.command: + cmd: /usr/bin/python3.11 -m ensurepip --upgrade + become: true + when: ansible_distribution == 'Rocky' + - name: Upgrade PIP3 ansible.builtin.pip: executable: pip3 @@ -29,6 +49,10 @@ - pexpect - google-cloud-storage - google-cloud-pubsub + - addict + - google-api-python-client + - google-cloud-secret-manager + - prometheus_client state: present - name: Install FE C&C Daemon diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/common/tasks/main.yaml b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/common/tasks/main.yaml index 649306ad6b..42e112cd94 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/common/tasks/main.yaml +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/common/tasks/main.yaml @@ -17,7 +17,7 @@ ansible.builtin.yum: name: - environment-modules - when: ansible_distribution == 'CentOS' + when: ansible_distribution in ['CentOS', 'Rocky'] - name: Create Modules directory ansible.builtin.file: @@ -33,7 +33,7 @@ owner: root mode: 0755 force: False - when: ansible_distribution == 'CentOS' + when: ansible_distribution in ['CentOS', 'Rocky'] - name: Enable su to OS Login ansible.builtin.lineinfile: diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/dev_env/tasks/main.yaml b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/dev_env/tasks/main.yaml index deffbf9c6a..ddc70959bc 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/dev_env/tasks/main.yaml +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/dev_env/tasks/main.yaml @@ -26,7 +26,7 @@ - cmake - python2-devel - python36-devel - when: ansible_distribution == 'CentOS' + when: ansible_distribution in ['CentOS', 'Rocky'] - name: Add DevTools to default shells @@ -37,7 +37,7 @@ owner: root mode: 0755 force: False - when: ansible_distribution == 'CentOS' + when: ansible_distribution in ['CentOS', 'Rocky'] - name: Install Debian Dev tools ansible.builtin.apt: diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/spack_setup/tasks/main.yaml b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/spack_setup/tasks/main.yaml index dac5225c02..54dfa4070d 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/spack_setup/tasks/main.yaml +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/spack_setup/tasks/main.yaml @@ -13,9 +13,19 @@ # limitations under the License. --- +- name: Install OpenMPI + ansible.builtin.yum: + name: openmpi + state: present + when: ansible_distribution == 'Rocky' + - name: Load openmpi module ansible.builtin.shell: ". /usr/share/Modules/init/sh;module load openmpi" - when: True + when: ansible_distribution == 'Centos' + +- name: Load openmpi module + ansible.builtin.shell: ". /usr/share/Modules/init/sh;module load mpi" + when: ansible_distribution == 'Rocky' - name: Spack find system modules ansible.builtin.command: "{{ spack_dir }}/bin/spack external find --scope=system --not-buildable" diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index 361ff9e317..e171364c8a 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -339,13 +339,13 @@ def _prepare_ghpc_yaml(self): - https://www.googleapis.com/auth/logging.write - https://www.googleapis.com/auth/devstorage.read_write - https://www.googleapis.com/auth/pubsub - controller_startup_script: | - #!/bin/bash - echo "******************************************** CALLING CONTROLLER STARTUP" - gsutil cp gs://{startup_bucket}/clusters/{self.cluster.id}/bootstrap_controller.sh - | bash - compute_startup_script: | - #!/bin/bash - gsutil cp gs://{startup_bucket}/clusters/{self.cluster.id}/bootstrap_compute.sh - | bash + controller_startup_script: | + #!/bin/bash + echo "******************************************** CALLING CONTROLLER STARTUP" + gsutil cp gs://{startup_bucket}/clusters/{self.cluster.id}/bootstrap_controller.sh - | bash + compute_startup_script: | + #!/bin/bash + gsutil cp gs://{startup_bucket}/clusters/{self.cluster.id}/bootstrap_compute.sh - | bash #TODO: enable_cleanup_compute: True #TODO: enable_cleanup_subscriptions: True use: diff --git a/community/front-end/ofe/website/ghpcfe/views/images.py b/community/front-end/ofe/website/ghpcfe/views/images.py index fe906bb858..03e54440e2 100644 --- a/community/front-end/ofe/website/ghpcfe/views/images.py +++ b/community/front-end/ofe/website/ghpcfe/views/images.py @@ -228,6 +228,7 @@ def post(self, request, *args, **kwargs): response = {'success': True} return JsonResponse(response) + class ImageStatusView(LoginRequiredMixin, generic.View): """Custom view for Image model that returns Image status""" From 45870ba529a29619d3252b8febaed8aa1821a4c7 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 13 Jul 2023 15:57:27 -0700 Subject: [PATCH 034/144] Split spack functionality into setup and execute modules --- .../modules/scripts/spack-execute/README.md | 63 +++++++++++++ .../modules/scripts/spack-execute/main.tf | 69 ++++++++++++++ .../modules/scripts/spack-execute/outputs.tf | 35 ++++++++ .../templates/execute_commands.yml.tpl | 0 .../scripts/spack-execute/variables.tf | 89 +++++++++++++++++++ .../modules/scripts/spack-execute/versions.tf | 25 ++++++ .../modules/scripts/spack-install/README.md | 16 ++-- .../modules/scripts/spack-install/main.tf | 47 ++++------ .../modules/scripts/spack-install/outputs.tf | 12 ++- .../scripts/spack-install/variables.tf | 63 ++++--------- .../modules/scripts/spack-install/versions.tf | 5 ++ modules/scripts/startup-script/README.md | 2 +- modules/scripts/startup-script/variables.tf | 2 +- tools/duplicate-diff.py | 2 +- 14 files changed, 342 insertions(+), 88 deletions(-) create mode 100644 community/modules/scripts/spack-execute/README.md create mode 100644 community/modules/scripts/spack-execute/main.tf create mode 100644 community/modules/scripts/spack-execute/outputs.tf rename community/modules/scripts/{spack-install => spack-execute}/templates/execute_commands.yml.tpl (100%) create mode 100644 community/modules/scripts/spack-execute/variables.tf create mode 100644 community/modules/scripts/spack-execute/versions.tf diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md new file mode 100644 index 0000000000..e516796f9a --- /dev/null +++ b/community/modules/scripts/spack-execute/README.md @@ -0,0 +1,63 @@ + +Copyright 2023 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.0.0 | +| [local](#requirement\_local) | ~> 2.0.0 | + +## Providers + +| Name | Version | +|------|---------| +| [local](#provider\_local) | ~> 2.0.0 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.20.0 | + +## Resources + +| Name | Type | +|------|------| +| [local_file.debug_file_ansible_execute](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [commands](#input\_commands) | String of commands to run within this module | `string` | `null` | no | +| [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | +| [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing spack scripts. | `string` | n/a | yes | +| [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | The GCS path for storage bucket and the object, starting with `gs://`. | `string` | n/a | yes | +| [labels](#input\_labels) | Key-value pairs of labels to be added to created resources. | `map(string)` | n/a | yes | +| [log\_file](#input\_log\_file) | Defines the logfile that script output will be written to | `string` | `"/var/log/spack.log"` | no | +| [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | +| [region](#input\_region) | Region to place bucket containing spack scripts. | `string` | n/a | yes | +| [spack\_runner](#input\_spack\_runner) | Runner from previous spack-install or spack-execute to be chained with scripts generated by this module. |
object({
type = string
content = string
destination = string
})
| n/a | yes | + +## Outputs + +| Name | Description | +|------|-------------| +| [controller\_startup\_script](#output\_controller\_startup\_script) | Path to the Spack installation script, duplicate for SLURM controller. | +| [gcs\_bucket\_path](#output\_gcs\_bucket\_path) | Bucket containing the startup scripts for spack, to be reused by spack-execute module. | +| [spack\_runner](#output\_spack\_runner) | Single runner that combines scripts from this module and any previously chained spack-execute or spack-install modules. | +| [startup\_script](#output\_startup\_script) | Path to the Spack installation script. | + diff --git a/community/modules/scripts/spack-execute/main.tf b/community/modules/scripts/spack-execute/main.tf new file mode 100644 index 0000000000..9838ac76e5 --- /dev/null +++ b/community/modules/scripts/spack-execute/main.tf @@ -0,0 +1,69 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "spack-execute" }) +} + +locals { + commands_content = var.commands == null ? "echo 'no spack commands provided'" : indent(4, yamlencode(var.commands)) + + execute_contents = templatefile( + "${path.module}/templates/execute_commands.yml.tpl", + { + pre_script = ". /etc/profile.d/spack.sh" + log_file = var.log_file + commands = local.commands_content + } + ) + + data_runners = [for data_file in var.data_files : merge(data_file, { type = "data" })] + + execute_md5 = substr(md5(local.execute_contents), 0, 4) + execute_runner = { + type = "ansible-local" + content = local.execute_contents + destination = "spack_execute_${local.execute_md5}.yml" + } + + runners = concat([var.spack_runner], local.data_runners, [local.execute_runner]) + + # Destinations should be unique while also being known at time of apply + combined_unique_string = join("\n", [for runner in local.runners : runner["destination"]]) + combined_md5 = substr(md5(local.combined_unique_string), 0, 4) + combined_runner = { + type = "shell" + content = module.startup_script.startup_script + destination = "combined_install_spack_${local.combined_md5}.sh" + } +} + +module "startup_script" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.20.0" + + labels = local.labels + project_id = var.project_id + deployment_name = var.deployment_name + region = var.region + runners = local.runners + gcs_bucket_path = var.gcs_bucket_path +} + +resource "local_file" "debug_file_ansible_execute" { + content = local.execute_contents + filename = "${path.module}/debug_execute_${local.execute_md5}.yml" +} diff --git a/community/modules/scripts/spack-execute/outputs.tf b/community/modules/scripts/spack-execute/outputs.tf new file mode 100644 index 0000000000..65fae325b7 --- /dev/null +++ b/community/modules/scripts/spack-execute/outputs.tf @@ -0,0 +1,35 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "startup_script" { + description = "Path to the Spack installation script." + value = module.startup_script.startup_script +} + +output "controller_startup_script" { + description = "Path to the Spack installation script, duplicate for SLURM controller." + value = module.startup_script.startup_script +} + +output "spack_runner" { + description = "Single runner that combines scripts from this module and any previously chained spack-execute or spack-install modules." + value = local.combined_runner +} + +output "gcs_bucket_path" { + description = "Bucket containing the startup scripts for spack, to be reused by spack-execute module." + value = var.gcs_bucket_path +} diff --git a/community/modules/scripts/spack-install/templates/execute_commands.yml.tpl b/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl similarity index 100% rename from community/modules/scripts/spack-install/templates/execute_commands.yml.tpl rename to community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl diff --git a/community/modules/scripts/spack-execute/variables.tf b/community/modules/scripts/spack-execute/variables.tf new file mode 100644 index 0000000000..03a5d94d4b --- /dev/null +++ b/community/modules/scripts/spack-execute/variables.tf @@ -0,0 +1,89 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "Project in which the HPC deployment will be created." + type = string +} + +variable "deployment_name" { + description = "Name of deployment, used to name bucket containing spack scripts." + type = string +} + +variable "region" { + description = "Region to place bucket containing spack scripts." + type = string +} + +variable "labels" { + description = "Key-value pairs of labels to be added to created resources." + type = map(string) +} + +variable "log_file" { + description = "Defines the logfile that script output will be written to" + default = "/var/log/spack.log" + type = string +} + +variable "data_files" { + description = <<-EOT + A list of files to be transferred prior to running commands. + It must specify one of 'source' (absolute local file path) or 'content' (string). + It must specify a 'destination' with absolute path where file should be placed. + EOT + type = list(map(string)) + default = [] + validation { + condition = alltrue([for r in var.data_files : substr(r["destination"], 0, 1) == "/"]) + error_message = "All destinations must be absolute paths and start with '/'." + } + validation { + condition = alltrue([ + for r in var.data_files : + can(r["content"]) != can(r["source"]) + ]) + error_message = "A data_file must specify either 'content' or 'source', but never both." + } + validation { + condition = alltrue([ + for r in var.data_files : + lookup(r, "content", lookup(r, "source", null)) != null + ]) + error_message = "A data_file must specify a non-null 'content' or 'source'." + } +} + +variable "commands" { + description = "String of commands to run within this module" + type = string + default = null +} + +variable "spack_runner" { + description = "Runner from previous spack-install or spack-execute to be chained with scripts generated by this module." + type = object({ + type = string + content = string + destination = string + }) +} + +variable "gcs_bucket_path" { + description = "The GCS path for storage bucket and the object, starting with `gs://`." + type = string +} diff --git a/community/modules/scripts/spack-execute/versions.tf b/community/modules/scripts/spack-execute/versions.tf new file mode 100644 index 0000000000..c9927a3d15 --- /dev/null +++ b/community/modules/scripts/spack-execute/versions.tf @@ -0,0 +1,25 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +terraform { + required_version = ">= 1.0.0" + required_providers { + local = { + source = "hashicorp/local" + version = "~> 2.0.0" + } + } +} diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index f47b872879..f9f56d2114 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -190,12 +190,14 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.0.0 | +| [google](#requirement\_google) | >= 4.42 | | [local](#requirement\_local) | >= 2.0.0 | ## Providers | Name | Version | |------|---------| +| [google](#provider\_google) | >= 4.42 | | [local](#provider\_local) | >= 2.0.0 | ## Modules @@ -208,7 +210,7 @@ limitations under the License. | Name | Type | |------|------| -| [local_file.debug_file_ansible_execute](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | +| [google_storage_bucket.bucket](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket) | resource | | [local_file.debug_file_shell_install](https://registry.terraform.io/providers/hashicorp/local/latest/docs/resources/file) | resource | ## Inputs @@ -219,19 +221,17 @@ limitations under the License. | [chgrp\_group](#input\_chgrp\_group) | Group to chgrp the Spack clone to. Default will not modify the clone. | `string` | `null` | no | | [chmod\_mode](#input\_chmod\_mode) | Mode to chmod the Spack clone to. Defaults to null (i.e. do not modify).
For usage information see:
https://docs.ansible.com/ansible/latest/collections/ansible/builtin/file_module.html#parameter-mode | `string` | `"a+rwxs"` | no | | [chown\_owner](#input\_chown\_owner) | Owner to chown the Spack clone to. Default will not modify the clone. | `string` | `null` | no | -| [commands](#input\_commands) | String of commands to run within this module | `string` | `null` | no | | [compilers](#input\_compilers) | DEPRECATED

The following `commands` can be used to install compilers:
spack install gcc@10.3.0 target=x86_64
spack load gcc@10.3.0 target=x86_64
spack compiler find --scope site
spack clean -s
spack unload gcc@10.3.0
Defines compilers for spack to install before installing packages. | `list(string)` | `null` | no | -| [concretize\_flags](#input\_concretize\_flags) | DEPRECATED - spack concretize is now performed using the `commands` variable. | `string` | `null` | no | +| [concretize\_flags](#input\_concretize\_flags) | DEPRECATED - spack concretize is now performed using the [spack-execute](../spack-execute/) module `commands` variable. | `string` | `null` | no | | [configs](#input\_configs) | DEPRECATED

The following `commands` can be used to add a single config:
spack config --scope defaults add config:default:true
Alternatively, use `data_files` to transfer a config file and use the `spack config add -f ` command to add the config.

List of configuration options to set within spack. | `list(map(any))` | `null` | no | -| [data\_files](#input\_data\_files) | A list of files to be transferred prior to running commands.
It must specify one of 'source' (absolute local file path) or 'content' (string).
It must specify a 'destination' with absolute path where file should be placed. | `list(map(string))` | `[]` | no | | [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | | [environments](#input\_environments) | DEPRECATED

The following `commands` can be used to configure an environment:
if ! spack env list \| grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
spack install
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | | [gpg\_keys](#input\_gpg\_keys) | DEPRECATED

The following `commands` can be used to create a new GPG key:
spack gpg init
spack gpg create
Alternatively, `data_files` can be used to transfer an existing GPG key. Then use `spack gpg trust ` to add the key to the keyring.

GPG Keys to trust within spack. | `list(map(any))` | `null` | no | | [install\_dir](#input\_install\_dir) | Directory to install spack into. | `string` | `"/sw/spack"` | no | -| [install\_flags](#input\_install\_flags) | DEPRECATED - spack install is now performed using the `commands` variable. | `string` | `null` | no | +| [install\_flags](#input\_install\_flags) | DEPRECATED - spack install is now performed using the [spack-execute](../spack-execute/) module `commands` variable. | `string` | `null` | no | | [labels](#input\_labels) | Key-value pairs of labels to be added to created resources. | `map(string)` | n/a | yes | -| [licenses](#input\_licenses) | DEPRECATED

Use `data_files` variable to install license files:
data_files = [{
source = "/abs/path/on/deployment/machine/license.lic"
destination = "/sw/spack/etc/spack/licenses/license.lic"
}]
List of software licenses to install within spack. |
list(object({
source = string
dest = string
}))
| `null` | no | -| [log\_file](#input\_log\_file) | Defines the logfile that script output will be written to | `string` | `"/var/log/spack.log"` | no | +| [licenses](#input\_licenses) | DEPRECATED

Use [spack-execute](../spack-execute/) module with `data_files` variable to install license files:
data_files = [{
source = "/abs/path/on/deployment/machine/license.lic"
destination = "/sw/spack/etc/spack/licenses/license.lic"
}]
List of software licenses to install within spack. |
list(object({
source = string
dest = string
}))
| `null` | no | +| [log\_file](#input\_log\_file) | DEPRECATED

All install logs are printed to stdout/stderr.
Execution log\_file location can be set on spack-execute module. | `string` | `null` | no | | [packages](#input\_packages) | DEPRECATED

The following `commands` can be used to install a package:
spack install intel-mpi@2018.4.274 %gcc@10.3.0
Defines root packages for spack to install. | `list(string)` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | | [region](#input\_region) | Region to place bucket containing startup script. | `string` | n/a | yes | @@ -245,9 +245,11 @@ limitations under the License. | Name | Description | |------|-------------| | [controller\_startup\_script](#output\_controller\_startup\_script) | Path to the Spack installation script, duplicate for SLURM controller. | +| [gcs\_bucket\_path](#output\_gcs\_bucket\_path) | Bucket containing the startup scripts for spack, to be reused by spack-execute module. | | [install\_spack\_deps\_runner](#output\_install\_spack\_deps\_runner) | Runner to install dependencies for spack using an ansible playbook. The
startup-script module will automatically handle installation of ansible.
- id: example-startup-script
source: modules/scripts/startup-script
settings:
runners:
- $(your-spack-id.install\_spack\_deps\_runner)
... | | [install\_spack\_runner](#output\_install\_spack\_runner) | Runner to install Spack using the startup-script module | | [setup\_spack\_runner](#output\_setup\_spack\_runner) | Adds Spack setup-env.sh script to /etc/profile.d so that it is called at shell startup. Among other things this adds Spack binary to user PATH. | | [spack\_path](#output\_spack\_path) | Path to the root of the spack installation | +| [spack\_runner](#output\_spack\_runner) | Runner to install Spack using the startup-script module | | [startup\_script](#output\_startup\_script) | Path to the Spack installation script. | diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-install/main.tf index c3ab3eae09..4b8ede607a 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-install/main.tf @@ -59,40 +59,27 @@ locals { "content" = local.script_content "destination" = "install_spack.yml" } -} - -locals { - commands_content = var.commands == null ? "echo 'no spack commands provided'" : indent(4, yamlencode(var.commands)) - - execute_contents = templatefile( - "${path.module}/templates/execute_commands.yml.tpl", - { - pre_script = ". /etc/profile.d/spack.sh" - log_file = var.log_file - commands = local.commands_content - } - ) - data_runners = [for data_file in var.data_files : merge(data_file, { type = "data" })] + bucket_md5 = substr(md5("${var.project_id}.${var.deployment_name}"), 0, 4) + bucket_name = "spack-scripts-${local.bucket_md5}" + runners = [local.install_spack_deps_runner, local.install_spack_runner] - execute_md5 = substr(md5(local.execute_contents), 0, 4) - execute_runner = { - "type" = "ansible-local" - "content" = local.execute_contents - "destination" = "spack_execute_${local.execute_md5}.yml" - } - - runners = concat([local.install_spack_runner], local.data_runners, [local.execute_runner]) - - combined_unique_string = join("\n", [for runner in local.runners : try(runner["content"], runner["source"])]) - combined_md5 = substr(md5(local.combined_unique_string), 0, 4) - combined_install_execute_runner = { + combined_runner = { "type" = "shell" "content" = module.startup_script.startup_script - "destination" = "combined_install_spack_${local.combined_md5}.sh" + "destination" = "spack-install-and-setup.sh" } } +resource "google_storage_bucket" "bucket" { + project = var.project_id + name = local.bucket_name + uniform_bucket_level_access = true + location = var.region + storage_class = "REGIONAL" + labels = local.labels +} + module "startup_script" { source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.19.1" @@ -101,14 +88,10 @@ module "startup_script" { deployment_name = var.deployment_name region = var.region runners = local.runners + gcs_bucket_path = "gs://${google_storage_bucket.bucket.name}" } resource "local_file" "debug_file_shell_install" { content = local.script_content filename = "${path.module}/debug_install.yml" } - -resource "local_file" "debug_file_ansible_execute" { - content = local.execute_contents - filename = "${path.module}/debug_execute_${local.execute_md5}.yml" -} diff --git a/community/modules/scripts/spack-install/outputs.tf b/community/modules/scripts/spack-install/outputs.tf index e02dbf227b..ea68b9d37c 100644 --- a/community/modules/scripts/spack-install/outputs.tf +++ b/community/modules/scripts/spack-install/outputs.tf @@ -40,7 +40,7 @@ output "install_spack_deps_runner" { output "install_spack_runner" { description = "Runner to install Spack using the startup-script module" - value = local.combined_install_execute_runner + value = local.combined_runner } output "setup_spack_runner" { @@ -61,3 +61,13 @@ output "spack_path" { description = "Path to the root of the spack installation" value = var.install_dir } + +output "spack_runner" { + description = "Runner to install Spack using the startup-script module" + value = local.combined_runner +} + +output "gcs_bucket_path" { + description = "Bucket containing the startup scripts for spack, to be reused by spack-execute module." + value = "gs://${google_storage_bucket.bucket.name}" +} diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index e4768e27fd..b152c77584 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -67,48 +67,6 @@ variable "spack_virtualenv_path" { type = string } -# spack-build variables - -variable "log_file" { - description = "Defines the logfile that script output will be written to" - default = "/var/log/spack.log" - type = string -} - -variable "data_files" { - description = <<-EOT - A list of files to be transferred prior to running commands. - It must specify one of 'source' (absolute local file path) or 'content' (string). - It must specify a 'destination' with absolute path where file should be placed. - EOT - type = list(map(string)) - default = [] - validation { - condition = alltrue([for r in var.data_files : substr(r["destination"], 0, 1) == "/"]) - error_message = "All destinations must be absolute paths and start with '/'." - } - validation { - condition = alltrue([ - for r in var.data_files : - can(r["content"]) != can(r["source"]) - ]) - error_message = "A data_file must specify either 'content' or 'source', but never both." - } - validation { - condition = alltrue([ - for r in var.data_files : - lookup(r, "content", lookup(r, "source", null)) != null - ]) - error_message = "A data_file must specify a non-null 'content' or 'source'." - } -} - -variable "commands" { - description = "String of commands to run within this module" - type = string - default = null -} - variable "deployment_name" { description = "Name of deployment, used to name bucket containing startup script." type = string @@ -126,6 +84,21 @@ variable "labels" { # variables to be deprecated +variable "log_file" { + description = <<-EOT + DEPRECATED + + All install logs are printed to stdout/stderr. + Execution log_file location can be set on spack-execute module. + EOT + default = null + type = string + validation { + condition = var.log_file == null + error_message = "log_file is deprecated. See spack-execute module for similar functionality." + } +} + variable "spack_cache_url" { description = <<-EOT DEPRECATED @@ -200,7 +173,7 @@ variable "licenses" { description = <<-EOT DEPRECATED - Use `data_files` variable to install license files: + Use [spack-execute](../spack-execute/) module with `data_files` variable to install license files: ``` data_files = [{ @@ -244,7 +217,7 @@ variable "packages" { } variable "install_flags" { - description = "DEPRECATED - spack install is now performed using the `commands` variable." + description = "DEPRECATED - spack install is now performed using the [spack-execute](../spack-execute/) module `commands` variable." default = null type = string validation { @@ -254,7 +227,7 @@ variable "install_flags" { } variable "concretize_flags" { - description = "DEPRECATED - spack concretize is now performed using the `commands` variable." + description = "DEPRECATED - spack concretize is now performed using the [spack-execute](../spack-execute/) module `commands` variable." default = null type = string validation { diff --git a/community/modules/scripts/spack-install/versions.tf b/community/modules/scripts/spack-install/versions.tf index b708682f88..ff1180fc1b 100644 --- a/community/modules/scripts/spack-install/versions.tf +++ b/community/modules/scripts/spack-install/versions.tf @@ -17,6 +17,11 @@ terraform { required_version = ">= 1.0.0" required_providers { + google = { + source = "hashicorp/google" + version = ">= 4.42" + } + local = { source = "hashicorp/local" version = ">= 2.0.0" diff --git a/modules/scripts/startup-script/README.md b/modules/scripts/startup-script/README.md index 9863f5ad3a..09b052ceb5 100644 --- a/modules/scripts/startup-script/README.md +++ b/modules/scripts/startup-script/README.md @@ -235,7 +235,7 @@ No modules. | [configure\_ssh\_host\_patterns](#input\_configure\_ssh\_host\_patterns) | If specified, it will automate ssh configuration by:
- Defining a Host block for every element of this variable and setting StrictHostKeyChecking to 'No'.
Ex: "hpc*", "hpc01*", "ml*"
- The first time users log-in, it will create ssh keys that are added to the authorized keys list
This requires a shared /home filesystem and relies on specifying the right prefix. | `list(string)` | `[]` | no | | [debug\_file](#input\_debug\_file) | Path to an optional local to be written with 'startup\_script'. | `string` | `null` | no | | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment, used to name GCS bucket for startup scripts. | `string` | n/a | yes | -| [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | The GCS path for storage bucket and the object. | `string` | `null` | no | +| [gcs\_bucket\_path](#input\_gcs\_bucket\_path) | The GCS path for storage bucket and the object, starting with `gs://`. | `string` | `null` | no | | [install\_ansible](#input\_install\_ansible) | Run Ansible installation script if either set to true or unset and runner of type 'ansible-local' are used. | `bool` | `null` | no | | [install\_cloud\_ops\_agent](#input\_install\_cloud\_ops\_agent) | Run Google Ops Agent installation script if set to true. | `bool` | `false` | no | | [labels](#input\_labels) | Labels for the created GCS bucket. Key-value pairs. | `map(string)` | n/a | yes | diff --git a/modules/scripts/startup-script/variables.tf b/modules/scripts/startup-script/variables.tf index 157f1da54e..ca3f774fb2 100644 --- a/modules/scripts/startup-script/variables.tf +++ b/modules/scripts/startup-script/variables.tf @@ -30,7 +30,7 @@ variable "region" { } variable "gcs_bucket_path" { - description = "The GCS path for storage bucket and the object." + description = "The GCS path for storage bucket and the object, starting with `gs://`." type = string default = null } diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index cd7e2bfd95..9130e956a6 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -47,7 +47,7 @@ ], [ "community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl", - "community/modules/scripts/spack-install/templates/execute_commands.yml.tpl", + "community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl", ], [ "community/modules/scripts/spack-install/templates/spack_setup.yml.tftpl", From fcf1377462cc91343fe619bdf63da56b650a2a34 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 13 Jul 2023 16:04:55 -0700 Subject: [PATCH 035/144] Update deprecated variable documentation to point to spack-execute module --- .../modules/scripts/spack-install/README.md | 14 ++++----- .../scripts/spack-install/variables.tf | 30 +++++++++---------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index f9f56d2114..49fcf49a6c 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -217,25 +217,25 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [caches\_to\_populate](#input\_caches\_to\_populate) | DEPRECATED

The following `commands` can be used to populate a cache:
MIRROR_URL=gs://my-bucket
spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash});
spack gpg publish --mirror-url $MIRROR_URL;
spack buildcache update-index --mirror-url $MIRROR_URL --keys;
Defines caches which will be populated with the installed packages.

NOTE: GPG Keys should be installed before trying to populate a cache
with packages.

NOTE: The gpg\_keys variable can be used to install existing GPG keys
and create new GPG keys, both of which are acceptable for populating a
cache. | `list(map(any))` | `null` | no | +| [caches\_to\_populate](#input\_caches\_to\_populate) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to populate a cache:
MIRROR_URL=gs://my-bucket
spack buildcache create --mirror-url $MIRROR_URL -af \$(spack find --format /{hash});
spack gpg publish --mirror-url $MIRROR_URL;
spack buildcache update-index --mirror-url $MIRROR_URL --keys;
Defines caches which will be populated with the installed packages.

NOTE: GPG Keys should be installed before trying to populate a cache
with packages.

NOTE: The gpg\_keys variable can be used to install existing GPG keys
and create new GPG keys, both of which are acceptable for populating a
cache. | `list(map(any))` | `null` | no | | [chgrp\_group](#input\_chgrp\_group) | Group to chgrp the Spack clone to. Default will not modify the clone. | `string` | `null` | no | | [chmod\_mode](#input\_chmod\_mode) | Mode to chmod the Spack clone to. Defaults to null (i.e. do not modify).
For usage information see:
https://docs.ansible.com/ansible/latest/collections/ansible/builtin/file_module.html#parameter-mode | `string` | `"a+rwxs"` | no | | [chown\_owner](#input\_chown\_owner) | Owner to chown the Spack clone to. Default will not modify the clone. | `string` | `null` | no | -| [compilers](#input\_compilers) | DEPRECATED

The following `commands` can be used to install compilers:
spack install gcc@10.3.0 target=x86_64
spack load gcc@10.3.0 target=x86_64
spack compiler find --scope site
spack clean -s
spack unload gcc@10.3.0
Defines compilers for spack to install before installing packages. | `list(string)` | `null` | no | +| [compilers](#input\_compilers) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to install compilers:
spack install gcc@10.3.0 target=x86_64
spack load gcc@10.3.0 target=x86_64
spack compiler find --scope site
spack clean -s
spack unload gcc@10.3.0
Defines compilers for spack to install before installing packages. | `list(string)` | `null` | no | | [concretize\_flags](#input\_concretize\_flags) | DEPRECATED - spack concretize is now performed using the [spack-execute](../spack-execute/) module `commands` variable. | `string` | `null` | no | -| [configs](#input\_configs) | DEPRECATED

The following `commands` can be used to add a single config:
spack config --scope defaults add config:default:true
Alternatively, use `data_files` to transfer a config file and use the `spack config add -f ` command to add the config.

List of configuration options to set within spack. | `list(map(any))` | `null` | no | +| [configs](#input\_configs) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to add a single config:
spack config --scope defaults add config:default:true
Alternatively, use `data_files` to transfer a config file and use the `spack config add -f ` command to add the config.

List of configuration options to set within spack. | `list(map(any))` | `null` | no | | [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | -| [environments](#input\_environments) | DEPRECATED

The following `commands` can be used to configure an environment:
if ! spack env list \| grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
spack install
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | -| [gpg\_keys](#input\_gpg\_keys) | DEPRECATED

The following `commands` can be used to create a new GPG key:
spack gpg init
spack gpg create
Alternatively, `data_files` can be used to transfer an existing GPG key. Then use `spack gpg trust ` to add the key to the keyring.

GPG Keys to trust within spack. | `list(map(any))` | `null` | no | +| [environments](#input\_environments) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to configure an environment:
if ! spack env list \| grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
spack install
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | +| [gpg\_keys](#input\_gpg\_keys) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to create a new GPG key:
spack gpg init
spack gpg create
Alternatively, `data_files` can be used to transfer an existing GPG key. Then use `spack gpg trust ` to add the key to the keyring.

GPG Keys to trust within spack. | `list(map(any))` | `null` | no | | [install\_dir](#input\_install\_dir) | Directory to install spack into. | `string` | `"/sw/spack"` | no | | [install\_flags](#input\_install\_flags) | DEPRECATED - spack install is now performed using the [spack-execute](../spack-execute/) module `commands` variable. | `string` | `null` | no | | [labels](#input\_labels) | Key-value pairs of labels to be added to created resources. | `map(string)` | n/a | yes | | [licenses](#input\_licenses) | DEPRECATED

Use [spack-execute](../spack-execute/) module with `data_files` variable to install license files:
data_files = [{
source = "/abs/path/on/deployment/machine/license.lic"
destination = "/sw/spack/etc/spack/licenses/license.lic"
}]
List of software licenses to install within spack. |
list(object({
source = string
dest = string
}))
| `null` | no | | [log\_file](#input\_log\_file) | DEPRECATED

All install logs are printed to stdout/stderr.
Execution log\_file location can be set on spack-execute module. | `string` | `null` | no | -| [packages](#input\_packages) | DEPRECATED

The following `commands` can be used to install a package:
spack install intel-mpi@2018.4.274 %gcc@10.3.0
Defines root packages for spack to install. | `list(string)` | `null` | no | +| [packages](#input\_packages) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to install a package:
spack install intel-mpi@2018.4.274 %gcc@10.3.0
Defines root packages for spack to install. | `list(string)` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | | [region](#input\_region) | Region to place bucket containing startup script. | `string` | n/a | yes | -| [spack\_cache\_url](#input\_spack\_cache\_url) | DEPRECATED

The following `commands` can be used to add a build cache:
spack mirror add --scope site  gs://my-build-cache
spack buildcache keys --install --trust
List of build caches for Spack. |
list(object({
mirror_name = string
mirror_url = string
}))
| `null` | no | +| [spack\_cache\_url](#input\_spack\_cache\_url) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to add a build cache:
spack mirror add --scope site  gs://my-build-cache
spack buildcache keys --install --trust
List of build caches for Spack. |
list(object({
mirror_name = string
mirror_url = string
}))
| `null` | no | | [spack\_ref](#input\_spack\_ref) | Git ref to checkout for spack. | `string` | `"v0.20.0"` | no | | [spack\_url](#input\_spack\_url) | URL to clone the spack repo from. | `string` | `"https://github.com/spack/spack"` | no | | [spack\_virtualenv\_path](#input\_spack\_virtualenv\_path) | Virtual environment path in which to install Spack Python interpreter and other dependencies | `string` | `"/usr/local/spack-python"` | no | diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-install/variables.tf index b152c77584..e2c0ca5920 100644 --- a/community/modules/scripts/spack-install/variables.tf +++ b/community/modules/scripts/spack-install/variables.tf @@ -103,7 +103,7 @@ variable "spack_cache_url" { description = <<-EOT DEPRECATED - The following `commands` can be used to add a build cache: + Use [spack-execute](../spack-execute/) module with the following `commands` can be used to add a build cache: ``` spack mirror add --scope site gs://my-build-cache @@ -119,7 +119,7 @@ variable "spack_cache_url" { default = null validation { condition = var.spack_cache_url == null - error_message = "spack_cache_url is deprecated. Use commands instead. See variable documentation for proposed alternative commands." + error_message = "spack_cache_url is deprecated. Use spack-execute.commands instead. See variable documentation for proposed alternative commands." } } @@ -127,7 +127,7 @@ variable "configs" { description = <<-EOT DEPRECATED - The following `commands` can be used to add a single config: + Use [spack-execute](../spack-execute/) module with the following `commands` can be used to add a single config: ``` spack config --scope defaults add config:default:true @@ -141,7 +141,7 @@ variable "configs" { type = list(map(any)) validation { condition = var.configs == null - error_message = "configs is deprecated. Use commands instead. See variable documentation for proposed alternative commands." + error_message = "configs is deprecated. Use spack-execute.commands instead. See variable documentation for proposed alternative commands." } } @@ -149,7 +149,7 @@ variable "compilers" { description = <<-EOT DEPRECATED - The following `commands` can be used to install compilers: + Use [spack-execute](../spack-execute/) module with the following `commands` can be used to install compilers: ``` spack install gcc@10.3.0 target=x86_64 @@ -165,7 +165,7 @@ variable "compilers" { default = null validation { condition = var.compilers == null - error_message = "compilers is deprecated. Use commands instead. See variable documentation for proposed alternative commands." + error_message = "compilers is deprecated. Use spack-execute.commands instead. See variable documentation for proposed alternative commands." } } @@ -192,7 +192,7 @@ variable "licenses" { })) validation { condition = var.licenses == null - error_message = "licenses is deprecated. Use commands instead. See variable documentation for proposed alternative commands." + error_message = "licenses is deprecated. Use spack-execute.commands instead. See variable documentation for proposed alternative commands." } } @@ -200,7 +200,7 @@ variable "packages" { description = <<-EOT DEPRECATED - The following `commands` can be used to install a package: + Use [spack-execute](../spack-execute/) module with the following `commands` can be used to install a package: ``` spack install intel-mpi@2018.4.274 %gcc@10.3.0 @@ -212,7 +212,7 @@ variable "packages" { default = null validation { condition = var.packages == null - error_message = "packages is deprecated. Use commands instead. See variable documentation for proposed alternative commands." + error_message = "packages is deprecated. Use spack-execute.commands instead. See variable documentation for proposed alternative commands." } } @@ -240,7 +240,7 @@ variable "gpg_keys" { description = < Date: Thu, 13 Jul 2023 23:46:52 -0700 Subject: [PATCH 036/144] Update blueprints to reflect spack module split --- community/examples/AMD/hpc-amd-slurm.yaml | 12 ++++++++---- community/examples/hpc-slurm-gromacs.yaml | 16 +++++++--------- community/examples/hpc-slurm-ramble-gromacs.yaml | 12 ++++++++---- docs/tutorials/gromacs/spack-gromacs.yaml | 10 +++++++--- docs/tutorials/openfoam/spack-openfoam.yaml | 10 +++++++--- docs/tutorials/wrfv3/spack-wrfv3.yaml | 10 +++++++--- .../hcls-blueprint.yaml | 9 +++++++-- examples/serverless-batch-mpi.yaml | 12 ++++++++---- tools/validate_configs/ramble.yaml | 3 +-- .../test_configs/centos8-ss.yaml | 12 +++++++----- .../validate_configs/test_configs/debian-ss.yaml | 12 +++++++----- .../test_configs/hpc-centos-ss.yaml | 12 +++++++----- .../validate_configs/test_configs/rocky-ss.yaml | 12 +++++++----- .../test_configs/spack-buildcache.yaml | 11 +++++++---- .../test_configs/spack-environments.yaml | 11 +++++++---- .../validate_configs/test_configs/ubuntu-ss.yaml | 12 +++++++----- 16 files changed, 109 insertions(+), 67 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 710aec046e..ca17bc65b7 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -39,11 +39,16 @@ deployment_groups: settings: local_mount: /sw - - id: spack + - id: spack-setup source: community/modules/scripts/spack-install settings: install_dir: /sw/spack spack_ref: v0.18.1 + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: log_file: /var/log/spack.log data_files: - destination: /tmp/projections-config.yaml @@ -113,8 +118,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack.install_spack_deps_runner) - - $(spack.install_spack_runner) + - $(spack-execute.spack_runner) - type: shell destination: shutdown.sh content: | @@ -128,7 +132,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack.setup_spack_runner) + - $(spack-setup.setup_spack_runner) # the following installation of AOCC may be automated in the future # with a clear direction to the user to read the EULA at # https://developer.amd.com/aocc-compiler-eula/ diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index b284915c46..3f87fb02a8 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -45,10 +45,15 @@ deployment_groups: local_mount: /home ## Install Scripts - - id: spack + - id: spack-setup source: community/modules/scripts/spack-install settings: install_dir: /sw/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: log_file: /var/log/spack.log data_files: - destination: /tmp/projections-config.yaml @@ -77,13 +82,6 @@ deployment_groups: spack install intel-mpi@2018.4.274%gcc@10.3.0 spack install gromacs@2023.1 %gcc@10.3.0 ^intel-mpi@2018.4.274 ^cmake@3.26.3 %gcc@4.8.5 - - id: spack-startup - source: modules/scripts/startup-script - settings: - runners: - - $(spack.install_spack_deps_runner) - - $(spack.install_spack_runner) - - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition use: @@ -111,7 +109,7 @@ deployment_groups: - homefs - appsfs - slurm_controller - - spack-startup + - spack-execute settings: login_machine_type: c2-standard-4 login_scopes: diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index 306a2de624..d66523a394 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -35,10 +35,15 @@ deployment_groups: source: modules/network/vpc ## Install Scripts - - id: spack + - id: spack-install source: community/modules/scripts/spack-install settings: install_dir: /opt/apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-install] + settings: log_file: /var/log/spack.log commands: | # Un-comment and update mirror_url to install from spack cache @@ -55,7 +60,7 @@ deployment_groups: - id: ramble-execute source: community/modules/scripts/ramble-execute - use: [spack, ramble-setup] + use: [spack-install, ramble-setup] settings: commands: | ramble workspace create gromacs -c /opt/apps/gromacs_scaling.yaml -t /opt/apps/execute_experiment.tpl @@ -66,8 +71,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack.install_spack_deps_runner) - - $(spack.install_spack_runner) + - $(spack-execute.spack_runner) - type: data destination: /opt/apps/gromacs_scaling.yaml content: | diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index 85c6b93fc0..b1036d5bb6 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -32,10 +32,15 @@ deployment_groups: source: modules/monitoring/dashboard ## Install Scripts - - id: spack + - id: spack-setup source: community/modules/scripts/spack-install settings: install_dir: /apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: log_file: /var/log/spack.log data_files: - destination: /tmp/projections-config.yaml @@ -99,8 +104,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack.install_spack_deps_runner) - - $(spack.install_spack_runner) + - $(spack-execute.spack_runner) - type: shell destination: setup_gromacs.sh content: | diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index 1cd694f9f0..973692e926 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -32,10 +32,15 @@ deployment_groups: source: modules/monitoring/dashboard ## Install Scripts - - id: spack + - id: spack-setup source: community/modules/scripts/spack-install settings: install_dir: /apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: log_file: /var/log/spack.log data_files: - destination: /tmp/projections-config.yaml @@ -106,8 +111,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack.install_spack_deps_runner) - - $(spack.install_spack_runner) + - $(spack-execute.spack_runner) - type: shell destination: setup_openfoam.sh content: | diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index 15944153ab..9bb7571054 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -32,10 +32,15 @@ deployment_groups: source: modules/monitoring/dashboard ## Install Scripts - - id: spack + - id: spack-setup source: community/modules/scripts/spack-install settings: install_dir: /apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: log_file: /var/log/spack.log data_files: - destination: /tmp/projections-config.yaml @@ -99,8 +104,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack.install_spack_deps_runner) - - $(spack.install_spack_runner) + - $(spack-execute.spack_runner) - type: shell destination: wrfv3_setup.sh content: | diff --git a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml index 7e8b8a86e5..64ff214639 100644 --- a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml +++ b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml @@ -113,10 +113,15 @@ deployment_groups: ### Software ### - - id: spack + - id: spack-setup source: community/modules/scripts/spack-install settings: install_dir: /apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: data_files: - destination: /tmp/projections-config.yaml content: | @@ -185,7 +190,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack.install_spack_deps_runner) + - $(spack-execute.spack_runner) - type: data destination: /tmp/install_spack.sh diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index 36a6d63bae..aa0766a146 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -34,11 +34,16 @@ deployment_groups: settings: local_mount: /share - - id: spack + - id: spack-setup source: community/modules/scripts/spack-install settings: spack_ref: v0.19.0 install_dir: /share/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: data_files: - destination: /tmp/projections-config.yaml content: | @@ -92,8 +97,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack.install_spack_deps_runner) - - $(spack.install_spack_runner) + - $(spack-execute.spack_runner) - type: shell destination: wrfv3_setup.sh content: | @@ -153,7 +157,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack.setup_spack_runner) + - $(spack-setup.setup_spack_runner) - id: batch-login source: modules/scheduler/batch-login-node diff --git a/tools/validate_configs/ramble.yaml b/tools/validate_configs/ramble.yaml index e0de6ed2a8..3e4b26897f 100644 --- a/tools/validate_configs/ramble.yaml +++ b/tools/validate_configs/ramble.yaml @@ -49,8 +49,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack.install_spack_deps_runner) - - $(spack.install_spack_runner) + - $(spack.spack_runner) - $(ramble-execute.ramble_runner) - id: ramble-vm diff --git a/tools/validate_configs/test_configs/centos8-ss.yaml b/tools/validate_configs/test_configs/centos8-ss.yaml index 58bd6f769b..b6f5722bf4 100644 --- a/tools/validate_configs/test_configs/centos8-ss.yaml +++ b/tools/validate_configs/test_configs/centos8-ss.yaml @@ -41,10 +41,15 @@ deployment_groups: settings: auto_delete_disk: true - - id: spack + - id: spack-setup source: ./community//modules/scripts/spack-install settings: install_dir: /apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: commands: | spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 @@ -66,10 +71,7 @@ deployment_groups: echo $2 tar zxvf /tmp/$1 -C / args: "foo.tgz 'Expanding the file'" - - $(spack.install_spack_deps_runner) - - type: shell - content: $(spack.startup_script) - destination: "/apps/spack-install.sh" + - $(spack-execute.spack_runner) - id: instance source: ./modules/compute/vm-instance diff --git a/tools/validate_configs/test_configs/debian-ss.yaml b/tools/validate_configs/test_configs/debian-ss.yaml index 7d02949510..15fe0d40e7 100644 --- a/tools/validate_configs/test_configs/debian-ss.yaml +++ b/tools/validate_configs/test_configs/debian-ss.yaml @@ -41,10 +41,15 @@ deployment_groups: settings: auto_delete_disk: true - - id: spack + - id: spack-setup source: ./community//modules/scripts/spack-install settings: install_dir: /apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: commands: | spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 @@ -66,10 +71,7 @@ deployment_groups: echo $2 tar zxvf /tmp/$1 -C / args: "foo.tgz 'Expanding the file'" - - $(spack.install_spack_deps_runner) - - type: shell - content: $(spack.startup_script) - destination: "/apps/spack-install.sh" + - $(spack-execute.spack_runner) - id: instance source: ./modules/compute/vm-instance diff --git a/tools/validate_configs/test_configs/hpc-centos-ss.yaml b/tools/validate_configs/test_configs/hpc-centos-ss.yaml index e364b3528a..385b399c89 100644 --- a/tools/validate_configs/test_configs/hpc-centos-ss.yaml +++ b/tools/validate_configs/test_configs/hpc-centos-ss.yaml @@ -41,10 +41,15 @@ deployment_groups: settings: auto_delete_disk: true - - id: spack + - id: spack-setup source: ./community//modules/scripts/spack-install settings: install_dir: /apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: commands: | spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 @@ -66,10 +71,7 @@ deployment_groups: echo $2 tar zxvf /tmp/$1 -C / args: "foo.tgz 'Expanding the file'" - - $(spack.install_spack_deps_runner) - - type: shell - content: $(spack.startup_script) - destination: "/apps/spack-install.sh" + - $(spack-execute.spack_runner) - id: instance source: ./modules/compute/vm-instance diff --git a/tools/validate_configs/test_configs/rocky-ss.yaml b/tools/validate_configs/test_configs/rocky-ss.yaml index 8234e720d8..9852a9ee95 100644 --- a/tools/validate_configs/test_configs/rocky-ss.yaml +++ b/tools/validate_configs/test_configs/rocky-ss.yaml @@ -42,10 +42,15 @@ deployment_groups: image: rocky-linux-cloud/rocky-linux-8 auto_delete_disk: true - - id: spack + - id: spack-setup source: ./community//modules/scripts/spack-install settings: install_dir: /apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: commands: | spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 @@ -67,10 +72,7 @@ deployment_groups: echo $2 tar zxvf /tmp/$1 -C / args: "foo.tgz 'Expanding the file'" - - $(spack.install_spack_deps_runner) - - type: shell - content: $(spack.startup_script) - destination: "/apps/spack-install.sh" + - $(spack-execute.spack_runner) - id: instance source: ./modules/compute/vm-instance diff --git a/tools/validate_configs/test_configs/spack-buildcache.yaml b/tools/validate_configs/test_configs/spack-buildcache.yaml index f70f0ee7f2..d11d108e7b 100644 --- a/tools/validate_configs/test_configs/spack-buildcache.yaml +++ b/tools/validate_configs/test_configs/spack-buildcache.yaml @@ -28,10 +28,15 @@ deployment_groups: - id: network1 source: modules/network/pre-existing-vpc - - id: spack + - id: spack-setup source: ./community/modules/scripts/spack-install settings: install_dir: /apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: log_file: /var/log/spack.log data_files: - source: /path/to/local/spack_gpg_key.pub @@ -70,9 +75,7 @@ deployment_groups: - type: ansible-local source: modules/spack-install/scripts/install_spack_deps.yml destination: install_spack_deps.yml - - type: shell - content: $(spack.startup_script) - destination: install_spack.sh + - $(spack-execute.spack_runner) - type: shell destination: shutdown.sh content: shutdown -h diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml index 8ca29f1fed..bcdd731d3e 100644 --- a/tools/validate_configs/test_configs/spack-environments.yaml +++ b/tools/validate_configs/test_configs/spack-environments.yaml @@ -28,12 +28,17 @@ deployment_groups: - id: network1 source: modules/network/pre-existing-vpc - - id: spack + - id: spack-setup source: ./community/modules/scripts/spack-install settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack spack_ref: v0.17.1 + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: log_file: /var/log/spack.log data_files: - destination: /apps/spack/env_file.yaml @@ -114,9 +119,7 @@ deployment_groups: - type: ansible-local source: modules/spack-install/scripts/install_spack_deps.yml destination: install_spack_deps.yml - - type: shell - content: $(spack.startup_script) - destination: install_spack.sh + - $(spack-execute.spack_runner) - type: shell destination: shutdown.sh content: shutdown -h diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml index ca16c98f6f..761c0d71c5 100644 --- a/tools/validate_configs/test_configs/ubuntu-ss.yaml +++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml @@ -41,10 +41,15 @@ deployment_groups: settings: auto_delete_disk: true - - id: spack + - id: spack-setup source: ./community//modules/scripts/spack-install settings: install_dir: /apps/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: commands: | spack install gcc@10.3.0 target=x86_64 spack load gcc@10.3.0 target=x86_64 @@ -70,10 +75,7 @@ deployment_groups: echo $2 tar zxvf /tmp/$1 -C / args: "foo.tgz 'Expanding the file'" - - $(spack.install_spack_deps_runner) - - type: shell - content: $(spack.startup_script) - destination: "/apps/spack-install.sh" + - $(spack-execute.spack_runner) - id: instance source: ./modules/compute/vm-instance From f30dfe4adc4bee234c72088e0e26f8c5bd7bf6b7 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Mon, 17 Jul 2023 07:33:36 +0100 Subject: [PATCH 037/144] OFE Backend: letting custom images to stay after cluster that is using it is deleted --- community/front-end/ofe/website/ghpcfe/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/models.py b/community/front-end/ofe/website/ghpcfe/models.py index 21e3e5b722..f31d7427eb 100644 --- a/community/front-end/ofe/website/ghpcfe/models.py +++ b/community/front-end/ofe/website/ghpcfe/models.py @@ -777,7 +777,7 @@ class Cluster(CloudResource): blank=True, null=True, default=None, - on_delete=models.RESTRICT, + on_delete=models.SET_NULL ) controller_node_image = models.ForeignKey( Image, @@ -786,7 +786,7 @@ class Cluster(CloudResource): blank=True, null=True, default=None, - on_delete=models.RESTRICT + on_delete=models.SET_NULL, ) def get_access_key(self): @@ -872,7 +872,7 @@ class ClusterPartition(models.Model): blank=True, null=True, default=None, - on_delete=models.RESTRICT, + on_delete=models.SET_NULL, ) max_node_count = models.PositiveIntegerField( validators=[MinValueValidator(1)], From 59fdd331cff2b0ff56e7100892ad5f8d83dc8372 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Mon, 17 Jul 2023 07:39:37 +0100 Subject: [PATCH 038/144] OFE Backend: locking google compute api package version --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index cebf623c7a..b6e6810cd2 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -28,7 +28,7 @@ google-auth-httplib2==0.1.0 google-cloud-billing==1.11.0 google-cloud-core==2.3.2 google-cloud-pubsub==2.17.1 -google-cloud-compute +google-cloud-compute==1.13.0 google-cloud-storage==2.9.0 google-crc32c==1.5.0 google-resumable-media==2.5.0 From dd10c0d42c43580cea75820a08671da204dac96a Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Mon, 17 Jul 2023 11:06:34 +0100 Subject: [PATCH 039/144] OFE Backend: syntax error fixes in cluster bluperint yaml --- .../ofe/website/ghpcfe/cluster_manager/clusterinfo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index e171364c8a..ef40c31c92 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -206,7 +206,6 @@ def _prepare_ghpc_partitions(self, part_uses): if part.image: yaml[-1] += ( f"""\ - source_image: {part.image} """ ) @@ -343,7 +342,7 @@ def _prepare_ghpc_yaml(self): #!/bin/bash echo "******************************************** CALLING CONTROLLER STARTUP" gsutil cp gs://{startup_bucket}/clusters/{self.cluster.id}/bootstrap_controller.sh - | bash - compute_startup_script: | + compute_startup_script: | #!/bin/bash gsutil cp gs://{startup_bucket}/clusters/{self.cluster.id}/bootstrap_compute.sh - | bash #TODO: enable_cleanup_compute: True From b24a0f8983fc728829206e446de08f40a8c96fe3 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 18 Jul 2023 15:44:44 -0700 Subject: [PATCH 040/144] Refactor `validateConfig` (#1589) * Move `validateModule` before "expand"; * Collect all errors to output; ```yaml vars: region: # do not forget to set deployment_groups: - group: primary modules: - id: arbuz source: ./arbus_store - id: nw2 source: modules/network/vpc settings: kill.switch: $(wut.this) green: $(arbuz.gold) orange: $(vars.yarn) outputs: [gold, silver] ``` ```shell **************** building ghpc ************************ deployment_name input error, cause: Could not find source of variable Error: blueprint_name input error, cause: value is an empty string 16: vars: ^ Error: deployment variable region was not set 17: region: # do not forget to set ^ Error: failed to get info using tfconfig for terraform module at ./arbus_store: Source to module does not exist: ./arbus_store 23: source: ./arbus_store ^ Error: a setting was added that is not found in the module 30: green: $(arbuz.gold) ^ Error: a setting name contains a period, which is not supported; variable subfields cannot be set independently in a blueprint. 29: kill.switch: $(wut.this) ^ Error: a setting was added that is not found in the module 31: orange: $(vars.yarn) ^ Error: requested output was not found in the module, module: nw2 output: gold 32: outputs: [gold, silver] ^ Error: requested output was not found in the module, module: nw2 output: silver 32: outputs: [gold, silver] ^ Error: invalid module reference: wut 29: kill.switch: $(wut.this) ^ Error: module "nw2" references unknown global variable "yarn" 31: orange: $(vars.yarn) ^ Error: failed to get info using tfconfig for terraform module at ./arbus_store: Source to module does not exist: ./arbus_store 30: green: $(arbuz.gold) ^ ``` --- pkg/config/config.go | 151 ++++++--------------- pkg/config/config_test.go | 249 ++++++++++++++++++----------------- pkg/config/errors.go | 16 ++- pkg/config/expand.go | 17 ++- pkg/config/validate.go | 59 ++++----- pkg/config/validator_test.go | 30 ++--- 6 files changed, 234 insertions(+), 288 deletions(-) diff --git a/pkg/config/config.go b/pkg/config/config.go index 2202208d36..a9e674c455 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -120,7 +120,7 @@ func (g DeploymentGroup) Kind() ModuleKind { // Module return the module with the given ID func (bp *Blueprint) Module(id ModuleID) (*Module, error) { var mod *Module - bp.WalkModules(func(_ modulePath, m *Module) error { + bp.WalkModules(func(m *Module) error { if m.ID == id { mod = m } @@ -332,18 +332,16 @@ type DeploymentConfig struct { // ExpandConfig expands the yaml config in place func (dc *DeploymentConfig) ExpandConfig() error { - if err := dc.Config.checkMovedModules(); err != nil { - return err - } dc.Config.setGlobalLabels() dc.Config.addKindToModules() - if err := dc.validateConfig(); err != nil { + + if err := validateBlueprint(dc.Config); err != nil { return err } if err := dc.expand(); err != nil { return err } - if err := dc.validate(); err != nil { + if err := dc.executeValidators(); err != nil { return err } return nil @@ -395,7 +393,7 @@ func (dc *DeploymentConfig) listUnusedDeploymentVariables() []string { "deployment_name": true, } - dc.Config.WalkModules(func(_ modulePath, m *Module) error { + dc.Config.WalkModules(func(m *Module) error { for _, v := range GetUsedDeploymentVars(m.Settings.AsObject()) { usedVars[v] = true } @@ -412,20 +410,13 @@ func (dc *DeploymentConfig) listUnusedDeploymentVariables() []string { return unusedVars } -func (bp Blueprint) checkMovedModules() error { - errs := Errors{} - - bp.WalkModules(func(p modulePath, m *Module) error { - if replacement, ok := movedModules[strings.Trim(m.Source, "./")]; ok { - err := fmt.Errorf( - "a module has moved. %s has been replaced with %s. Please update the source in your blueprint and try again", - m.Source, replacement) - errs.At(p.Source, err) - } - return nil - }) - - return errs.OrNil() +func checkMovedModule(source string) error { + if replacement, ok := movedModules[strings.Trim(source, "./")]; ok { + return fmt.Errorf( + "a module has moved. %s has been replaced with %s. Please update the source in your blueprint and try again", + source, replacement) + } + return nil } // NewDeploymentConfig is a constructor for DeploymentConfig @@ -468,7 +459,7 @@ func (dc DeploymentConfig) ExportBlueprint(outputFilename string) error { // addKindToModules sets the kind to 'terraform' when empty. func (bp *Blueprint) addKindToModules() { - bp.WalkModules(func(_ modulePath, m *Module) error { + bp.WalkModules(func(m *Module) error { if m.Kind == UnknownKind { m.Kind = TerraformKind } @@ -476,40 +467,19 @@ func (bp *Blueprint) addKindToModules() { }) } -// checkModulesInfo ensures each module in the blueprint has known detailed -// metadata (inputs, outputs) -func (bp *Blueprint) checkModulesInfo() error { - errs := Errors{} - bp.WalkModules(func(p modulePath, m *Module) error { - _, err := modulereader.GetModuleInfo(m.Source, m.Kind.String()) - errs.At(p.Source, err) - return nil - }) - return errs.OrNil() -} - -// checkModulesAndGroups ensures: -// - all module IDs are unique across all groups -// - if deployment group kind is unknown (not explicit in blueprint), then it is -// set to th kind of the first module that has a known kind (a prior func sets -// module kind to Terraform if unset) -// - all modules must be of the same kind and all modules must be of the same -// kind as the group -// - all group names are unique and do not have illegal characters -func checkModulesAndGroups(groups []DeploymentGroup) error { +func checkModulesAndGroups(bp Blueprint) error { seenMod := map[ModuleID]bool{} - seenGroups := map[GroupName]bool{} + seenGrp := map[GroupName]bool{} errs := Errors{} - for ig := range groups { + for ig, grp := range bp.DeploymentGroups { pg := Root.Groups.At(ig) - grp := &groups[ig] errs.At(pg.Name, grp.Name.Validate()) - if seenGroups[grp.Name] { + if seenGrp[grp.Name] { errs.At(pg.Name, fmt.Errorf("%s: %s used more than once", errorMessages["duplicateGroup"], grp.Name)) } - seenGroups[grp.Name] = true + seenGrp[grp.Name] = true if len(grp.Modules) == 0 { errs.At(pg.Modules, errors.New("deployment group must have at least one module")) @@ -523,21 +493,19 @@ func checkModulesAndGroups(groups []DeploymentGroup) error { errs.At(pm.ID, fmt.Errorf("%s: %s used more than once", errorMessages["duplicateID"], mod.ID)) } seenMod[mod.ID] = true + errs.Add(validateModule(pm, mod, bp)) } } return errs.OrNil() } -// checkUsedModuleNames verifies that any used modules have valid names and +// validateModuleUseReferences verifies that any used modules exist and // are in the correct group -func checkUsedModuleNames(bp Blueprint) error { +func validateModuleUseReferences(p modulePath, mod Module, bp Blueprint) error { errs := Errors{} - bp.WalkModules(func(p modulePath, mod *Module) error { - for iu, used := range mod.Use { - errs.At(p.Use.At(iu), validateModuleReference(bp, *mod, used)) - } - return nil - }) + for iu, used := range mod.Use { + errs.At(p.Use.At(iu), validateModuleReference(bp, mod, used)) + } return errs.OrNil() } @@ -567,47 +535,18 @@ func checkBackends(bp Blueprint) error { return errs.OrNil() } -// validateConfig runs a set of simple early checks on the imported input YAML -func (dc *DeploymentConfig) validateConfig() error { - - if _, err := dc.Config.DeploymentName(); err != nil { - return err - } - - if err := dc.Config.checkBlueprintName(); err != nil { - return err - } - - if err := dc.validateVars(); err != nil { - return err - } - - if err := dc.Config.checkModulesInfo(); err != nil { - return err - } - - if err := checkModulesAndGroups(dc.Config.DeploymentGroups); err != nil { - return err - } - - // checkPackerGroups must come after checkModulesAndGroups, in which group - // Kind is set and aligned with module Kinds - if err := checkPackerGroups(dc.Config.DeploymentGroups); err != nil { - return err - } - - if err := checkUsedModuleNames(dc.Config); err != nil { - return err - } - - if err := checkBackends(dc.Config); err != nil { - return err - } +// validateBlueprint runs a set of simple early checks on the imported input YAML +func validateBlueprint(bp Blueprint) error { + errs := Errors{} - if err := checkModuleSettings(dc.Config); err != nil { - return err - } - return nil + _, err := bp.DeploymentName() + return errs.Add(err). + Add(bp.checkBlueprintName()). + Add(validateVars(bp.Vars)). + Add(checkModulesAndGroups(bp)). + Add(checkPackerGroups(bp.DeploymentGroups)). + Add(checkBackends(bp)). + OrNil() } // SkipValidator marks validator(s) as skipped, @@ -745,13 +684,12 @@ func IsProductOfModuleUse(v cty.Value) []ModuleID { } // WalkModules walks all modules in the blueprint and calls the walker function -func (bp *Blueprint) WalkModules(walker func(modulePath, *Module) error) error { +func (bp *Blueprint) WalkModules(walker func(*Module) error) error { for ig := range bp.DeploymentGroups { g := &bp.DeploymentGroups[ig] for im := range g.Modules { - p := Root.Groups.At(ig).Modules.At(im) m := &g.Modules[im] - if err := walker(p, m); err != nil { + if err := walker(m); err != nil { return err } } @@ -760,17 +698,14 @@ func (bp *Blueprint) WalkModules(walker func(modulePath, *Module) error) error { } // validate every module setting in the blueprint containing a reference -func checkModuleSettings(bp Blueprint) error { +func validateModuleSettingReferences(p modulePath, m Module, bp Blueprint) error { errs := Errors{} - bp.WalkModules(func(p modulePath, m *Module) error { - for k, v := range m.Settings.Items() { - for _, r := range valueReferences(v) { - // TODO: add a cty.Path suffix to the errors path for better location - errs.At(p.Settings.Dot(k), validateModuleSettingReference(bp, *m, r)) - } + for k, v := range m.Settings.Items() { + for _, r := range valueReferences(v) { + // TODO: add a cty.Path suffix to the errors path for better location + errs.At(p.Settings.Dot(k), validateModuleSettingReference(bp, m, r)) } - return nil - }) + } return errs.OrNil() } diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 4697e6ee4c..bd1a0728d9 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -387,33 +387,37 @@ func (s *MySuite) TestExpandConfig(c *C) { } func (s *MySuite) TestCheckModulesAndGroups(c *C) { - { // Duplicate module name same group - g := DeploymentGroup{Name: "ice", Modules: []Module{ - {ID: "pony", Kind: PackerKind}, - {ID: "pony", Kind: PackerKind}, - }} - err := checkModulesAndGroups([]DeploymentGroup{g}) + pony := Module{ID: "pony", Kind: TerraformKind, Source: "./ponyshop"} + zebra := Module{ID: "zebra", Kind: PackerKind, Source: "./zebrashop"} + + setTestModuleInfo(pony, modulereader.ModuleInfo{}) + setTestModuleInfo(zebra, modulereader.ModuleInfo{}) + + { // Duplicate module id same group + g := DeploymentGroup{Name: "ice", Modules: []Module{pony, pony}} + err := checkModulesAndGroups(Blueprint{DeploymentGroups: []DeploymentGroup{g}}) c.Check(err, ErrorMatches, ".*pony used more than once") } - { // Duplicate module name different groups - ice := DeploymentGroup{Name: "ice", Modules: []Module{ - {ID: "pony", Kind: PackerKind}}} - fire := DeploymentGroup{Name: "fire", Modules: []Module{ - {ID: "pony", Kind: PackerKind}}} - err := checkModulesAndGroups([]DeploymentGroup{ice, fire}) + { // Duplicate module id different groups + ice := DeploymentGroup{Name: "ice", Modules: []Module{pony}} + fire := DeploymentGroup{Name: "fire", Modules: []Module{pony}} + err := checkModulesAndGroups(Blueprint{DeploymentGroups: []DeploymentGroup{ice, fire}}) c.Check(err, ErrorMatches, ".*pony used more than once") } + { // Duplicate group name + ice := DeploymentGroup{Name: "ice", Modules: []Module{pony}} + ice9 := DeploymentGroup{Name: "ice", Modules: []Module{zebra}} + err := checkModulesAndGroups(Blueprint{DeploymentGroups: []DeploymentGroup{ice, ice9}}) + c.Check(err, ErrorMatches, ".*ice used more than once") + } { // Mixing module kinds - g := DeploymentGroup{Name: "ice", Modules: []Module{ - {ID: "pony", Kind: PackerKind}, - {ID: "zebra", Kind: TerraformKind}, - }} - err := checkModulesAndGroups([]DeploymentGroup{g}) + g := DeploymentGroup{Name: "ice", Modules: []Module{pony, zebra}} + err := checkModulesAndGroups(Blueprint{DeploymentGroups: []DeploymentGroup{g}}) c.Check(err, NotNil) } { // Empty group g := DeploymentGroup{Name: "ice"} - err := checkModulesAndGroups([]DeploymentGroup{g}) + err := checkModulesAndGroups(Blueprint{DeploymentGroups: []DeploymentGroup{g}}) c.Check(err, NotNil) } } @@ -619,8 +623,7 @@ func (s *MySuite) TestImportBlueprint(c *C) { Equals, expectedSimpleBlueprint.DeploymentGroups[0].Modules[0].ID) } -func (s *MySuite) TestImportBlueprint_LabelValidation(c *C) { - dc := getDeploymentConfigForTest() +func (s *MySuite) TestValidateGlobalLabels(c *C) { labelName := "my_test_label_name" labelValue := "my-valid-label-value" @@ -629,87 +632,106 @@ func (s *MySuite) TestImportBlueprint_LabelValidation(c *C) { maxLabels := 64 - var err error - - // Simple success case - dc.Config.Vars.Set("labels", cty.MapVal(map[string]cty.Value{ - labelName: cty.StringVal(labelValue), - })) - err = dc.validateVars() - c.Assert(err, Equals, nil) - - // Succeed on empty value - dc.Config.Vars.Set("labels", cty.MapVal(map[string]cty.Value{ - labelName: cty.StringVal(""), - })) - err = dc.validateVars() - c.Assert(err, Equals, nil) - - // Succeed on lowercase international character - dc.Config.Vars.Set("labels", cty.MapVal(map[string]cty.Value{ - "ñ" + labelName: cty.StringVal("ñ"), - })) - err = dc.validateVars() - c.Assert(err, Equals, nil) - - // Succeed on case-less international character - dc.Config.Vars.Set("labels", cty.MapVal(map[string]cty.Value{ - "ƿ" + labelName: cty.StringVal("ƿ"), // Unicode 01BF, latin character "wynn" - })) - err = dc.validateVars() - c.Assert(err, Equals, nil) - - // Succeed on max number of labels - largeLabelsMap := map[string]cty.Value{} - for i := 0; i < maxLabels; i++ { - largeLabelsMap[labelName+"_"+fmt.Sprint(i)] = cty.StringVal(labelValue) - } - dc.Config.Vars.Set("labels", cty.MapVal(largeLabelsMap)) - c.Check(dc.validateVars(), IsNil) - - // Invalid label name - dc.Config.Vars.Set("labels", cty.MapVal(map[string]cty.Value{ - invalidLabelName: cty.StringVal(labelValue), - })) - err = dc.validateVars() - c.Assert(err, ErrorMatches, fmt.Sprintf(`.*name.*'%s: %s'.*`, - regexp.QuoteMeta(invalidLabelName), - regexp.QuoteMeta(labelValue))) - - // Invalid label value - dc.Config.Vars.Set("labels", cty.MapVal(map[string]cty.Value{ - labelName: cty.StringVal(invalidLabelValue), - })) - err = dc.validateVars() - c.Assert(err, ErrorMatches, fmt.Sprintf(`.*value.*'%s: %s'.*`, - regexp.QuoteMeta(labelName), - regexp.QuoteMeta(invalidLabelValue))) - - // Too many labels - tooManyLabelsMap := map[string]cty.Value{} - for i := 0; i < maxLabels+1; i++ { - tooManyLabelsMap[labelName+"_"+fmt.Sprint(i)] = cty.StringVal(labelValue) - } - dc.Config.Vars.Set("labels", cty.MapVal(tooManyLabelsMap)) - c.Check(dc.validateVars(), NotNil) - - // Fail on uppercase international character - dc.Config.Vars.Set("labels", cty.MapVal(map[string]cty.Value{ - labelName: cty.StringVal("Ñ"), - })) - err = dc.validateVars() - c.Assert(err, ErrorMatches, fmt.Sprintf(`.*value.*'%s: %s'.*`, - regexp.QuoteMeta(labelName), - regexp.QuoteMeta("Ñ"))) - - // Fail on empty name - dc.Config.Vars.Set("labels", cty.MapVal(map[string]cty.Value{ - "": cty.StringVal(labelValue), - })) - err = dc.validateVars() - c.Assert(err, ErrorMatches, fmt.Sprintf(`.*name.*'%s: %s'.*`, - "", - regexp.QuoteMeta(labelValue))) + { // No labels + vars := Dict{} + c.Check(validateGlobalLabels(vars), IsNil) + } + + { // Simple success case + vars := Dict{} + vars.Set("labels", cty.MapVal(map[string]cty.Value{ + labelName: cty.StringVal(labelValue), + })) + c.Check(validateGlobalLabels(vars), IsNil) + } + + { // Succeed on empty value + vars := Dict{} + vars.Set("labels", cty.MapVal(map[string]cty.Value{ + labelName: cty.StringVal(""), + })) + c.Check(validateGlobalLabels(vars), IsNil) + } + + { // Succeed on lowercase international character + vars := Dict{} + vars.Set("labels", cty.MapVal(map[string]cty.Value{ + "ñ" + labelName: cty.StringVal("ñ"), + })) + c.Check(validateGlobalLabels(vars), IsNil) + } + + { // Succeed on case-less international character + vars := Dict{} + vars.Set("labels", cty.MapVal(map[string]cty.Value{ + "ƿ" + labelName: cty.StringVal("ƿ"), // Unicode 01BF, latin character "wynn" + })) + c.Check(validateGlobalLabels(vars), IsNil) + } + + { // Succeed on max number of labels + vars := Dict{} + largeLabelsMap := map[string]cty.Value{} + for i := 0; i < maxLabels; i++ { + largeLabelsMap[labelName+"_"+fmt.Sprint(i)] = cty.StringVal(labelValue) + } + vars.Set("labels", cty.MapVal(largeLabelsMap)) + c.Check(validateGlobalLabels(vars), IsNil) + } + + { // Invalid label name + vars := Dict{} + vars.Set("labels", cty.MapVal(map[string]cty.Value{ + invalidLabelName: cty.StringVal(labelValue), + })) + err := validateGlobalLabels(vars) + c.Check(err, ErrorMatches, fmt.Sprintf(`.*name.*'%s: %s'.*`, + regexp.QuoteMeta(invalidLabelName), + regexp.QuoteMeta(labelValue))) + } + + { // Invalid label value + vars := Dict{} + vars.Set("labels", cty.MapVal(map[string]cty.Value{ + labelName: cty.StringVal(invalidLabelValue), + })) + err := validateGlobalLabels(vars) + c.Check(err, ErrorMatches, fmt.Sprintf(`.*value.*'%s: %s'.*`, + regexp.QuoteMeta(labelName), + regexp.QuoteMeta(invalidLabelValue))) + } + + { // Too many labels + vars := Dict{} + tooManyLabelsMap := map[string]cty.Value{} + for i := 0; i < maxLabels+1; i++ { + tooManyLabelsMap[labelName+"_"+fmt.Sprint(i)] = cty.StringVal(labelValue) + } + vars.Set("labels", cty.MapVal(tooManyLabelsMap)) + c.Check(validateGlobalLabels(vars), NotNil) + } + + { // Fail on uppercase international character + vars := Dict{} + vars.Set("labels", cty.MapVal(map[string]cty.Value{ + labelName: cty.StringVal("Ñ"), + })) + err := validateGlobalLabels(vars) + c.Check(err, ErrorMatches, fmt.Sprintf(`.*value.*'%s: %s'.*`, + regexp.QuoteMeta(labelName), + regexp.QuoteMeta("Ñ"))) + } + + { // Fail on empty name + vars := Dict{} + vars.Set("labels", cty.MapVal(map[string]cty.Value{ + "": cty.StringVal(labelValue), + })) + err := validateGlobalLabels(vars) + c.Check(err, ErrorMatches, fmt.Sprintf(`.*name.*'%s: %s'.*`, + "", + regexp.QuoteMeta(labelValue))) + } } func (s *MySuite) TestImportBlueprint_ExtraField_ThrowsError(c *C) { @@ -756,21 +778,14 @@ func (s *MySuite) TestValidationLevels(c *C) { } func (s *MySuite) TestCheckMovedModules(c *C) { - bp := Blueprint{ - DeploymentGroups: []DeploymentGroup{ - {Modules: []Module{ - {Source: "some/module/that/has/not/moved"}}}}} - // base case should not err - c.Assert(bp.checkMovedModules(), IsNil) + c.Check(checkMovedModule("some/module/that/has/not/moved"), IsNil) // embedded moved - bp.DeploymentGroups[0].Modules[0].Source = "community/modules/scheduler/cloud-batch-job" - c.Assert(bp.checkMovedModules(), NotNil) + c.Check(checkMovedModule("community/modules/scheduler/cloud-batch-job"), NotNil) // local moved - bp.DeploymentGroups[0].Modules[0].Source = "./community/modules/scheduler/cloud-batch-job" - c.Assert(bp.checkMovedModules(), NotNil) + c.Assert(checkMovedModule("./community/modules/scheduler/cloud-batch-job"), NotNil) } func (s *MySuite) TestValidatorConfigCheck(c *C) { @@ -1022,18 +1037,16 @@ func (s *MySuite) TestValidateModuleSettingReference(c *C) { c.Check(vld(bp, mod21, ModuleRef("pkr", "outPkr")), NotNil) } -func (s *MySuite) TestCheckModuleSettings(c *C) { +func (s *MySuite) TestValidateModuleSettingReferences(c *C) { m := Module{ID: "m"} m.Settings.Set("white", GlobalRef("zebra").AsExpression().AsValue()) - bp := Blueprint{ - DeploymentGroups: []DeploymentGroup{ - {Name: "g", Modules: []Module{m}}, - }} + bp := Blueprint{} + p := Root.Groups.At(0).Modules.At(0) - c.Check(checkModuleSettings(bp), NotNil) + c.Check(validateModuleSettingReferences(p, m, bp), NotNil) bp.Vars.Set("zebra", cty.StringVal("stripes")) - c.Check(checkModuleSettings(bp), IsNil) + c.Check(validateModuleSettingReferences(p, m, bp), IsNil) } func (s *MySuite) TestGroupNameValidate(c *C) { diff --git a/pkg/config/errors.go b/pkg/config/errors.go index ebe3785716..35775dad42 100644 --- a/pkg/config/errors.go +++ b/pkg/config/errors.go @@ -68,15 +68,27 @@ func (e Errors) OrNil() error { } } +func (e *Errors) addDedup(err error) { + msg := err.Error() // Do message comparison + for _, e := range e.Errors { + if msg == e.Error() { + return + } + } + e.Errors = append(e.Errors, err) +} + // Add adds an error to the Errors and returns itself func (e *Errors) Add(err error) *Errors { if err == nil { return e } if multi, ok := err.(*Errors); ok { - e.Errors = append(e.Errors, multi.Errors...) + for _, c := range multi.Errors { + e.addDedup(c) + } } else { - e.Errors = append(e.Errors, err) + e.addDedup(err) } return e } diff --git a/pkg/config/expand.go b/pkg/config/expand.go index 38884fce00..c82069db09 100644 --- a/pkg/config/expand.go +++ b/pkg/config/expand.go @@ -173,7 +173,7 @@ func useModule(mod *Module, use Module) { // applyUseModules applies variables from modules listed in the "use" field // when/if applicable func (dc *DeploymentConfig) applyUseModules() error { - return dc.Config.WalkModules(func(_ modulePath, m *Module) error { + return dc.Config.WalkModules(func(m *Module) error { for _, u := range m.Use { used, err := dc.Config.Module(u) if err != nil { // should never happen @@ -223,7 +223,7 @@ func (dc *DeploymentConfig) combineLabels() { gl := mergeMaps(defaults, vars.Get(labels).AsValueMap()) vars.Set(labels, cty.ObjectVal(gl)) - dc.Config.WalkModules(func(_ modulePath, mod *Module) error { + dc.Config.WalkModules(func(mod *Module) error { combineModuleLabels(mod, *dc) return nil }) @@ -291,7 +291,7 @@ func (bp Blueprint) applyGlobalVarsInModule(mod *Module) error { // applyGlobalVariables takes any variables defined at the global level and // applies them to module settings if not already set. func (dc *DeploymentConfig) applyGlobalVariables() error { - return dc.Config.WalkModules(func(_ modulePath, mod *Module) error { + return dc.Config.WalkModules(func(mod *Module) error { return dc.Config.applyGlobalVarsInModule(mod) }) } @@ -341,8 +341,11 @@ func validateModuleSettingReference(bp Blueprint, mod Module, r Reference) error if err := validateModuleReference(bp, mod, r.Module); err != nil { return err } - tm, _ := bp.Module(r.Module) - mi := tm.InfoOrDie() + tm, _ := bp.Module(r.Module) // Shouldn't error if validateModuleReference didn't + mi, err := modulereader.GetModuleInfo(tm.Source, tm.Kind.String()) + if err != nil { + return err + } found := slices.ContainsFunc(mi.Outputs, func(o modulereader.OutputInfo) bool { return o.Name == r.Name }) if !found { return fmt.Errorf("%s: module %s did not have output %s", errorMessages["noOutput"], tm.ID, r.Name) @@ -465,7 +468,7 @@ func FindIntergroupReferences(v cty.Value, mod Module, bp Blueprint) []Reference // find all intergroup references and add them to source Module.Outputs func (bp *Blueprint) populateOutputs() { refs := map[Reference]bool{} - bp.WalkModules(func(_ modulePath, m *Module) error { + bp.WalkModules(func(m *Module) error { rs := FindIntergroupReferences(m.Settings.AsObject(), *m, *bp) for _, r := range rs { refs[r] = true @@ -473,7 +476,7 @@ func (bp *Blueprint) populateOutputs() { return nil }) - bp.WalkModules(func(_ modulePath, m *Module) error { + bp.WalkModules(func(m *Module) error { for r := range refs { if r.Module != m.ID { continue // find IGC references pointing to this module diff --git a/pkg/config/validate.go b/pkg/config/validate.go index 08326ca44f..7acca9f788 100644 --- a/pkg/config/validate.go +++ b/pkg/config/validate.go @@ -37,14 +37,6 @@ const ( maxLabels = 64 ) -// validate is the top-level function for running the validation suite. -func (dc DeploymentConfig) validate() error { - errs := Errors{} - return errs. - Add(dc.executeValidators()). - Add(dc.validateModules()).OrNil() -} - // performs validation of global variables func (dc DeploymentConfig) executeValidators() error { var errored, warned bool @@ -112,12 +104,12 @@ func (dc DeploymentConfig) executeValidators() error { return nil } -func validateGlobalLabels(bp Blueprint) error { - if !bp.Vars.Has("labels") { +func validateGlobalLabels(vars Dict) error { + if !vars.Has("labels") { return nil } p := Root.Vars.Dot("labels") - labels := bp.Vars.Get("labels") + labels := vars.Get("labels") ty := labels.Type() if !ty.IsObjectType() && !ty.IsMapType() { @@ -151,11 +143,11 @@ func validateGlobalLabels(bp Blueprint) error { } // validateVars checks the global variables for viable types -func (dc DeploymentConfig) validateVars() error { +func validateVars(vars Dict) error { errs := Errors{} - errs.Add(validateGlobalLabels(dc.Config)) + errs.Add(validateGlobalLabels(vars)) // Check for any nil values - for key, val := range dc.Config.Vars.Items() { + for key, val := range vars.Items() { if val.IsNull() { errs.At(Root.Vars.Dot(key), fmt.Errorf("deployment variable %s was not set", key)) } @@ -163,26 +155,31 @@ func (dc DeploymentConfig) validateVars() error { return errs.OrNil() } -func validateModule(p modulePath, m Module) error { - errs := Errors{} - if m.ID == "" { - errs.At(p.ID, fmt.Errorf(errorMessages["emptyID"])) - } +func validateModule(p modulePath, m Module, bp Blueprint) error { + // Source/Kind validations are required to pass to perform other validations if m.Source == "" { - errs.At(p.Source, fmt.Errorf(errorMessages["emptySource"])) + return BpError{p.Source, fmt.Errorf(errorMessages["emptySource"])} + } + if err := checkMovedModule(m.Source); err != nil { + return BpError{p.Source, err} } if !IsValidModuleKind(m.Kind.String()) { - errs.At(p.Kind, fmt.Errorf(errorMessages["wrongKind"])) + return BpError{p.Kind, fmt.Errorf(errorMessages["wrongKind"])} } - info, err := modulereader.GetModuleInfo(m.Source, m.Kind.kind) - if err != nil { // Can not proceed with other validations - return errs.Add(err) + if err != nil { + return BpError{p.Source, err} } + errs := Errors{} + if m.ID == "" { + errs.At(p.ID, fmt.Errorf(errorMessages["emptyID"])) + } return errs. Add(validateSettings(p, m, info)). Add(validateOutputs(p, m, info)). + Add(validateModuleUseReferences(p, m, bp)). + Add(validateModuleSettingReferences(p, m, bp)). OrNil() } @@ -200,16 +197,6 @@ func validateOutputs(p modulePath, mod Module, info modulereader.ModuleInfo) err return errs.OrNil() } -// validateModules ensures parameters set in modules are set correctly. -func (dc DeploymentConfig) validateModules() error { - errs := Errors{} - dc.Config.WalkModules(func(p modulePath, m *Module) error { - errs.Add(validateModule(p, *m)) - return nil - }) - return errs.OrNil() -} - type moduleVariables struct { Inputs map[string]bool Outputs map[string]bool @@ -278,7 +265,7 @@ func (dc *DeploymentConfig) testApisEnabled(c validatorConfig) error { } apis := map[string]bool{} - dc.Config.WalkModules(func(_ modulePath, m *Module) error { + dc.Config.WalkModules(func(m *Module) error { for _, api := range m.InfoOrDie().RequiredApis { apis[api] = true } @@ -378,7 +365,7 @@ func (dc *DeploymentConfig) testModuleNotUsed(c validatorConfig) error { } acc := map[string][]string{} - dc.Config.WalkModules(func(_ modulePath, m *Module) error { + dc.Config.WalkModules(func(m *Module) error { ids := m.listUnusedModules() sids := make([]string, len(ids)) for i, id := range ids { diff --git a/pkg/config/validator_test.go b/pkg/config/validator_test.go index c3dc70b99d..7355e9ca93 100644 --- a/pkg/config/validator_test.go +++ b/pkg/config/validator_test.go @@ -28,27 +28,22 @@ const ( undefinedGlobalVariableRegex = ".* was not defined$" ) -func (s *MySuite) TestValidateModules(c *C) { - dc := getDeploymentConfigForTest() - dc.validateModules() -} - func (s *MySuite) TestValidateVars(c *C) { { // Success - dc := DeploymentConfig{} - c.Check(dc.validateVars(), IsNil) + vars := Dict{} + c.Check(validateVars(vars), IsNil) } { // Fail: Nil value - dc := DeploymentConfig{} - dc.Config.Vars.Set("fork", cty.NilVal) - c.Check(dc.validateVars(), NotNil) + vars := Dict{} + vars.Set("fork", cty.NilVal) + c.Check(validateVars(vars), NotNil) } { // Fail: labels not a map - dc := DeploymentConfig{} - dc.Config.Vars.Set("labels", cty.StringVal("a_string")) - c.Check(dc.validateVars(), NotNil) + vars := Dict{} + vars.Set("labels", cty.StringVal("a_string")) + c.Check(validateVars(vars), NotNil) } } @@ -98,14 +93,15 @@ func (s *MySuite) TestValidateSettings(c *C) { func (s *MySuite) TestValidateModule(c *C) { p := Root.Groups.At(2).Modules.At(1) + dummyBp := Blueprint{} { // Catch no ID - err := validateModule(p, Module{Source: "green"}) + err := validateModule(p, Module{Source: "green"}, dummyBp) c.Check(err, NotNil) } { // Catch no Source - err := validateModule(p, Module{ID: "bond"}) + err := validateModule(p, Module{ID: "bond"}, dummyBp) c.Check(err, NotNil) } @@ -114,7 +110,7 @@ func (s *MySuite) TestValidateModule(c *C) { ID: "bond", Source: "green", Kind: ModuleKind{kind: "mean"}, - }) + }, dummyBp) c.Check(err, NotNil) } @@ -125,7 +121,7 @@ func (s *MySuite) TestValidateModule(c *C) { Kind: TerraformKind, } modulereader.SetModuleInfo(mod.Source, mod.Kind.String(), modulereader.ModuleInfo{}) - err := validateModule(p, mod) + err := validateModule(p, mod, dummyBp) c.Check(err, IsNil) } } From d16f4409a32e05047b9b2a84b20de721122ae873 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 19 Jul 2023 14:29:26 -0500 Subject: [PATCH 041/144] Add explicit zone to batch-login-node module --- modules/scheduler/batch-login-node/README.md | 1 + modules/scheduler/batch-login-node/main.tf | 1 + modules/scheduler/batch-login-node/variables.tf | 5 +++++ 3 files changed, 7 insertions(+) diff --git a/modules/scheduler/batch-login-node/README.md b/modules/scheduler/batch-login-node/README.md index 8be40bd1cb..c279309d67 100644 --- a/modules/scheduler/batch-login-node/README.md +++ b/modules/scheduler/batch-login-node/README.md @@ -116,6 +116,7 @@ limitations under the License. | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | | [region](#input\_region) | The region in which to create the login node | `string` | n/a | yes | | [startup\_script](#input\_startup\_script) | Startup script run before Google Cloud Batch job starts. Typically supplied by a batch-job-template module. | `string` | `null` | no | +| [zone](#input\_zone) | The zone in which to create the login node | `string` | n/a | yes | ## Outputs diff --git a/modules/scheduler/batch-login-node/main.tf b/modules/scheduler/batch-login-node/main.tf index c3fe8d5326..896a026708 100644 --- a/modules/scheduler/batch-login-node/main.tf +++ b/modules/scheduler/batch-login-node/main.tf @@ -118,6 +118,7 @@ resource "google_compute_instance_from_template" "batch_login" { name = "${var.deployment_name}-batch-login" source_instance_template = var.instance_template project = var.project_id + zone = var.zone metadata = local.login_metadata service_account { diff --git a/modules/scheduler/batch-login-node/variables.tf b/modules/scheduler/batch-login-node/variables.tf index 1794fda786..b81e844e19 100644 --- a/modules/scheduler/batch-login-node/variables.tf +++ b/modules/scheduler/batch-login-node/variables.tf @@ -29,6 +29,11 @@ variable "region" { type = string } +variable "zone" { + description = "The zone in which to create the login node" + type = string +} + variable "labels" { description = "Labels to add to the login node. Key-value pairs" type = map(string) From 9ff112fa8e52a9a7d14cb871d8d60d7e7177ac85 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Wed, 19 Jul 2023 20:23:21 +0000 Subject: [PATCH 042/144] Use PR labels to auto generate release notes (#1578) --- .github/release.yml | 44 +++++++++++++++++++++ .github/workflows/pr-label-validation.yml | 48 +++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 .github/release.yml create mode 100644 .github/workflows/pr-label-validation.yml diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 0000000000..c791664188 --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,44 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# For more info, see: +# https://docs.github.com/en/repositories/releasing-projects-on-github/automatically-generated-release-notes#configuration-options + +changelog: + exclude: + labels: + - chore + authors: [] + categories: + - title: Key New Features 🎉 + labels: + - key-new-features + - title: New Modules 🧱 + labels: + - new-modules + - title: Module Improvements 🛠 + labels: + - module-improvements + - title: Improvements + labels: + - improvements + - title: Deprecations + labels: + - deprecations + - title: Version Updates + labels: + - version-updates + - title: Other changes + labels: + - "*" diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml new file mode 100644 index 0000000000..2c6d6a47fd --- /dev/null +++ b/.github/workflows/pr-label-validation.yml @@ -0,0 +1,48 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Dependency Review Action +# +# This Action will ensure that a label exists on a PR. +name: 'Ensure PR label exists' + +on: + pull_request: + types: + - opened + - labeled + - unlabeled + branches: + - develop + +jobs: + label: + runs-on: ubuntu-latest + permissions: + pull-requests: read + steps: + - id: check-labels + uses: mheap/github-action-required-labels@v5 + with: + mode: minimum + count: 1 + labels: "chore, key-new-features, new-modules, module-improvements, improvements, deprecations, version-updates" + message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: chore, key-new-features, new-modules, module-improvements, improvements, deprecations, version-updates" + - id: print-labels + run: | + echo "Current PR labels:" + for f in $(echo "{{steps.check-labels.outputs.labels}}" | sed "s/,/ /g") + do + echo "$f" + done From edadd4d01e1500982a1d0c38c2b16e6a59a9eea8 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 19 Jul 2023 17:06:46 -0500 Subject: [PATCH 043/144] Fix links in image building docs --- docs/image-building.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/image-building.md b/docs/image-building.md index bdea141b4b..7dd29e8297 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -141,8 +141,8 @@ The Toolkit supports Packer modules developed by 3rd parties -- including ones that you have developed! -- hosted via git or GitHub. We recommend reading the module documentation on: -- [GitHub-hosted modules and packages](https://github.com/tpdownes/hpc-toolkit/blob/packer_documentation/modules/README.md#github-hosted-modules-and-packages) -- [GitHub-hosted Packer modules](https://github.com/tpdownes/hpc-toolkit/blob/packer_documentation/modules/README.md#github-hosted-packer-modules) +- [GitHub-hosted modules and packages](https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md#github-hosted-modules-and-packages) +- [GitHub-hosted Packer modules](https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md#github-hosted-packer-modules) In particular, the Toolkit recommends using double-slash (`//`) notation to identify the root of the git repository. Doing so will ensure that the Packer From c056e5fe3a70a25ef7368703b00198d6374c181c Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 20 Jul 2023 14:52:53 -0700 Subject: [PATCH 044/144] Check for container api when using GKE modules --- pkg/modulereader/resreader.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index d21d200284..b49fef5c41 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -256,6 +256,12 @@ func defaultAPIList(source string) []string { "community/modules/scheduler/SchedMD-slurm-on-gcp-login-node": { "compute.googleapis.com", }, + "community/modules/compute/gke-node-pool": { + "container.googleapis.com", + }, + "community/modules/scheduler/gke-cluster": { + "container.googleapis.com", + }, "modules/scheduler/batch-job-template": { "batch.googleapis.com", "compute.googleapis.com", From c6781fa873706e6abd56b7119123eba1aaad3066 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Thu, 13 Jul 2023 15:41:38 -0700 Subject: [PATCH 045/144] Make output of execute_commands more user friendly --- .../ramble-execute/templates/ramble_execute.yml.tpl | 10 +++------- .../spack-execute/templates/execute_commands.yml.tpl | 10 +++------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl b/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl index a3cf73e097..b0fac689f9 100644 --- a/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl +++ b/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl @@ -23,7 +23,7 @@ block: - name: Print commands to be executed ansible.builtin.debug: - msg: "{{ commands.split('\n') }}" + msg: "{{ commands.split('\n') | ansible.builtin.to_nice_yaml }}" - name: Execute commands ansible.builtin.shell: | @@ -33,14 +33,10 @@ echo " === Starting commands ===" {{ commands }} echo " === Finished commands ===" - } | tee -a {{ log_file }} + } 2>&1 | tee -a {{ log_file }} register: output always: - - name: Print commands output to stderr - ansible.builtin.debug: - var: output.stderr_lines - - - name: Print commands output to stdout + - name: Print commands output ansible.builtin.debug: var: output.stdout_lines diff --git a/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl b/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl index a3cf73e097..b0fac689f9 100644 --- a/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl +++ b/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl @@ -23,7 +23,7 @@ block: - name: Print commands to be executed ansible.builtin.debug: - msg: "{{ commands.split('\n') }}" + msg: "{{ commands.split('\n') | ansible.builtin.to_nice_yaml }}" - name: Execute commands ansible.builtin.shell: | @@ -33,14 +33,10 @@ echo " === Starting commands ===" {{ commands }} echo " === Finished commands ===" - } | tee -a {{ log_file }} + } 2>&1 | tee -a {{ log_file }} register: output always: - - name: Print commands output to stderr - ansible.builtin.debug: - var: output.stderr_lines - - - name: Print commands output to stdout + - name: Print commands output ansible.builtin.debug: var: output.stdout_lines From 7f232a02881754251e6c4e28e798f69b3e37890e Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Fri, 21 Jul 2023 10:03:24 +0100 Subject: [PATCH 046/144] Bumping version of cryptography package for OFE --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index b6e6810cd2..9e879290eb 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -9,7 +9,7 @@ cffi==1.15.1 cfgv==3.3.1 charset-normalizer==3.1.0 click==8.1.3 -cryptography==41.0.1 +cryptography==41.0.2 decorator==5.1.1 defusedxml==0.7.1 dill==0.3.6 From 3b416a2f1369c7dc8166b1719fc166a616c9ba53 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 21 Jul 2023 10:06:10 -0500 Subject: [PATCH 047/144] Bump cryptography from 41.0.1 to 41.0.2 in /community/front-end/ofe (#1607) Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.1 to 41.0.2. - [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/41.0.1...41.0.2) --- updated-dependencies: - dependency-name: cryptography dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index ba99fa8c1e..a93326f98c 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -9,7 +9,7 @@ cffi==1.15.1 cfgv==3.3.1 charset-normalizer==3.1.0 click==8.1.3 -cryptography==41.0.1 +cryptography==41.0.2 decorator==5.1.1 defusedxml==0.7.1 dill==0.3.6 From 38f365e62d61a3898a17075c1b41409824d55872 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:46 -0500 Subject: [PATCH 048/144] Initial commit of HTCondor windows install script --- .../scripts/htcondor-install/README.md | 1 + .../modules/scripts/htcondor-install/main.tf | 5 +++ .../scripts/htcondor-install/outputs.tf | 5 +++ .../templates/install-htcondor.ps1.tftpl | 34 +++++++++++++++++++ 4 files changed, 45 insertions(+) create mode 100644 community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl diff --git a/community/modules/scripts/htcondor-install/README.md b/community/modules/scripts/htcondor-install/README.md index 7ba8d16fd1..775ec6211b 100644 --- a/community/modules/scripts/htcondor-install/README.md +++ b/community/modules/scripts/htcondor-install/README.md @@ -137,4 +137,5 @@ No resources. | [install\_autoscaler\_deps\_runner](#output\_install\_autoscaler\_deps\_runner) | Toolkit Runner to install HTCondor autoscaler dependencies | | [install\_autoscaler\_runner](#output\_install\_autoscaler\_runner) | Toolkit Runner to install HTCondor autoscaler | | [install\_htcondor\_runner](#output\_install\_htcondor\_runner) | Runner to install HTCondor using startup-scripts | +| [windows\_startup\_ps1](#output\_windows\_startup\_ps1) | Windows PowerShell script to install HTCondor | diff --git a/community/modules/scripts/htcondor-install/main.tf b/community/modules/scripts/htcondor-install/main.tf index afd7430dd9..8648351ab2 100644 --- a/community/modules/scripts/htcondor-install/main.tf +++ b/community/modules/scripts/htcondor-install/main.tf @@ -25,6 +25,11 @@ locals { ]) } + install_htcondor_ps1 = templatefile( + "${path.module}/templates/install-htcondor.ps1.tftpl", { + condor_version = var.condor_version + }) + runner_install_autoscaler_deps = { "type" = "ansible-local" "content" = file("${path.module}/files/install-htcondor-autoscaler-deps.yml") diff --git a/community/modules/scripts/htcondor-install/outputs.tf b/community/modules/scripts/htcondor-install/outputs.tf index 3c62ef6b3b..1a0d55b7bf 100644 --- a/community/modules/scripts/htcondor-install/outputs.tf +++ b/community/modules/scripts/htcondor-install/outputs.tf @@ -19,6 +19,11 @@ output "install_htcondor_runner" { value = local.runner_install_htcondor } +output "windows_startup_ps1" { + description = "Windows PowerShell script to install HTCondor" + value = local.install_htcondor_ps1 +} + output "install_autoscaler_deps_runner" { description = "Toolkit Runner to install HTCondor autoscaler dependencies" value = local.runner_install_autoscaler_deps diff --git a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl new file mode 100644 index 0000000000..0c6b239209 --- /dev/null +++ b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl @@ -0,0 +1,34 @@ +Set-StrictMode -Version latest +$ErrorActionPreference = 'Stop' + +# Windows 2016 defaults to old TLS protocols, override it +[Net.ServicePointManager]::SecurityProtocol = 'Tls12' + +# do not show progress bar when running Invoke-WebRequest +$ProgressPreference = 'SilentlyContinue' + +# download C Runtime DLL necessary for HTCondor installer +Invoke-WebRequest https://aka.ms/vs/17/release/vc_redist.x64.exe -OutFile C:\vc_redist.x64.exe +Start-Process C:\vc_redist.x64.exe -Wait -ArgumentList "/norestart /quiet /log c:\vc_redist_log.txt" +Remove-Item C:\vc_redist.x64.exe + +# download HTCondor installer +%{ if condor_version == "10.*" } +Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/current/current/condor-Windows-x64.msi -OutFile C:\htcondor.msi +%{ else ~} +Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/current/${condor_version}/release/condor-${condor_version}-Windows-x64.msi -OutFile C:\htcondor.msi +%{ endif ~} +$args='/qn /l* condor-install-log.txt /i' +$args=$args + ' C:\htcondor.msi' +$args=$args + ' NEWPOOL="N"' +$args=$args + ' RUNJOBS="N"' +$args=$args + ' SUBMITJOBS="N"' +$args=$args + ' INSTALLDIR="C:\Condor"' +Start-Process msiexec.exe -Wait -ArgumentList $args +Remove-Item C:\htcondor.msi + +# remove settings from condor_config that we want to override in configuration step +Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^CONDOR_HOST' -NotMatch) +Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^INSTALL_USER' -NotMatch) +Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^DAEMON_LIST' -NotMatch) +Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^use SECURITY' -NotMatch) From 9885bc760bfac07ee25f206e3b731292e6b6df28 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:47 -0500 Subject: [PATCH 049/144] Refactor condor_config generation - template using Terraform language into a single file, rather than a series of files using Ansible jinja templates - store file as a Cloud Storage object (anticipating need to generate a simple Windows startup script to download it, instead of Ansible) --- .../scheduler/htcondor-configure/README.md | 10 +- .../files/htcondor_configure.yml | 153 ++++-------------- .../scheduler/htcondor-configure/main.tf | 93 +++++++++-- .../scheduler/htcondor-configure/outputs.tf | 6 +- .../templates/htcondor.tftpl | 100 ++++++++++++ 5 files changed, 217 insertions(+), 145 deletions(-) create mode 100644 community/modules/scheduler/htcondor-configure/templates/htcondor.tftpl diff --git a/community/modules/scheduler/htcondor-configure/README.md b/community/modules/scheduler/htcondor-configure/README.md index 9c0cab51d1..10c9971448 100644 --- a/community/modules/scheduler/htcondor-configure/README.md +++ b/community/modules/scheduler/htcondor-configure/README.md @@ -214,11 +214,12 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [access\_point\_service\_account](#module\_access\_point\_service\_account) | terraform-google-modules/service-accounts/google | ~> 4.1 | +| [access\_point\_service\_account](#module\_access\_point\_service\_account) | terraform-google-modules/service-accounts/google | ~> 4.2 | | [address](#module\_address) | terraform-google-modules/address/google | ~> 3.0 | -| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | terraform-google-modules/service-accounts/google | ~> 4.1 | -| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | terraform-google-modules/service-accounts/google | ~> 4.1 | +| [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | terraform-google-modules/service-accounts/google | ~> 4.2 | +| [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | terraform-google-modules/service-accounts/google | ~> 4.2 | | [health\_check\_firewall\_rule](#module\_health\_check\_firewall\_rule) | terraform-google-modules/network/google//modules/firewall-rules | ~> 6.0 | +| [htcondor\_bucket](#module\_htcondor\_bucket) | terraform-google-modules/cloud-storage/google | ~> 4.0 | ## Resources @@ -229,6 +230,9 @@ limitations under the License. | [google_secret_manager_secret_iam_member.central_manager](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | | [google_secret_manager_secret_iam_member.execute_point](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | | [google_secret_manager_secret_version.pool_password](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_version) | resource | +| [google_storage_bucket_object.ap_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.cm_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_storage_bucket_object.execute_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [random_password.pool](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | | [google_compute_subnetwork.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork) | data source | diff --git a/community/modules/scheduler/htcondor-configure/files/htcondor_configure.yml b/community/modules/scheduler/htcondor-configure/files/htcondor_configure.yml index faa8b348a7..8e083bcfcd 100644 --- a/community/modules/scheduler/htcondor-configure/files/htcondor_configure.yml +++ b/community/modules/scheduler/htcondor-configure/files/htcondor_configure.yml @@ -20,24 +20,17 @@ job_queue_ha: false spool_dir: /var/lib/condor/spool condor_config_root: /etc/condor - role_file: 00-role - pool_file: 01-pool - cm_config_file: 02-central-manager - cm_ha_config_file: 02-central-manager-high-availability - schedd_config_file: 02-schedd - schedd_ha_config_file: 02-schedd-high-availability + ghpc_config_file: 50-ghpc-managed + schedd_ha_config_file: 51-ghpc-schedd-high-availability execute_config_file: 02-execute tasks: - - name: User must supply HTCondor role + - name: Ensure necessary variables are set ansible.builtin.assert: that: - - htcondor_central_manager_ips is defined - htcondor_role is defined - password_id is defined - - project_id is defined - - name: Set Trust Domain - ansible.builtin.set_fact: - trust_domain: c.{{ project_id }}.internal + - trust_domain is defined + - config_object is defined - name: Set HTCondor Pool password (token signing key) ansible.builtin.shell: | set -e -o pipefail @@ -45,30 +38,35 @@ echo -n "$POOL_PASSWORD" | sh -c "condor_store_cred add -c -i -" args: creates: "{{ condor_config_root }}/passwords.d/POOL" + executable: /bin/bash - name: Remove default HTCondor configuration ansible.builtin.file: path: "{{ condor_config_root }}/config.d/00-htcondor-9.0.config" state: absent notify: - Reload HTCondor - - name: Set HTCondor role on all hosts - ansible.builtin.copy: - dest: "{{ condor_config_root }}/config.d/{{ role_file }}" - mode: 0644 - content: | - use role:{{ htcondor_role }} - notify: - - Reload HTCondor - - name: Set HTCondor Central Manager and trust domain on all hosts - ansible.builtin.copy: - dest: "{{ condor_config_root }}/config.d/{{ pool_file }}" - mode: 0644 - content: | - CONDOR_HOST={{ htcondor_central_manager_ips }} - UID_DOMAIN={{ trust_domain }} - TRUST_DOMAIN={{ trust_domain }} - notify: - - Reload HTCondor + - name: Create Toolkit configuration file + register: config_update + changed_when: config_update.rc == 137 + failed_when: config_update.rc != 0 and config_update.rc != 137 + ansible.builtin.shell: | + set -e -o pipefail + REMOTE_HASH=$(gcloud --format="value(md5_hash)" storage hash {{ config_object }}) + + CONFIG_FILE="{{ condor_config_root }}/config.d/{{ ghpc_config_file }}" + if [ -f "${CONFIG_FILE}" ]; then + LOCAL_HASH=$(gcloud --format="value(md5_hash)" storage hash "${CONFIG_FILE}") + else + LOCAL_HASH="INVALID-HASH" + fi + + if [ "${REMOTE_HASH}" != "${LOCAL_HASH}" ]; then + gcloud storage cp {{ config_object }} "${CONFIG_FILE}" + chmod 0644 "${CONFIG_FILE}" + exit 137 + fi + args: + executable: /bin/bash - name: Configure HTCondor Central Manager when: htcondor_role == 'get_htcondor_central_manager' block: @@ -81,53 +79,6 @@ -token condor@{{ trust_domain }} args: creates: "{{ condor_config_root }}/tokens.d/condor@{{ trust_domain }}" - - name: Generate list of Central Managers - ansible.builtin.set_fact: - central_manager_list: "{{ htcondor_central_manager_ips | split(',') }}" - - name: Create Central Manager standard configuration file - ansible.builtin.copy: - dest: "{{ condor_config_root }}/config.d/{{ cm_config_file }}" - mode: 0644 - content: | - COLLECTOR_UPDATE_INTERVAL=30 - NEGOTIATOR_UPDATE_INTERVAL=30 - NEGOTIATOR_DEPTH_FIRST=True - NEGOTIATOR_UPDATE_AFTER_CYCLE=True - notify: - - Reload HTCondor - - name: Create Central Manager HA configuration file - when: central_manager_list | length > 1 - ansible.builtin.copy: - dest: "{{ condor_config_root }}/config.d/{{ cm_ha_config_file }}" - mode: 0644 - content: | - # following https://htcondor.readthedocs.io/en/latest/admin-manual/high-availability.html#high-availability-of-the-central-manager - CM_LIST = \ - {{ central_manager_list[0] }}:$(SHARED_PORT_PORT), \ - {{ central_manager_list[1] }}:$(SHARED_PORT_PORT) - - HAD_USE_SHARED_PORT=True - HAD_LIST=$(CM_LIST) - - REPLICATION_USE_SHARED_PORT=True - REPLICATION_LIST=$(CM_LIST) - - HAD_USE_PRIMARY=True - HAD_CONTROLLEE=NEGOTIATOR - MASTER_NEGOTIATOR_CONTROLLER=HAD - - DAEMON_LIST=$(DAEMON_LIST), HAD, REPLICATION - HAD_USE_REPLICATION=True - MASTER_HAD_BACKOFF_CONSTANT=360 - notify: - - Restart HTCondor - - name: Remove Central Manager HA configuration file - when: central_manager_list | length == 1 - ansible.builtin.file: - path: "{{ condor_config_root }}/config.d/{{ cm_ha_config_file }}" - state: absent - notify: - - Restart HTCondor - name: Configure HTCondor SchedD when: htcondor_role == 'get_htcondor_submit' block: @@ -138,41 +89,6 @@ owner: condor group: condor mode: 0755 - - name: Create SchedD configuration file - ansible.builtin.copy: - dest: "{{ condor_config_root }}/config.d/{{ schedd_config_file }}" - mode: 0644 - content: | - SCHEDD_INTERVAL=30 - TRUST_UID_DOMAIN=True - SUBMIT_ATTRS=RunAsOwner - RunAsOwner=True - use feature:JobsHaveInstanceIDs - SYSTEM_JOB_MACHINE_ATTRS=$(SYSTEM_JOB_MACHINE_ATTRS) \ - CloudVMType CloudZone CloudInterruptible - SYSTEM_JOB_MACHINE_ATTRS_HISTORY_LENGTH=10 - SPOOL={{ spool_dir }} - use feature:ScheddCronOneShot(cloud, $(LIBEXEC)/common-cloud-attributes-google.py) - SCHEDD_CRON_cloud_PREFIX=Cloud - # the sequence of job transforms and submit requirements below set - # a default job attribute RequireSpot to False but allow the user to - # specify *only* a boolean value with +RequireSpot = True in their job - # submit file; the requirements of the job are transformed to filter - # on +RequireSpot unless job has explicit CloudInterruptible requirements - JOB_TRANSFORM_NAMES = SPOT_DEFAULT, SPOT_REQS - JOB_TRANSFORM_SPOT_DEFAULT @=end - DEFAULT RequireSpot False - @end - # Unless explicit, set CloudInterruptible requirements to job RequireSpot attribute - JOB_TRANSFORM_SPOT_REQS @=end - REQUIREMENTS ! unresolved(Requirements, "^CloudInterruptible$") - SET Requirements $(MY.Requirements) && (CloudInterruptible is My.RequireSpot) - @end - SUBMIT_REQUIREMENT_NAMES = REQSPOT - SUBMIT_REQUIREMENT_REQSPOT = isBoolean(RequireSpot) - SUBMIT_REQUIREMENT_REQSPOT_REASON = "Jobs must set +RequireSpot to either True or False" - notify: - - Reload HTCondor - name: Create IDTOKEN to advertise access point ansible.builtin.shell: | umask 0077 @@ -237,19 +153,6 @@ - name: Configure HTCondor StartD when: htcondor_role == 'get_htcondor_execute' block: - - name: Create StartD configuration file - ansible.builtin.copy: - dest: "{{ condor_config_root }}/config.d/{{ execute_config_file }}" - mode: 0644 - content: | - use feature:PartitionableSlot - use feature:CommonCloudAttributesGoogle("-c created-by") - UPDATE_INTERVAL=30 - TRUST_UID_DOMAIN=True - STARTER_ALLOW_RUNAS_OWNER=True - RUNBENCHMARKS=False - notify: - - Reload HTCondor - name: Create IDTOKEN to advertise execute point ansible.builtin.shell: | umask 0077 diff --git a/community/modules/scheduler/htcondor-configure/main.tf b/community/modules/scheduler/htcondor-configure/main.tf index d4d13fa2ea..ae06618713 100644 --- a/community/modules/scheduler/htcondor-configure/main.tf +++ b/community/modules/scheduler/htcondor-configure/main.tf @@ -30,50 +30,115 @@ locals { central_manager_count = var.central_manager_high_availability ? 2 : 1 central_manager_ip_names = [for i in range(local.central_manager_count) : "${var.deployment_name}-cm-ip-${i}"] - pool_password = var.pool_password == null ? random_password.pool.result : var.pool_password + pool_password = coalesce(var.pool_password, random_password.pool.result) + trust_domain = "c.${var.project_id}.internal" - runner_cm_role = { + cm_config = templatefile("${path.module}/templates/htcondor.tftpl", { + htcondor_role = "get_htcondor_central_manager", + central_manager_ips = module.address.addresses, + trust_domain = local.trust_domain, + spool_dir = "${var.spool_parent_dir}/spool", + }) + + execute_config = templatefile("${path.module}/templates/htcondor.tftpl", { + htcondor_role = "get_htcondor_execute", + central_manager_ips = module.address.addresses, + trust_domain = local.trust_domain, + spool_dir = "${var.spool_parent_dir}/spool", + }) + + ap_config = templatefile("${path.module}/templates/htcondor.tftpl", { + htcondor_role = "get_htcondor_submit", + central_manager_ips = module.address.addresses, + trust_domain = local.trust_domain, + spool_dir = "${var.spool_parent_dir}/spool", + }) + + + cm_object = "gs://${module.htcondor_bucket.name}/${google_storage_bucket_object.cm_config.output_name}" + runner_cm = { "type" = "ansible-local" "content" = file("${path.module}/files/htcondor_configure.yml") "destination" = "htcondor_configure.yml" "args" = join(" ", [ "-e htcondor_role=get_htcondor_central_manager", - "-e htcondor_central_manager_ips=${join(",", module.address.addresses)}", + "-e config_object=${local.cm_object}", "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", - "-e project_id=${var.project_id}", + "-e trust_domain=${local.trust_domain}", ]) } - runner_access_role = { + ap_object = "gs://${module.htcondor_bucket.name}/${google_storage_bucket_object.ap_config.output_name}" + runner_access = { "type" = "ansible-local" "content" = file("${path.module}/files/htcondor_configure.yml") "destination" = "htcondor_configure.yml" "args" = join(" ", [ "-e htcondor_role=get_htcondor_submit", - "-e htcondor_central_manager_ips=${join(",", module.address.addresses)}", + "-e config_object=${local.ap_object}", + "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", + "-e trust_domain=${local.trust_domain}", "-e job_queue_ha=${var.job_queue_high_availability}", "-e spool_dir=${var.spool_parent_dir}/spool", - "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", - "-e project_id=${var.project_id}", ]) } - runner_execute_role = { + execute_object = "gs://${module.htcondor_bucket.name}/${google_storage_bucket_object.execute_config.output_name}" + runner_execute = { "type" = "ansible-local" "content" = file("${path.module}/files/htcondor_configure.yml") "destination" = "htcondor_configure.yml" "args" = join(" ", [ "-e htcondor_role=get_htcondor_execute", - "-e htcondor_central_manager_ips=${join(",", module.address.addresses)}", + "-e config_object=${local.execute_object}", "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", - "-e project_id=${var.project_id}", + "-e trust_domain=${local.trust_domain}", + ]) + } +} + +module "htcondor_bucket" { + source = "terraform-google-modules/cloud-storage/google" + version = "~> 4.0" + + project_id = var.project_id + location = var.region + prefix = var.deployment_name + names = ["htcondor-config"] + randomize_suffix = true + labels = local.labels + + bucket_viewers = { + "htcondor-config" = join(",", [ + module.access_point_service_account.iam_email, + module.central_manager_service_account.iam_email, + module.execute_point_service_account.iam_email, ]) } + set_viewer_roles = true +} + +resource "google_storage_bucket_object" "cm_config" { + name = "${var.deployment_name}-cm-config" + content = local.cm_config + bucket = module.htcondor_bucket.name +} + +resource "google_storage_bucket_object" "execute_config" { + name = "${var.deployment_name}-execute-config" + content = local.execute_config + bucket = module.htcondor_bucket.name +} + +resource "google_storage_bucket_object" "ap_config" { + name = "${var.deployment_name}-ap-config" + content = local.ap_config + bucket = module.htcondor_bucket.name } module "access_point_service_account" { source = "terraform-google-modules/service-accounts/google" - version = "~> 4.1" + version = "~> 4.2" project_id = var.project_id prefix = var.deployment_name @@ -84,7 +149,7 @@ module "access_point_service_account" { module "execute_point_service_account" { source = "terraform-google-modules/service-accounts/google" - version = "~> 4.1" + version = "~> 4.2" project_id = var.project_id prefix = var.deployment_name @@ -95,7 +160,7 @@ module "execute_point_service_account" { module "central_manager_service_account" { source = "terraform-google-modules/service-accounts/google" - version = "~> 4.1" + version = "~> 4.2" project_id = var.project_id prefix = var.deployment_name diff --git a/community/modules/scheduler/htcondor-configure/outputs.tf b/community/modules/scheduler/htcondor-configure/outputs.tf index 91e31f46cc..5689f6e2e8 100644 --- a/community/modules/scheduler/htcondor-configure/outputs.tf +++ b/community/modules/scheduler/htcondor-configure/outputs.tf @@ -49,7 +49,7 @@ output "pool_password_secret_id" { output "central_manager_runner" { description = "Toolkit Runner to configure an HTCondor Central Manager" - value = local.runner_cm_role + value = local.runner_cm depends_on = [ google_secret_manager_secret_version.pool_password ] @@ -57,7 +57,7 @@ output "central_manager_runner" { output "access_point_runner" { description = "Toolkit Runner to configure an HTCondor Access Point" - value = local.runner_access_role + value = local.runner_access depends_on = [ google_secret_manager_secret_version.pool_password ] @@ -65,7 +65,7 @@ output "access_point_runner" { output "execute_point_runner" { description = "Toolkit Runner to configure an HTCondor Execute Point" - value = local.runner_execute_role + value = local.runner_execute depends_on = [ google_secret_manager_secret_version.pool_password ] diff --git a/community/modules/scheduler/htcondor-configure/templates/htcondor.tftpl b/community/modules/scheduler/htcondor-configure/templates/htcondor.tftpl new file mode 100644 index 0000000000..c5ae0175ec --- /dev/null +++ b/community/modules/scheduler/htcondor-configure/templates/htcondor.tftpl @@ -0,0 +1,100 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# this file is managed by the HPC Toolkit; do not edit it manually +# override settings with a higher priority (last lexically) named file +# https://htcondor.readthedocs.io/en/latest/admin-manual/introduction-to-configuration.html?#ordered-evaluation-to-set-the-configuration + +use role:${htcondor_role} +CONDOR_HOST = ${join(",", central_manager_ips)} +UID_DOMAIN = ${trust_domain} +TRUST_DOMAIN = ${trust_domain} + +%{ if htcondor_role == "get_htcondor_central_manager" ~} +# Central Manager configuraition settings +COLLECTOR_UPDATE_INTERVAL = 30 +NEGOTIATOR_UPDATE_INTERVAL = 30 +NEGOTIATOR_DEPTH_FIRST = True +NEGOTIATOR_UPDATE_AFTER_CYCLE = True +%{ endif ~} + +%{ if htcondor_role == "get_htcondor_central_manager" && length(central_manager_ips) > 1 ~} +# Central Manager high availability configuration settings +# following https://htcondor.readthedocs.io/en/latest/admin-manual/high-availability.html#high-availability-of-the-central-manager +CM_LIST = \ + ${central_manager_ips[0]}:$(SHARED_PORT_PORT), \ + ${central_manager_ips[1]}:$(SHARED_PORT_PORT) + +HAD_USE_SHARED_PORT = True +HAD_LIST = $(CM_LIST) + +REPLICATION_USE_SHARED_PORT = True +REPLICATION_LIST = $(CM_LIST) + +HAD_USE_PRIMARY = True +HAD_CONTROLLEE = NEGOTIATOR +MASTER_NEGOTIATOR_CONTROLLER = HAD + +DAEMON_LIST = $(DAEMON_LIST), HAD, REPLICATION +HAD_USE_REPLICATION = True +MASTER_HAD_BACKOFF_CONSTANT = 360 +%{ endif ~} + +%{ if htcondor_role == "get_htcondor_submit" ~} +# SchedD configuration settings +SPOOL = ${spool_dir} +SCHEDD_INTERVAL = 30 +TRUST_UID_DOMAIN = True +SUBMIT_ATTRS = RunAsOwner +RunAsOwner = True + +# When a job matches to a machine, add machine attributes to the job for +# condor_history (e.g. VM Instance ID) +use feature:JobsHaveInstanceIDs +SYSTEM_JOB_MACHINE_ATTRS = $(SYSTEM_JOB_MACHINE_ATTRS) \ + CloudVMType CloudZone CloudInterruptible +SYSTEM_JOB_MACHINE_ATTRS_HISTORY_LENGTH = 10 + +# Add Cloud attributes to SchedD ClassAd +use feature:ScheddCronOneShot(cloud, $(LIBEXEC)/common-cloud-attributes-google.py) +SCHEDD_CRON_cloud_PREFIX = Cloud + +# the sequence of job transforms and submit requirements below set +# a default job attribute RequireSpot to False but allow the user to +# specify *only* a boolean value with +RequireSpot = True in their job +# submit file; the requirements of the job are transformed to filter +# on +RequireSpot unless job has explicit CloudInterruptible requirements +JOB_TRANSFORM_NAMES = SPOT_DEFAULT, SPOT_REQS +JOB_TRANSFORM_SPOT_DEFAULT @=end + DEFAULT RequireSpot False +@end +# Unless explicit, set CloudInterruptible requirements to job RequireSpot attribute +JOB_TRANSFORM_SPOT_REQS @=end + REQUIREMENTS ! unresolved(Requirements, "^CloudInterruptible$") + SET Requirements $(MY.Requirements) && (CloudInterruptible is My.RequireSpot) +@end +SUBMIT_REQUIREMENT_NAMES = REQSPOT +SUBMIT_REQUIREMENT_REQSPOT = isBoolean(RequireSpot) +SUBMIT_REQUIREMENT_REQSPOT_REASON = "Jobs must set +RequireSpot to either True or False" +%{ endif ~} + +%{ if htcondor_role == "get_htcondor_execute" ~} +# StartD configuration settings +use feature:PartitionableSlot +use feature:CommonCloudAttributesGoogle("-c created-by") +UPDATE_INTERVAL = 30 +TRUST_UID_DOMAIN = True +STARTER_ALLOW_RUNAS_OWNER = True +RUNBENCHMARKS = False +%{ endif ~} From 263725d247b292eb894ec9efe1f5df22f5170eee Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:47 -0500 Subject: [PATCH 050/144] Cleanup unparsable HTCondor example job --- community/examples/htc-htcondor.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 3a6b735f6b..c02654b6bf 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -151,14 +151,15 @@ deployment_groups: destination: /var/tmp/helloworld.sub content: | universe = vanilla - executable = /bin/echo - arguments = "Hello, World!" + executable = /bin/sleep + arguments = 1000 output = out.\$(ClusterId).\$(ProcId) error = err.\$(ClusterId).\$(ProcId) log = log.\$(ClusterId).\$(ProcId) request_cpus = 1 request_memory = 100MB - +RequireSpot = true # if unset, defaults to false + # if unset, defaults to false + +RequireSpot = true queue - id: htcondor_access From 9110b0284c38002d5efe8944867b21a85bf4a3f6 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:47 -0500 Subject: [PATCH 051/144] Rename condor_config template file --- community/modules/scheduler/htcondor-configure/main.tf | 6 +++--- .../templates/{htcondor.tftpl => condor_config.tftpl} | 0 2 files changed, 3 insertions(+), 3 deletions(-) rename community/modules/scheduler/htcondor-configure/templates/{htcondor.tftpl => condor_config.tftpl} (100%) diff --git a/community/modules/scheduler/htcondor-configure/main.tf b/community/modules/scheduler/htcondor-configure/main.tf index ae06618713..82f8e2c721 100644 --- a/community/modules/scheduler/htcondor-configure/main.tf +++ b/community/modules/scheduler/htcondor-configure/main.tf @@ -33,21 +33,21 @@ locals { pool_password = coalesce(var.pool_password, random_password.pool.result) trust_domain = "c.${var.project_id}.internal" - cm_config = templatefile("${path.module}/templates/htcondor.tftpl", { + cm_config = templatefile("${path.module}/templates/condor_config.tftpl", { htcondor_role = "get_htcondor_central_manager", central_manager_ips = module.address.addresses, trust_domain = local.trust_domain, spool_dir = "${var.spool_parent_dir}/spool", }) - execute_config = templatefile("${path.module}/templates/htcondor.tftpl", { + execute_config = templatefile("${path.module}/templates/condor_config.tftpl", { htcondor_role = "get_htcondor_execute", central_manager_ips = module.address.addresses, trust_domain = local.trust_domain, spool_dir = "${var.spool_parent_dir}/spool", }) - ap_config = templatefile("${path.module}/templates/htcondor.tftpl", { + ap_config = templatefile("${path.module}/templates/condor_config.tftpl", { htcondor_role = "get_htcondor_submit", central_manager_ips = module.address.addresses, trust_domain = local.trust_domain, diff --git a/community/modules/scheduler/htcondor-configure/templates/htcondor.tftpl b/community/modules/scheduler/htcondor-configure/templates/condor_config.tftpl similarity index 100% rename from community/modules/scheduler/htcondor-configure/templates/htcondor.tftpl rename to community/modules/scheduler/htcondor-configure/templates/condor_config.tftpl From b00e20f85282377d99f3cf6331600f4a2fc7a1d1 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:47 -0500 Subject: [PATCH 052/144] Add boot-time Windows config script for HTCondor This script will download the execute point config object and reload it if necessary. --- .../scheduler/htcondor-configure/README.md | 1 + .../modules/scheduler/htcondor-configure/main.tf | 6 ++++++ .../scheduler/htcondor-configure/outputs.tf | 5 +++++ .../templates/download-condor-config.ps1.tftpl | 15 +++++++++++++++ 4 files changed, 27 insertions(+) create mode 100644 community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl diff --git a/community/modules/scheduler/htcondor-configure/README.md b/community/modules/scheduler/htcondor-configure/README.md index 10c9971448..0827ab4d55 100644 --- a/community/modules/scheduler/htcondor-configure/README.md +++ b/community/modules/scheduler/htcondor-configure/README.md @@ -266,4 +266,5 @@ limitations under the License. | [execute\_point\_runner](#output\_execute\_point\_runner) | Toolkit Runner to configure an HTCondor Execute Point | | [execute\_point\_service\_account](#output\_execute\_point\_service\_account) | HTCondor Execute Point Service Account (e-mail format) | | [pool\_password\_secret\_id](#output\_pool\_password\_secret\_id) | Google Cloud Secret Manager ID containing HTCondor Pool Password | +| [windows\_startup\_ps1](#output\_windows\_startup\_ps1) | Windows PowerShell script to update HTCondor configuration file | diff --git a/community/modules/scheduler/htcondor-configure/main.tf b/community/modules/scheduler/htcondor-configure/main.tf index 82f8e2c721..e047ee207a 100644 --- a/community/modules/scheduler/htcondor-configure/main.tf +++ b/community/modules/scheduler/htcondor-configure/main.tf @@ -95,6 +95,12 @@ locals { "-e trust_domain=${local.trust_domain}", ]) } + windows_startup_ps1 = templatefile( + "${path.module}/templates/download-condor-config.ps1.tftpl", + { + config_object = local.execute_object + } + ) } module "htcondor_bucket" { diff --git a/community/modules/scheduler/htcondor-configure/outputs.tf b/community/modules/scheduler/htcondor-configure/outputs.tf index 5689f6e2e8..3d19b229f9 100644 --- a/community/modules/scheduler/htcondor-configure/outputs.tf +++ b/community/modules/scheduler/htcondor-configure/outputs.tf @@ -80,3 +80,8 @@ output "central_manager_secondary_internal_ip" { description = "Reserved internal IP address for use by failover Central Manager" value = try(module.address.addresses[1], null) } + +output "windows_startup_ps1" { + description = "Windows PowerShell script to update HTCondor configuration file" + value = local.windows_startup_ps1 +} diff --git a/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl b/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl new file mode 100644 index 0000000000..d6320f9112 --- /dev/null +++ b/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl @@ -0,0 +1,15 @@ +$remote_hash = gcloud --format="value(md5_hash)" storage hash ${config_object} + +# custom install location are not supported in Windows +$config_file = "C:\condor\condor_config.local" +if (Test-Path -Path $config_file -PathType Leaf) { + $local_hash = gcloud --format="value(md5_hash)" storage hash $config_file +} else { + $local_hash = "INVALID-HASH" +} + +if ($local_hash -cne $remote_hash) { + Write-Output "Updating condor configuration" + gcloud storage cp ${config_object} $config_file + condor_reconfig.exe +} From b3eb2d101763b7f39368e1c08cdaf5d267796d36 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:47 -0500 Subject: [PATCH 053/144] Add support for Windows to htcondor-execute-point --- .../compute/htcondor-execute-point/README.md | 14 +++++++--- .../compute/htcondor-execute-point/main.tf | 27 +++++++++++++------ .../htcondor-execute-point/variables.tf | 8 +++++- .../htcondor-execute-point/versions.tf | 13 ++++++++- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 68d327df4f..98f9453099 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -158,11 +158,14 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 0.13.0 | +| [terraform](#requirement\_terraform) | >= 1.0 | +| [google](#requirement\_google) | >= 4.0 | ## Providers -No providers. +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 4.0 | ## Modules @@ -173,7 +176,9 @@ No providers. ## Resources -No resources. +| Name | Type | +|------|------| +| [google_compute_image.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | ## Inputs @@ -194,9 +199,10 @@ No resources. | [region](#input\_region) | The region in which HTCondor execute points will be created | `string` | n/a | yes | | [service\_account](#input\_service\_account) | Service account to attach to HTCondor execute points |
object({
email = string,
scopes = set(string)
})
|
{
"email": null,
"scopes": [
"https://www.googleapis.com/auth/cloud-platform"
]
}
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | -| [startup\_script](#input\_startup\_script) | Startup script to run at boot-time for HTCondor execute points | `string` | `null` | no | +| [startup\_script](#input\_startup\_script) | Startup script to run at boot-time for Linux HTCondor execute points | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork HTCondor execute points will join | `string` | `null` | no | | [target\_size](#input\_target\_size) | Initial size of the HTCondor execute point pool; set to null (default) to avoid Terraform management of size. | `number` | `null` | no | +| [windows\_startup\_ps1](#input\_windows\_startup\_ps1) | Startup script to run at boot-time for Windows-based HTCondor execute points | `string` | `null` | no | | [zone](#input\_zone) | The default zone in which resources will be created | `string` | n/a | yes | ## Outputs diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 091e329a8f..497b278e52 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -28,7 +28,12 @@ locals { } enable_oslogin = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } - metadata = merge(var.metadata, local.network_storage_metadata, local.enable_oslogin) + is_windows_image = anytrue([for l in data.google_compute_image.htcondor.licenses : length(regexall("windows-cloud", l)) > 0]) + windows_startup_metadata = local.is_windows_image && var.windows_startup_ps1 != null ? { + windows-startup-script-ps1 = var.windows_startup_ps1 + } : {} + + metadata = merge(local.windows_startup_metadata, local.network_storage_metadata, local.enable_oslogin, var.metadata) configure_autoscaler_role = { "type" = "ansible-local" @@ -47,6 +52,13 @@ locals { hostnames = var.spot ? "${var.deployment_name}-spot-xp" : "${var.deployment_name}-xp" } + +data "google_compute_image" "htcondor" { + family = var.instance_image.family + project = var.instance_image.project +} + + module "execute_point_instance_template" { source = "terraform-google-modules/vm/google//modules/instance_template" version = "~> 8.0" @@ -58,13 +70,12 @@ module "execute_point_instance_template" { service_account = var.service_account labels = local.labels - machine_type = var.machine_type - disk_size_gb = var.disk_size_gb - preemptible = var.spot - startup_script = var.startup_script - metadata = local.metadata - source_image_family = var.instance_image.family - source_image_project = var.instance_image.project + machine_type = var.machine_type + disk_size_gb = var.disk_size_gb + preemptible = var.spot + startup_script = local.is_windows_image ? null : var.startup_script + metadata = local.metadata + source_image = data.google_compute_image.htcondor.self_link } module "mig" { diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index f1a21bb2c7..2ac6b4addd 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -46,7 +46,7 @@ variable "machine_type" { } variable "startup_script" { - description = "Startup script to run at boot-time for HTCondor execute points" + description = "Startup script to run at boot-time for Linux HTCondor execute points" type = string default = null } @@ -152,3 +152,9 @@ variable "disk_size_gb" { type = number default = 100 } + +variable "windows_startup_ps1" { + description = "Startup script to run at boot-time for Windows-based HTCondor execute points" + type = string + default = null +} diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 79b6fbde47..275c483863 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -15,5 +15,16 @@ */ terraform { - required_version = ">= 0.13.0" + required_version = ">= 1.0" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 4.0" + } + } + + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.19.1" + } } From e51fa550b53387eba52a64d1d4270c3d04615608 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:47 -0500 Subject: [PATCH 054/144] Ensure stability of HTCondor execute points Disable instance redistribution across zones during MIG updates. This ensures that the MIG manager does not proactively delete VMs with running jobs in order to create more balance throughout the region. --- .../compute/htcondor-execute-point/README.md | 1 + .../compute/htcondor-execute-point/main.tf | 20 +++++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 98f9453099..e521e4eaf9 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -179,6 +179,7 @@ limitations under the License. | Name | Type | |------|------| | [google_compute_image.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | +| [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | ## Inputs diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 497b278e52..5c775eb7bd 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -52,13 +52,11 @@ locals { hostnames = var.spot ? "${var.deployment_name}-spot-xp" : "${var.deployment_name}-xp" } - data "google_compute_image" "htcondor" { family = var.instance_image.family project = var.instance_image.project } - module "execute_point_instance_template" { source = "terraform-google-modules/vm/google//modules/instance_template" version = "~> 8.0" @@ -78,6 +76,11 @@ module "execute_point_instance_template" { source_image = data.google_compute_image.htcondor.self_link } +data "google_compute_zones" "available" { + project = var.project_id + region = var.region +} + module "mig" { source = "terraform-google-modules/vm/google//modules/mig" version = "~> 8.0" @@ -103,4 +106,17 @@ module "mig" { host = "" enable_logging = true } + + update_policy = [{ + instance_redistribution_type = "NONE" + replacement_method = "SUBSTITUTE" + max_surge_fixed = length(data.google_compute_zones.available.names) + max_unavailable_fixed = length(data.google_compute_zones.available.names) + max_surge_percent = null + max_unavailable_percent = null + min_ready_sec = 300 + minimal_action = "REPLACE" + type = "OPPORTUNISTIC" + }] + } From eb18821bbee204565f2b6266876ccf618d5d9465 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:47 -0500 Subject: [PATCH 055/144] Modify execute point IDTOKEN creation - Central Manager will now update a secret with an IDTOKEN scoped for advertising the StartD daemon on an execute point - Linux Execute Points no longer download the pool password and generate a token from it. Instead they directly download the IDTOKEN. Windows execute points will have this functionality added in a future commit. --- .../scheduler/htcondor-configure/README.md | 4 +- .../files/htcondor_configure.yml | 49 ++++++++++++++----- .../scheduler/htcondor-configure/main.tf | 23 ++++++++- .../scheduler/htcondor-configure/outputs.tf | 3 +- 4 files changed, 62 insertions(+), 17 deletions(-) diff --git a/community/modules/scheduler/htcondor-configure/README.md b/community/modules/scheduler/htcondor-configure/README.md index 0827ab4d55..aef3a162cd 100644 --- a/community/modules/scheduler/htcondor-configure/README.md +++ b/community/modules/scheduler/htcondor-configure/README.md @@ -225,9 +225,11 @@ limitations under the License. | Name | Type | |------|------| +| [google_secret_manager_secret.execute_point_idtoken](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | | [google_secret_manager_secret.pool_password](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | | [google_secret_manager_secret_iam_member.access_point](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | -| [google_secret_manager_secret_iam_member.central_manager](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | +| [google_secret_manager_secret_iam_member.central_manager_idtoken](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | +| [google_secret_manager_secret_iam_member.central_manager_password](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | | [google_secret_manager_secret_iam_member.execute_point](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | | [google_secret_manager_secret_version.pool_password](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_version) | resource | | [google_storage_bucket_object.ap_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | diff --git a/community/modules/scheduler/htcondor-configure/files/htcondor_configure.yml b/community/modules/scheduler/htcondor-configure/files/htcondor_configure.yml index 8e083bcfcd..0a3cd7da7f 100644 --- a/community/modules/scheduler/htcondor-configure/files/htcondor_configure.yml +++ b/community/modules/scheduler/htcondor-configure/files/htcondor_configure.yml @@ -32,10 +32,13 @@ - trust_domain is defined - config_object is defined - name: Set HTCondor Pool password (token signing key) + when: htcondor_role != 'get_htcondor_execute' ansible.builtin.shell: | set -e -o pipefail - POOL_PASSWORD=$(gcloud secrets versions access latest --secret={{ password_id }}) - echo -n "$POOL_PASSWORD" | sh -c "condor_store_cred add -c -i -" + TMPFILE=$(mktemp) + gcloud secrets versions access latest --out-file "$TMPFILE" --secret={{ password_id }} + condor_store_cred add -c -i "$TMPFILE" + rm -f "$TMPFILE" args: creates: "{{ condor_config_root }}/passwords.d/POOL" executable: /bin/bash @@ -79,6 +82,17 @@ -token condor@{{ trust_domain }} args: creates: "{{ condor_config_root }}/tokens.d/condor@{{ trust_domain }}" + - name: Create IDTOKEN secret for Execute Points + changed_when: true + ansible.builtin.shell: | + umask 0077 + TRUST_DOMAIN=$(condor_config_val TRUST_DOMAIN) + TMPFILE=$(mktemp) + condor_token_create -authz READ -authz ADVERTISE_MASTER \ + -authz ADVERTISE_STARTD -identity condor@{{ trust_domain }} > "$TMPFILE" + gcloud secrets versions add --data-file "$TMPFILE" {{ xp_idtoken_secret_id }} + rm -f "$TMPFILE" + when: xp_idtoken_secret_id | length > 0 - name: Configure HTCondor SchedD when: htcondor_role == 'get_htcondor_submit' block: @@ -134,7 +148,7 @@ [Unit] RequiresMountsFor={{ spool_dir }} notify: - - Reload HTCondor SystemD unit + - Reload SystemD - name: Disable SchedD high availability when: not job_queue_ha | bool block: @@ -149,20 +163,29 @@ path: /etc/systemd/system/condor.service.d/mount-spool.conf state: absent notify: - - Reload HTCondor SystemD unit + - Reload SystemD - name: Configure HTCondor StartD when: htcondor_role == 'get_htcondor_execute' block: - - name: Create IDTOKEN to advertise execute point - ansible.builtin.shell: | - umask 0077 - condor_token_create -authz READ -authz ADVERTISE_MASTER \ - -authz ADVERTISE_STARTD -identity condor@{{ trust_domain }} \ - -token condor@{{ trust_domain }} - args: - creates: "{{ condor_config_root }}/tokens.d/condor@{{ trust_domain }}" + - name: Create SystemD override directory for HTCondor Execute Point + ansible.builtin.file: + path: /etc/systemd/system/condor.service.d + state: directory + owner: root + group: root + mode: 0755 + - name: Fetch IDTOKEN to advertise execute point + ansible.builtin.copy: + dest: "/etc/systemd/system/condor.service.d/htcondor-token-fetcher.conf" + mode: 0644 + content: | + [Service] + ExecStartPre=gcloud secrets versions access latest --secret {{ xp_idtoken_secret_id }} \ + --out-file {{ condor_config_root }}/tokens.d/condor@{{ trust_domain }} + notify: + - Reload SystemD handlers: - - name: Reload HTCondor SystemD unit + - name: Reload SystemD ansible.builtin.systemd: daemon_reload: true - name: Restart HTCondor diff --git a/community/modules/scheduler/htcondor-configure/main.tf b/community/modules/scheduler/htcondor-configure/main.tf index e047ee207a..e161f03c0c 100644 --- a/community/modules/scheduler/htcondor-configure/main.tf +++ b/community/modules/scheduler/htcondor-configure/main.tf @@ -64,6 +64,7 @@ locals { "-e htcondor_role=get_htcondor_central_manager", "-e config_object=${local.cm_object}", "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", + "-e xp_idtoken_secret_id=${google_secret_manager_secret.execute_point_idtoken.secret_id}", "-e trust_domain=${local.trust_domain}", ]) } @@ -92,6 +93,7 @@ locals { "-e htcondor_role=get_htcondor_execute", "-e config_object=${local.execute_object}", "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", + "-e xp_idtoken_secret_id=${google_secret_manager_secret.execute_point_idtoken.secret_id}", "-e trust_domain=${local.trust_domain}", ]) } @@ -196,12 +198,29 @@ resource "google_secret_manager_secret_version" "pool_password" { secret_data = local.pool_password } -resource "google_secret_manager_secret_iam_member" "central_manager" { +# this secret will be populated by the Central Manager +resource "google_secret_manager_secret" "execute_point_idtoken" { + secret_id = "${var.deployment_name}-execute-point-idtoken" + + labels = local.labels + + replication { + automatic = true + } +} + +resource "google_secret_manager_secret_iam_member" "central_manager_password" { secret_id = google_secret_manager_secret.pool_password.id role = "roles/secretmanager.secretAccessor" member = module.central_manager_service_account.iam_email } +resource "google_secret_manager_secret_iam_member" "central_manager_idtoken" { + secret_id = google_secret_manager_secret.execute_point_idtoken.id + role = "roles/secretmanager.secretVersionManager" + member = module.central_manager_service_account.iam_email +} + resource "google_secret_manager_secret_iam_member" "access_point" { secret_id = google_secret_manager_secret.pool_password.id role = "roles/secretmanager.secretAccessor" @@ -209,7 +228,7 @@ resource "google_secret_manager_secret_iam_member" "access_point" { } resource "google_secret_manager_secret_iam_member" "execute_point" { - secret_id = google_secret_manager_secret.pool_password.id + secret_id = google_secret_manager_secret.execute_point_idtoken.id role = "roles/secretmanager.secretAccessor" member = module.execute_point_service_account.iam_email } diff --git a/community/modules/scheduler/htcondor-configure/outputs.tf b/community/modules/scheduler/htcondor-configure/outputs.tf index 3d19b229f9..6b86e1b03a 100644 --- a/community/modules/scheduler/htcondor-configure/outputs.tf +++ b/community/modules/scheduler/htcondor-configure/outputs.tf @@ -27,7 +27,8 @@ output "central_manager_service_account" { description = "HTCondor Central Manager Service Account (e-mail format)" value = module.central_manager_service_account.email depends_on = [ - google_secret_manager_secret_iam_member.central_manager, + google_secret_manager_secret_iam_member.central_manager_idtoken, + google_secret_manager_secret_iam_member.central_manager_password, module.central_manager_service_account ] } From 19645294c921577cb60b25f63fb307d41faf585c Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:48 -0500 Subject: [PATCH 056/144] Include GCESysPrep in Windows image shutdown script --- modules/packer/custom-image/image.pkr.hcl | 5 ++++- .../expectations/igc_pkr/one/image/image.pkr.hcl | 5 ++++- .../expectations/text_escape/zero/lime/image.pkr.hcl | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/modules/packer/custom-image/image.pkr.hcl b/modules/packer/custom-image/image.pkr.hcl index fa9a372cc5..c37ff0b941 100644 --- a/modules/packer/custom-image/image.pkr.hcl +++ b/modules/packer/custom-image/image.pkr.hcl @@ -47,7 +47,10 @@ locals { windows_packer_user = "packer_user" windows_user_metadata = { sysprep-specialize-script-cmd = "winrm quickconfig -quiet & net user /add ${local.windows_packer_user} & net localgroup administrators ${local.windows_packer_user} /add & winrm set winrm/config/service/auth @{Basic=\\\"true\\\"}" - windows-shutdown-script-cmd = "net user /delete ${local.windows_packer_user}" + windows-shutdown-script-cmd = <<-EOT + net user /delete ${local.windows_packer_user} + GCESysprep -no_shutdown + EOT } user_metadata = local.communicator == "winrm" ? local.windows_user_metadata : local.linux_user_metadata diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl index fa9a372cc5..c37ff0b941 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/image.pkr.hcl @@ -47,7 +47,10 @@ locals { windows_packer_user = "packer_user" windows_user_metadata = { sysprep-specialize-script-cmd = "winrm quickconfig -quiet & net user /add ${local.windows_packer_user} & net localgroup administrators ${local.windows_packer_user} /add & winrm set winrm/config/service/auth @{Basic=\\\"true\\\"}" - windows-shutdown-script-cmd = "net user /delete ${local.windows_packer_user}" + windows-shutdown-script-cmd = <<-EOT + net user /delete ${local.windows_packer_user} + GCESysprep -no_shutdown + EOT } user_metadata = local.communicator == "winrm" ? local.windows_user_metadata : local.linux_user_metadata diff --git a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl index fa9a372cc5..c37ff0b941 100644 --- a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/image.pkr.hcl @@ -47,7 +47,10 @@ locals { windows_packer_user = "packer_user" windows_user_metadata = { sysprep-specialize-script-cmd = "winrm quickconfig -quiet & net user /add ${local.windows_packer_user} & net localgroup administrators ${local.windows_packer_user} /add & winrm set winrm/config/service/auth @{Basic=\\\"true\\\"}" - windows-shutdown-script-cmd = "net user /delete ${local.windows_packer_user}" + windows-shutdown-script-cmd = <<-EOT + net user /delete ${local.windows_packer_user} + GCESysprep -no_shutdown + EOT } user_metadata = local.communicator == "winrm" ? local.windows_user_metadata : local.linux_user_metadata From 23f536346a010e1bea3eaf868530661583307188 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:48 -0500 Subject: [PATCH 057/144] Add boot-time IDTOKEN support to Windows-based HTCondor execute points --- .../modules/scheduler/htcondor-configure/main.tf | 4 +++- .../templates/download-condor-config.ps1.tftpl | 13 ++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/community/modules/scheduler/htcondor-configure/main.tf b/community/modules/scheduler/htcondor-configure/main.tf index e161f03c0c..72d6fa6650 100644 --- a/community/modules/scheduler/htcondor-configure/main.tf +++ b/community/modules/scheduler/htcondor-configure/main.tf @@ -100,7 +100,9 @@ locals { windows_startup_ps1 = templatefile( "${path.module}/templates/download-condor-config.ps1.tftpl", { - config_object = local.execute_object + config_object = local.execute_object, + trust_domain = local.trust_domain, + xp_idtoken_secret_id = google_secret_manager_secret.execute_point_idtoken.secret_id, } ) } diff --git a/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl b/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl index d6320f9112..3c5359f3ee 100644 --- a/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl +++ b/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl @@ -1,15 +1,18 @@ +gcloud secrets versions access latest --secret ${xp_idtoken_secret_id} ` + --out-file C:\condor\tokens.d\condor@${trust_domain} + $remote_hash = gcloud --format="value(md5_hash)" storage hash ${config_object} # custom install location are not supported in Windows $config_file = "C:\condor\condor_config.local" if (Test-Path -Path $config_file -PathType Leaf) { - $local_hash = gcloud --format="value(md5_hash)" storage hash $config_file + $local_hash = gcloud --format="value(md5_hash)" storage hash $config_file } else { - $local_hash = "INVALID-HASH" + $local_hash = "INVALID-HASH" } if ($local_hash -cne $remote_hash) { - Write-Output "Updating condor configuration" - gcloud storage cp ${config_object} $config_file - condor_reconfig.exe + Write-Output "Updating condor configuration" + gcloud storage cp ${config_object} $config_file + Restart-Service condor } From c40d08928d8699c879ab38c4fc4379966abfbe00 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:48 -0500 Subject: [PATCH 058/144] Add Cloud hook configuration to Windows-based HTCondor execute points --- .../download-condor-config.ps1.tftpl | 12 ++++++--- .../templates/install-htcondor.ps1.tftpl | 25 +++++++++++++------ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl b/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl index 3c5359f3ee..2328d41a24 100644 --- a/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl +++ b/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl @@ -1,16 +1,22 @@ +# obtain IDTOKEN for authentication by StartD to Central Manager gcloud secrets versions access latest --secret ${xp_idtoken_secret_id} ` --out-file C:\condor\tokens.d\condor@${trust_domain} -$remote_hash = gcloud --format="value(md5_hash)" storage hash ${config_object} +# create directory for local condor_config customizations +$config_dir = 'C:\Condor\config' +if(!(test-path -PathType container -Path $config_dir)) { + New-Item -ItemType Directory -Path $config_dir +} -# custom install location are not supported in Windows -$config_file = "C:\condor\condor_config.local" +# update local condor_config if blueprint has changed +$config_file = "$config_dir\50-ghpc-managed" if (Test-Path -Path $config_file -PathType Leaf) { $local_hash = gcloud --format="value(md5_hash)" storage hash $config_file } else { $local_hash = "INVALID-HASH" } +$remote_hash = gcloud --format="value(md5_hash)" storage hash ${config_object} if ($local_hash -cne $remote_hash) { Write-Output "Updating condor configuration" gcloud storage cp ${config_object} $config_file diff --git a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl index 0c6b239209..822a9c5ae8 100644 --- a/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl +++ b/community/modules/scripts/htcondor-install/templates/install-htcondor.ps1.tftpl @@ -8,27 +8,36 @@ $ErrorActionPreference = 'Stop' $ProgressPreference = 'SilentlyContinue' # download C Runtime DLL necessary for HTCondor installer -Invoke-WebRequest https://aka.ms/vs/17/release/vc_redist.x64.exe -OutFile C:\vc_redist.x64.exe -Start-Process C:\vc_redist.x64.exe -Wait -ArgumentList "/norestart /quiet /log c:\vc_redist_log.txt" -Remove-Item C:\vc_redist.x64.exe +$runtime_installer = 'C:\vc_redist.x64.exe' +Invoke-WebRequest https://aka.ms/vs/17/release/vc_redist.x64.exe -OutFile "$runtime_installer" +Start-Process -FilePath "$runtime_installer" -Wait -ArgumentList "/norestart /quiet /log c:\vc_redist_log.txt" +Remove-Item "$runtime_installer" # download HTCondor installer +$htcondor_installer = 'C:\htcondor.msi' %{ if condor_version == "10.*" } -Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/current/current/condor-Windows-x64.msi -OutFile C:\htcondor.msi +Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/current/current/condor-Windows-x64.msi -OutFile "$htcondor_installer" %{ else ~} -Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/current/${condor_version}/release/condor-${condor_version}-Windows-x64.msi -OutFile C:\htcondor.msi +Invoke-WebRequest https://research.cs.wisc.edu/htcondor/tarball/current/${condor_version}/release/condor-${condor_version}-Windows-x64.msi -OutFile "$htcondor_installer" %{ endif ~} $args='/qn /l* condor-install-log.txt /i' -$args=$args + ' C:\htcondor.msi' +$args=$args + " $htcondor_installer" $args=$args + ' NEWPOOL="N"' $args=$args + ' RUNJOBS="N"' $args=$args + ' SUBMITJOBS="N"' $args=$args + ' INSTALLDIR="C:\Condor"' -Start-Process msiexec.exe -Wait -ArgumentList $args -Remove-Item C:\htcondor.msi +Start-Process "msiexec.exe" -Wait -ArgumentList "$args" +Remove-Item "$htcondor_installer" # remove settings from condor_config that we want to override in configuration step Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^CONDOR_HOST' -NotMatch) Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^INSTALL_USER' -NotMatch) Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^DAEMON_LIST' -NotMatch) Set-Content -Path "C:\Condor\condor_config" -Value (Get-Content -Path "C:\Condor\condor_config" | Select-String -Pattern '^use SECURITY' -NotMatch) + +$python_installer = 'C:\python-installer.exe' +Invoke-WebRequest -Uri "https://www.python.org/ftp/python/3.11.4/python-3.11.4-amd64.exe" -OutFile "$python_installer" +Start-Process -FilePath "$python_installer" -Wait -ArgumentList '/quiet InstallAllUsers=1 PrependPath=1 Include_test=0' +Start-Process "py.exe" -Wait -ArgumentList "-3.11 -m pip install --no-warn-script-location requests" +Invoke-WebRequest -Uri "https://raw.githubusercontent.com/htcondor/htcondor/main/src/condor_scripts/common-cloud-attributes-google.py" -OutFile "C:\Condor\bin\common-cloud-attributes-google.py" +Remove-Item "$python_installer" From e4733869578a9a94b96652ede205e401b1de96f7 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:48 -0500 Subject: [PATCH 059/144] Improve timestamp in HTCondor autoscaler logs --- community/modules/scripts/htcondor-install/files/autoscaler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/community/modules/scripts/htcondor-install/files/autoscaler.py b/community/modules/scripts/htcondor-install/files/autoscaler.py index e1392dadf5..8e993ac1cb 100644 --- a/community/modules/scripts/htcondor-install/files/autoscaler.py +++ b/community/modules/scripts/htcondor-install/files/autoscaler.py @@ -21,6 +21,7 @@ from absl import app from absl import flags from collections import OrderedDict +from datetime import datetime from pprint import pprint from googleapiclient import discovery from oauth2client.client import GoogleCredentials @@ -224,7 +225,7 @@ def scale(self): print(f"The negotiator has not yet started a match cycle. Exiting auto-scaling.") exit() - print(f"Last negotiation cycle occurred at: {last_negotiation_cycle_time}") + print(f"Last negotiation cycle occurred at: {datetime.fromtimestamp(last_negotiation_cycle_time)}") idle_job_query = classad.ExprTree(f"JobStatus == 1 && QDate < {last_negotiation_cycle_time}") idle_job_ads = schedd.query(constraint=idle_job_query.and_(spot_query), projection=job_attributes) From ee15eb0146a9a48df668d03fd6c00f4f964cc935 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:30:48 -0500 Subject: [PATCH 060/144] Select HTCondor node name attribute in Windows and Linux compatible fashion --- .../modules/scripts/htcondor-install/files/autoscaler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/htcondor-install/files/autoscaler.py b/community/modules/scripts/htcondor-install/files/autoscaler.py index 8e993ac1cb..462b45e52d 100644 --- a/community/modules/scripts/htcondor-install/files/autoscaler.py +++ b/community/modules/scripts/htcondor-install/files/autoscaler.py @@ -276,11 +276,11 @@ def scale(self): constraint=filter_idle_vms.and_(filter_mig), projection=["Machine", "CloudZone"]) - NODENAME_ATTRIBUTE = "UtsnameNodename" + NODENAME_ATTRIBUTE = "Machine" claimed_node_ads = coll.query(htcondor.AdTypes.Startd, constraint=filter_claimed_vms.and_(filter_mig), projection=[NODENAME_ATTRIBUTE]) - claimed_nodes = [ ad[NODENAME_ATTRIBUTE] for ad in claimed_node_ads] + claimed_nodes = [ ad[NODENAME_ATTRIBUTE].split(".")[0] for ad in claimed_node_ads] # treat OrderedDict as a set by ignoring key values; this set will # contain VMs we would consider deleting, in inverse order of From 1362947a81fdc4e9cd3e1594102a87e30f03652b Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:31:40 -0500 Subject: [PATCH 061/144] Rename htcondor-configure to htcondor-base --- community/examples/htc-htcondor.yaml | 2 +- community/modules/compute/htcondor-execute-point/README.md | 4 ++-- .../{htcondor-configure => htcondor-base}/README.md | 6 +++--- .../files/htcondor_configure.yml | 0 .../scheduler/{htcondor-configure => htcondor-base}/main.tf | 2 +- .../{htcondor-configure => htcondor-base}/outputs.tf | 0 .../templates/condor_config.tftpl | 0 .../templates/download-condor-config.ps1.tftpl | 0 .../{htcondor-configure => htcondor-base}/variables.tf | 0 .../{htcondor-configure => htcondor-base}/versions.tf | 2 +- community/modules/scripts/htcondor-install/README.md | 6 +++--- docs/gpu-support.md | 4 ++-- docs/vm-images.md | 2 +- modules/README.md | 6 +++--- pkg/modulereader/resreader.go | 2 +- 15 files changed, 18 insertions(+), 18 deletions(-) rename community/modules/scheduler/{htcondor-configure => htcondor-base}/README.md (98%) rename community/modules/scheduler/{htcondor-configure => htcondor-base}/files/htcondor_configure.yml (100%) rename community/modules/scheduler/{htcondor-configure => htcondor-base}/main.tf (99%) rename community/modules/scheduler/{htcondor-configure => htcondor-base}/outputs.tf (100%) rename community/modules/scheduler/{htcondor-configure => htcondor-base}/templates/condor_config.tftpl (100%) rename community/modules/scheduler/{htcondor-configure => htcondor-base}/templates/download-condor-config.ps1.tftpl (100%) rename community/modules/scheduler/{htcondor-configure => htcondor-base}/variables.tf (100%) rename community/modules/scheduler/{htcondor-configure => htcondor-base}/versions.tf (91%) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index c02654b6bf..5128f2ad2b 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -60,7 +60,7 @@ deployment_groups: - group: pool modules: - id: htcondor_configure - source: community/modules/scheduler/htcondor-configure + source: community/modules/scheduler/htcondor-base use: - network1 diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index e521e4eaf9..897556bc82 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -6,11 +6,11 @@ This module performs the following tasks: - create a managed instance group (MIG) for execute points - create a Toolkit runner to configure the autoscaler to scale the MIG -It is expected to be used with the [htcondor-install] and [htcondor-configure] +It is expected to be used with the [htcondor-install] and [htcondor-base] modules. [htcondor-install]: ../../scripts/htcondor-install/README.md -[htcondor-configure]: ../../scheduler/htcondor-configure/README.md +[htcondor-base]: ../../scheduler/htcondor-configure/README.md ### Known limitations diff --git a/community/modules/scheduler/htcondor-configure/README.md b/community/modules/scheduler/htcondor-base/README.md similarity index 98% rename from community/modules/scheduler/htcondor-configure/README.md rename to community/modules/scheduler/htcondor-base/README.md index aef3a162cd..fe09bb52fa 100644 --- a/community/modules/scheduler/htcondor-configure/README.md +++ b/community/modules/scheduler/htcondor-base/README.md @@ -12,7 +12,7 @@ It is expected to be used with the [htcondor-install] and [htcondor-execute-point] modules. [hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm -[htcondor-install]: ../../scripts/htcondor-configure/README.md +[htcondor-install]: ../../scripts/htcondor-base/README.md [htcondor-execute-point]: ../../compute/htcondor-execute-point/README.md [htcrole]: https://htcondor.readthedocs.io/en/latest/getting-htcondor/admin-quick-start.html#what-get-htcondor-does-to-configure-a-role @@ -33,7 +33,7 @@ example can be found in the [examples README][htc-example]. source: community/modules/scripts/htcondor-install - id: htcondor_configure - source: community/modules/scheduler/htcondor-configure + source: community/modules/scheduler/htcondor-base use: - network1 @@ -94,7 +94,7 @@ vars: zone_secondary: us-central1-f - id: htcondor_configure - source: community/modules/scheduler/htcondor-configure + source: community/modules/scheduler/htcondor-base use: - network1 settings: diff --git a/community/modules/scheduler/htcondor-configure/files/htcondor_configure.yml b/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml similarity index 100% rename from community/modules/scheduler/htcondor-configure/files/htcondor_configure.yml rename to community/modules/scheduler/htcondor-base/files/htcondor_configure.yml diff --git a/community/modules/scheduler/htcondor-configure/main.tf b/community/modules/scheduler/htcondor-base/main.tf similarity index 99% rename from community/modules/scheduler/htcondor-configure/main.tf rename to community/modules/scheduler/htcondor-base/main.tf index 72d6fa6650..452ef489f7 100644 --- a/community/modules/scheduler/htcondor-configure/main.tf +++ b/community/modules/scheduler/htcondor-base/main.tf @@ -16,7 +16,7 @@ locals { # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "htcondor-configure" }) + labels = merge(var.labels, { ghpc_module = "htcondor-base" }) } locals { diff --git a/community/modules/scheduler/htcondor-configure/outputs.tf b/community/modules/scheduler/htcondor-base/outputs.tf similarity index 100% rename from community/modules/scheduler/htcondor-configure/outputs.tf rename to community/modules/scheduler/htcondor-base/outputs.tf diff --git a/community/modules/scheduler/htcondor-configure/templates/condor_config.tftpl b/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl similarity index 100% rename from community/modules/scheduler/htcondor-configure/templates/condor_config.tftpl rename to community/modules/scheduler/htcondor-base/templates/condor_config.tftpl diff --git a/community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl b/community/modules/scheduler/htcondor-base/templates/download-condor-config.ps1.tftpl similarity index 100% rename from community/modules/scheduler/htcondor-configure/templates/download-condor-config.ps1.tftpl rename to community/modules/scheduler/htcondor-base/templates/download-condor-config.ps1.tftpl diff --git a/community/modules/scheduler/htcondor-configure/variables.tf b/community/modules/scheduler/htcondor-base/variables.tf similarity index 100% rename from community/modules/scheduler/htcondor-configure/variables.tf rename to community/modules/scheduler/htcondor-base/variables.tf diff --git a/community/modules/scheduler/htcondor-configure/versions.tf b/community/modules/scheduler/htcondor-base/versions.tf similarity index 91% rename from community/modules/scheduler/htcondor-configure/versions.tf rename to community/modules/scheduler/htcondor-base/versions.tf index 6827118448..e69e9b8847 100644 --- a/community/modules/scheduler/htcondor-configure/versions.tf +++ b/community/modules/scheduler/htcondor-base/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-configure/v1.20.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-base/v1.20.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scripts/htcondor-install/README.md b/community/modules/scripts/htcondor-install/README.md index 775ec6211b..c6d988971c 100644 --- a/community/modules/scripts/htcondor-install/README.md +++ b/community/modules/scripts/htcondor-install/README.md @@ -13,18 +13,18 @@ Debian or Ubuntu distributions. It also exports a list of Google Cloud APIs which must be enabled prior to provisioning an HTCondor Pool. -It is expected to be used with the [htcondor-configure] and +It is expected to be used with the [htcondor-base] and [htcondor-execute-point] modules. [hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm -[htcondor-configure]: ../../scheduler/htcondor-configure/README.md +[htcondor-base]: ../../scheduler/htcondor-configure/README.md [htcondor-execute-point]: ../../compute/htcondor-execute-point/README.md ### Example The following code snippet uses this module to create startup scripts that install the HTCondor software and adds custom configurations using -[htcondor-configure] and [htcondor-execute-point]. +[htcondor-base] and [htcondor-execute-point]. ```yaml - id: htcondor_install diff --git a/docs/gpu-support.md b/docs/gpu-support.md index d220ab02fb..d7722f6eb1 100644 --- a/docs/gpu-support.md +++ b/docs/gpu-support.md @@ -3,7 +3,7 @@ ## Supported modules * [vm-instance] and therefore any module that relies on `vm-instance` including: - * HTCondor modules including [htcondor-install], [htcondor-configure] and + * HTCondor modules including [htcondor-install], [htcondor-base] and [htcondor-execute-point]. * [omnia-install] * Slurm on GCP modules where applicable, both version 4 and version 5 @@ -47,7 +47,7 @@ cannot be determined automatically like with `a2`. [login]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-login [omnia-install]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scripts/omnia-install [htcondor-install]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scripts/htcondor-install -[htcondor-configure]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/htcondor-configure +[htcondor-base]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/htcondor-configure [htcondor-execute-point]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/compute/htcondor-execute-point ## Troubleshooting and tips diff --git a/docs/vm-images.md b/docs/vm-images.md index b0b5edc315..813c4b744f 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -153,7 +153,7 @@ a description of our support for Windows images. HTCondor - ✓ + ✓✓ diff --git a/modules/README.md b/modules/README.md index 99c404f242..2524588ca5 100644 --- a/modules/README.md +++ b/modules/README.md @@ -45,7 +45,7 @@ Modules that are still in development and less stable are labeled with the Kubernetes job file to be used with a [gke-node-pool]. * **[htcondor-execute-point]** ![community-badge] ![experimental-badge] : Manages a group of execute points for use in an [HTCondor - pool][htcondor-configure]. + pool][htcondor-base]. * **[pbspro-execution]** ![community-badge] ![experimental-badge] : Creates execution hosts for use in a PBS Professional cluster. * **[SchedMD-slurm-on-gcp-partition]** ![community-badge] ![deprecated-badge] : Creates a partition @@ -153,7 +153,7 @@ Modules that are still in development and less stable are labeled with the Creates a Slurm login node using [slurm-gcp-version-5]. * **[schedmd-slurm-gcp-v5-hybrid]** ![community-badge] ![experimental-badge] : Creates hybrid Slurm partition configuration files using [slurm-gcp-version-5]. -* **[htcondor-configure]** ![community-badge] ![experimental-badge] : Creates +* **[htcondor-base]** ![community-badge] ![experimental-badge] : Creates Toolkit runners and service accounts to configure an HTCondor pool. * **[pbspro-client]** ![community-badge] ![experimental-badge] : Creates a client host for submitting jobs to a PBS Professional cluster. @@ -167,7 +167,7 @@ Modules that are still in development and less stable are labeled with the [batch-job-template]: ../modules/scheduler/batch-job-template/README.md [batch-login-node]: ../modules/scheduler/batch-login-node/README.md [gke-cluster]: ../community/modules/scheduler/gke-cluster/README.md -[htcondor-configure]: ../community/modules/scheduler/htcondor-configure/README.md +[htcondor-base]: ../community/modules/scheduler/htcondor-configure/README.md [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index d21d200284..5d9904bf56 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -265,7 +265,7 @@ func defaultAPIList(source string) []string { "compute.googleapis.com", "storage.googleapis.com", }, - "community/modules/scheduler/htcondor-configure": { + "community/modules/scheduler/htcondor-base": { "iam.googleapis.com", "secretmanager.googleapis.com", }, From c07eefada7f95d037c3224a594d6f2676e60d40d Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:31:40 -0500 Subject: [PATCH 062/144] Fix htcondor-base API requirements --- pkg/modulereader/resreader.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index 5d9904bf56..4c4805ef93 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -268,6 +268,7 @@ func defaultAPIList(source string) []string { "community/modules/scheduler/htcondor-base": { "iam.googleapis.com", "secretmanager.googleapis.com", + "storage.googleapis.com", }, "community/modules/scheduler/pbspro-client": { "compute.googleapis.com", From 94fd8dafee4eda813e499a78796fb3922dbb32b7 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:31:40 -0500 Subject: [PATCH 063/144] Rename htcondor-configure to htcondor-base --- community/examples/htc-htcondor.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 5128f2ad2b..9e6378c53e 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -59,7 +59,7 @@ deployment_groups: - group: pool modules: - - id: htcondor_configure + - id: htcondor_base source: community/modules/scheduler/htcondor-base use: - network1 @@ -68,7 +68,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(htcondor_configure.central_manager_runner) + - $(htcondor_base.central_manager_runner) - id: htcondor_cm source: modules/compute/vm-instance @@ -83,14 +83,14 @@ deployment_groups: machine_type: c2-standard-4 disable_public_ips: true service_account: - email: $(htcondor_configure.central_manager_service_account) + email: $(htcondor_base.central_manager_service_account) scopes: - cloud-platform network_interfaces: - network: null subnetwork: $(network1.subnetwork_self_link) subnetwork_project: $(vars.project_id) - network_ip: $(htcondor_configure.central_manager_internal_ip) + network_ip: $(htcondor_base.central_manager_internal_ip) stack_type: null access_config: [] ipv6_access_config: [] @@ -104,7 +104,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(htcondor_configure.execute_point_runner) + - $(htcondor_base.execute_point_runner) # the HTCondor modules support up to 2 execute points per blueprint # if using 1, it may use Spot or On-demand pricing @@ -120,7 +120,7 @@ deployment_groups: family: $(vars.new_image_family) min_idle: 2 service_account: - email: $(htcondor_configure.execute_point_service_account) + email: $(htcondor_base.execute_point_service_account) scopes: - cloud-platform @@ -135,7 +135,7 @@ deployment_groups: project: $(vars.project_id) family: $(vars.new_image_family) service_account: - email: $(htcondor_configure.execute_point_service_account) + email: $(htcondor_base.execute_point_service_account) scopes: - cloud-platform @@ -144,7 +144,7 @@ deployment_groups: settings: runners: - $(htcondor_install.install_autoscaler_runner) - - $(htcondor_configure.access_point_runner) + - $(htcondor_base.access_point_runner) - $(htcondor_execute_point.configure_autoscaler_runner) - $(htcondor_execute_point_spot.configure_autoscaler_runner) - type: data @@ -175,7 +175,7 @@ deployment_groups: add_deployment_name_before_prefix: true machine_type: c2-standard-4 service_account: - email: $(htcondor_configure.access_point_service_account) + email: $(htcondor_base.access_point_service_account) scopes: - cloud-platform outputs: From b430ab31fb59884eb7f6d35169ab29c95c133b47 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:31:40 -0500 Subject: [PATCH 064/144] Refactor HTCondor Pool secret management - add dedicated module for handling HTCondor pool secrets and other identity / authorization configuration settings - responsible for setting TRUST_DOMAIN, UID_DOMAIN and creating runners that fetch the secrets necessary for each node within the pool --- community/examples/htc-htcondor.yaml | 16 +- .../modules/scheduler/htcondor-base/README.md | 18 +- .../files/htcondor_configure.yml | 67 ------- .../modules/scheduler/htcondor-base/main.tf | 75 +------- .../scheduler/htcondor-base/outputs.tf | 25 +-- .../templates/condor_config.tftpl | 2 - .../download-condor-config.ps1.tftpl | 4 - .../scheduler/htcondor-base/variables.tf | 7 - .../scheduler/htcondor-base/versions.tf | 4 - .../scheduler/htcondor-pool-secrets/README.md | 171 ++++++++++++++++++ .../files/htcondor_secrets.yml | 98 ++++++++++ .../scheduler/htcondor-pool-secrets/main.tf | 127 +++++++++++++ .../htcondor-pool-secrets/outputs.tf | 50 +++++ .../templates/fetch-idtoken.ps1.tftpl | 17 ++ .../htcondor-pool-secrets/variables.tf | 58 ++++++ .../htcondor-pool-secrets/versions.tf | 33 ++++ modules/README.md | 7 +- 17 files changed, 578 insertions(+), 201 deletions(-) create mode 100644 community/modules/scheduler/htcondor-pool-secrets/README.md create mode 100644 community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml create mode 100644 community/modules/scheduler/htcondor-pool-secrets/main.tf create mode 100644 community/modules/scheduler/htcondor-pool-secrets/outputs.tf create mode 100644 community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl create mode 100644 community/modules/scheduler/htcondor-pool-secrets/variables.tf create mode 100644 community/modules/scheduler/htcondor-pool-secrets/versions.tf diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 9e6378c53e..1fb368d075 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -64,10 +64,16 @@ deployment_groups: use: - network1 + - id: htcondor_secrets + source: community/modules/scheduler/htcondor-pool-secrets + use: + - htcondor_base + - id: htcondor_startup_central_manager source: modules/scripts/startup-script settings: runners: + - $(htcondor_secrets.central_manager_runner) - $(htcondor_base.central_manager_runner) - id: htcondor_cm @@ -83,7 +89,7 @@ deployment_groups: machine_type: c2-standard-4 disable_public_ips: true service_account: - email: $(htcondor_base.central_manager_service_account) + email: $(htcondor_base.central_manager_service_account_email) scopes: - cloud-platform network_interfaces: @@ -104,6 +110,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: + - $(htcondor_secrets.execute_point_runner) - $(htcondor_base.execute_point_runner) # the HTCondor modules support up to 2 execute points per blueprint @@ -120,7 +127,7 @@ deployment_groups: family: $(vars.new_image_family) min_idle: 2 service_account: - email: $(htcondor_base.execute_point_service_account) + email: $(htcondor_base.execute_point_service_account_email) scopes: - cloud-platform @@ -135,7 +142,7 @@ deployment_groups: project: $(vars.project_id) family: $(vars.new_image_family) service_account: - email: $(htcondor_base.execute_point_service_account) + email: $(htcondor_base.execute_point_service_account_email) scopes: - cloud-platform @@ -144,6 +151,7 @@ deployment_groups: settings: runners: - $(htcondor_install.install_autoscaler_runner) + - $(htcondor_secrets.access_point_runner) - $(htcondor_base.access_point_runner) - $(htcondor_execute_point.configure_autoscaler_runner) - $(htcondor_execute_point_spot.configure_autoscaler_runner) @@ -175,7 +183,7 @@ deployment_groups: add_deployment_name_before_prefix: true machine_type: c2-standard-4 service_account: - email: $(htcondor_base.access_point_service_account) + email: $(htcondor_base.access_point_service_account_email) scopes: - cloud-platform outputs: diff --git a/community/modules/scheduler/htcondor-base/README.md b/community/modules/scheduler/htcondor-base/README.md index fe09bb52fa..a3e74ae8e9 100644 --- a/community/modules/scheduler/htcondor-base/README.md +++ b/community/modules/scheduler/htcondor-base/README.md @@ -201,14 +201,12 @@ limitations under the License. |------|---------| | [terraform](#requirement\_terraform) | >= 0.13.0 | | [google](#requirement\_google) | >= 3.83 | -| [random](#requirement\_random) | >= 3.0 | ## Providers | Name | Version | |------|---------| | [google](#provider\_google) | >= 3.83 | -| [random](#provider\_random) | >= 3.0 | ## Modules @@ -225,17 +223,9 @@ limitations under the License. | Name | Type | |------|------| -| [google_secret_manager_secret.execute_point_idtoken](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | -| [google_secret_manager_secret.pool_password](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | -| [google_secret_manager_secret_iam_member.access_point](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | -| [google_secret_manager_secret_iam_member.central_manager_idtoken](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | -| [google_secret_manager_secret_iam_member.central_manager_password](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | -| [google_secret_manager_secret_iam_member.execute_point](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | -| [google_secret_manager_secret_version.pool_password](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_version) | resource | | [google_storage_bucket_object.ap_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.cm_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.execute_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [random_password.pool](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | | [google_compute_subnetwork.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork) | data source | ## Inputs @@ -249,7 +239,6 @@ limitations under the License. | [execute\_point\_roles](#input\_execute\_point\_roles) | Project-wide roles for HTCondor Execute Point service account | `list(string)` |
[
"roles/monitoring.metricWriter",
"roles/logging.logWriter",
"roles/storage.objectViewer"
]
| no | | [job\_queue\_high\_availability](#input\_job\_queue\_high\_availability) | Provision HTCondor access points in high availability mode (experimental: see README) | `bool` | `false` | no | | [labels](#input\_labels) | Labels to add to resources. List key, value pairs. | `map(string)` | n/a | yes | -| [pool\_password](#input\_pool\_password) | HTCondor Pool Password | `string` | `null` | no | | [project\_id](#input\_project\_id) | Project in which HTCondor pool will be created | `string` | n/a | yes | | [region](#input\_region) | Default region for creating resources | `string` | n/a | yes | | [spool\_parent\_dir](#input\_spool\_parent\_dir) | HTCondor access point configuration SPOOL will be set to subdirectory named "spool" | `string` | `"/var/lib/condor"` | no | @@ -260,13 +249,12 @@ limitations under the License. | Name | Description | |------|-------------| | [access\_point\_runner](#output\_access\_point\_runner) | Toolkit Runner to configure an HTCondor Access Point | -| [access\_point\_service\_account](#output\_access\_point\_service\_account) | HTCondor Access Point Service Account (e-mail format) | +| [access\_point\_service\_account\_email](#output\_access\_point\_service\_account\_email) | HTCondor Access Point Service Account (e-mail format) | | [central\_manager\_internal\_ip](#output\_central\_manager\_internal\_ip) | Reserved internal IP address for use by Central Manager | | [central\_manager\_runner](#output\_central\_manager\_runner) | Toolkit Runner to configure an HTCondor Central Manager | | [central\_manager\_secondary\_internal\_ip](#output\_central\_manager\_secondary\_internal\_ip) | Reserved internal IP address for use by failover Central Manager | -| [central\_manager\_service\_account](#output\_central\_manager\_service\_account) | HTCondor Central Manager Service Account (e-mail format) | +| [central\_manager\_service\_account\_email](#output\_central\_manager\_service\_account\_email) | HTCondor Central Manager Service Account (e-mail format) | | [execute\_point\_runner](#output\_execute\_point\_runner) | Toolkit Runner to configure an HTCondor Execute Point | -| [execute\_point\_service\_account](#output\_execute\_point\_service\_account) | HTCondor Execute Point Service Account (e-mail format) | -| [pool\_password\_secret\_id](#output\_pool\_password\_secret\_id) | Google Cloud Secret Manager ID containing HTCondor Pool Password | +| [execute\_point\_service\_account\_email](#output\_execute\_point\_service\_account\_email) | HTCondor Execute Point Service Account (e-mail format) | | [windows\_startup\_ps1](#output\_windows\_startup\_ps1) | Windows PowerShell script to update HTCondor configuration file | diff --git a/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml b/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml index 0a3cd7da7f..cf75113807 100644 --- a/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml +++ b/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml @@ -28,20 +28,7 @@ ansible.builtin.assert: that: - htcondor_role is defined - - password_id is defined - - trust_domain is defined - config_object is defined - - name: Set HTCondor Pool password (token signing key) - when: htcondor_role != 'get_htcondor_execute' - ansible.builtin.shell: | - set -e -o pipefail - TMPFILE=$(mktemp) - gcloud secrets versions access latest --out-file "$TMPFILE" --secret={{ password_id }} - condor_store_cred add -c -i "$TMPFILE" - rm -f "$TMPFILE" - args: - creates: "{{ condor_config_root }}/passwords.d/POOL" - executable: /bin/bash - name: Remove default HTCondor configuration ansible.builtin.file: path: "{{ condor_config_root }}/config.d/00-htcondor-9.0.config" @@ -70,29 +57,6 @@ fi args: executable: /bin/bash - - name: Configure HTCondor Central Manager - when: htcondor_role == 'get_htcondor_central_manager' - block: - - name: Create IDTOKEN for Central Manager - ansible.builtin.shell: | - umask 0077 - TRUST_DOMAIN=$(condor_config_val TRUST_DOMAIN) - # do not restrict Central Manager authz scopes! - condor_token_create -identity condor@{{ trust_domain }} \ - -token condor@{{ trust_domain }} - args: - creates: "{{ condor_config_root }}/tokens.d/condor@{{ trust_domain }}" - - name: Create IDTOKEN secret for Execute Points - changed_when: true - ansible.builtin.shell: | - umask 0077 - TRUST_DOMAIN=$(condor_config_val TRUST_DOMAIN) - TMPFILE=$(mktemp) - condor_token_create -authz READ -authz ADVERTISE_MASTER \ - -authz ADVERTISE_STARTD -identity condor@{{ trust_domain }} > "$TMPFILE" - gcloud secrets versions add --data-file "$TMPFILE" {{ xp_idtoken_secret_id }} - rm -f "$TMPFILE" - when: xp_idtoken_secret_id | length > 0 - name: Configure HTCondor SchedD when: htcondor_role == 'get_htcondor_submit' block: @@ -103,17 +67,6 @@ owner: condor group: condor mode: 0755 - - name: Create IDTOKEN to advertise access point - ansible.builtin.shell: | - umask 0077 - # DAEMON authorization can likely be removed in future when scopes - # needed to trigger a negotiation cycle are changed. Suggest review - # https://opensciencegrid.atlassian.net/jira/software/c/projects/HTCONDOR/issues/?filter=allissues - condor_token_create -authz READ -authz ADVERTISE_MASTER \ - -authz ADVERTISE_SCHEDD -authz DAEMON -identity condor@{{ trust_domain }} \ - -token condor@{{ trust_domain }} - args: - creates: "{{ condor_config_root }}/tokens.d/condor@{{ trust_domain }}" - name: Enable SchedD high availability when: job_queue_ha | bool block: @@ -164,26 +117,6 @@ state: absent notify: - Reload SystemD - - name: Configure HTCondor StartD - when: htcondor_role == 'get_htcondor_execute' - block: - - name: Create SystemD override directory for HTCondor Execute Point - ansible.builtin.file: - path: /etc/systemd/system/condor.service.d - state: directory - owner: root - group: root - mode: 0755 - - name: Fetch IDTOKEN to advertise execute point - ansible.builtin.copy: - dest: "/etc/systemd/system/condor.service.d/htcondor-token-fetcher.conf" - mode: 0644 - content: | - [Service] - ExecStartPre=gcloud secrets versions access latest --secret {{ xp_idtoken_secret_id }} \ - --out-file {{ condor_config_root }}/tokens.d/condor@{{ trust_domain }} - notify: - - Reload SystemD handlers: - name: Reload SystemD ansible.builtin.systemd: diff --git a/community/modules/scheduler/htcondor-base/main.tf b/community/modules/scheduler/htcondor-base/main.tf index 452ef489f7..fa8cdcd170 100644 --- a/community/modules/scheduler/htcondor-base/main.tf +++ b/community/modules/scheduler/htcondor-base/main.tf @@ -30,31 +30,24 @@ locals { central_manager_count = var.central_manager_high_availability ? 2 : 1 central_manager_ip_names = [for i in range(local.central_manager_count) : "${var.deployment_name}-cm-ip-${i}"] - pool_password = coalesce(var.pool_password, random_password.pool.result) - trust_domain = "c.${var.project_id}.internal" - cm_config = templatefile("${path.module}/templates/condor_config.tftpl", { htcondor_role = "get_htcondor_central_manager", central_manager_ips = module.address.addresses, - trust_domain = local.trust_domain, spool_dir = "${var.spool_parent_dir}/spool", }) execute_config = templatefile("${path.module}/templates/condor_config.tftpl", { htcondor_role = "get_htcondor_execute", central_manager_ips = module.address.addresses, - trust_domain = local.trust_domain, spool_dir = "${var.spool_parent_dir}/spool", }) ap_config = templatefile("${path.module}/templates/condor_config.tftpl", { htcondor_role = "get_htcondor_submit", central_manager_ips = module.address.addresses, - trust_domain = local.trust_domain, spool_dir = "${var.spool_parent_dir}/spool", }) - cm_object = "gs://${module.htcondor_bucket.name}/${google_storage_bucket_object.cm_config.output_name}" runner_cm = { "type" = "ansible-local" @@ -63,9 +56,6 @@ locals { "args" = join(" ", [ "-e htcondor_role=get_htcondor_central_manager", "-e config_object=${local.cm_object}", - "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", - "-e xp_idtoken_secret_id=${google_secret_manager_secret.execute_point_idtoken.secret_id}", - "-e trust_domain=${local.trust_domain}", ]) } @@ -77,8 +67,6 @@ locals { "args" = join(" ", [ "-e htcondor_role=get_htcondor_submit", "-e config_object=${local.ap_object}", - "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", - "-e trust_domain=${local.trust_domain}", "-e job_queue_ha=${var.job_queue_high_availability}", "-e spool_dir=${var.spool_parent_dir}/spool", ]) @@ -92,17 +80,12 @@ locals { "args" = join(" ", [ "-e htcondor_role=get_htcondor_execute", "-e config_object=${local.execute_object}", - "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", - "-e xp_idtoken_secret_id=${google_secret_manager_secret.execute_point_idtoken.secret_id}", - "-e trust_domain=${local.trust_domain}", ]) } windows_startup_ps1 = templatefile( "${path.module}/templates/download-condor-config.ps1.tftpl", { - config_object = local.execute_object, - trust_domain = local.trust_domain, - xp_idtoken_secret_id = google_secret_manager_secret.execute_point_idtoken.secret_id, + config_object = local.execute_object, } ) } @@ -179,62 +162,6 @@ module "central_manager_service_account" { project_roles = local.central_manager_roles } -resource "random_password" "pool" { - length = 24 - special = true - override_special = "_-#=." -} - -resource "google_secret_manager_secret" "pool_password" { - secret_id = "${var.deployment_name}-pool-password" - - labels = local.labels - - replication { - automatic = true - } -} - -resource "google_secret_manager_secret_version" "pool_password" { - secret = google_secret_manager_secret.pool_password.id - secret_data = local.pool_password -} - -# this secret will be populated by the Central Manager -resource "google_secret_manager_secret" "execute_point_idtoken" { - secret_id = "${var.deployment_name}-execute-point-idtoken" - - labels = local.labels - - replication { - automatic = true - } -} - -resource "google_secret_manager_secret_iam_member" "central_manager_password" { - secret_id = google_secret_manager_secret.pool_password.id - role = "roles/secretmanager.secretAccessor" - member = module.central_manager_service_account.iam_email -} - -resource "google_secret_manager_secret_iam_member" "central_manager_idtoken" { - secret_id = google_secret_manager_secret.execute_point_idtoken.id - role = "roles/secretmanager.secretVersionManager" - member = module.central_manager_service_account.iam_email -} - -resource "google_secret_manager_secret_iam_member" "access_point" { - secret_id = google_secret_manager_secret.pool_password.id - role = "roles/secretmanager.secretAccessor" - member = module.access_point_service_account.iam_email -} - -resource "google_secret_manager_secret_iam_member" "execute_point" { - secret_id = google_secret_manager_secret.execute_point_idtoken.id - role = "roles/secretmanager.secretAccessor" - member = module.execute_point_service_account.iam_email -} - module "address" { source = "terraform-google-modules/address/google" version = "~> 3.0" diff --git a/community/modules/scheduler/htcondor-base/outputs.tf b/community/modules/scheduler/htcondor-base/outputs.tf index 6b86e1b03a..a71c4456f7 100644 --- a/community/modules/scheduler/htcondor-base/outputs.tf +++ b/community/modules/scheduler/htcondor-base/outputs.tf @@ -14,62 +14,43 @@ * limitations under the License. */ -output "access_point_service_account" { +output "access_point_service_account_email" { description = "HTCondor Access Point Service Account (e-mail format)" value = module.access_point_service_account.email depends_on = [ - google_secret_manager_secret_iam_member.access_point, module.access_point_service_account ] } -output "central_manager_service_account" { +output "central_manager_service_account_email" { description = "HTCondor Central Manager Service Account (e-mail format)" value = module.central_manager_service_account.email depends_on = [ - google_secret_manager_secret_iam_member.central_manager_idtoken, - google_secret_manager_secret_iam_member.central_manager_password, module.central_manager_service_account ] } -output "execute_point_service_account" { +output "execute_point_service_account_email" { description = "HTCondor Execute Point Service Account (e-mail format)" value = module.execute_point_service_account.email depends_on = [ - google_secret_manager_secret_iam_member.execute_point, module.execute_point_service_account ] } -output "pool_password_secret_id" { - description = "Google Cloud Secret Manager ID containing HTCondor Pool Password" - value = google_secret_manager_secret.pool_password.secret_id - sensitive = true -} - output "central_manager_runner" { description = "Toolkit Runner to configure an HTCondor Central Manager" value = local.runner_cm - depends_on = [ - google_secret_manager_secret_version.pool_password - ] } output "access_point_runner" { description = "Toolkit Runner to configure an HTCondor Access Point" value = local.runner_access - depends_on = [ - google_secret_manager_secret_version.pool_password - ] } output "execute_point_runner" { description = "Toolkit Runner to configure an HTCondor Execute Point" value = local.runner_execute - depends_on = [ - google_secret_manager_secret_version.pool_password - ] } output "central_manager_internal_ip" { diff --git a/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl b/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl index c5ae0175ec..a7c6b9fb36 100644 --- a/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl +++ b/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl @@ -18,8 +18,6 @@ use role:${htcondor_role} CONDOR_HOST = ${join(",", central_manager_ips)} -UID_DOMAIN = ${trust_domain} -TRUST_DOMAIN = ${trust_domain} %{ if htcondor_role == "get_htcondor_central_manager" ~} # Central Manager configuraition settings diff --git a/community/modules/scheduler/htcondor-base/templates/download-condor-config.ps1.tftpl b/community/modules/scheduler/htcondor-base/templates/download-condor-config.ps1.tftpl index 2328d41a24..d31bdf2faa 100644 --- a/community/modules/scheduler/htcondor-base/templates/download-condor-config.ps1.tftpl +++ b/community/modules/scheduler/htcondor-base/templates/download-condor-config.ps1.tftpl @@ -1,7 +1,3 @@ -# obtain IDTOKEN for authentication by StartD to Central Manager -gcloud secrets versions access latest --secret ${xp_idtoken_secret_id} ` - --out-file C:\condor\tokens.d\condor@${trust_domain} - # create directory for local condor_config customizations $config_dir = 'C:\Condor\config' if(!(test-path -PathType container -Path $config_dir)) { diff --git a/community/modules/scheduler/htcondor-base/variables.tf b/community/modules/scheduler/htcondor-base/variables.tf index 05538f4341..695b3feac0 100644 --- a/community/modules/scheduler/htcondor-base/variables.tf +++ b/community/modules/scheduler/htcondor-base/variables.tf @@ -70,13 +70,6 @@ variable "execute_point_roles" { ] } -variable "pool_password" { - description = "HTCondor Pool Password" - type = string - sensitive = true - default = null -} - variable "central_manager_high_availability" { description = "Provision HTCondor central manager in high availability mode" type = bool diff --git a/community/modules/scheduler/htcondor-base/versions.tf b/community/modules/scheduler/htcondor-base/versions.tf index e69e9b8847..c3d459c250 100644 --- a/community/modules/scheduler/htcondor-base/versions.tf +++ b/community/modules/scheduler/htcondor-base/versions.tf @@ -20,10 +20,6 @@ terraform { source = "hashicorp/google" version = ">= 3.83" } - random = { - source = "hashicorp/random" - version = ">= 3.0" - } } provider_meta "google" { module_name = "blueprints/terraform/hpc-toolkit:htcondor-base/v1.20.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/README.md b/community/modules/scheduler/htcondor-pool-secrets/README.md new file mode 100644 index 0000000000..01ade8698b --- /dev/null +++ b/community/modules/scheduler/htcondor-pool-secrets/README.md @@ -0,0 +1,171 @@ +## Description + +This module is responsible for the following actions: + +- store an HTCondor Pool password in Google Cloud Secret Manager + - will generate a new password if one is not supplied +- create a secret in Google Cloud Secret Manager in which the HTCondor central + manager can place IDTOKENs (JWT Authorizations) for execute points to download +- create a Toolkit runner for the central manager + - download the POOL password / signing key + - create a local IDTOKEN for itself + - upload the execute point IDTOKEN secret +- create a Toolkit runner for access points + - download the POOL password / signing key + - create a local IDTOKEN for itself +- create a Toolkit runner for execute points + - Fetch the IDTOKEN secret generated by the central manager + +It is expected to be used with the [htcondor-install] and +[htcondor-execute-point] modules. + +[hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm +[htcondor-install]: ../../scripts/htcondor-base/README.md +[htcondor-execute-point]: ../../compute/htcondor-execute-point/README.md + +[htcrole]: https://htcondor.readthedocs.io/en/latest/getting-htcondor/admin-quick-start.html#what-get-htcondor-does-to-configure-a-role + +### Example + +The following code snippet uses this module to create a startup script that +installs HTCondor software and configures an HTCondor Central Manager. A full +example can be found in the [examples README][htc-example]. + +[htc-example]: ../../../../examples/README.md#htc-htcondoryaml-- + +```yaml +- id: network1 + source: modules/network/pre-existing-vpc + +- id: htcondor_install + source: community/modules/scripts/htcondor-install + +- id: htcondor_base + source: community/modules/scheduler/htcondor-base + use: + - network1 + +- id: htcondor_secrets + source: community/modules/scheduler/htcondor-pool-secrets + use: + - htcondor_base + + - id: htcondor_startup_central_manager + source: modules/scripts/startup-script + settings: + runners: + - $(htcondor_install.install_htcondor_runner) + - $(htcondor_secrets.central_manager_runner) + - $(htcondor_base.central_manager_runner) + +- id: htcondor_cm + source: modules/compute/vm-instance + use: + - network1 + - htcondor_startup_central_manager + settings: + name_prefix: cm0 + machine_type: c2-standard-4 + disable_public_ips: true + service_account: + email: $(htcondor_base.central_manager_service_account) + scopes: + - cloud-platform + network_interfaces: + - network: null + subnetwork: $(network1.subnetwork_self_link) + subnetwork_project: $(vars.project_id) + network_ip: $(htcondor_base.central_manager_internal_ip) + stack_type: null + access_config: [] + ipv6_access_config: [] + alias_ip_range: [] + nic_type: VIRTIO_NET + queue_count: null + outputs: + - internal_ip +``` + +## Support + +HTCondor is maintained by the [Center for High Throughput Computing][chtc] at +the University of Wisconsin-Madison. Support for HTCondor is available via: + +- [Discussion lists](https://htcondor.org/mail-lists/) +- [HTCondor on GitHub](https://github.com/htcondor/htcondor/) +- [HTCondor manual](https://htcondor.readthedocs.io/en/latest/) + +[chtc]: https://chtc.cs.wisc.edu/ + +## License + + +Copyright 2023 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 0.13.0 | +| [google](#requirement\_google) | >= 3.83 | +| [random](#requirement\_random) | >= 3.0 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 3.83 | +| [random](#provider\_random) | >= 3.0 | + +## Modules + +No modules. + +## Resources + +| Name | Type | +|------|------| +| [google_secret_manager_secret.execute_point_idtoken](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | +| [google_secret_manager_secret.pool_password](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret) | resource | +| [google_secret_manager_secret_iam_member.access_point](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | +| [google_secret_manager_secret_iam_member.central_manager_idtoken](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | +| [google_secret_manager_secret_iam_member.central_manager_password](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | +| [google_secret_manager_secret_iam_member.execute_point](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_iam_member) | resource | +| [google_secret_manager_secret_version.pool_password](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/secret_manager_secret_version) | resource | +| [random_password.pool](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/password) | resource | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [access\_point\_service\_account\_email](#input\_access\_point\_service\_account\_email) | HTCondor access point service account e-mail | `string` | n/a | yes | +| [central\_manager\_service\_account\_email](#input\_central\_manager\_service\_account\_email) | HTCondor access point service account e-mail | `string` | n/a | yes | +| [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | +| [execute\_point\_service\_account\_email](#input\_execute\_point\_service\_account\_email) | HTCondor access point service account e-mail | `string` | n/a | yes | +| [labels](#input\_labels) | Labels to add to resources. List key, value pairs. | `map(string)` | n/a | yes | +| [pool\_password](#input\_pool\_password) | HTCondor Pool Password | `string` | `null` | no | +| [project\_id](#input\_project\_id) | Project in which HTCondor pool will be created | `string` | n/a | yes | +| [trust\_domain](#input\_trust\_domain) | Trust domain for HTCondor pool (if not supplied, will be set based on project\_id) | `string` | `""` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [access\_point\_runner](#output\_access\_point\_runner) | Toolkit Runner to download pool secrets to an HTCondor access point | +| [central\_manager\_runner](#output\_central\_manager\_runner) | Toolkit Runner to download pool secrets to an HTCondor central manager | +| [execute\_point\_runner](#output\_execute\_point\_runner) | Toolkit Runner to download pool secrets to an HTCondor execute point | +| [pool\_password\_secret\_id](#output\_pool\_password\_secret\_id) | Google Cloud Secret Manager ID containing HTCondor Pool Password | +| [windows\_startup\_ps1](#output\_windows\_startup\_ps1) | PowerShell script to download pool secrets to an HTCondor execute point | + diff --git a/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml b/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml new file mode 100644 index 0000000000..83cf7c8b24 --- /dev/null +++ b/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml @@ -0,0 +1,98 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +- name: Configure HTCondor Secrets + hosts: localhost + become: true + vars: + condor_config_root: /etc/condor + tasks: + - name: Ensure necessary variables are set + ansible.builtin.assert: + that: + - htcondor_role is defined + - password_id is defined + - trust_domain is defined + - name: Set Pool Trust Domain + ansible.builtin.copy: + dest: "{{ condor_config_root }}/config.d/51-ghpc-trust-domain" + mode: 0644 + content: | + # these lines must appear AFTER any "use role:" settings + UID_DOMAIN = {{ trust_domain }} + TRUST_DOMAIN = {{ trust_domain }} + - name: Get HTCondor Pool password (token signing key) + when: htcondor_role != 'get_htcondor_execute' + ansible.builtin.shell: | + set -e -o pipefail + TMPFILE=$(mktemp) + gcloud secrets versions access latest --out-file "$TMPFILE" --secret={{ password_id }} + condor_store_cred add -c -i "$TMPFILE" + rm -f "$TMPFILE" + args: + creates: "{{ condor_config_root }}/passwords.d/POOL" + executable: /bin/bash + - name: Configure HTCondor Central Manager + when: htcondor_role == 'get_htcondor_central_manager' + block: + - name: Create IDTOKEN for Central Manager + ansible.builtin.shell: | + umask 0077 + condor_token_create -identity condor@{{ trust_domain }} \ + -token condor@{{ trust_domain }} + args: + creates: "{{ condor_config_root }}/tokens.d/condor@{{ trust_domain }}" + - name: Create IDTOKEN secret for Execute Points + changed_when: true + ansible.builtin.shell: | + umask 0077 + TMPFILE=$(mktemp) + condor_token_create -authz READ -authz ADVERTISE_MASTER \ + -authz ADVERTISE_STARTD -identity condor@{{ trust_domain }} > "$TMPFILE" + gcloud secrets versions add --data-file "$TMPFILE" {{ xp_idtoken_secret_id }} + rm -f "$TMPFILE" + when: xp_idtoken_secret_id | length > 0 + - name: Configure HTCondor SchedD + when: htcondor_role == 'get_htcondor_submit' + block: + - name: Create IDTOKEN to advertise access point + ansible.builtin.shell: | + umask 0077 + # DAEMON authorization can likely be removed in future when scopes + # needed to trigger a negotiation cycle are changed. Suggest review + # https://opensciencegrid.atlassian.net/jira/software/c/projects/HTCONDOR/issues/?filter=allissues + condor_token_create -authz READ -authz ADVERTISE_MASTER \ + -authz ADVERTISE_SCHEDD -authz DAEMON -identity condor@{{ trust_domain }} \ + -token condor@{{ trust_domain }} + args: + creates: "{{ condor_config_root }}/tokens.d/condor@{{ trust_domain }}" + - name: Configure HTCondor StartD + when: htcondor_role == 'get_htcondor_execute' + block: + - name: Create SystemD override directory for HTCondor Execute Point + ansible.builtin.file: + path: /etc/systemd/system/condor.service.d + state: directory + owner: root + group: root + mode: 0755 + - name: Fetch IDTOKEN to advertise execute point + ansible.builtin.copy: + dest: "/etc/systemd/system/condor.service.d/htcondor-token-fetcher.conf" + mode: 0644 + content: | + [Service] + ExecStartPre=gcloud secrets versions access latest --secret {{ xp_idtoken_secret_id }} \ + --out-file {{ condor_config_root }}/tokens.d/condor@{{ trust_domain }} diff --git a/community/modules/scheduler/htcondor-pool-secrets/main.tf b/community/modules/scheduler/htcondor-pool-secrets/main.tf new file mode 100644 index 0000000000..51213fb448 --- /dev/null +++ b/community/modules/scheduler/htcondor-pool-secrets/main.tf @@ -0,0 +1,127 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "htcondor-pool-secrets" }) +} + +locals { + pool_password = coalesce(var.pool_password, random_password.pool.result) + access_point_service_account_iam_email = "serviceAccount:${var.access_point_service_account_email}" + central_manager_service_account_iam_email = "serviceAccount:${var.central_manager_service_account_email}" + execute_point_service_account_iam_email = "serviceAccount:${var.execute_point_service_account_email}" + + trust_domain = coalesce(var.trust_domain, "c.${var.project_id}.internal") + + runner_cm = { + "type" = "ansible-local" + "content" = file("${path.module}/files/htcondor_secrets.yml") + "destination" = "htcondor_secrets.yml" + "args" = join(" ", [ + "-e htcondor_role=get_htcondor_central_manager", + "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", + "-e xp_idtoken_secret_id=${google_secret_manager_secret.execute_point_idtoken.secret_id}", + "-e trust_domain=${local.trust_domain}", + ]) + } + + runner_access = { + "type" = "ansible-local" + "content" = file("${path.module}/files/htcondor_secrets.yml") + "destination" = "htcondor_secrets.yml" + "args" = join(" ", [ + "-e htcondor_role=get_htcondor_submit", + "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", + "-e trust_domain=${local.trust_domain}", + ]) + } + + runner_execute = { + "type" = "ansible-local" + "content" = file("${path.module}/files/htcondor_secrets.yml") + "destination" = "htcondor_secrets.yml" + "args" = join(" ", [ + "-e htcondor_role=get_htcondor_execute", + "-e password_id=${google_secret_manager_secret.pool_password.secret_id}", + "-e xp_idtoken_secret_id=${google_secret_manager_secret.execute_point_idtoken.secret_id}", + "-e trust_domain=${local.trust_domain}", + ]) + } + windows_startup_ps1 = templatefile( + "${path.module}/templates/fetch-idtoken.ps1.tftpl", + { + trust_domain = local.trust_domain, + xp_idtoken_secret_id = google_secret_manager_secret.execute_point_idtoken.secret_id, + } + ) +} + +resource "random_password" "pool" { + length = 24 + special = true + override_special = "_-#=." +} + +resource "google_secret_manager_secret" "pool_password" { + secret_id = "${var.deployment_name}-pool-password" + + labels = local.labels + + replication { + automatic = true + } +} + +resource "google_secret_manager_secret_version" "pool_password" { + secret = google_secret_manager_secret.pool_password.id + secret_data = local.pool_password +} + +# this secret will be populated by the Central Manager +resource "google_secret_manager_secret" "execute_point_idtoken" { + secret_id = "${var.deployment_name}-execute-point-idtoken" + + labels = local.labels + + replication { + automatic = true + } +} + +resource "google_secret_manager_secret_iam_member" "central_manager_password" { + secret_id = google_secret_manager_secret.pool_password.id + role = "roles/secretmanager.secretAccessor" + member = local.central_manager_service_account_iam_email +} + +resource "google_secret_manager_secret_iam_member" "central_manager_idtoken" { + secret_id = google_secret_manager_secret.execute_point_idtoken.id + role = "roles/secretmanager.secretVersionManager" + member = local.central_manager_service_account_iam_email +} + +resource "google_secret_manager_secret_iam_member" "access_point" { + secret_id = google_secret_manager_secret.pool_password.id + role = "roles/secretmanager.secretAccessor" + member = local.access_point_service_account_iam_email +} + +resource "google_secret_manager_secret_iam_member" "execute_point" { + secret_id = google_secret_manager_secret.execute_point_idtoken.id + role = "roles/secretmanager.secretAccessor" + member = local.execute_point_service_account_iam_email +} diff --git a/community/modules/scheduler/htcondor-pool-secrets/outputs.tf b/community/modules/scheduler/htcondor-pool-secrets/outputs.tf new file mode 100644 index 0000000000..81c4986b16 --- /dev/null +++ b/community/modules/scheduler/htcondor-pool-secrets/outputs.tf @@ -0,0 +1,50 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "pool_password_secret_id" { + description = "Google Cloud Secret Manager ID containing HTCondor Pool Password" + value = google_secret_manager_secret.pool_password.secret_id + sensitive = true +} + +output "central_manager_runner" { + description = "Toolkit Runner to download pool secrets to an HTCondor central manager" + value = local.runner_cm + depends_on = [ + google_secret_manager_secret_version.pool_password + ] +} + +output "access_point_runner" { + description = "Toolkit Runner to download pool secrets to an HTCondor access point" + value = local.runner_access + depends_on = [ + google_secret_manager_secret_version.pool_password + ] +} + +output "execute_point_runner" { + description = "Toolkit Runner to download pool secrets to an HTCondor execute point" + value = local.runner_execute + depends_on = [ + google_secret_manager_secret_version.pool_password + ] +} + +output "windows_startup_ps1" { + description = "PowerShell script to download pool secrets to an HTCondor execute point" + value = local.windows_startup_ps1 +} diff --git a/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl b/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl new file mode 100644 index 0000000000..ec6773053c --- /dev/null +++ b/community/modules/scheduler/htcondor-pool-secrets/templates/fetch-idtoken.ps1.tftpl @@ -0,0 +1,17 @@ +$config_dir = 'C:\Condor\config' +if(!(test-path -PathType container -Path $config_dir)) { + New-Item -ItemType Directory -Path $config_dir +} +$config_file = "$config_dir\51-ghpc-trust-domain" + +$config_string = @' +# these lines must appear AFTER any "use role:" settings +UID_DOMAIN = ${trust_domain} +TRUST_DOMAIN = ${trust_domain} +'@ + +Set-Content -Path "$config_file" -Value "$config_string" + +# obtain IDTOKEN for authentication by StartD to Central Manager +gcloud secrets versions access latest --secret ${xp_idtoken_secret_id} ` + --out-file C:\condor\tokens.d\condor@${trust_domain} diff --git a/community/modules/scheduler/htcondor-pool-secrets/variables.tf b/community/modules/scheduler/htcondor-pool-secrets/variables.tf new file mode 100644 index 0000000000..edc377766a --- /dev/null +++ b/community/modules/scheduler/htcondor-pool-secrets/variables.tf @@ -0,0 +1,58 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "Project in which HTCondor pool will be created" + type = string +} + +variable "deployment_name" { + description = "HPC Toolkit deployment name. HTCondor cloud resource names will include this value." + type = string +} + +variable "labels" { + description = "Labels to add to resources. List key, value pairs." + type = map(string) +} + +variable "access_point_service_account_email" { + description = "HTCondor access point service account e-mail" + type = string +} + +variable "central_manager_service_account_email" { + description = "HTCondor access point service account e-mail" + type = string +} + +variable "execute_point_service_account_email" { + description = "HTCondor access point service account e-mail" + type = string +} + +variable "pool_password" { + description = "HTCondor Pool Password" + type = string + sensitive = true + default = null +} + +variable "trust_domain" { + description = "Trust domain for HTCondor pool (if not supplied, will be set based on project_id)" + type = string + default = "" +} diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf new file mode 100644 index 0000000000..29f4dadff6 --- /dev/null +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -0,0 +1,33 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = ">= 3.83" + } + random = { + source = "hashicorp/random" + version = ">= 3.0" + } + } + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.19.1" + } + + required_version = ">= 0.13.0" +} diff --git a/modules/README.md b/modules/README.md index 2524588ca5..454a7886cd 100644 --- a/modules/README.md +++ b/modules/README.md @@ -153,8 +153,11 @@ Modules that are still in development and less stable are labeled with the Creates a Slurm login node using [slurm-gcp-version-5]. * **[schedmd-slurm-gcp-v5-hybrid]** ![community-badge] ![experimental-badge] : Creates hybrid Slurm partition configuration files using [slurm-gcp-version-5]. -* **[htcondor-base]** ![community-badge] ![experimental-badge] : Creates - Toolkit runners and service accounts to configure an HTCondor pool. +* **[htcondor-base]** ![community-badge] ![experimental-badge] : Creates the + base infrastructure for an HTCondor pool (service accounts and Cloud Storage bucket). +* **[htcondor-pool-secrets]** ![community-badge] ![experimental-badge] : Creates + and manages access to the secrets necessary for secure operation of an + HTCondor pool. * **[pbspro-client]** ![community-badge] ![experimental-badge] : Creates a client host for submitting jobs to a PBS Professional cluster. * **[pbspro-server]** ![community-badge] ![experimental-badge] : Creates From ae3f0b2a9e32dac418ee31bda45bcdcf1da40059 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:31:40 -0500 Subject: [PATCH 065/144] Fix apparent bug in storing HTCondor pool password --- .../htcondor-pool-secrets/files/htcondor_secrets.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml b/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml index 83cf7c8b24..39ba4095db 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml +++ b/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml @@ -36,11 +36,9 @@ - name: Get HTCondor Pool password (token signing key) when: htcondor_role != 'get_htcondor_execute' ansible.builtin.shell: | - set -e -o pipefail - TMPFILE=$(mktemp) - gcloud secrets versions access latest --out-file "$TMPFILE" --secret={{ password_id }} - condor_store_cred add -c -i "$TMPFILE" - rm -f "$TMPFILE" + set -e -o pipefail +o history + POOL_PASSWORD=$(gcloud secrets versions access latest --secret={{ password_id }}) + echo -n "$POOL_PASSWORD" | sh -c "condor_store_cred add -c -i -" args: creates: "{{ condor_config_root }}/passwords.d/POOL" executable: /bin/bash From 104b14fbb8a291c0f8a0c50cd3b789985c51ec84 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 16:31:41 -0500 Subject: [PATCH 066/144] Allow multiple PS1 scripts in succession on Windows execute points --- community/modules/compute/htcondor-execute-point/README.md | 4 ++-- community/modules/compute/htcondor-execute-point/main.tf | 7 +++++-- .../modules/compute/htcondor-execute-point/variables.tf | 5 +++-- .../modules/compute/htcondor-execute-point/versions.tf | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 897556bc82..a186539149 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -158,7 +158,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 1.0 | +| [terraform](#requirement\_terraform) | >= 1.1 | | [google](#requirement\_google) | >= 4.0 | ## Providers @@ -203,7 +203,7 @@ limitations under the License. | [startup\_script](#input\_startup\_script) | Startup script to run at boot-time for Linux HTCondor execute points | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork HTCondor execute points will join | `string` | `null` | no | | [target\_size](#input\_target\_size) | Initial size of the HTCondor execute point pool; set to null (default) to avoid Terraform management of size. | `number` | `null` | no | -| [windows\_startup\_ps1](#input\_windows\_startup\_ps1) | Startup script to run at boot-time for Windows-based HTCondor execute points | `string` | `null` | no | +| [windows\_startup\_ps1](#input\_windows\_startup\_ps1) | Startup script to run at boot-time for Windows-based HTCondor execute points | `list(string)` | `[]` | no | | [zone](#input\_zone) | The default zone in which resources will be created | `string` | n/a | yes | ## Outputs diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 5c775eb7bd..eadbbefe58 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -19,6 +19,7 @@ locals { labels = merge(var.labels, { ghpc_module = "htcondor-execute-point" }) } + locals { network_storage_metadata = var.network_storage == null ? {} : { network_storage = jsonencode(var.network_storage) } @@ -28,9 +29,11 @@ locals { } enable_oslogin = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } + windows_startup_ps1 = join("\n\n", var.windows_startup_ps1) + is_windows_image = anytrue([for l in data.google_compute_image.htcondor.licenses : length(regexall("windows-cloud", l)) > 0]) - windows_startup_metadata = local.is_windows_image && var.windows_startup_ps1 != null ? { - windows-startup-script-ps1 = var.windows_startup_ps1 + windows_startup_metadata = local.is_windows_image && local.windows_startup_ps1 != "" ? { + windows-startup-script-ps1 = local.windows_startup_ps1 } : {} metadata = merge(local.windows_startup_metadata, local.network_storage_metadata, local.enable_oslogin, var.metadata) diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index 2ac6b4addd..a0413ceff5 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -155,6 +155,7 @@ variable "disk_size_gb" { variable "windows_startup_ps1" { description = "Startup script to run at boot-time for Windows-based HTCondor execute points" - type = string - default = null + type = list(string) + default = [] + nullable = false } diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 275c483863..4e0c5180e3 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -15,7 +15,7 @@ */ terraform { - required_version = ">= 1.0" + required_version = ">= 1.1" required_providers { google = { From d9d7a0e0cd2f06ef545745a0790e7b9da6218506 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 18:50:17 -0500 Subject: [PATCH 067/144] Fix version # of Toolkit metadata --- community/modules/compute/htcondor-execute-point/versions.tf | 2 +- community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index 4e0c5180e3..f06d695674 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.19.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.20.0" } } diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 29f4dadff6..56d1dbd47d 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.19.1" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.20.0" } required_version = ">= 0.13.0" From ccf91cefd1888da535c980f83070686502594b72 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 11 Jul 2023 23:00:53 -0500 Subject: [PATCH 068/144] Simplify htcondor-install usage --- community/examples/htc-htcondor.yaml | 7 +--- .../scripts/htcondor-install/README.md | 4 +- .../modules/scripts/htcondor-install/main.tf | 42 +++++++++---------- .../scripts/htcondor-install/outputs.tf | 14 +------ 4 files changed, 26 insertions(+), 41 deletions(-) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 1fb368d075..a074364699 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -39,10 +39,8 @@ deployment_groups: - id: htcondor_install_script source: modules/scripts/startup-script - settings: - runners: - - $(htcondor_install.install_htcondor_runner) - - $(htcondor_install.install_autoscaler_deps_runner) + use: + - htcondor_install - group: packer modules: @@ -150,7 +148,6 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(htcondor_install.install_autoscaler_runner) - $(htcondor_secrets.access_point_runner) - $(htcondor_base.access_point_runner) - $(htcondor_execute_point.configure_autoscaler_runner) diff --git a/community/modules/scripts/htcondor-install/README.md b/community/modules/scripts/htcondor-install/README.md index c6d988971c..7b2d07b9f5 100644 --- a/community/modules/scripts/htcondor-install/README.md +++ b/community/modules/scripts/htcondor-install/README.md @@ -134,8 +134,6 @@ No resources. | Name | Description | |------|-------------| | [gcp\_service\_list](#output\_gcp\_service\_list) | Google Cloud APIs required by HTCondor | -| [install\_autoscaler\_deps\_runner](#output\_install\_autoscaler\_deps\_runner) | Toolkit Runner to install HTCondor autoscaler dependencies | -| [install\_autoscaler\_runner](#output\_install\_autoscaler\_runner) | Toolkit Runner to install HTCondor autoscaler | -| [install\_htcondor\_runner](#output\_install\_htcondor\_runner) | Runner to install HTCondor using startup-scripts | +| [runners](#output\_runners) | Runner to install HTCondor using startup-scripts | | [windows\_startup\_ps1](#output\_windows\_startup\_ps1) | Windows PowerShell script to install HTCondor | diff --git a/community/modules/scripts/htcondor-install/main.tf b/community/modules/scripts/htcondor-install/main.tf index 8648351ab2..6c2a864494 100644 --- a/community/modules/scripts/htcondor-install/main.tf +++ b/community/modules/scripts/htcondor-install/main.tf @@ -15,33 +15,33 @@ */ locals { - runner_install_htcondor = { - "type" = "ansible-local" - "source" = "${path.module}/files/install-htcondor.yaml" - "destination" = "install-htcondor.yaml" - "args" = join(" ", [ - "-e enable_docker=${var.enable_docker}", - "-e condor_version=${var.condor_version}", - ]) - } + runners = [ + { + "type" = "ansible-local" + "source" = "${path.module}/files/install-htcondor.yaml" + "destination" = "install-htcondor.yaml" + "args" = join(" ", [ + "-e enable_docker=${var.enable_docker}", + "-e condor_version=${var.condor_version}", + ]) + }, + { + "type" = "ansible-local" + "content" = file("${path.module}/files/install-htcondor-autoscaler-deps.yml") + "destination" = "install-htcondor-autoscaler-deps.yml" + }, + { + "type" = "data" + "content" = file("${path.module}/files/autoscaler.py") + "destination" = "/usr/local/htcondor/bin/autoscaler.py" + }, + ] install_htcondor_ps1 = templatefile( "${path.module}/templates/install-htcondor.ps1.tftpl", { condor_version = var.condor_version }) - runner_install_autoscaler_deps = { - "type" = "ansible-local" - "content" = file("${path.module}/files/install-htcondor-autoscaler-deps.yml") - "destination" = "install-htcondor-autoscaler-deps.yml" - } - - runner_install_autoscaler = { - "type" = "data" - "content" = file("${path.module}/files/autoscaler.py") - "destination" = "/usr/local/htcondor/bin/autoscaler.py" - } - required_apis = [ "compute.googleapis.com", "secretmanager.googleapis.com", diff --git a/community/modules/scripts/htcondor-install/outputs.tf b/community/modules/scripts/htcondor-install/outputs.tf index 1a0d55b7bf..c7951737ff 100644 --- a/community/modules/scripts/htcondor-install/outputs.tf +++ b/community/modules/scripts/htcondor-install/outputs.tf @@ -14,9 +14,9 @@ * limitations under the License. */ -output "install_htcondor_runner" { +output "runners" { description = "Runner to install HTCondor using startup-scripts" - value = local.runner_install_htcondor + value = local.runners } output "windows_startup_ps1" { @@ -24,16 +24,6 @@ output "windows_startup_ps1" { value = local.install_htcondor_ps1 } -output "install_autoscaler_deps_runner" { - description = "Toolkit Runner to install HTCondor autoscaler dependencies" - value = local.runner_install_autoscaler_deps -} - -output "install_autoscaler_runner" { - description = "Toolkit Runner to install HTCondor autoscaler" - value = local.runner_install_autoscaler -} - output "gcp_service_list" { description = "Google Cloud APIs required by HTCondor" value = local.required_apis From 31ecab51d2964af80dd84c3d45dc2be71c7e77b5 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 12 Jul 2023 15:27:09 -0500 Subject: [PATCH 069/144] Implement wrapper for HTCondor access points Refactor existing blueprint using vm-instance and startup-script modules using a wrapper that adds the additional functionality of managing highly available HTCondor access points using a managed instance group. --- community/examples/htc-htcondor.yaml | 43 +---- .../compute/htcondor-execute-point/README.md | 2 +- .../compute/htcondor-execute-point/main.tf | 2 +- .../compute/htcondor-execute-point/outputs.tf | 4 +- .../scheduler/htcondor-access-point/README.md | 106 +++++++++++ .../files/htcondor_configure.yml | 141 ++++++++++++++ .../scheduler/htcondor-access-point/main.tf | 178 ++++++++++++++++++ .../htcondor-access-point/outputs.tf | 20 ++ .../templates/condor_config.tftpl | 55 ++++++ .../htcondor-access-point/variables.tf | 151 +++++++++++++++ .../htcondor-access-point/versions.tf | 29 +++ .../modules/scheduler/htcondor-base/README.md | 8 +- .../files/htcondor_configure.yml | 64 +------ .../modules/scheduler/htcondor-base/main.tf | 31 +-- .../scheduler/htcondor-base/outputs.tf | 19 +- .../templates/condor_config.tftpl | 38 ---- .../scheduler/htcondor-base/variables.tf | 12 -- modules/README.md | 7 +- 18 files changed, 711 insertions(+), 199 deletions(-) create mode 100644 community/modules/scheduler/htcondor-access-point/README.md create mode 100644 community/modules/scheduler/htcondor-access-point/files/htcondor_configure.yml create mode 100644 community/modules/scheduler/htcondor-access-point/main.tf create mode 100644 community/modules/scheduler/htcondor-access-point/outputs.tf create mode 100644 community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl create mode 100644 community/modules/scheduler/htcondor-access-point/variables.tf create mode 100644 community/modules/scheduler/htcondor-access-point/versions.tf diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index a074364699..904c31f552 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -94,7 +94,7 @@ deployment_groups: - network: null subnetwork: $(network1.subnetwork_self_link) subnetwork_project: $(vars.project_id) - network_ip: $(htcondor_base.central_manager_internal_ip) + network_ip: $(htcondor_base.central_manager_ips[0]) stack_type: null access_config: [] ipv6_access_config: [] @@ -144,45 +144,18 @@ deployment_groups: scopes: - cloud-platform - - id: htcondor_startup_access_point - source: modules/scripts/startup-script - settings: - runners: - - $(htcondor_secrets.access_point_runner) - - $(htcondor_base.access_point_runner) - - $(htcondor_execute_point.configure_autoscaler_runner) - - $(htcondor_execute_point_spot.configure_autoscaler_runner) - - type: data - destination: /var/tmp/helloworld.sub - content: | - universe = vanilla - executable = /bin/sleep - arguments = 1000 - output = out.\$(ClusterId).\$(ProcId) - error = err.\$(ClusterId).\$(ProcId) - log = log.\$(ClusterId).\$(ProcId) - request_cpus = 1 - request_memory = 100MB - # if unset, defaults to false - +RequireSpot = true - queue - - id: htcondor_access - source: modules/compute/vm-instance + source: community/modules/scheduler/htcondor-access-point use: - network1 - - htcondor_startup_access_point + - htcondor_secrets + - htcondor_base + - htcondor_execute_point + - htcondor_execute_point_spot settings: - name_prefix: ap + enable_public_ips: true instance_image: project: $(vars.project_id) family: $(vars.new_image_family) - add_deployment_name_before_prefix: true - machine_type: c2-standard-4 - service_account: - email: $(htcondor_base.access_point_service_account_email) - scopes: - - cloud-platform outputs: - - internal_ip - - external_ip + - list_instances_command diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index a186539149..af3cd3d26b 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -210,5 +210,5 @@ limitations under the License. | Name | Description | |------|-------------| -| [configure\_autoscaler\_runner](#output\_configure\_autoscaler\_runner) | Toolkit runner to configure the HTCondor autoscaler | +| [autoscaler\_runner](#output\_autoscaler\_runner) | Toolkit runner to configure the HTCondor autoscaler | diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index eadbbefe58..0bdb1e9a58 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -38,7 +38,7 @@ locals { metadata = merge(local.windows_startup_metadata, local.network_storage_metadata, local.enable_oslogin, var.metadata) - configure_autoscaler_role = { + autoscaler_runner = { "type" = "ansible-local" "content" = file("${path.module}/files/htcondor_configure_autoscaler.yml") "destination" = "htcondor_configure_autoscaler_${module.mig.instance_group_manager.name}.yml" diff --git a/community/modules/compute/htcondor-execute-point/outputs.tf b/community/modules/compute/htcondor-execute-point/outputs.tf index d009c465cd..13192401c4 100644 --- a/community/modules/compute/htcondor-execute-point/outputs.tf +++ b/community/modules/compute/htcondor-execute-point/outputs.tf @@ -14,7 +14,7 @@ * limitations under the License. */ -output "configure_autoscaler_runner" { - value = local.configure_autoscaler_role +output "autoscaler_runner" { + value = local.autoscaler_runner description = "Toolkit runner to configure the HTCondor autoscaler" } diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md new file mode 100644 index 0000000000..d58f633a96 --- /dev/null +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -0,0 +1,106 @@ +## Description + +This module provisions a highly available HTCondor access point using a [Managed +Instance Group (MIG)][mig] with auto-healing. + +[mig]: https://cloud.google.com/compute/docs/instance-groups + +## Usage + +Although this provisions an HTCondor access point with standard configuration, +for a functioning node, you must supply Toolkit runners as described below: + +- [var.access_point_runner](#input_access_point_runner) + - Runner must download or otherwise create an [IDTOKEN] with ADVERTISE_MASTER, + ADVERTISE_SCHEDD, and DAEMON scopes +- [var.autoscaler_runner](#input_autoscaler_runner) + - 1 runner for each set of execute points to add to the pool + +Reference implementations for each are included in the Toolkit modules +[htcondor-pool-secrets] and [htcondor-execute-point]. You may substitute +implementations (e.g. alternative secret management) so long as they duplicate +the functionality in these references. Their usage is demonstrated in the +[HTCondor example][htc-example]. + +[htc-example]: ../../../../examples/README.md#htc-htcondoryaml-- +[htcondor-execute-point]: ../../compute/htcondor-execute-point/README.md +[htcondor-pool-secrets]: ../htcondor-pool-secrets/README.md +[IDTOKEN]: https://htcondor.readthedocs.io/en/latest/admin-manual/security.html#introducing-idtokens + + +Copyright 2023 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 0.13.0 | +| [google](#requirement\_google) | >= 3.83 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 3.83 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | +| [htcondor\_ap](#module\_htcondor\_ap) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | 84d7959 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.20.0&depth=1 | + +## Resources + +| Name | Type | +|------|------| +| [google_storage_bucket_object.ap_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [google_compute_image.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | +| [google_compute_region_instance_group.ap](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_region_instance_group) | data source | +| [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [access\_point\_runner](#input\_access\_point\_runner) | A list of Toolkit runners for configuring an HTCondor access point | `list(map(string))` | `[]` | no | +| [access\_point\_service\_account\_email](#input\_access\_point\_service\_account\_email) | Service account for access point (e-mail format) | `string` | n/a | yes | +| [autoscaler\_runner](#input\_autoscaler\_runner) | A list of Toolkit runners for configuring autoscaling daemons | `list(map(string))` | `[]` | no | +| [central\_manager\_ips](#input\_central\_manager\_ips) | List of IP addresses of HTCondor Central Managers | `list(string)` | n/a | yes | +| [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | +| [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `null` | no | +| [enable\_high\_availability](#input\_enable\_high\_availability) | Provision HTCondor access point in high availability mode | `bool` | `false` | no | +| [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | +| [enable\_public\_ips](#input\_enable\_public\_ips) | Enable Public IPs on the access points | `bool` | `false` | no | +| [htcondor\_bucket\_name](#input\_htcondor\_bucket\_name) | Name of HTCondor configuration bucket | `string` | n/a | yes | +| [instance\_image](#input\_instance\_image) | Custom VM image with HTCondor and Toolkit support installed. |
object({
family = string,
project = string
})
| n/a | yes | +| [labels](#input\_labels) | Labels to add to resources. List key, value pairs. | `map(string)` | n/a | yes | +| [machine\_type](#input\_machine\_type) | Machine type to use for HTCondor central managers | `string` | `"c2-standard-4"` | no | +| [metadata](#input\_metadata) | Metadata to add to HTCondor central managers | `map(string)` | `{}` | no | +| [network\_self\_link](#input\_network\_self\_link) | The self link of the network in which the HTCondor central manager will be created. | `string` | `null` | no | +| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | +| [project\_id](#input\_project\_id) | Project in which HTCondor pool will be created | `string` | n/a | yes | +| [region](#input\_region) | Default region for creating resources | `string` | n/a | yes | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [spool\_parent\_dir](#input\_spool\_parent\_dir) | HTCondor access point configuration SPOOL will be set to subdirectory named "spool" | `string` | `"/var/lib/condor"` | no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [list\_instances\_command](#output\_list\_instances\_command) | Command to list Access Points provisioned by this module | + diff --git a/community/modules/scheduler/htcondor-access-point/files/htcondor_configure.yml b/community/modules/scheduler/htcondor-access-point/files/htcondor_configure.yml new file mode 100644 index 0000000000..93f3dfe8d4 --- /dev/null +++ b/community/modules/scheduler/htcondor-access-point/files/htcondor_configure.yml @@ -0,0 +1,141 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +- name: Configure HTCondor Access Point + hosts: localhost + become: true + vars: + job_queue_ha: false + spool_dir: /var/lib/condor/spool + condor_config_root: /etc/condor + ghpc_config_file: 50-ghpc-managed + schedd_ha_config_file: 51-ghpc-schedd-high-availability + tasks: + - name: Ensure necessary variables are set + ansible.builtin.assert: + that: + - htcondor_role is defined + - config_object is defined + - name: Remove default HTCondor configuration + ansible.builtin.file: + path: "{{ condor_config_root }}/config.d/00-htcondor-9.0.config" + state: absent + notify: + - Reload HTCondor + - name: Create Toolkit configuration file + register: config_update + changed_when: config_update.rc == 137 + failed_when: config_update.rc != 0 and config_update.rc != 137 + ansible.builtin.shell: | + set -e -o pipefail + REMOTE_HASH=$(gcloud --format="value(md5_hash)" storage hash {{ config_object }}) + + CONFIG_FILE="{{ condor_config_root }}/config.d/{{ ghpc_config_file }}" + if [ -f "${CONFIG_FILE}" ]; then + LOCAL_HASH=$(gcloud --format="value(md5_hash)" storage hash "${CONFIG_FILE}") + else + LOCAL_HASH="INVALID-HASH" + fi + + if [ "${REMOTE_HASH}" != "${LOCAL_HASH}" ]; then + gcloud storage cp {{ config_object }} "${CONFIG_FILE}" + chmod 0644 "${CONFIG_FILE}" + exit 137 + fi + args: + executable: /bin/bash + - name: Configure HTCondor SchedD + when: htcondor_role == 'get_htcondor_submit' + block: + - name: Setup Spool directory + ansible.builtin.file: + path: "{{ spool_dir }}" + state: directory + owner: condor + group: condor + mode: 0755 + - name: Enable SchedD high availability + when: job_queue_ha | bool + block: + - name: Set SchedD HA configuration (requires restart) + ansible.builtin.copy: + dest: "{{ condor_config_root }}/config.d/{{ schedd_ha_config_file }}" + mode: 0644 + content: | + MASTER_HA_LIST=SCHEDD + HA_LOCK_URL=file:{{ spool_dir }} + VALID_SPOOL_FILES=$(VALID_SPOOL_FILES), SCHEDD.lock + HA_POLL_PERIOD=30 + SCHEDD_NAME=had-schedd@ + notify: + - Restart HTCondor + # although HTCondor is guaranteed to start after mounting remote + # filesystems is *attempted*, it does not guarantee successful mounts; + # this additional SystemD setting will refuse to start HTCondor if the + # spool shared filesystem has not been mounted + - name: Create SystemD override directory for HTCondor + ansible.builtin.file: + path: /etc/systemd/system/condor.service.d + state: directory + owner: root + group: root + mode: 0755 + - name: Ensure HTCondor starts after shared filesystem is mounted + ansible.builtin.copy: + dest: /etc/systemd/system/condor.service.d/mount-spool.conf + mode: 0644 + content: | + [Unit] + RequiresMountsFor={{ spool_dir }} + notify: + - Reload SystemD + - name: Disable SchedD high availability + when: not job_queue_ha | bool + block: + - name: Remove SchedD HA configuration file + ansible.builtin.file: + path: "{{ condor_config_root }}/config.d/{{ schedd_ha_config_file }}" + state: absent + notify: + - Restart HTCondor + - name: Remove HTCondor SystemD override + ansible.builtin.file: + path: /etc/systemd/system/condor.service.d/mount-spool.conf + state: absent + notify: + - Reload SystemD + handlers: + - name: Reload SystemD + ansible.builtin.systemd: + daemon_reload: true + - name: Restart HTCondor + ansible.builtin.service: + name: condor + state: restarted + - name: Reload HTCondor + ansible.builtin.service: + name: condor + state: reloaded + post_tasks: + - name: Start HTCondor + ansible.builtin.service: + name: condor + state: started + enabled: true + - name: Inform users + changed_when: false + ansible.builtin.shell: | + set -e -o pipefail + wall "******* HTCondor system configuration complete ********" diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf new file mode 100644 index 0000000000..13e28e13e6 --- /dev/null +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -0,0 +1,178 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "htcondor-access-point" }) +} + +locals { + network_storage_metadata = var.network_storage == null ? {} : { network_storage = jsonencode(var.network_storage) } + oslogin_api_values = { + "DISABLE" = "FALSE" + "ENABLE" = "TRUE" + } + enable_oslogin_metadata = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } + metadata = merge(local.network_storage_metadata, local.enable_oslogin_metadata, var.metadata) + + name_prefix = "${var.deployment_name}-ap" + + example_runner = { + type = "data" + destination = "/var/tmp/helloworld.sub" + content = <<-EOT + universe = vanilla + executable = /bin/sleep + arguments = 1000 + output = out.\$(ClusterId).\$(ProcId) + error = err.\$(ClusterId).\$(ProcId) + log = log.\$(ClusterId).\$(ProcId) + request_cpus = 1 + request_memory = 100MB + # if unset, defaults to false + +RequireSpot = true + queue + EOT + } + + all_runners = concat( + var.access_point_runner, + [local.schedd_runner], + var.autoscaler_runner, + [local.example_runner] + ) + disk_size_gb = max(var.disk_size_gb, data.google_compute_image.htcondor.disk_size_gb) + + ap_config = templatefile("${path.module}/templates/condor_config.tftpl", { + htcondor_role = "get_htcondor_submit", + central_manager_ips = var.central_manager_ips + spool_dir = "${var.spool_parent_dir}/spool", + }) + + ap_object = "gs://${var.htcondor_bucket_name}/${google_storage_bucket_object.ap_config.output_name}" + schedd_runner = { + "type" = "ansible-local" + "content" = file("${path.module}/files/htcondor_configure.yml") + "destination" = "htcondor_configure.yml" + "args" = join(" ", [ + "-e htcondor_role=get_htcondor_submit", + "-e config_object=${local.ap_object}", + "-e job_queue_ha=${var.enable_high_availability}", + "-e spool_dir=${var.spool_parent_dir}/spool", + ]) + } + + list_instances_command = "gcloud compute instance-groups list-instances ${data.google_compute_region_instance_group.ap.name} --region ${var.region} --project ${var.project_id}" +} + +data "google_compute_image" "htcondor" { + family = var.instance_image.family + project = var.instance_image.project +} + +data "google_compute_zones" "available" { + project = var.project_id + region = var.region +} + +data "google_compute_region_instance_group" "ap" { + self_link = module.htcondor_ap.self_link +} + +resource "google_storage_bucket_object" "ap_config" { + name = "${var.deployment_name}-ap-config-${substr(md5(local.ap_config), 0, 4)}" + content = local.ap_config + bucket = var.htcondor_bucket_name +} + +module "startup_script" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.20.0&depth=1" + + project_id = var.project_id + region = var.region + labels = local.labels + deployment_name = var.deployment_name + + runners = local.all_runners +} + +module "access_point_instance_template" { + # tflint-ignore: terraform_module_pinned_source + source = "github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=84d7959" + + name_prefix = local.name_prefix + project_id = var.project_id + network = var.network_self_link + subnetwork = var.subnetwork_self_link + service_account = { + email = var.access_point_service_account_email + scopes = var.service_account_scopes + } + labels = local.labels + + machine_type = var.machine_type + disk_size_gb = local.disk_size_gb + preemptible = false + startup_script = module.startup_script.startup_script + metadata = local.metadata + source_image = data.google_compute_image.htcondor.self_link +} + +module "htcondor_ap" { + # tflint-ignore: terraform_module_pinned_source + source = "github.com/terraform-google-modules/terraform-google-vm//modules/mig?ref=84d7959" + + project_id = var.project_id + region = var.region + target_size = var.enable_high_availability ? 2 : 1 + hostname = local.name_prefix + instance_template = module.access_point_instance_template.self_link + + health_check_name = "health-${local.name_prefix}" + health_check = { + type = "tcp" + initial_delay_sec = 600 + check_interval_sec = 20 + healthy_threshold = 2 + timeout_sec = 8 + unhealthy_threshold = 3 + response = "" + proxy_header = "NONE" + port = 9618 + request = "" + request_path = "" + host = "" + enable_logging = true + } + + update_policy = [{ + instance_redistribution_type = "NONE" + replacement_method = "SUBSTITUTE" + max_surge_fixed = length(data.google_compute_zones.available.names) + max_unavailable_fixed = length(data.google_compute_zones.available.names) + max_surge_percent = null + max_unavailable_percent = null + min_ready_sec = 300 + minimal_action = "REPLACE" + type = "OPPORTUNISTIC" + }] + + stateful_ips = [{ + interface_name = "nic0" + delete_rule = "ON_PERMANENT_INSTANCE_DELETION" + is_external = var.enable_public_ips + }] +} diff --git a/community/modules/scheduler/htcondor-access-point/outputs.tf b/community/modules/scheduler/htcondor-access-point/outputs.tf new file mode 100644 index 0000000000..9512cd8028 --- /dev/null +++ b/community/modules/scheduler/htcondor-access-point/outputs.tf @@ -0,0 +1,20 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "list_instances_command" { + description = "Command to list Access Points provisioned by this module" + value = local.list_instances_command +} diff --git a/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl b/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl new file mode 100644 index 0000000000..db7144e0f3 --- /dev/null +++ b/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl @@ -0,0 +1,55 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# this file is managed by the HPC Toolkit; do not edit it manually +# override settings with a higher priority (last lexically) named file +# https://htcondor.readthedocs.io/en/latest/admin-manual/introduction-to-configuration.html?#ordered-evaluation-to-set-the-configuration + +use role:${htcondor_role} +CONDOR_HOST = ${join(",", central_manager_ips)} + +SPOOL = ${spool_dir} +SCHEDD_INTERVAL = 30 +TRUST_UID_DOMAIN = True +SUBMIT_ATTRS = RunAsOwner +RunAsOwner = True + +# When a job matches to a machine, add machine attributes to the job for +# condor_history (e.g. VM Instance ID) +use feature:JobsHaveInstanceIDs +SYSTEM_JOB_MACHINE_ATTRS = $(SYSTEM_JOB_MACHINE_ATTRS) \ + CloudVMType CloudZone CloudInterruptible +SYSTEM_JOB_MACHINE_ATTRS_HISTORY_LENGTH = 10 + +# Add Cloud attributes to SchedD ClassAd +use feature:ScheddCronOneShot(cloud, $(LIBEXEC)/common-cloud-attributes-google.py) +SCHEDD_CRON_cloud_PREFIX = Cloud + +# the sequence of job transforms and submit requirements below set +# a default job attribute RequireSpot to False but allow the user to +# specify *only* a boolean value with +RequireSpot = True in their job +# submit file; the requirements of the job are transformed to filter +# on +RequireSpot unless job has explicit CloudInterruptible requirements +JOB_TRANSFORM_NAMES = SPOT_DEFAULT, SPOT_REQS +JOB_TRANSFORM_SPOT_DEFAULT @=end + DEFAULT RequireSpot False +@end +# Unless explicit, set CloudInterruptible requirements to job RequireSpot attribute +JOB_TRANSFORM_SPOT_REQS @=end + REQUIREMENTS ! unresolved(Requirements, "^CloudInterruptible$") + SET Requirements $(MY.Requirements) && (CloudInterruptible is My.RequireSpot) +@end +SUBMIT_REQUIREMENT_NAMES = REQSPOT +SUBMIT_REQUIREMENT_REQSPOT = isBoolean(RequireSpot) +SUBMIT_REQUIREMENT_REQSPOT_REASON = "Jobs must set +RequireSpot to either True or False" diff --git a/community/modules/scheduler/htcondor-access-point/variables.tf b/community/modules/scheduler/htcondor-access-point/variables.tf new file mode 100644 index 0000000000..fed1f47d4c --- /dev/null +++ b/community/modules/scheduler/htcondor-access-point/variables.tf @@ -0,0 +1,151 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "Project in which HTCondor pool will be created" + type = string +} + +variable "deployment_name" { + description = "HPC Toolkit deployment name. HTCondor cloud resource names will include this value." + type = string +} + +variable "labels" { + description = "Labels to add to resources. List key, value pairs." + type = map(string) +} + +variable "region" { + description = "Default region for creating resources" + type = string +} + +variable "network_self_link" { + description = "The self link of the network in which the HTCondor central manager will be created." + type = string + default = null +} + +variable "access_point_service_account_email" { + description = "Service account for access point (e-mail format)" + type = string +} + +variable "service_account_scopes" { + description = "Scopes by which to limit service account attached to central manager." + type = set(string) + default = [ + "https://www.googleapis.com/auth/cloud-platform", + ] +} + +variable "network_storage" { + description = "An array of network attached storage mounts to be configured" + type = list(object({ + server_ip = string, + remote_mount = string, + local_mount = string, + fs_type = string, + mount_options = string, + client_install_runner = map(string) + mount_runner = map(string) + })) + default = [] +} + +variable "disk_size_gb" { + description = "Boot disk size in GB" + type = number + default = null +} + +variable "metadata" { + description = "Metadata to add to HTCondor central managers" + type = map(string) + default = {} +} + +variable "enable_oslogin" { + description = "Enable or Disable OS Login with \"ENABLE\" or \"DISABLE\". Set to \"INHERIT\" to inherit project OS Login setting." + type = string + default = "ENABLE" + nullable = false + validation { + condition = contains(["ENABLE", "DISABLE", "INHERIT"], var.enable_oslogin) + error_message = "Allowed string values for var.enable_oslogin are \"ENABLE\", \"DISABLE\", or \"INHERIT\"." + } +} + +variable "subnetwork_self_link" { + description = "The self link of the subnetwork in which the HTCondor central manager will be created." + type = string + default = null +} + +variable "enable_high_availability" { + description = "Provision HTCondor access point in high availability mode" + type = bool + default = false +} + +variable "instance_image" { + description = "Custom VM image with HTCondor and Toolkit support installed." + type = object({ + family = string, + project = string + }) +} + +variable "machine_type" { + description = "Machine type to use for HTCondor central managers" + type = string + default = "c2-standard-4" +} + +variable "access_point_runner" { + description = "A list of Toolkit runners for configuring an HTCondor access point" + type = list(map(string)) + default = [] +} + +variable "autoscaler_runner" { + description = "A list of Toolkit runners for configuring autoscaling daemons" + type = list(map(string)) + default = [] +} + +variable "spool_parent_dir" { + description = "HTCondor access point configuration SPOOL will be set to subdirectory named \"spool\"" + type = string + default = "/var/lib/condor" +} + +variable "central_manager_ips" { + description = "List of IP addresses of HTCondor Central Managers" + type = list(string) +} + +variable "htcondor_bucket_name" { + description = "Name of HTCondor configuration bucket" + type = string +} + +variable "enable_public_ips" { + description = "Enable Public IPs on the access points" + type = bool + default = false +} diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf new file mode 100644 index 0000000000..839d2ffcad --- /dev/null +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -0,0 +1,29 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = ">= 3.83" + } + } + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.20.0" + } + + required_version = ">= 0.13.0" +} diff --git a/community/modules/scheduler/htcondor-base/README.md b/community/modules/scheduler/htcondor-base/README.md index a3e74ae8e9..c429207099 100644 --- a/community/modules/scheduler/htcondor-base/README.md +++ b/community/modules/scheduler/htcondor-base/README.md @@ -223,7 +223,6 @@ limitations under the License. | Name | Type | |------|------| -| [google_storage_bucket_object.ap_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.cm_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_storage_bucket_object.execute_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_compute_subnetwork.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork) | data source | @@ -237,24 +236,21 @@ limitations under the License. | [central\_manager\_roles](#input\_central\_manager\_roles) | Project-wide roles for HTCondor Central Manager service account | `list(string)` |
[
"roles/monitoring.metricWriter",
"roles/logging.logWriter",
"roles/storage.objectViewer"
]
| no | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | | [execute\_point\_roles](#input\_execute\_point\_roles) | Project-wide roles for HTCondor Execute Point service account | `list(string)` |
[
"roles/monitoring.metricWriter",
"roles/logging.logWriter",
"roles/storage.objectViewer"
]
| no | -| [job\_queue\_high\_availability](#input\_job\_queue\_high\_availability) | Provision HTCondor access points in high availability mode (experimental: see README) | `bool` | `false` | no | | [labels](#input\_labels) | Labels to add to resources. List key, value pairs. | `map(string)` | n/a | yes | | [project\_id](#input\_project\_id) | Project in which HTCondor pool will be created | `string` | n/a | yes | | [region](#input\_region) | Default region for creating resources | `string` | n/a | yes | -| [spool\_parent\_dir](#input\_spool\_parent\_dir) | HTCondor access point configuration SPOOL will be set to subdirectory named "spool" | `string` | `"/var/lib/condor"` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which Central Managers will be placed. | `string` | n/a | yes | ## Outputs | Name | Description | |------|-------------| -| [access\_point\_runner](#output\_access\_point\_runner) | Toolkit Runner to configure an HTCondor Access Point | | [access\_point\_service\_account\_email](#output\_access\_point\_service\_account\_email) | HTCondor Access Point Service Account (e-mail format) | -| [central\_manager\_internal\_ip](#output\_central\_manager\_internal\_ip) | Reserved internal IP address for use by Central Manager | +| [central\_manager\_ips](#output\_central\_manager\_ips) | Reserved internal IP address for use by Central Manager | | [central\_manager\_runner](#output\_central\_manager\_runner) | Toolkit Runner to configure an HTCondor Central Manager | -| [central\_manager\_secondary\_internal\_ip](#output\_central\_manager\_secondary\_internal\_ip) | Reserved internal IP address for use by failover Central Manager | | [central\_manager\_service\_account\_email](#output\_central\_manager\_service\_account\_email) | HTCondor Central Manager Service Account (e-mail format) | | [execute\_point\_runner](#output\_execute\_point\_runner) | Toolkit Runner to configure an HTCondor Execute Point | | [execute\_point\_service\_account\_email](#output\_execute\_point\_service\_account\_email) | HTCondor Execute Point Service Account (e-mail format) | +| [htcondor\_bucket\_name](#output\_htcondor\_bucket\_name) | Name of the HTCondor configuration bucket | | [windows\_startup\_ps1](#output\_windows\_startup\_ps1) | Windows PowerShell script to update HTCondor configuration file | diff --git a/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml b/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml index cf75113807..c7477ca67d 100644 --- a/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml +++ b/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml @@ -21,8 +21,6 @@ spool_dir: /var/lib/condor/spool condor_config_root: /etc/condor ghpc_config_file: 50-ghpc-managed - schedd_ha_config_file: 51-ghpc-schedd-high-availability - execute_config_file: 02-execute tasks: - name: Ensure necessary variables are set ansible.builtin.assert: @@ -57,66 +55,8 @@ fi args: executable: /bin/bash - - name: Configure HTCondor SchedD - when: htcondor_role == 'get_htcondor_submit' - block: - - name: Setup Spool directory - ansible.builtin.file: - path: "{{ spool_dir }}" - state: directory - owner: condor - group: condor - mode: 0755 - - name: Enable SchedD high availability - when: job_queue_ha | bool - block: - - name: Set SchedD HA configuration (requires restart) - ansible.builtin.copy: - dest: "{{ condor_config_root }}/config.d/{{ schedd_ha_config_file }}" - mode: 0644 - content: | - MASTER_HA_LIST=SCHEDD - HA_LOCK_URL=file:{{ spool_dir }} - VALID_SPOOL_FILES=$(VALID_SPOOL_FILES), SCHEDD.lock - HA_POLL_PERIOD=30 - SCHEDD_NAME=had-schedd@ - notify: - - Restart HTCondor - # although HTCondor is guaranteed to start after mounting remote - # filesystems is *attempted*, it does not guarantee successful mounts; - # this additional SystemD setting will refuse to start HTCondor if the - # spool shared filesystem has not been mounted - - name: Create SystemD override directory for HTCondor - ansible.builtin.file: - path: /etc/systemd/system/condor.service.d - state: directory - owner: root - group: root - mode: 0755 - - name: Ensure HTCondor starts after shared filesystem is mounted - ansible.builtin.copy: - dest: /etc/systemd/system/condor.service.d/mount-spool.conf - mode: 0644 - content: | - [Unit] - RequiresMountsFor={{ spool_dir }} - notify: - - Reload SystemD - - name: Disable SchedD high availability - when: not job_queue_ha | bool - block: - - name: Remove SchedD HA configuration file - ansible.builtin.file: - path: "{{ condor_config_root }}/config.d/{{ schedd_ha_config_file }}" - state: absent - notify: - - Restart HTCondor - - name: Remove HTCondor SystemD override - ansible.builtin.file: - path: /etc/systemd/system/condor.service.d/mount-spool.conf - state: absent - notify: - - Reload SystemD + notify: + - Reload HTCondor handlers: - name: Reload SystemD ansible.builtin.systemd: diff --git a/community/modules/scheduler/htcondor-base/main.tf b/community/modules/scheduler/htcondor-base/main.tf index fa8cdcd170..65af6acf9b 100644 --- a/community/modules/scheduler/htcondor-base/main.tf +++ b/community/modules/scheduler/htcondor-base/main.tf @@ -33,19 +33,11 @@ locals { cm_config = templatefile("${path.module}/templates/condor_config.tftpl", { htcondor_role = "get_htcondor_central_manager", central_manager_ips = module.address.addresses, - spool_dir = "${var.spool_parent_dir}/spool", }) execute_config = templatefile("${path.module}/templates/condor_config.tftpl", { htcondor_role = "get_htcondor_execute", central_manager_ips = module.address.addresses, - spool_dir = "${var.spool_parent_dir}/spool", - }) - - ap_config = templatefile("${path.module}/templates/condor_config.tftpl", { - htcondor_role = "get_htcondor_submit", - central_manager_ips = module.address.addresses, - spool_dir = "${var.spool_parent_dir}/spool", }) cm_object = "gs://${module.htcondor_bucket.name}/${google_storage_bucket_object.cm_config.output_name}" @@ -59,19 +51,6 @@ locals { ]) } - ap_object = "gs://${module.htcondor_bucket.name}/${google_storage_bucket_object.ap_config.output_name}" - runner_access = { - "type" = "ansible-local" - "content" = file("${path.module}/files/htcondor_configure.yml") - "destination" = "htcondor_configure.yml" - "args" = join(" ", [ - "-e htcondor_role=get_htcondor_submit", - "-e config_object=${local.ap_object}", - "-e job_queue_ha=${var.job_queue_high_availability}", - "-e spool_dir=${var.spool_parent_dir}/spool", - ]) - } - execute_object = "gs://${module.htcondor_bucket.name}/${google_storage_bucket_object.execute_config.output_name}" runner_execute = { "type" = "ansible-local" @@ -112,23 +91,17 @@ module "htcondor_bucket" { } resource "google_storage_bucket_object" "cm_config" { - name = "${var.deployment_name}-cm-config" + name = "${var.deployment_name}-cm-config-${substr(md5(local.cm_config), 0, 4)}" content = local.cm_config bucket = module.htcondor_bucket.name } resource "google_storage_bucket_object" "execute_config" { - name = "${var.deployment_name}-execute-config" + name = "${var.deployment_name}-execute-config-${substr(md5(local.execute_config), 0, 4)}" content = local.execute_config bucket = module.htcondor_bucket.name } -resource "google_storage_bucket_object" "ap_config" { - name = "${var.deployment_name}-ap-config" - content = local.ap_config - bucket = module.htcondor_bucket.name -} - module "access_point_service_account" { source = "terraform-google-modules/service-accounts/google" version = "~> 4.2" diff --git a/community/modules/scheduler/htcondor-base/outputs.tf b/community/modules/scheduler/htcondor-base/outputs.tf index a71c4456f7..025749e879 100644 --- a/community/modules/scheduler/htcondor-base/outputs.tf +++ b/community/modules/scheduler/htcondor-base/outputs.tf @@ -43,27 +43,22 @@ output "central_manager_runner" { value = local.runner_cm } -output "access_point_runner" { - description = "Toolkit Runner to configure an HTCondor Access Point" - value = local.runner_access -} - output "execute_point_runner" { description = "Toolkit Runner to configure an HTCondor Execute Point" value = local.runner_execute } -output "central_manager_internal_ip" { +output "central_manager_ips" { description = "Reserved internal IP address for use by Central Manager" - value = try(module.address.addresses[0], null) -} - -output "central_manager_secondary_internal_ip" { - description = "Reserved internal IP address for use by failover Central Manager" - value = try(module.address.addresses[1], null) + value = module.address.addresses } output "windows_startup_ps1" { description = "Windows PowerShell script to update HTCondor configuration file" value = local.windows_startup_ps1 } + +output "htcondor_bucket_name" { + description = "Name of the HTCondor configuration bucket" + value = module.htcondor_bucket.name +} diff --git a/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl b/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl index a7c6b9fb36..fd3926f6f7 100644 --- a/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl +++ b/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl @@ -49,44 +49,6 @@ HAD_USE_REPLICATION = True MASTER_HAD_BACKOFF_CONSTANT = 360 %{ endif ~} -%{ if htcondor_role == "get_htcondor_submit" ~} -# SchedD configuration settings -SPOOL = ${spool_dir} -SCHEDD_INTERVAL = 30 -TRUST_UID_DOMAIN = True -SUBMIT_ATTRS = RunAsOwner -RunAsOwner = True - -# When a job matches to a machine, add machine attributes to the job for -# condor_history (e.g. VM Instance ID) -use feature:JobsHaveInstanceIDs -SYSTEM_JOB_MACHINE_ATTRS = $(SYSTEM_JOB_MACHINE_ATTRS) \ - CloudVMType CloudZone CloudInterruptible -SYSTEM_JOB_MACHINE_ATTRS_HISTORY_LENGTH = 10 - -# Add Cloud attributes to SchedD ClassAd -use feature:ScheddCronOneShot(cloud, $(LIBEXEC)/common-cloud-attributes-google.py) -SCHEDD_CRON_cloud_PREFIX = Cloud - -# the sequence of job transforms and submit requirements below set -# a default job attribute RequireSpot to False but allow the user to -# specify *only* a boolean value with +RequireSpot = True in their job -# submit file; the requirements of the job are transformed to filter -# on +RequireSpot unless job has explicit CloudInterruptible requirements -JOB_TRANSFORM_NAMES = SPOT_DEFAULT, SPOT_REQS -JOB_TRANSFORM_SPOT_DEFAULT @=end - DEFAULT RequireSpot False -@end -# Unless explicit, set CloudInterruptible requirements to job RequireSpot attribute -JOB_TRANSFORM_SPOT_REQS @=end - REQUIREMENTS ! unresolved(Requirements, "^CloudInterruptible$") - SET Requirements $(MY.Requirements) && (CloudInterruptible is My.RequireSpot) -@end -SUBMIT_REQUIREMENT_NAMES = REQSPOT -SUBMIT_REQUIREMENT_REQSPOT = isBoolean(RequireSpot) -SUBMIT_REQUIREMENT_REQSPOT_REASON = "Jobs must set +RequireSpot to either True or False" -%{ endif ~} - %{ if htcondor_role == "get_htcondor_execute" ~} # StartD configuration settings use feature:PartitionableSlot diff --git a/community/modules/scheduler/htcondor-base/variables.tf b/community/modules/scheduler/htcondor-base/variables.tf index 695b3feac0..823aa043fb 100644 --- a/community/modules/scheduler/htcondor-base/variables.tf +++ b/community/modules/scheduler/htcondor-base/variables.tf @@ -75,15 +75,3 @@ variable "central_manager_high_availability" { type = bool default = false } - -variable "job_queue_high_availability" { - description = "Provision HTCondor access points in high availability mode (experimental: see README)" - type = bool - default = false -} - -variable "spool_parent_dir" { - description = "HTCondor access point configuration SPOOL will be set to subdirectory named \"spool\"" - type = string - default = "/var/lib/condor" -} diff --git a/modules/README.md b/modules/README.md index 454a7886cd..f4ea466766 100644 --- a/modules/README.md +++ b/modules/README.md @@ -158,6 +158,9 @@ Modules that are still in development and less stable are labeled with the * **[htcondor-pool-secrets]** ![community-badge] ![experimental-badge] : Creates and manages access to the secrets necessary for secure operation of an HTCondor pool. +* **[htcondor-access-point]** ![community-badge] ![experimental-badge] : Creates + a regional instance group managing a highly available HTCondor access point + (login node). * **[pbspro-client]** ![community-badge] ![experimental-badge] : Creates a client host for submitting jobs to a PBS Professional cluster. * **[pbspro-server]** ![community-badge] ![experimental-badge] : Creates @@ -170,7 +173,9 @@ Modules that are still in development and less stable are labeled with the [batch-job-template]: ../modules/scheduler/batch-job-template/README.md [batch-login-node]: ../modules/scheduler/batch-login-node/README.md [gke-cluster]: ../community/modules/scheduler/gke-cluster/README.md -[htcondor-base]: ../community/modules/scheduler/htcondor-configure/README.md +[htcondor-base]: ../community/modules/scheduler/htcondor-base/README.md +[htcondor-pool-secrets]: ../community/modules/scheduler/htcondor-pool-secrets/README.md +[htcondor-access-point]: ../community/modules/scheduler/htcondor-access-point/README.md [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md [schedmd-slurm-gcp-v5-login]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md [schedmd-slurm-gcp-v5-hybrid]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md From 232943e0340592333aa0c0d94bae98cea1d4440e Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 12 Jul 2023 16:14:54 -0500 Subject: [PATCH 070/144] Fix HTCondor integration test to work with new access point wrapper --- .../ansible_playbooks/htcondor-integration-test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml index e678b025a2..8ff8fd2408 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml @@ -52,7 +52,9 @@ executable: /bin/bash ansible.builtin.shell: | set -e -o pipefail - terraform output -json external_ip_htcondor_access | jq -r '.[0]' + sleep 60 + gcloud compute instances list --filter="labels.ghpc_deployment={{ deployment_name }} AND labels.ghpc_module=htcondor-access-point" \ + --format='csv[no-heading](networkInterfaces.accessConfigs[0].natIP)' --limit 1 - name: Add Login node as host ansible.builtin.add_host: hostname: "{{ access_ip.stdout }}" From 3a5ac520effe34ef9fb8f861a3e125e3ea8d11cf Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 12 Jul 2023 18:24:36 -0500 Subject: [PATCH 071/144] Address feedback from #1571 --- .../scheduler/htcondor-access-point/main.tf | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 13e28e13e6..42a780ab7f 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -54,7 +54,6 @@ locals { var.autoscaler_runner, [local.example_runner] ) - disk_size_gb = max(var.disk_size_gb, data.google_compute_image.htcondor.disk_size_gb) ap_config = templatefile("${path.module}/templates/condor_config.tftpl", { htcondor_role = "get_htcondor_submit", @@ -64,10 +63,10 @@ locals { ap_object = "gs://${var.htcondor_bucket_name}/${google_storage_bucket_object.ap_config.output_name}" schedd_runner = { - "type" = "ansible-local" - "content" = file("${path.module}/files/htcondor_configure.yml") - "destination" = "htcondor_configure.yml" - "args" = join(" ", [ + type = "ansible-local" + content = file("${path.module}/files/htcondor_configure.yml") + destination = "htcondor_configure.yml" + args = join(" ", [ "-e htcondor_role=get_htcondor_submit", "-e config_object=${local.ap_object}", "-e job_queue_ha=${var.enable_high_availability}", @@ -81,6 +80,13 @@ locals { data "google_compute_image" "htcondor" { family = var.instance_image.family project = var.instance_image.project + + lifecycle { + postcondition { + condition = self.disk_size_gb <= var.disk_size_gb + error_message = "var.disk_size_gb must be set to at least the size of the image (${self.disk_size_gb})" + } + } } data "google_compute_zones" "available" { @@ -124,7 +130,7 @@ module "access_point_instance_template" { labels = local.labels machine_type = var.machine_type - disk_size_gb = local.disk_size_gb + disk_size_gb = var.disk_size_gb preemptible = false startup_script = module.startup_script.startup_script metadata = local.metadata From ae3afd1841475211afbb70260462c9bb93b9763d Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 14 Jul 2023 12:02:10 -0500 Subject: [PATCH 072/144] Improve wrapper for HTCondor execute points Refactor existing modules to wrap usage of the startup-script module and increase adoption of `use` keyword in HTCondor blueprints. --- community/examples/htc-htcondor.yaml | 23 ++---- .../compute/htcondor-execute-point/README.md | 9 ++- .../files/htcondor_configure.yml | 75 +++++++++++++++++++ .../compute/htcondor-execute-point/main.tf | 66 ++++++++++++++-- .../templates/condor_config.tftpl | 28 +++++++ .../download-condor-config.ps1.tftpl | 0 .../htcondor-execute-point/variables.tf | 41 ++++++---- .../modules/scheduler/htcondor-base/README.md | 3 - .../files/htcondor_configure.yml | 7 -- .../modules/scheduler/htcondor-base/main.tf | 28 ------- .../scheduler/htcondor-base/outputs.tf | 10 --- .../templates/condor_config.tftpl | 10 --- .../files/htcondor_secrets.yml | 8 +- 13 files changed, 205 insertions(+), 103 deletions(-) create mode 100644 community/modules/compute/htcondor-execute-point/files/htcondor_configure.yml create mode 100644 community/modules/compute/htcondor-execute-point/templates/condor_config.tftpl rename community/modules/{scheduler/htcondor-base => compute/htcondor-execute-point}/templates/download-condor-config.ps1.tftpl (100%) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 904c31f552..1a695ea898 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -104,13 +104,6 @@ deployment_groups: outputs: - internal_ip - - id: htcondor_startup_execute_point - source: modules/scripts/startup-script - settings: - runners: - - $(htcondor_secrets.execute_point_runner) - - $(htcondor_base.execute_point_runner) - # the HTCondor modules support up to 2 execute points per blueprint # if using 1, it may use Spot or On-demand pricing # if using 2, one must use Spot and the other must use On-demand (default) @@ -118,31 +111,25 @@ deployment_groups: source: community/modules/compute/htcondor-execute-point use: - network1 - - htcondor_startup_execute_point + - htcondor_secrets # must be ordered before htcondor_base + - htcondor_base settings: instance_image: project: $(vars.project_id) family: $(vars.new_image_family) min_idle: 2 - service_account: - email: $(htcondor_base.execute_point_service_account_email) - scopes: - - cloud-platform - id: htcondor_execute_point_spot source: community/modules/compute/htcondor-execute-point use: - network1 - - htcondor_startup_execute_point + - htcondor_secrets # must be ordered before htcondor_base + - htcondor_base settings: - spot: true instance_image: project: $(vars.project_id) family: $(vars.new_image_family) - service_account: - email: $(htcondor_base.execute_point_service_account_email) - scopes: - - cloud-platform + spot: true - id: htcondor_access source: community/modules/scheduler/htcondor-access-point diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index af3cd3d26b..410f85d52b 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -173,11 +173,13 @@ limitations under the License. |------|--------|---------| | [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 | | [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | ~> 8.0 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.20.0&depth=1 | ## Resources | Name | Type | |------|------| +| [google_storage_bucket_object.execute_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_compute_image.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | | [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | @@ -185,9 +187,13 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [central\_manager\_ips](#input\_central\_manager\_ips) | List of IP addresses of HTCondor Central Managers | `list(string)` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `100` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | +| [execute\_point\_runner](#input\_execute\_point\_runner) | A list of Toolkit runners for configuring an HTCondor execute point | `list(map(string))` | `[]` | no | +| [execute\_point\_service\_account\_email](#input\_execute\_point\_service\_account\_email) | Service account for HTCondor execute point (e-mail format) | `string` | n/a | yes | +| [htcondor\_bucket\_name](#input\_htcondor\_bucket\_name) | Name of HTCondor configuration bucket | `string` | n/a | yes | | [instance\_image](#input\_instance\_image) | HTCondor execute point VM image |
object({
family = string,
project = string
})
|
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to HTConodr execute points | `map(string)` | n/a | yes | | [machine\_type](#input\_machine\_type) | Machine type to use for HTCondor execute points | `string` | `"n2-standard-4"` | no | @@ -198,9 +204,8 @@ limitations under the License. | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [project\_id](#input\_project\_id) | Project in which the HTCondor execute points will be created | `string` | n/a | yes | | [region](#input\_region) | The region in which HTCondor execute points will be created | `string` | n/a | yes | -| [service\_account](#input\_service\_account) | Service account to attach to HTCondor execute points |
object({
email = string,
scopes = set(string)
})
|
{
"email": null,
"scopes": [
"https://www.googleapis.com/auth/cloud-platform"
]
}
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | -| [startup\_script](#input\_startup\_script) | Startup script to run at boot-time for Linux HTCondor execute points | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork HTCondor execute points will join | `string` | `null` | no | | [target\_size](#input\_target\_size) | Initial size of the HTCondor execute point pool; set to null (default) to avoid Terraform management of size. | `number` | `null` | no | | [windows\_startup\_ps1](#input\_windows\_startup\_ps1) | Startup script to run at boot-time for Windows-based HTCondor execute points | `list(string)` | `[]` | no | diff --git a/community/modules/compute/htcondor-execute-point/files/htcondor_configure.yml b/community/modules/compute/htcondor-execute-point/files/htcondor_configure.yml new file mode 100644 index 0000000000..44fb1e4833 --- /dev/null +++ b/community/modules/compute/htcondor-execute-point/files/htcondor_configure.yml @@ -0,0 +1,75 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +- name: Configure HTCondor Role + hosts: localhost + become: true + vars: + job_queue_ha: false + spool_dir: /var/lib/condor/spool + condor_config_root: /etc/condor + ghpc_config_file: 50-ghpc-managed + tasks: + - name: Ensure necessary variables are set + ansible.builtin.assert: + that: + - htcondor_role is defined + - config_object is defined + - name: Remove default HTCondor configuration + ansible.builtin.file: + path: "{{ condor_config_root }}/config.d/00-htcondor-9.0.config" + state: absent + notify: + - Reload HTCondor + - name: Create Toolkit configuration file + register: config_update + changed_when: config_update.rc == 137 + failed_when: config_update.rc != 0 and config_update.rc != 137 + ansible.builtin.shell: | + set -e -o pipefail + REMOTE_HASH=$(gcloud --format="value(md5_hash)" storage hash {{ config_object }}) + + CONFIG_FILE="{{ condor_config_root }}/config.d/{{ ghpc_config_file }}" + if [ -f "${CONFIG_FILE}" ]; then + LOCAL_HASH=$(gcloud --format="value(md5_hash)" storage hash "${CONFIG_FILE}") + else + LOCAL_HASH="INVALID-HASH" + fi + + if [ "${REMOTE_HASH}" != "${LOCAL_HASH}" ]; then + gcloud storage cp {{ config_object }} "${CONFIG_FILE}" + chmod 0644 "${CONFIG_FILE}" + exit 137 + fi + args: + executable: /bin/bash + notify: + - Reload HTCondor + handlers: + - name: Reload HTCondor + ansible.builtin.service: + name: condor + state: reloaded + post_tasks: + - name: Start HTCondor + ansible.builtin.service: + name: condor + state: started + enabled: true + - name: Inform users + changed_when: false + ansible.builtin.shell: | + set -e -o pipefail + wall "******* HTCondor system configuration complete ********" diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 0bdb1e9a58..b8fc82f8f1 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -29,7 +29,7 @@ locals { } enable_oslogin = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } - windows_startup_ps1 = join("\n\n", var.windows_startup_ps1) + windows_startup_ps1 = join("\n\n", flatten([var.windows_startup_ps1, local.execute_config_windows_startup_ps1])) is_windows_image = anytrue([for l in data.google_compute_image.htcondor.licenses : length(regexall("windows-cloud", l)) > 0]) windows_startup_metadata = local.is_windows_image && local.windows_startup_ps1 != "" ? { @@ -52,29 +52,79 @@ locals { ]) } + execute_config = templatefile("${path.module}/templates/condor_config.tftpl", { + htcondor_role = "get_htcondor_execute", + central_manager_ips = var.central_manager_ips + }) + + execute_object = "gs://${var.htcondor_bucket_name}/${google_storage_bucket_object.execute_config.output_name}" + execute_runner = { + type = "ansible-local" + content = file("${path.module}/files/htcondor_configure.yml") + destination = "htcondor_configure.yml" + args = join(" ", [ + "-e htcondor_role=get_htcondor_execute", + "-e config_object=${local.execute_object}", + ]) + } + + execute_config_windows_startup_ps1 = templatefile( + "${path.module}/templates/download-condor-config.ps1.tftpl", + { + config_object = local.execute_object, + } + ) + hostnames = var.spot ? "${var.deployment_name}-spot-xp" : "${var.deployment_name}-xp" } data "google_compute_image" "htcondor" { family = var.instance_image.family project = var.instance_image.project + + lifecycle { + postcondition { + condition = self.disk_size_gb <= var.disk_size_gb + error_message = "var.disk_size_gb must be set to at least the size of the image (${self.disk_size_gb})" + } + } +} + +resource "google_storage_bucket_object" "execute_config" { + name = "${var.deployment_name}-execute-config-${substr(md5(local.execute_config), 0, 4)}" + content = local.execute_config + bucket = var.htcondor_bucket_name +} + +module "startup_script" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.20.0&depth=1" + + project_id = var.project_id + region = var.region + labels = local.labels + deployment_name = var.deployment_name + + runners = flatten([var.execute_point_runner, local.execute_runner]) } module "execute_point_instance_template" { source = "terraform-google-modules/vm/google//modules/instance_template" version = "~> 8.0" - name_prefix = local.hostnames - project_id = var.project_id - network = var.network_self_link - subnetwork = var.subnetwork_self_link - service_account = var.service_account - labels = local.labels + name_prefix = local.hostnames + project_id = var.project_id + network = var.network_self_link + subnetwork = var.subnetwork_self_link + service_account = { + email = var.execute_point_service_account_email + scopes = var.service_account_scopes + } + labels = local.labels machine_type = var.machine_type disk_size_gb = var.disk_size_gb preemptible = var.spot - startup_script = local.is_windows_image ? null : var.startup_script + startup_script = local.is_windows_image ? null : module.startup_script.startup_script metadata = local.metadata source_image = data.google_compute_image.htcondor.self_link } diff --git a/community/modules/compute/htcondor-execute-point/templates/condor_config.tftpl b/community/modules/compute/htcondor-execute-point/templates/condor_config.tftpl new file mode 100644 index 0000000000..cfd801b729 --- /dev/null +++ b/community/modules/compute/htcondor-execute-point/templates/condor_config.tftpl @@ -0,0 +1,28 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# this file is managed by the HPC Toolkit; do not edit it manually +# override settings with a higher priority (last lexically) named file +# https://htcondor.readthedocs.io/en/latest/admin-manual/introduction-to-configuration.html?#ordered-evaluation-to-set-the-configuration + +use role:${htcondor_role} +CONDOR_HOST = ${join(",", central_manager_ips)} + +# StartD configuration settings +use feature:PartitionableSlot +use feature:CommonCloudAttributesGoogle("-c created-by") +UPDATE_INTERVAL = 30 +TRUST_UID_DOMAIN = True +STARTER_ALLOW_RUNAS_OWNER = True +RUNBENCHMARKS = False diff --git a/community/modules/scheduler/htcondor-base/templates/download-condor-config.ps1.tftpl b/community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl similarity index 100% rename from community/modules/scheduler/htcondor-base/templates/download-condor-config.ps1.tftpl rename to community/modules/compute/htcondor-execute-point/templates/download-condor-config.ps1.tftpl diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index a0413ceff5..cb97d87585 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -45,10 +45,10 @@ variable "machine_type" { default = "n2-standard-4" } -variable "startup_script" { - description = "Startup script to run at boot-time for Linux HTCondor execute points" - type = string - default = null +variable "execute_point_runner" { + description = "A list of Toolkit runners for configuring an HTCondor execute point" + type = list(map(string)) + default = [] } variable "network_storage" { @@ -77,18 +77,17 @@ variable "instance_image" { } } -variable "service_account" { - description = "Service account to attach to HTCondor execute points" - type = object({ - email = string, - scopes = set(string) - }) - default = { - email = null - scopes = [ - "https://www.googleapis.com/auth/cloud-platform", - ] - } +variable "execute_point_service_account_email" { + description = "Service account for HTCondor execute point (e-mail format)" + type = string +} + +variable "service_account_scopes" { + description = "Scopes by which to limit service account attached to central manager." + type = set(string) + default = [ + "https://www.googleapis.com/auth/cloud-platform", + ] } variable "network_self_link" { @@ -159,3 +158,13 @@ variable "windows_startup_ps1" { default = [] nullable = false } + +variable "central_manager_ips" { + description = "List of IP addresses of HTCondor Central Managers" + type = list(string) +} + +variable "htcondor_bucket_name" { + description = "Name of HTCondor configuration bucket" + type = string +} diff --git a/community/modules/scheduler/htcondor-base/README.md b/community/modules/scheduler/htcondor-base/README.md index c429207099..e61324c096 100644 --- a/community/modules/scheduler/htcondor-base/README.md +++ b/community/modules/scheduler/htcondor-base/README.md @@ -224,7 +224,6 @@ limitations under the License. | Name | Type | |------|------| | [google_storage_bucket_object.cm_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | -| [google_storage_bucket_object.execute_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_compute_subnetwork.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork) | data source | ## Inputs @@ -249,8 +248,6 @@ limitations under the License. | [central\_manager\_ips](#output\_central\_manager\_ips) | Reserved internal IP address for use by Central Manager | | [central\_manager\_runner](#output\_central\_manager\_runner) | Toolkit Runner to configure an HTCondor Central Manager | | [central\_manager\_service\_account\_email](#output\_central\_manager\_service\_account\_email) | HTCondor Central Manager Service Account (e-mail format) | -| [execute\_point\_runner](#output\_execute\_point\_runner) | Toolkit Runner to configure an HTCondor Execute Point | | [execute\_point\_service\_account\_email](#output\_execute\_point\_service\_account\_email) | HTCondor Execute Point Service Account (e-mail format) | | [htcondor\_bucket\_name](#output\_htcondor\_bucket\_name) | Name of the HTCondor configuration bucket | -| [windows\_startup\_ps1](#output\_windows\_startup\_ps1) | Windows PowerShell script to update HTCondor configuration file | diff --git a/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml b/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml index c7477ca67d..44fb1e4833 100644 --- a/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml +++ b/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml @@ -58,13 +58,6 @@ notify: - Reload HTCondor handlers: - - name: Reload SystemD - ansible.builtin.systemd: - daemon_reload: true - - name: Restart HTCondor - ansible.builtin.service: - name: condor - state: restarted - name: Reload HTCondor ansible.builtin.service: name: condor diff --git a/community/modules/scheduler/htcondor-base/main.tf b/community/modules/scheduler/htcondor-base/main.tf index 65af6acf9b..cc9bf66b9e 100644 --- a/community/modules/scheduler/htcondor-base/main.tf +++ b/community/modules/scheduler/htcondor-base/main.tf @@ -35,11 +35,6 @@ locals { central_manager_ips = module.address.addresses, }) - execute_config = templatefile("${path.module}/templates/condor_config.tftpl", { - htcondor_role = "get_htcondor_execute", - central_manager_ips = module.address.addresses, - }) - cm_object = "gs://${module.htcondor_bucket.name}/${google_storage_bucket_object.cm_config.output_name}" runner_cm = { "type" = "ansible-local" @@ -50,23 +45,6 @@ locals { "-e config_object=${local.cm_object}", ]) } - - execute_object = "gs://${module.htcondor_bucket.name}/${google_storage_bucket_object.execute_config.output_name}" - runner_execute = { - "type" = "ansible-local" - "content" = file("${path.module}/files/htcondor_configure.yml") - "destination" = "htcondor_configure.yml" - "args" = join(" ", [ - "-e htcondor_role=get_htcondor_execute", - "-e config_object=${local.execute_object}", - ]) - } - windows_startup_ps1 = templatefile( - "${path.module}/templates/download-condor-config.ps1.tftpl", - { - config_object = local.execute_object, - } - ) } module "htcondor_bucket" { @@ -96,12 +74,6 @@ resource "google_storage_bucket_object" "cm_config" { bucket = module.htcondor_bucket.name } -resource "google_storage_bucket_object" "execute_config" { - name = "${var.deployment_name}-execute-config-${substr(md5(local.execute_config), 0, 4)}" - content = local.execute_config - bucket = module.htcondor_bucket.name -} - module "access_point_service_account" { source = "terraform-google-modules/service-accounts/google" version = "~> 4.2" diff --git a/community/modules/scheduler/htcondor-base/outputs.tf b/community/modules/scheduler/htcondor-base/outputs.tf index 025749e879..a39c1e7b4e 100644 --- a/community/modules/scheduler/htcondor-base/outputs.tf +++ b/community/modules/scheduler/htcondor-base/outputs.tf @@ -43,21 +43,11 @@ output "central_manager_runner" { value = local.runner_cm } -output "execute_point_runner" { - description = "Toolkit Runner to configure an HTCondor Execute Point" - value = local.runner_execute -} - output "central_manager_ips" { description = "Reserved internal IP address for use by Central Manager" value = module.address.addresses } -output "windows_startup_ps1" { - description = "Windows PowerShell script to update HTCondor configuration file" - value = local.windows_startup_ps1 -} - output "htcondor_bucket_name" { description = "Name of the HTCondor configuration bucket" value = module.htcondor_bucket.name diff --git a/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl b/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl index fd3926f6f7..d96950b4ed 100644 --- a/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl +++ b/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl @@ -48,13 +48,3 @@ DAEMON_LIST = $(DAEMON_LIST), HAD, REPLICATION HAD_USE_REPLICATION = True MASTER_HAD_BACKOFF_CONSTANT = 360 %{ endif ~} - -%{ if htcondor_role == "get_htcondor_execute" ~} -# StartD configuration settings -use feature:PartitionableSlot -use feature:CommonCloudAttributesGoogle("-c created-by") -UPDATE_INTERVAL = 30 -TRUST_UID_DOMAIN = True -STARTER_ALLOW_RUNAS_OWNER = True -RUNBENCHMARKS = False -%{ endif ~} diff --git a/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml b/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml index 39ba4095db..538c809c2a 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml +++ b/community/modules/scheduler/htcondor-pool-secrets/files/htcondor_secrets.yml @@ -53,6 +53,7 @@ args: creates: "{{ condor_config_root }}/tokens.d/condor@{{ trust_domain }}" - name: Create IDTOKEN secret for Execute Points + when: xp_idtoken_secret_id | length > 0 changed_when: true ansible.builtin.shell: | umask 0077 @@ -61,7 +62,6 @@ -authz ADVERTISE_STARTD -identity condor@{{ trust_domain }} > "$TMPFILE" gcloud secrets versions add --data-file "$TMPFILE" {{ xp_idtoken_secret_id }} rm -f "$TMPFILE" - when: xp_idtoken_secret_id | length > 0 - name: Configure HTCondor SchedD when: htcondor_role == 'get_htcondor_submit' block: @@ -94,3 +94,9 @@ [Service] ExecStartPre=gcloud secrets versions access latest --secret {{ xp_idtoken_secret_id }} \ --out-file {{ condor_config_root }}/tokens.d/condor@{{ trust_domain }} + notify: + - Reload SystemD + handlers: + - name: Reload SystemD + ansible.builtin.systemd: + daemon_reload: true From 6ecb056f960f3f590291595f146e0d9b1e27459f Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 14 Jul 2023 12:02:10 -0500 Subject: [PATCH 073/144] Address feedback from #1574 --- community/examples/htc-htcondor.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 1a695ea898..5dcb30adc0 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -111,7 +111,7 @@ deployment_groups: source: community/modules/compute/htcondor-execute-point use: - network1 - - htcondor_secrets # must be ordered before htcondor_base + - htcondor_secrets - htcondor_base settings: instance_image: @@ -123,7 +123,7 @@ deployment_groups: source: community/modules/compute/htcondor-execute-point use: - network1 - - htcondor_secrets # must be ordered before htcondor_base + - htcondor_secrets - htcondor_base settings: instance_image: From 3d91608988978e8632f1c5edbdc1d08ccbc7c3d5 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 14 Jul 2023 13:51:16 -0500 Subject: [PATCH 074/144] Fix HTCondor access point playbook --- .../htcondor-access-point/files/htcondor_configure.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/community/modules/scheduler/htcondor-access-point/files/htcondor_configure.yml b/community/modules/scheduler/htcondor-access-point/files/htcondor_configure.yml index 93f3dfe8d4..ab125e12ea 100644 --- a/community/modules/scheduler/htcondor-access-point/files/htcondor_configure.yml +++ b/community/modules/scheduler/htcondor-access-point/files/htcondor_configure.yml @@ -56,6 +56,8 @@ fi args: executable: /bin/bash + notify: + - Reload HTCondor - name: Configure HTCondor SchedD when: htcondor_role == 'get_htcondor_submit' block: From 34dbf87e6277fc64294179e35305227b61186634 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 14 Jul 2023 13:51:16 -0500 Subject: [PATCH 075/144] Add IP/name outputs to access point module --- community/examples/htc-htcondor.yaml | 3 ++- .../scheduler/htcondor-access-point/README.md | 7 ++++- .../scheduler/htcondor-access-point/main.tf | 26 ++++++++++++++++--- .../htcondor-access-point/outputs.tf | 11 +++++--- .../htcondor-access-point/versions.tf | 4 +++ 5 files changed, 43 insertions(+), 8 deletions(-) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 5dcb30adc0..ce1fa13f1c 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -145,4 +145,5 @@ deployment_groups: project: $(vars.project_id) family: $(vars.new_image_family) outputs: - - list_instances_command + - access_point_ips + - access_point_name diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index d58f633a96..dd57de5df1 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -48,12 +48,14 @@ limitations under the License. |------|---------| | [terraform](#requirement\_terraform) | >= 0.13.0 | | [google](#requirement\_google) | >= 3.83 | +| [time](#requirement\_time) | ~> 0.9 | ## Providers | Name | Version | |------|---------| | [google](#provider\_google) | >= 3.83 | +| [time](#provider\_time) | ~> 0.9 | ## Modules @@ -68,7 +70,9 @@ limitations under the License. | Name | Type | |------|------| | [google_storage_bucket_object.ap_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [time_sleep.mig_warmup](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | | [google_compute_image.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | +| [google_compute_instance.ap](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_instance) | data source | | [google_compute_region_instance_group.ap](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_region_instance_group) | data source | | [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | @@ -102,5 +106,6 @@ limitations under the License. | Name | Description | |------|-------------| -| [list\_instances\_command](#output\_list\_instances\_command) | Command to list Access Points provisioned by this module | +| [access\_point\_ips](#output\_access\_point\_ips) | IP addresses of the access points provisioned by this module | +| [access\_point\_name](#output\_access\_point\_name) | Name of the access point provisioned by this module | diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 42a780ab7f..603a755f36 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -28,6 +28,7 @@ locals { enable_oslogin_metadata = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } metadata = merge(local.network_storage_metadata, local.enable_oslogin_metadata, var.metadata) + host_count = var.enable_high_availability ? 2 : 1 name_prefix = "${var.deployment_name}-ap" example_runner = { @@ -74,7 +75,8 @@ locals { ]) } - list_instances_command = "gcloud compute instance-groups list-instances ${data.google_compute_region_instance_group.ap.name} --region ${var.region} --project ${var.project_id}" + access_point_ips = [data.google_compute_instance.ap.network_interface[0].network_ip] + access_point_name = data.google_compute_instance.ap.name } data "google_compute_image" "htcondor" { @@ -95,7 +97,17 @@ data "google_compute_zones" "available" { } data "google_compute_region_instance_group" "ap" { - self_link = module.htcondor_ap.self_link + self_link = time_sleep.mig_warmup.triggers.self_link + lifecycle { + postcondition { + condition = length(self.instances) == local.host_count + error_message = "There should be ${local.host_count} access points found" + } + } +} + +data "google_compute_instance" "ap" { + self_link = data.google_compute_region_instance_group.ap.instances[0].instance } resource "google_storage_bucket_object" "ap_config" { @@ -143,7 +155,7 @@ module "htcondor_ap" { project_id = var.project_id region = var.region - target_size = var.enable_high_availability ? 2 : 1 + target_size = local.host_count hostname = local.name_prefix instance_template = module.access_point_instance_template.self_link @@ -182,3 +194,11 @@ module "htcondor_ap" { is_external = var.enable_public_ips }] } + +resource "time_sleep" "mig_warmup" { + create_duration = "120s" + + triggers = { + self_link = module.htcondor_ap.self_link + } +} diff --git a/community/modules/scheduler/htcondor-access-point/outputs.tf b/community/modules/scheduler/htcondor-access-point/outputs.tf index 9512cd8028..f7424c6d5d 100644 --- a/community/modules/scheduler/htcondor-access-point/outputs.tf +++ b/community/modules/scheduler/htcondor-access-point/outputs.tf @@ -14,7 +14,12 @@ * limitations under the License. */ -output "list_instances_command" { - description = "Command to list Access Points provisioned by this module" - value = local.list_instances_command +output "access_point_ips" { + description = "IP addresses of the access points provisioned by this module" + value = local.access_point_ips +} + +output "access_point_name" { + description = "Name of the access point provisioned by this module" + value = local.access_point_name } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 839d2ffcad..6c3301b210 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -20,6 +20,10 @@ terraform { source = "hashicorp/google" version = ">= 3.83" } + time = { + source = "hashicorp/time" + version = "~> 0.9" + } } provider_meta "google" { module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.20.0" From ce9650ad1d9da58e33757a387b38af6120c123bc Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 17 Jul 2023 18:49:05 -0500 Subject: [PATCH 076/144] Address integration test non-failures Address 2 instances where tasks in our integration tests fail but Ansible does not ultimately exit with non-0 return code. - end of rescue blocks must fail, otherwise they "rescue" the failure - the always block has a sequence of "ignore_errors" to ensure that all tasks are attempted; this commit adds a test to fail if any of the tasks fail --- .../ansible_playbooks/htcondor-integration-test.yml | 3 +++ .../ansible_playbooks/multigroup-integration-test.yml | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml index 8ff8fd2408..6badc7ba44 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml @@ -131,6 +131,9 @@ args: chdir: "{{ workspace }}/{{ deployment_name }}/packer/custom-image" executable: /bin/bash + - name: Trigger Cloud Build failure (rescue blocks otherwise revert failures) + ansible.builtin.fail: + msg: "Failed while setting up test infrastructure" - name: Run Integration Tests hosts: remote_host diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml index bffd923d64..af3383d517 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml @@ -51,3 +51,7 @@ args: chdir: "{{ workspace }}/{{ deployment_name }}/packer/custom-image" executable: /bin/bash + - name: Trigger Cloud Build failure + when: ghpc_destroy.failed or image_deletion.failed + ansible.builtin.fail: + msg: "Failed while setting up test infrastructure" From 996fadaf1e46d183214bc8b4d9a62a4e56dea149 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Sat, 15 Jul 2023 07:27:53 -0500 Subject: [PATCH 077/144] Implement wrapper for HTCondor central manager Refactor existing blueprint using vm-instance and startup-script modules using a wrapper that adds the additional functionality of provisioning highly available HTCondor central manager using a managed instance group. --- community/examples/htc-htcondor.yaml | 37 +--- .../modules/scheduler/htcondor-base/README.md | 5 - .../modules/scheduler/htcondor-base/main.tf | 34 ---- .../scheduler/htcondor-base/outputs.tf | 10 -- .../scheduler/htcondor-base/variables.tf | 6 - .../htcondor-central-manager/README.md | 103 +++++++++++ .../files/htcondor_configure.yml | 5 +- .../htcondor-central-manager/main.tf | 169 ++++++++++++++++++ .../htcondor-central-manager/outputs.tf | 30 ++++ .../templates/condor_config.tftpl | 30 +--- .../htcondor-central-manager/variables.tf | 122 +++++++++++++ .../htcondor-central-manager/versions.tf | 33 ++++ 12 files changed, 469 insertions(+), 115 deletions(-) create mode 100644 community/modules/scheduler/htcondor-central-manager/README.md rename community/modules/scheduler/{htcondor-base => htcondor-central-manager}/files/htcondor_configure.yml (94%) create mode 100644 community/modules/scheduler/htcondor-central-manager/main.tf create mode 100644 community/modules/scheduler/htcondor-central-manager/outputs.tf rename community/modules/scheduler/{htcondor-base => htcondor-central-manager}/templates/condor_config.tftpl (52%) create mode 100644 community/modules/scheduler/htcondor-central-manager/variables.tf create mode 100644 community/modules/scheduler/htcondor-central-manager/versions.tf diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index ce1fa13f1c..28f2f84f6f 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -67,42 +67,18 @@ deployment_groups: use: - htcondor_base - - id: htcondor_startup_central_manager - source: modules/scripts/startup-script - settings: - runners: - - $(htcondor_secrets.central_manager_runner) - - $(htcondor_base.central_manager_runner) - - id: htcondor_cm - source: modules/compute/vm-instance + source: community/modules/scheduler/htcondor-central-manager use: - - htcondor_startup_central_manager + - network1 + - htcondor_secrets + - htcondor_base settings: - name_prefix: cm instance_image: project: $(vars.project_id) family: $(vars.new_image_family) - add_deployment_name_before_prefix: true - machine_type: c2-standard-4 - disable_public_ips: true - service_account: - email: $(htcondor_base.central_manager_service_account_email) - scopes: - - cloud-platform - network_interfaces: - - network: null - subnetwork: $(network1.subnetwork_self_link) - subnetwork_project: $(vars.project_id) - network_ip: $(htcondor_base.central_manager_ips[0]) - stack_type: null - access_config: [] - ipv6_access_config: [] - alias_ip_range: [] - nic_type: VIRTIO_NET - queue_count: null outputs: - - internal_ip + - central_manager_name # the HTCondor modules support up to 2 execute points per blueprint # if using 1, it may use Spot or On-demand pricing @@ -113,6 +89,7 @@ deployment_groups: - network1 - htcondor_secrets - htcondor_base + - htcondor_cm settings: instance_image: project: $(vars.project_id) @@ -125,6 +102,7 @@ deployment_groups: - network1 - htcondor_secrets - htcondor_base + - htcondor_cm settings: instance_image: project: $(vars.project_id) @@ -137,6 +115,7 @@ deployment_groups: - network1 - htcondor_secrets - htcondor_base + - htcondor_cm - htcondor_execute_point - htcondor_execute_point_spot settings: diff --git a/community/modules/scheduler/htcondor-base/README.md b/community/modules/scheduler/htcondor-base/README.md index e61324c096..cc13c53a47 100644 --- a/community/modules/scheduler/htcondor-base/README.md +++ b/community/modules/scheduler/htcondor-base/README.md @@ -213,7 +213,6 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [access\_point\_service\_account](#module\_access\_point\_service\_account) | terraform-google-modules/service-accounts/google | ~> 4.2 | -| [address](#module\_address) | terraform-google-modules/address/google | ~> 3.0 | | [central\_manager\_service\_account](#module\_central\_manager\_service\_account) | terraform-google-modules/service-accounts/google | ~> 4.2 | | [execute\_point\_service\_account](#module\_execute\_point\_service\_account) | terraform-google-modules/service-accounts/google | ~> 4.2 | | [health\_check\_firewall\_rule](#module\_health\_check\_firewall\_rule) | terraform-google-modules/network/google//modules/firewall-rules | ~> 6.0 | @@ -223,7 +222,6 @@ limitations under the License. | Name | Type | |------|------| -| [google_storage_bucket_object.cm_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | | [google_compute_subnetwork.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_subnetwork) | data source | ## Inputs @@ -231,7 +229,6 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [access\_point\_roles](#input\_access\_point\_roles) | Project-wide roles for HTCondor Access Point service account | `list(string)` |
[
"roles/compute.instanceAdmin",
"roles/monitoring.metricWriter",
"roles/logging.logWriter",
"roles/storage.objectViewer"
]
| no | -| [central\_manager\_high\_availability](#input\_central\_manager\_high\_availability) | Provision HTCondor central manager in high availability mode | `bool` | `false` | no | | [central\_manager\_roles](#input\_central\_manager\_roles) | Project-wide roles for HTCondor Central Manager service account | `list(string)` |
[
"roles/monitoring.metricWriter",
"roles/logging.logWriter",
"roles/storage.objectViewer"
]
| no | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | | [execute\_point\_roles](#input\_execute\_point\_roles) | Project-wide roles for HTCondor Execute Point service account | `list(string)` |
[
"roles/monitoring.metricWriter",
"roles/logging.logWriter",
"roles/storage.objectViewer"
]
| no | @@ -245,8 +242,6 @@ limitations under the License. | Name | Description | |------|-------------| | [access\_point\_service\_account\_email](#output\_access\_point\_service\_account\_email) | HTCondor Access Point Service Account (e-mail format) | -| [central\_manager\_ips](#output\_central\_manager\_ips) | Reserved internal IP address for use by Central Manager | -| [central\_manager\_runner](#output\_central\_manager\_runner) | Toolkit Runner to configure an HTCondor Central Manager | | [central\_manager\_service\_account\_email](#output\_central\_manager\_service\_account\_email) | HTCondor Central Manager Service Account (e-mail format) | | [execute\_point\_service\_account\_email](#output\_execute\_point\_service\_account\_email) | HTCondor Execute Point Service Account (e-mail format) | | [htcondor\_bucket\_name](#output\_htcondor\_bucket\_name) | Name of the HTCondor configuration bucket | diff --git a/community/modules/scheduler/htcondor-base/main.tf b/community/modules/scheduler/htcondor-base/main.tf index cc9bf66b9e..3572f729eb 100644 --- a/community/modules/scheduler/htcondor-base/main.tf +++ b/community/modules/scheduler/htcondor-base/main.tf @@ -26,25 +26,6 @@ locals { access_point_roles = [for role in var.access_point_roles : "${var.project_id}=>${role}"] central_manager_display_name = "HTCondor Central Manager (${var.deployment_name})" central_manager_roles = [for role in var.central_manager_roles : "${var.project_id}=>${role}"] - - central_manager_count = var.central_manager_high_availability ? 2 : 1 - central_manager_ip_names = [for i in range(local.central_manager_count) : "${var.deployment_name}-cm-ip-${i}"] - - cm_config = templatefile("${path.module}/templates/condor_config.tftpl", { - htcondor_role = "get_htcondor_central_manager", - central_manager_ips = module.address.addresses, - }) - - cm_object = "gs://${module.htcondor_bucket.name}/${google_storage_bucket_object.cm_config.output_name}" - runner_cm = { - "type" = "ansible-local" - "content" = file("${path.module}/files/htcondor_configure.yml") - "destination" = "htcondor_configure.yml" - "args" = join(" ", [ - "-e htcondor_role=get_htcondor_central_manager", - "-e config_object=${local.cm_object}", - ]) - } } module "htcondor_bucket" { @@ -68,12 +49,6 @@ module "htcondor_bucket" { set_viewer_roles = true } -resource "google_storage_bucket_object" "cm_config" { - name = "${var.deployment_name}-cm-config-${substr(md5(local.cm_config), 0, 4)}" - content = local.cm_config - bucket = module.htcondor_bucket.name -} - module "access_point_service_account" { source = "terraform-google-modules/service-accounts/google" version = "~> 4.2" @@ -107,15 +82,6 @@ module "central_manager_service_account" { project_roles = local.central_manager_roles } -module "address" { - source = "terraform-google-modules/address/google" - version = "~> 3.0" - project_id = var.project_id - region = var.region - subnetwork = var.subnetwork_self_link - names = local.central_manager_ip_names -} - data "google_compute_subnetwork" "htcondor" { self_link = var.subnetwork_self_link } diff --git a/community/modules/scheduler/htcondor-base/outputs.tf b/community/modules/scheduler/htcondor-base/outputs.tf index a39c1e7b4e..9441831e8b 100644 --- a/community/modules/scheduler/htcondor-base/outputs.tf +++ b/community/modules/scheduler/htcondor-base/outputs.tf @@ -38,16 +38,6 @@ output "execute_point_service_account_email" { ] } -output "central_manager_runner" { - description = "Toolkit Runner to configure an HTCondor Central Manager" - value = local.runner_cm -} - -output "central_manager_ips" { - description = "Reserved internal IP address for use by Central Manager" - value = module.address.addresses -} - output "htcondor_bucket_name" { description = "Name of the HTCondor configuration bucket" value = module.htcondor_bucket.name diff --git a/community/modules/scheduler/htcondor-base/variables.tf b/community/modules/scheduler/htcondor-base/variables.tf index 823aa043fb..763908012e 100644 --- a/community/modules/scheduler/htcondor-base/variables.tf +++ b/community/modules/scheduler/htcondor-base/variables.tf @@ -69,9 +69,3 @@ variable "execute_point_roles" { "roles/storage.objectViewer", ] } - -variable "central_manager_high_availability" { - description = "Provision HTCondor central manager in high availability mode" - type = bool - default = false -} diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md new file mode 100644 index 0000000000..ce0d8e8c7d --- /dev/null +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -0,0 +1,103 @@ +## Description + +This module provisions a highly available HTCondor central manager using a [Managed +Instance Group (MIG)][mig] with auto-healing. + +[mig]: https://cloud.google.com/compute/docs/instance-groups + +## Usage + +Although this provisions an HTCondor central manager with standard configuration, +for a functioning node, you must supply Toolkit runners as described below: + +- [var.central_manager_runner](#input_central_manager_runner) + - Runner must download a POOL password / signing key and create an [IDTOKEN] + with no scopes (full authorization). + +A reference implementation is included in the Toolkit module +[htcondor-pool-secrets]. You may substitute implementations so long as they +duplicate the functionality in the references. Usage is demonstrated in the +[HTCondor example][htc-example]. + +[htc-example]: ../../../../examples/README.md#htc-htcondoryaml-- +[htcondor-pool-secrets]: ../htcondor-pool-secrets/README.md +[IDTOKEN]: https://htcondor.readthedocs.io/en/latest/admin-manual/security.html#introducing-idtokens + + +Copyright 2023 Google LLC + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 0.13.0 | +| [google](#requirement\_google) | >= 3.83 | +| [time](#requirement\_time) | ~> 0.9 | + +## Providers + +| Name | Version | +|------|---------| +| [google](#provider\_google) | >= 3.83 | +| [time](#provider\_time) | ~> 0.9 | + +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | +| [htcondor\_cm](#module\_htcondor\_cm) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | 84d7959 | +| [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.20.0&depth=1 | + +## Resources + +| Name | Type | +|------|------| +| [google_storage_bucket_object.cm_config](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/storage_bucket_object) | resource | +| [time_sleep.mig_warmup](https://registry.terraform.io/providers/hashicorp/time/latest/docs/resources/sleep) | resource | +| [google_compute_image.htcondor](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_image) | data source | +| [google_compute_instance.cm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_instance) | data source | +| [google_compute_region_instance_group.cm](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_region_instance_group) | data source | +| [google_compute_zones.available](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_zones) | data source | + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [central\_manager\_runner](#input\_central\_manager\_runner) | A list of Toolkit runners for configuring an HTCondor central manager | `list(map(string))` | `[]` | no | +| [central\_manager\_service\_account\_email](#input\_central\_manager\_service\_account\_email) | Service account for central manager (e-mail format) | `string` | n/a | yes | +| [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | +| [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `null` | no | +| [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | +| [htcondor\_bucket\_name](#input\_htcondor\_bucket\_name) | Name of HTCondor configuration bucket | `string` | n/a | yes | +| [instance\_image](#input\_instance\_image) | Custom VM image with HTCondor and Toolkit support installed. |
object({
family = string,
project = string
})
| n/a | yes | +| [labels](#input\_labels) | Labels to add to resources. List key, value pairs. | `map(string)` | n/a | yes | +| [machine\_type](#input\_machine\_type) | Machine type to use for HTCondor central managers | `string` | `"c2-standard-4"` | no | +| [metadata](#input\_metadata) | Metadata to add to HTCondor central managers | `map(string)` | `{}` | no | +| [network\_self\_link](#input\_network\_self\_link) | The self link of the network in which the HTCondor central manager will be created. | `string` | `null` | no | +| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | +| [project\_id](#input\_project\_id) | Project in which HTCondor pool will be created | `string` | n/a | yes | +| [region](#input\_region) | Default region for creating resources | `string` | n/a | yes | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no | + +## Outputs + +| Name | Description | +|------|-------------| +| [central\_manager\_ips](#output\_central\_manager\_ips) | IP addresses of the central managers provisioned by this module | +| [central\_manager\_name](#output\_central\_manager\_name) | Name of the central managers provisioned by this module | +| [list\_instances\_command](#output\_list\_instances\_command) | Command to list central managers provisioned by this module | + diff --git a/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml b/community/modules/scheduler/htcondor-central-manager/files/htcondor_configure.yml similarity index 94% rename from community/modules/scheduler/htcondor-base/files/htcondor_configure.yml rename to community/modules/scheduler/htcondor-central-manager/files/htcondor_configure.yml index 44fb1e4833..7408af6370 100644 --- a/community/modules/scheduler/htcondor-base/files/htcondor_configure.yml +++ b/community/modules/scheduler/htcondor-central-manager/files/htcondor_configure.yml @@ -13,19 +13,16 @@ # limitations under the License. --- -- name: Configure HTCondor Role +- name: Configure HTCondor central manager hosts: localhost become: true vars: - job_queue_ha: false - spool_dir: /var/lib/condor/spool condor_config_root: /etc/condor ghpc_config_file: 50-ghpc-managed tasks: - name: Ensure necessary variables are set ansible.builtin.assert: that: - - htcondor_role is defined - config_object is defined - name: Remove default HTCondor configuration ansible.builtin.file: diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf new file mode 100644 index 0000000000..9025bed984 --- /dev/null +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -0,0 +1,169 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +locals { + # This label allows for billing report tracking based on module. + labels = merge(var.labels, { ghpc_module = "htcondor-central-manager" }) +} + +locals { + network_storage_metadata = var.network_storage == null ? {} : { network_storage = jsonencode(var.network_storage) } + oslogin_api_values = { + "DISABLE" = "FALSE" + "ENABLE" = "TRUE" + } + enable_oslogin_metadata = var.enable_oslogin == "INHERIT" ? {} : { enable-oslogin = lookup(local.oslogin_api_values, var.enable_oslogin, "") } + metadata = merge(local.network_storage_metadata, local.enable_oslogin_metadata, var.metadata) + + name_prefix = "${var.deployment_name}-cm" + + all_runners = flatten([var.central_manager_runner, local.schedd_runner]) + + cm_config = templatefile("${path.module}/templates/condor_config.tftpl", {}) + + cm_object = "gs://${var.htcondor_bucket_name}/${google_storage_bucket_object.cm_config.output_name}" + schedd_runner = { + type = "ansible-local" + content = file("${path.module}/files/htcondor_configure.yml") + destination = "htcondor_configure.yml" + args = join(" ", [ + "-e config_object=${local.cm_object}", + ]) + } + + central_manager_ips = [data.google_compute_instance.cm.network_interface[0].network_ip] + central_manager_name = data.google_compute_instance.cm.name + + list_instances_command = "gcloud compute instance-groups list-instances ${data.google_compute_region_instance_group.cm.name} --region ${var.region} --project ${var.project_id}" +} + +data "google_compute_image" "htcondor" { + family = var.instance_image.family + project = var.instance_image.project + + lifecycle { + postcondition { + condition = self.disk_size_gb <= var.disk_size_gb + error_message = "var.disk_size_gb must be set to at least the size of the image (${self.disk_size_gb})" + } + } +} + +data "google_compute_zones" "available" { + project = var.project_id + region = var.region +} + +data "google_compute_region_instance_group" "cm" { + self_link = time_sleep.mig_warmup.triggers.self_link +} + +data "google_compute_instance" "cm" { + self_link = data.google_compute_region_instance_group.cm.instances[0].instance +} + +resource "google_storage_bucket_object" "cm_config" { + name = "${var.deployment_name}-cm-config-${substr(md5(local.cm_config), 0, 4)}" + content = local.cm_config + bucket = var.htcondor_bucket_name +} + +module "startup_script" { + source = "github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script?ref=v1.20.0&depth=1" + + project_id = var.project_id + region = var.region + labels = local.labels + deployment_name = var.deployment_name + + runners = local.all_runners +} + +module "central_manager_instance_template" { + # tflint-ignore: terraform_module_pinned_source + source = "github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=84d7959" + + name_prefix = local.name_prefix + project_id = var.project_id + network = var.network_self_link + subnetwork = var.subnetwork_self_link + service_account = { + email = var.central_manager_service_account_email + scopes = var.service_account_scopes + } + labels = local.labels + + machine_type = var.machine_type + disk_size_gb = var.disk_size_gb + preemptible = false + startup_script = module.startup_script.startup_script + metadata = local.metadata + source_image = data.google_compute_image.htcondor.self_link +} + +module "htcondor_cm" { + # tflint-ignore: terraform_module_pinned_source + source = "github.com/terraform-google-modules/terraform-google-vm//modules/mig?ref=84d7959" + + project_id = var.project_id + region = var.region + target_size = 1 + hostname = local.name_prefix + instance_template = module.central_manager_instance_template.self_link + + health_check_name = "health-${local.name_prefix}" + health_check = { + type = "tcp" + initial_delay_sec = 600 + check_interval_sec = 20 + healthy_threshold = 2 + timeout_sec = 8 + unhealthy_threshold = 3 + response = "" + proxy_header = "NONE" + port = 9618 + request = "" + request_path = "" + host = "" + enable_logging = true + } + + update_policy = [{ + instance_redistribution_type = "NONE" + replacement_method = "SUBSTITUTE" + max_surge_fixed = length(data.google_compute_zones.available.names) + max_unavailable_fixed = length(data.google_compute_zones.available.names) + max_surge_percent = null + max_unavailable_percent = null + min_ready_sec = 300 + minimal_action = "REPLACE" + type = "OPPORTUNISTIC" + }] + + stateful_ips = [{ + interface_name = "nic0" + delete_rule = "ON_PERMANENT_INSTANCE_DELETION" + is_external = false + }] +} + +resource "time_sleep" "mig_warmup" { + create_duration = "120s" + + triggers = { + self_link = module.htcondor_cm.self_link + } +} diff --git a/community/modules/scheduler/htcondor-central-manager/outputs.tf b/community/modules/scheduler/htcondor-central-manager/outputs.tf new file mode 100644 index 0000000000..a6272e7ca2 --- /dev/null +++ b/community/modules/scheduler/htcondor-central-manager/outputs.tf @@ -0,0 +1,30 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +output "list_instances_command" { + description = "Command to list central managers provisioned by this module" + value = local.list_instances_command +} + +output "central_manager_ips" { + description = "IP addresses of the central managers provisioned by this module" + value = local.central_manager_ips +} + +output "central_manager_name" { + description = "Name of the central managers provisioned by this module" + value = local.central_manager_name +} diff --git a/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl b/community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl similarity index 52% rename from community/modules/scheduler/htcondor-base/templates/condor_config.tftpl rename to community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl index d96950b4ed..d6a6d451b5 100644 --- a/community/modules/scheduler/htcondor-base/templates/condor_config.tftpl +++ b/community/modules/scheduler/htcondor-central-manager/templates/condor_config.tftpl @@ -16,35 +16,11 @@ # override settings with a higher priority (last lexically) named file # https://htcondor.readthedocs.io/en/latest/admin-manual/introduction-to-configuration.html?#ordered-evaluation-to-set-the-configuration -use role:${htcondor_role} -CONDOR_HOST = ${join(",", central_manager_ips)} +use role:get_htcondor_central_manager +CONDOR_HOST = $(IPV4_ADDRESS) -%{ if htcondor_role == "get_htcondor_central_manager" ~} -# Central Manager configuraition settings +# Central Manager configuration settings COLLECTOR_UPDATE_INTERVAL = 30 NEGOTIATOR_UPDATE_INTERVAL = 30 NEGOTIATOR_DEPTH_FIRST = True NEGOTIATOR_UPDATE_AFTER_CYCLE = True -%{ endif ~} - -%{ if htcondor_role == "get_htcondor_central_manager" && length(central_manager_ips) > 1 ~} -# Central Manager high availability configuration settings -# following https://htcondor.readthedocs.io/en/latest/admin-manual/high-availability.html#high-availability-of-the-central-manager -CM_LIST = \ - ${central_manager_ips[0]}:$(SHARED_PORT_PORT), \ - ${central_manager_ips[1]}:$(SHARED_PORT_PORT) - -HAD_USE_SHARED_PORT = True -HAD_LIST = $(CM_LIST) - -REPLICATION_USE_SHARED_PORT = True -REPLICATION_LIST = $(CM_LIST) - -HAD_USE_PRIMARY = True -HAD_CONTROLLEE = NEGOTIATOR -MASTER_NEGOTIATOR_CONTROLLER = HAD - -DAEMON_LIST = $(DAEMON_LIST), HAD, REPLICATION -HAD_USE_REPLICATION = True -MASTER_HAD_BACKOFF_CONSTANT = 360 -%{ endif ~} diff --git a/community/modules/scheduler/htcondor-central-manager/variables.tf b/community/modules/scheduler/htcondor-central-manager/variables.tf new file mode 100644 index 0000000000..a49e3b16ea --- /dev/null +++ b/community/modules/scheduler/htcondor-central-manager/variables.tf @@ -0,0 +1,122 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +variable "project_id" { + description = "Project in which HTCondor pool will be created" + type = string +} + +variable "deployment_name" { + description = "HPC Toolkit deployment name. HTCondor cloud resource names will include this value." + type = string +} + +variable "labels" { + description = "Labels to add to resources. List key, value pairs." + type = map(string) +} + +variable "region" { + description = "Default region for creating resources" + type = string +} + +variable "network_self_link" { + description = "The self link of the network in which the HTCondor central manager will be created." + type = string + default = null +} + +variable "central_manager_service_account_email" { + description = "Service account for central manager (e-mail format)" + type = string +} + +variable "service_account_scopes" { + description = "Scopes by which to limit service account attached to central manager." + type = set(string) + default = [ + "https://www.googleapis.com/auth/cloud-platform", + ] +} + +variable "network_storage" { + description = "An array of network attached storage mounts to be configured" + type = list(object({ + server_ip = string, + remote_mount = string, + local_mount = string, + fs_type = string, + mount_options = string, + client_install_runner = map(string) + mount_runner = map(string) + })) + default = [] +} + +variable "disk_size_gb" { + description = "Boot disk size in GB" + type = number + default = null +} + +variable "metadata" { + description = "Metadata to add to HTCondor central managers" + type = map(string) + default = {} +} + +variable "enable_oslogin" { + description = "Enable or Disable OS Login with \"ENABLE\" or \"DISABLE\". Set to \"INHERIT\" to inherit project OS Login setting." + type = string + default = "ENABLE" + nullable = false + validation { + condition = contains(["ENABLE", "DISABLE", "INHERIT"], var.enable_oslogin) + error_message = "Allowed string values for var.enable_oslogin are \"ENABLE\", \"DISABLE\", or \"INHERIT\"." + } +} + +variable "subnetwork_self_link" { + description = "The self link of the subnetwork in which the HTCondor central manager will be created." + type = string + default = null +} + +variable "instance_image" { + description = "Custom VM image with HTCondor and Toolkit support installed." + type = object({ + family = string, + project = string + }) +} + +variable "machine_type" { + description = "Machine type to use for HTCondor central managers" + type = string + default = "c2-standard-4" +} + +variable "central_manager_runner" { + description = "A list of Toolkit runners for configuring an HTCondor central manager" + type = list(map(string)) + default = [] +} + +variable "htcondor_bucket_name" { + description = "Name of HTCondor configuration bucket" + type = string +} diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf new file mode 100644 index 0000000000..d9109a1ed8 --- /dev/null +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -0,0 +1,33 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +terraform { + required_providers { + google = { + source = "hashicorp/google" + version = ">= 3.83" + } + time = { + source = "hashicorp/time" + version = "~> 0.9" + } + } + provider_meta "google" { + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.20.0" + } + + required_version = ">= 0.13.0" +} From c28d9b6ba3da0c3d7ff590712275393e7f9b5e13 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 18 Jul 2023 10:03:35 -0500 Subject: [PATCH 078/144] Address feedback from #1585 --- .../modules/scheduler/htcondor-base/README.md | 13 ++++- .../htcondor-central-manager/README.md | 55 ++++++++++++++++--- .../htcondor-central-manager/main.tf | 26 ++++++--- .../htcondor-central-manager/variables.tf | 16 ++++-- .../htcondor-central-manager/versions.tf | 2 +- pkg/config/config.go | 1 + 6 files changed, 89 insertions(+), 24 deletions(-) diff --git a/community/modules/scheduler/htcondor-base/README.md b/community/modules/scheduler/htcondor-base/README.md index cc13c53a47..aaca5a1abd 100644 --- a/community/modules/scheduler/htcondor-base/README.md +++ b/community/modules/scheduler/htcondor-base/README.md @@ -1,12 +1,19 @@ ## Description -This module performs the following tasks: +This module creates the basic security infrastructure of an HTCondor pool in +Google Cloud. + +> **_NOTE:_** This module was previously named htcondor-configure. The interface +> and responsibilities of this module have changed significantly. Please review +> the [example](#example) and modify your blueprints accordingly. + +## Security setup + +This module will take the following actions: - store an HTCondor Pool password in Google Cloud Secret Manager - will generate a new password if one is not supplied - create service accounts for an HTCondor Access Point and Central Manager -- create a Toolkit runner for an Access Point -- create a Toolkit runner for a Central Manager It is expected to be used with the [htcondor-install] and [htcondor-execute-point] modules. diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index ce0d8e8c7d..de88946480 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -7,8 +7,9 @@ Instance Group (MIG)][mig] with auto-healing. ## Usage -Although this provisions an HTCondor central manager with standard configuration, -for a functioning node, you must supply Toolkit runners as described below: +This module provisions an HTCondor central manager with a standard +configuration. For the node to function correctly, you must supply the input +variable described below: - [var.central_manager_runner](#input_central_manager_runner) - Runner must download a POOL password / signing key and create an [IDTOKEN] @@ -23,6 +24,45 @@ duplicate the functionality in the references. Usage is demonstrated in the [htcondor-pool-secrets]: ../htcondor-pool-secrets/README.md [IDTOKEN]: https://htcondor.readthedocs.io/en/latest/admin-manual/security.html#introducing-idtokens +## Behavior of Managed Instance Group (MIG) + +A regional [MIG][mig] is used to provision the central manager, although only +1 node will ever be active at a time. By default, the node will be provisioned +in any of the zones available in that region, however, it can be constrained to +run in fewer zones (or a single zone) using [var.zones](#input_zones). + +The VM replacement policy is set to [opportunistic]. In practice, this means +that an active VM will not be replaced by Terraform actions, but may be +replaced when either: + +- intentionally by issuing an update via Cloud Console or using gcloud (below) +- the VM becomes unhealthy or is otherwise automatically replaced (e.g. regular + Google Cloud maintenance) + +For example, to manually update all instances in a MIG: + +```text +gcloud compute instance-groups managed update-instances \ + <> --all-instances --region <> \ + --project <> --minimal-action replace +``` + +[opportunistic]: https://cloud.google.com/compute/docs/instance-groups/rolling-out-updates-to-managed-instance-groups#type + +## Limiting inter-zone egress + +Because all the elements of the HTCondor pool use regional MIGs, they may be +subject to [interzone egress fees][network-pricing]. The primary traffic between +nodes of an HTCondor pool running embarrassingly parallel jobs is expected to +be limited to API traffic for job scheduling and monitoring. Please review the +[network pricing][network-pricing] documentation and determine if this cost is +a concern. If it is, use [var.zones](#input_zones) to constrain each node within +your HTCondor pool to operate within a single zone. + +[network-pricing]: https://cloud.google.com/vpc/network-pricing + +## License + Copyright 2023 Google LLC @@ -42,7 +82,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 0.13.0 | +| [terraform](#requirement\_terraform) | >= 1.1.0 | | [google](#requirement\_google) | >= 3.83 | | [time](#requirement\_time) | ~> 0.9 | @@ -77,21 +117,22 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [central\_manager\_runner](#input\_central\_manager\_runner) | A list of Toolkit runners for configuring an HTCondor central manager | `list(map(string))` | `[]` | no | -| [central\_manager\_service\_account\_email](#input\_central\_manager\_service\_account\_email) | Service account for central manager (e-mail format) | `string` | n/a | yes | +| [central\_manager\_service\_account\_email](#input\_central\_manager\_service\_account\_email) | Service account e-mail for central manager (can be supplied by htcondor-base module) | `string` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | -| [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `null` | no | +| [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `20` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [htcondor\_bucket\_name](#input\_htcondor\_bucket\_name) | Name of HTCondor configuration bucket | `string` | n/a | yes | -| [instance\_image](#input\_instance\_image) | Custom VM image with HTCondor and Toolkit support installed. |
object({
family = string,
project = string
})
| n/a | yes | +| [instance\_image](#input\_instance\_image) | Custom VM image with HTCondor installed using the htcondor-install module. |
object({
family = string,
project = string
})
| n/a | yes | | [labels](#input\_labels) | Labels to add to resources. List key, value pairs. | `map(string)` | n/a | yes | | [machine\_type](#input\_machine\_type) | Machine type to use for HTCondor central managers | `string` | `"c2-standard-4"` | no | | [metadata](#input\_metadata) | Metadata to add to HTCondor central managers | `map(string)` | `{}` | no | | [network\_self\_link](#input\_network\_self\_link) | The self link of the network in which the HTCondor central manager will be created. | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | -| [project\_id](#input\_project\_id) | Project in which HTCondor pool will be created | `string` | n/a | yes | +| [project\_id](#input\_project\_id) | Project in which HTCondor central manager will be created | `string` | n/a | yes | | [region](#input\_region) | Default region for creating resources | `string` | n/a | yes | | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no | +| [zones](#input\_zones) | Zone(s) in which central manager may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | ## Outputs diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 9025bed984..3f817fcc8b 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -30,8 +30,6 @@ locals { name_prefix = "${var.deployment_name}-cm" - all_runners = flatten([var.central_manager_runner, local.schedd_runner]) - cm_config = templatefile("${path.module}/templates/condor_config.tftpl", {}) cm_object = "gs://${var.htcondor_bucket_name}/${google_storage_bucket_object.cm_config.output_name}" @@ -43,11 +41,14 @@ locals { "-e config_object=${local.cm_object}", ]) } + all_runners = flatten([var.central_manager_runner, local.schedd_runner]) central_manager_ips = [data.google_compute_instance.cm.network_interface[0].network_ip] central_manager_name = data.google_compute_instance.cm.name list_instances_command = "gcloud compute instance-groups list-instances ${data.google_compute_region_instance_group.cm.name} --region ${var.region} --project ${var.project_id}" + + zones = coalescelist(var.zones, data.google_compute_zones.available.names) } data "google_compute_image" "htcondor" { @@ -69,6 +70,12 @@ data "google_compute_zones" "available" { data "google_compute_region_instance_group" "cm" { self_link = time_sleep.mig_warmup.triggers.self_link + lifecycle { + postcondition { + condition = length(self.instances) == 1 + error_message = "There should only be 1 central manager found" + } + } } data "google_compute_instance" "cm" { @@ -118,11 +125,12 @@ module "htcondor_cm" { # tflint-ignore: terraform_module_pinned_source source = "github.com/terraform-google-modules/terraform-google-vm//modules/mig?ref=84d7959" - project_id = var.project_id - region = var.region - target_size = 1 - hostname = local.name_prefix - instance_template = module.central_manager_instance_template.self_link + project_id = var.project_id + region = var.region + distribution_policy_zones = local.zones + target_size = 1 + hostname = local.name_prefix + instance_template = module.central_manager_instance_template.self_link health_check_name = "health-${local.name_prefix}" health_check = { @@ -144,8 +152,8 @@ module "htcondor_cm" { update_policy = [{ instance_redistribution_type = "NONE" replacement_method = "SUBSTITUTE" - max_surge_fixed = length(data.google_compute_zones.available.names) - max_unavailable_fixed = length(data.google_compute_zones.available.names) + max_surge_fixed = length(local.zones) + max_unavailable_fixed = length(local.zones) max_surge_percent = null max_unavailable_percent = null min_ready_sec = 300 diff --git a/community/modules/scheduler/htcondor-central-manager/variables.tf b/community/modules/scheduler/htcondor-central-manager/variables.tf index a49e3b16ea..b248cac9cf 100644 --- a/community/modules/scheduler/htcondor-central-manager/variables.tf +++ b/community/modules/scheduler/htcondor-central-manager/variables.tf @@ -15,7 +15,7 @@ */ variable "project_id" { - description = "Project in which HTCondor pool will be created" + description = "Project in which HTCondor central manager will be created" type = string } @@ -34,6 +34,13 @@ variable "region" { type = string } +variable "zones" { + description = "Zone(s) in which central manager may be created. If not supplied, will default to all zones in var.region." + type = list(string) + default = [] + nullable = false +} + variable "network_self_link" { description = "The self link of the network in which the HTCondor central manager will be created." type = string @@ -41,7 +48,7 @@ variable "network_self_link" { } variable "central_manager_service_account_email" { - description = "Service account for central manager (e-mail format)" + description = "Service account e-mail for central manager (can be supplied by htcondor-base module)" type = string } @@ -70,7 +77,8 @@ variable "network_storage" { variable "disk_size_gb" { description = "Boot disk size in GB" type = number - default = null + default = 20 + nullable = false } variable "metadata" { @@ -97,7 +105,7 @@ variable "subnetwork_self_link" { } variable "instance_image" { - description = "Custom VM image with HTCondor and Toolkit support installed." + description = "Custom VM image with HTCondor installed using the htcondor-install module." type = object({ family = string, project = string diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index d9109a1ed8..4f8fd8952a 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -29,5 +29,5 @@ terraform { module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.20.0" } - required_version = ">= 0.13.0" + required_version = ">= 1.1.0" } diff --git a/pkg/config/config.go b/pkg/config/config.go index a9e674c455..0e17119aa5 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -75,6 +75,7 @@ var errorMessages = map[string]string{ var movedModules = map[string]string{ "community/modules/scheduler/cloud-batch-job": "modules/scheduler/batch-job-template", "community/modules/scheduler/cloud-batch-login-node": "modules/scheduler/batch-login-node", + "community/modules/scheduler/htcondor-configure": "community/modules/scheduler/htcondor-base", } // GroupName is the name of a deployment group From ff169a24b889e94c2c8ffac71175599d14c2af27 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 18 Jul 2023 19:44:03 -0500 Subject: [PATCH 079/144] Add GPU support to HTCondor execute points --- .../compute/htcondor-execute-point/README.md | 1 + .../htcondor-execute-point/gpu_definition.tf | 47 +++++++++++++++++++ .../compute/htcondor-execute-point/main.tf | 4 +- .../templates/condor_config.tftpl | 3 ++ .../htcondor-execute-point/variables.tf | 15 ++++++ tools/duplicate-diff.py | 1 + 6 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 community/modules/compute/htcondor-execute-point/gpu_definition.tf diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 410f85d52b..98888d7d6a 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -193,6 +193,7 @@ limitations under the License. | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [execute\_point\_runner](#input\_execute\_point\_runner) | A list of Toolkit runners for configuring an HTCondor execute point | `list(map(string))` | `[]` | no | | [execute\_point\_service\_account\_email](#input\_execute\_point\_service\_account\_email) | Service account for HTCondor execute point (e-mail format) | `string` | n/a | yes | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [htcondor\_bucket\_name](#input\_htcondor\_bucket\_name) | Name of HTCondor configuration bucket | `string` | n/a | yes | | [instance\_image](#input\_instance\_image) | HTCondor execute point VM image |
object({
family = string,
project = string
})
|
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to HTConodr execute points | `map(string)` | n/a | yes | diff --git a/community/modules/compute/htcondor-execute-point/gpu_definition.tf b/community/modules/compute/htcondor-execute-point/gpu_definition.tf new file mode 100644 index 0000000000..197a97b6e6 --- /dev/null +++ b/community/modules/compute/htcondor-execute-point/gpu_definition.tf @@ -0,0 +1,47 @@ +/** + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +## Required variables: +# guest_accelerator +# machine_type + +locals { + + # Ensure guest_accelerator is a list if not set + input_guest_accelerator = var.guest_accelerator == null ? [] : var.guest_accelerator + + # If the machine type indicates a GPU is used, gather the count and type information + accelerator_types = { + "highgpu" = "nvidia-tesla-a100" + "megagpu" = "nvidia-tesla-a100" + "ultragpu" = "nvidia-a100-80gb" + } + generated_guest_accelerator = try([{ + type = local.accelerator_types[regex("a2-([A-Za-z]+)-", var.machine_type)[0]], + count = one(regex("a2-[A-Za-z]+-(\\d+)", var.machine_type)), + }], []) + + # If the machine type is a valid a2 machine_type, generated_guest_accelerator + # will be populated. This also guarantees at least one populated list in coalescelist. + is_a2_vm = length(local.generated_guest_accelerator) > 0 + + # Set the guest_accelerator to the user defined value if supplied, otherwise + # use the locally generated accelerator list. + guest_accelerator = local.is_a2_vm ? coalescelist( + local.input_guest_accelerator, + local.generated_guest_accelerator, + ) : local.input_guest_accelerator +} diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index b8fc82f8f1..5b13ff63eb 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -54,7 +54,8 @@ locals { execute_config = templatefile("${path.module}/templates/condor_config.tftpl", { htcondor_role = "get_htcondor_execute", - central_manager_ips = var.central_manager_ips + central_manager_ips = var.central_manager_ips, + guest_accelerator = local.guest_accelerator, }) execute_object = "gs://${var.htcondor_bucket_name}/${google_storage_bucket_object.execute_config.output_name}" @@ -123,6 +124,7 @@ module "execute_point_instance_template" { machine_type = var.machine_type disk_size_gb = var.disk_size_gb + gpu = one(local.guest_accelerator) preemptible = var.spot startup_script = local.is_windows_image ? null : module.startup_script.startup_script metadata = local.metadata diff --git a/community/modules/compute/htcondor-execute-point/templates/condor_config.tftpl b/community/modules/compute/htcondor-execute-point/templates/condor_config.tftpl index cfd801b729..946edd741a 100644 --- a/community/modules/compute/htcondor-execute-point/templates/condor_config.tftpl +++ b/community/modules/compute/htcondor-execute-point/templates/condor_config.tftpl @@ -20,6 +20,9 @@ use role:${htcondor_role} CONDOR_HOST = ${join(",", central_manager_ips)} # StartD configuration settings +%{ if length(guest_accelerator) > 0 ~} +use feature:GPUs +%{ endif ~} use feature:PartitionableSlot use feature:CommonCloudAttributesGoogle("-c created-by") UPDATE_INTERVAL = 30 diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index cb97d87585..b1120c4cdb 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -168,3 +168,18 @@ variable "htcondor_bucket_name" { description = "Name of HTCondor configuration bucket" type = string } + +variable "guest_accelerator" { + description = "List of the type and count of accelerator cards attached to the instance." + type = list(object({ + type = string, + count = number + })) + default = [] + nullable = false + + validation { + condition = length(var.guest_accelerator) <= 1 + error_message = "The HTCondor module supports 0 or 1 models of accelerator card on each execute point" + } +} diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index 068d2e476e..7db1d062ca 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -37,6 +37,7 @@ ], [ "modules/compute/vm-instance/gpu_definition.tf", + "community/modules/compute/htcondor-execute-point/gpu_definition.tf", "community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf", "community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf", "community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf", From 742f9c7bce9bc07a402eb9cb033d7909ba65a07d Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 18 Jul 2023 19:44:04 -0500 Subject: [PATCH 080/144] Add MIG zone selection to HTCondor execute point module --- .../compute/htcondor-execute-point/README.md | 2 +- .../compute/htcondor-execute-point/main.tf | 33 ++++++++++--------- .../htcondor-execute-point/variables.tf | 8 +++-- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 98888d7d6a..7c15da6922 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -210,7 +210,7 @@ limitations under the License. | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork HTCondor execute points will join | `string` | `null` | no | | [target\_size](#input\_target\_size) | Initial size of the HTCondor execute point pool; set to null (default) to avoid Terraform management of size. | `number` | `null` | no | | [windows\_startup\_ps1](#input\_windows\_startup\_ps1) | Startup script to run at boot-time for Windows-based HTCondor execute points | `list(string)` | `[]` | no | -| [zone](#input\_zone) | The default zone in which resources will be created | `string` | n/a | yes | +| [zones](#input\_zones) | Zone(s) in which execute points may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | ## Outputs diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 5b13ff63eb..de6a547afd 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -19,8 +19,8 @@ locals { labels = merge(var.labels, { ghpc_module = "htcondor-execute-point" }) } - locals { + zones = coalescelist(var.zones, data.google_compute_zones.available.names) network_storage_metadata = var.network_storage == null ? {} : { network_storage = jsonencode(var.network_storage) } oslogin_api_values = { @@ -45,7 +45,7 @@ locals { "args" = join(" ", [ "-e project_id=${var.project_id}", "-e region=${var.region}", - "-e zone=${var.zone}", + "-e zone=${local.zones[0]}", # this value is required, but ignored by regional MIG autoscaler "-e mig_id=${module.mig.instance_group_manager.name}", "-e max_size=${var.max_size}", "-e min_idle=${var.min_idle}", @@ -91,6 +91,11 @@ data "google_compute_image" "htcondor" { } } +data "google_compute_zones" "available" { + project = var.project_id + region = var.region +} + resource "google_storage_bucket_object" "execute_config" { name = "${var.deployment_name}-execute-config-${substr(md5(local.execute_config), 0, 4)}" content = local.execute_config @@ -131,19 +136,15 @@ module "execute_point_instance_template" { source_image = data.google_compute_image.htcondor.self_link } -data "google_compute_zones" "available" { - project = var.project_id - region = var.region -} - module "mig" { - source = "terraform-google-modules/vm/google//modules/mig" - version = "~> 8.0" - project_id = var.project_id - region = var.region - target_size = var.target_size - hostname = local.hostnames - instance_template = module.execute_point_instance_template.self_link + source = "terraform-google-modules/vm/google//modules/mig" + version = "~> 8.0" + project_id = var.project_id + region = var.region + distribution_policy_zones = local.zones + target_size = var.target_size + hostname = local.hostnames + instance_template = module.execute_point_instance_template.self_link health_check_name = "health-htcondor-${local.hostnames}" health_check = { @@ -165,8 +166,8 @@ module "mig" { update_policy = [{ instance_redistribution_type = "NONE" replacement_method = "SUBSTITUTE" - max_surge_fixed = length(data.google_compute_zones.available.names) - max_unavailable_fixed = length(data.google_compute_zones.available.names) + max_surge_fixed = length(local.zones) + max_unavailable_fixed = length(local.zones) max_surge_percent = null max_unavailable_percent = null min_ready_sec = 300 diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index b1120c4cdb..7df7044d95 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -24,9 +24,11 @@ variable "region" { type = string } -variable "zone" { - description = "The default zone in which resources will be created" - type = string +variable "zones" { + description = "Zone(s) in which execute points may be created. If not supplied, will default to all zones in var.region." + type = list(string) + default = [] + nullable = false } variable "deployment_name" { From 9d6a16d2f04d7e99a1735d5f355a82fb45a5fa42 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 18 Jul 2023 19:44:04 -0500 Subject: [PATCH 081/144] Use EP instead of XP as execute point acronym Align with HTCondor standard usage: https://htcondor.readthedocs.io/en/latest/codes-other-values/glossary.html#term-EP-Execution-Point --- community/modules/compute/htcondor-execute-point/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index de6a547afd..001f15bab0 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -76,7 +76,7 @@ locals { } ) - hostnames = var.spot ? "${var.deployment_name}-spot-xp" : "${var.deployment_name}-xp" + hostnames = var.spot ? "${var.deployment_name}-spot-ep" : "${var.deployment_name}-ep" } data "google_compute_image" "htcondor" { From 0aca9ea650bd1c92885266b1fa11f4b3c307d021 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 19 Jul 2023 09:59:36 -0500 Subject: [PATCH 082/144] Add MIG zone selection to HTCondor access point module --- .../scheduler/htcondor-access-point/README.md | 1 + .../scheduler/htcondor-access-point/main.tf | 17 ++++++++++------- .../htcondor-access-point/variables.tf | 7 +++++++ 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index dd57de5df1..d97ac99c92 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -101,6 +101,7 @@ limitations under the License. | [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes by which to limit service account attached to central manager. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spool\_parent\_dir](#input\_spool\_parent\_dir) | HTCondor access point configuration SPOOL will be set to subdirectory named "spool" | `string` | `"/var/lib/condor"` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork in which the HTCondor central manager will be created. | `string` | `null` | no | +| [zones](#input\_zones) | Zone(s) in which access point may be created. If not supplied, will default to all zones in var.region. | `list(string)` | `[]` | no | ## Outputs diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 603a755f36..171ef29154 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -77,6 +77,8 @@ locals { access_point_ips = [data.google_compute_instance.ap.network_interface[0].network_ip] access_point_name = data.google_compute_instance.ap.name + + zones = coalescelist(var.zones, data.google_compute_zones.available.names) } data "google_compute_image" "htcondor" { @@ -153,11 +155,12 @@ module "htcondor_ap" { # tflint-ignore: terraform_module_pinned_source source = "github.com/terraform-google-modules/terraform-google-vm//modules/mig?ref=84d7959" - project_id = var.project_id - region = var.region - target_size = local.host_count - hostname = local.name_prefix - instance_template = module.access_point_instance_template.self_link + project_id = var.project_id + region = var.region + distribution_policy_zones = local.zones + target_size = local.host_count + hostname = local.name_prefix + instance_template = module.access_point_instance_template.self_link health_check_name = "health-${local.name_prefix}" health_check = { @@ -179,8 +182,8 @@ module "htcondor_ap" { update_policy = [{ instance_redistribution_type = "NONE" replacement_method = "SUBSTITUTE" - max_surge_fixed = length(data.google_compute_zones.available.names) - max_unavailable_fixed = length(data.google_compute_zones.available.names) + max_surge_fixed = length(local.zones) + max_unavailable_fixed = length(local.zones) max_surge_percent = null max_unavailable_percent = null min_ready_sec = 300 diff --git a/community/modules/scheduler/htcondor-access-point/variables.tf b/community/modules/scheduler/htcondor-access-point/variables.tf index fed1f47d4c..99ed55037a 100644 --- a/community/modules/scheduler/htcondor-access-point/variables.tf +++ b/community/modules/scheduler/htcondor-access-point/variables.tf @@ -34,6 +34,13 @@ variable "region" { type = string } +variable "zones" { + description = "Zone(s) in which access point may be created. If not supplied, will default to all zones in var.region." + type = list(string) + default = [] + nullable = false +} + variable "network_self_link" { description = "The self link of the network in which the HTCondor central manager will be created." type = string From 2f57855eba08f236002a246952740319a31c93c1 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 19 Jul 2023 15:33:44 -0500 Subject: [PATCH 083/144] Fix potential object naming conflicts in HTCondor modules --- community/modules/compute/htcondor-execute-point/main.tf | 2 +- community/modules/scheduler/htcondor-access-point/main.tf | 2 +- community/modules/scheduler/htcondor-central-manager/main.tf | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 001f15bab0..806955df56 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -97,7 +97,7 @@ data "google_compute_zones" "available" { } resource "google_storage_bucket_object" "execute_config" { - name = "${var.deployment_name}-execute-config-${substr(md5(local.execute_config), 0, 4)}" + name = "${local.hostnames}-config-${substr(md5(local.execute_config), 0, 4)}" content = local.execute_config bucket = var.htcondor_bucket_name } diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 171ef29154..5d19301658 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -113,7 +113,7 @@ data "google_compute_instance" "ap" { } resource "google_storage_bucket_object" "ap_config" { - name = "${var.deployment_name}-ap-config-${substr(md5(local.ap_config), 0, 4)}" + name = "${local.name_prefix}-config-${substr(md5(local.ap_config), 0, 4)}" content = local.ap_config bucket = var.htcondor_bucket_name } diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index 3f817fcc8b..fce3ff2709 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -83,7 +83,7 @@ data "google_compute_instance" "cm" { } resource "google_storage_bucket_object" "cm_config" { - name = "${var.deployment_name}-cm-config-${substr(md5(local.cm_config), 0, 4)}" + name = "${local.name_prefix}-config-${substr(md5(local.cm_config), 0, 4)}" content = local.cm_config bucket = var.htcondor_bucket_name } From ababc68fd706fe0db578c1fc5192a2b5acf5f156 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 20 Jul 2023 17:10:50 -0500 Subject: [PATCH 084/144] Rename htcondor-base to htcondor-setup - address feedback from #1585 --- community/examples/htc-htcondor.yaml | 14 +++++++------- .../compute/htcondor-execute-point/README.md | 4 ++-- .../scheduler/htcondor-central-manager/README.md | 2 +- .../htcondor-central-manager/variables.tf | 2 +- .../scheduler/htcondor-pool-secrets/README.md | 14 +++++++------- .../{htcondor-base => htcondor-setup}/README.md | 6 +++--- .../{htcondor-base => htcondor-setup}/main.tf | 2 +- .../{htcondor-base => htcondor-setup}/outputs.tf | 0 .../{htcondor-base => htcondor-setup}/variables.tf | 0 .../{htcondor-base => htcondor-setup}/versions.tf | 2 +- .../modules/scripts/htcondor-install/README.md | 6 +++--- docs/gpu-support.md | 4 ++-- modules/README.md | 6 +++--- pkg/config/config.go | 2 +- pkg/modulereader/resreader.go | 2 +- 15 files changed, 33 insertions(+), 33 deletions(-) rename community/modules/scheduler/{htcondor-base => htcondor-setup}/README.md (98%) rename community/modules/scheduler/{htcondor-base => htcondor-setup}/main.tf (98%) rename community/modules/scheduler/{htcondor-base => htcondor-setup}/outputs.tf (100%) rename community/modules/scheduler/{htcondor-base => htcondor-setup}/variables.tf (100%) rename community/modules/scheduler/{htcondor-base => htcondor-setup}/versions.tf (91%) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 28f2f84f6f..462cd8691d 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -57,22 +57,22 @@ deployment_groups: - group: pool modules: - - id: htcondor_base - source: community/modules/scheduler/htcondor-base + - id: htcondor_setup + source: community/modules/scheduler/htcondor-setup use: - network1 - id: htcondor_secrets source: community/modules/scheduler/htcondor-pool-secrets use: - - htcondor_base + - htcondor_setup - id: htcondor_cm source: community/modules/scheduler/htcondor-central-manager use: - network1 - htcondor_secrets - - htcondor_base + - htcondor_setup settings: instance_image: project: $(vars.project_id) @@ -88,7 +88,7 @@ deployment_groups: use: - network1 - htcondor_secrets - - htcondor_base + - htcondor_setup - htcondor_cm settings: instance_image: @@ -101,7 +101,7 @@ deployment_groups: use: - network1 - htcondor_secrets - - htcondor_base + - htcondor_setup - htcondor_cm settings: instance_image: @@ -114,7 +114,7 @@ deployment_groups: use: - network1 - htcondor_secrets - - htcondor_base + - htcondor_setup - htcondor_cm - htcondor_execute_point - htcondor_execute_point_spot diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 7c15da6922..d47faf0e99 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -6,11 +6,11 @@ This module performs the following tasks: - create a managed instance group (MIG) for execute points - create a Toolkit runner to configure the autoscaler to scale the MIG -It is expected to be used with the [htcondor-install] and [htcondor-base] +It is expected to be used with the [htcondor-install] and [htcondor-setup] modules. [htcondor-install]: ../../scripts/htcondor-install/README.md -[htcondor-base]: ../../scheduler/htcondor-configure/README.md +[htcondor-setup]: ../../scheduler/htcondor-configure/README.md ### Known limitations diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index de88946480..4f96c10834 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -117,7 +117,7 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [central\_manager\_runner](#input\_central\_manager\_runner) | A list of Toolkit runners for configuring an HTCondor central manager | `list(map(string))` | `[]` | no | -| [central\_manager\_service\_account\_email](#input\_central\_manager\_service\_account\_email) | Service account e-mail for central manager (can be supplied by htcondor-base module) | `string` | n/a | yes | +| [central\_manager\_service\_account\_email](#input\_central\_manager\_service\_account\_email) | Service account e-mail for central manager (can be supplied by htcondor-setup module) | `string` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `20` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | diff --git a/community/modules/scheduler/htcondor-central-manager/variables.tf b/community/modules/scheduler/htcondor-central-manager/variables.tf index b248cac9cf..1844c3d93c 100644 --- a/community/modules/scheduler/htcondor-central-manager/variables.tf +++ b/community/modules/scheduler/htcondor-central-manager/variables.tf @@ -48,7 +48,7 @@ variable "network_self_link" { } variable "central_manager_service_account_email" { - description = "Service account e-mail for central manager (can be supplied by htcondor-base module)" + description = "Service account e-mail for central manager (can be supplied by htcondor-setup module)" type = string } diff --git a/community/modules/scheduler/htcondor-pool-secrets/README.md b/community/modules/scheduler/htcondor-pool-secrets/README.md index 01ade8698b..7dff208f4a 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/README.md +++ b/community/modules/scheduler/htcondor-pool-secrets/README.md @@ -20,7 +20,7 @@ It is expected to be used with the [htcondor-install] and [htcondor-execute-point] modules. [hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm -[htcondor-install]: ../../scripts/htcondor-base/README.md +[htcondor-install]: ../../scripts/htcondor-setup/README.md [htcondor-execute-point]: ../../compute/htcondor-execute-point/README.md [htcrole]: https://htcondor.readthedocs.io/en/latest/getting-htcondor/admin-quick-start.html#what-get-htcondor-does-to-configure-a-role @@ -40,15 +40,15 @@ example can be found in the [examples README][htc-example]. - id: htcondor_install source: community/modules/scripts/htcondor-install -- id: htcondor_base - source: community/modules/scheduler/htcondor-base +- id: htcondor_setup + source: community/modules/scheduler/htcondor-setup use: - network1 - id: htcondor_secrets source: community/modules/scheduler/htcondor-pool-secrets use: - - htcondor_base + - htcondor_setup - id: htcondor_startup_central_manager source: modules/scripts/startup-script @@ -56,7 +56,7 @@ example can be found in the [examples README][htc-example]. runners: - $(htcondor_install.install_htcondor_runner) - $(htcondor_secrets.central_manager_runner) - - $(htcondor_base.central_manager_runner) + - $(htcondor_setup.central_manager_runner) - id: htcondor_cm source: modules/compute/vm-instance @@ -68,14 +68,14 @@ example can be found in the [examples README][htc-example]. machine_type: c2-standard-4 disable_public_ips: true service_account: - email: $(htcondor_base.central_manager_service_account) + email: $(htcondor_setup.central_manager_service_account) scopes: - cloud-platform network_interfaces: - network: null subnetwork: $(network1.subnetwork_self_link) subnetwork_project: $(vars.project_id) - network_ip: $(htcondor_base.central_manager_internal_ip) + network_ip: $(htcondor_setup.central_manager_internal_ip) stack_type: null access_config: [] ipv6_access_config: [] diff --git a/community/modules/scheduler/htcondor-base/README.md b/community/modules/scheduler/htcondor-setup/README.md similarity index 98% rename from community/modules/scheduler/htcondor-base/README.md rename to community/modules/scheduler/htcondor-setup/README.md index aaca5a1abd..f59e7f3f8b 100644 --- a/community/modules/scheduler/htcondor-base/README.md +++ b/community/modules/scheduler/htcondor-setup/README.md @@ -19,7 +19,7 @@ It is expected to be used with the [htcondor-install] and [htcondor-execute-point] modules. [hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm -[htcondor-install]: ../../scripts/htcondor-base/README.md +[htcondor-install]: ../../scripts/htcondor-setup/README.md [htcondor-execute-point]: ../../compute/htcondor-execute-point/README.md [htcrole]: https://htcondor.readthedocs.io/en/latest/getting-htcondor/admin-quick-start.html#what-get-htcondor-does-to-configure-a-role @@ -40,7 +40,7 @@ example can be found in the [examples README][htc-example]. source: community/modules/scripts/htcondor-install - id: htcondor_configure - source: community/modules/scheduler/htcondor-base + source: community/modules/scheduler/htcondor-setup use: - network1 @@ -101,7 +101,7 @@ vars: zone_secondary: us-central1-f - id: htcondor_configure - source: community/modules/scheduler/htcondor-base + source: community/modules/scheduler/htcondor-setup use: - network1 settings: diff --git a/community/modules/scheduler/htcondor-base/main.tf b/community/modules/scheduler/htcondor-setup/main.tf similarity index 98% rename from community/modules/scheduler/htcondor-base/main.tf rename to community/modules/scheduler/htcondor-setup/main.tf index 3572f729eb..a397445ea8 100644 --- a/community/modules/scheduler/htcondor-base/main.tf +++ b/community/modules/scheduler/htcondor-setup/main.tf @@ -16,7 +16,7 @@ locals { # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "htcondor-base" }) + labels = merge(var.labels, { ghpc_module = "htcondor-setup" }) } locals { diff --git a/community/modules/scheduler/htcondor-base/outputs.tf b/community/modules/scheduler/htcondor-setup/outputs.tf similarity index 100% rename from community/modules/scheduler/htcondor-base/outputs.tf rename to community/modules/scheduler/htcondor-setup/outputs.tf diff --git a/community/modules/scheduler/htcondor-base/variables.tf b/community/modules/scheduler/htcondor-setup/variables.tf similarity index 100% rename from community/modules/scheduler/htcondor-base/variables.tf rename to community/modules/scheduler/htcondor-setup/variables.tf diff --git a/community/modules/scheduler/htcondor-base/versions.tf b/community/modules/scheduler/htcondor-setup/versions.tf similarity index 91% rename from community/modules/scheduler/htcondor-base/versions.tf rename to community/modules/scheduler/htcondor-setup/versions.tf index c3d459c250..115c5f50d6 100644 --- a/community/modules/scheduler/htcondor-base/versions.tf +++ b/community/modules/scheduler/htcondor-setup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-base/v1.20.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.20.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scripts/htcondor-install/README.md b/community/modules/scripts/htcondor-install/README.md index 7b2d07b9f5..f94dc93ce5 100644 --- a/community/modules/scripts/htcondor-install/README.md +++ b/community/modules/scripts/htcondor-install/README.md @@ -13,18 +13,18 @@ Debian or Ubuntu distributions. It also exports a list of Google Cloud APIs which must be enabled prior to provisioning an HTCondor Pool. -It is expected to be used with the [htcondor-base] and +It is expected to be used with the [htcondor-setup] and [htcondor-execute-point] modules. [hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm -[htcondor-base]: ../../scheduler/htcondor-configure/README.md +[htcondor-setup]: ../../scheduler/htcondor-configure/README.md [htcondor-execute-point]: ../../compute/htcondor-execute-point/README.md ### Example The following code snippet uses this module to create startup scripts that install the HTCondor software and adds custom configurations using -[htcondor-base] and [htcondor-execute-point]. +[htcondor-setup] and [htcondor-execute-point]. ```yaml - id: htcondor_install diff --git a/docs/gpu-support.md b/docs/gpu-support.md index d7722f6eb1..4d677524bb 100644 --- a/docs/gpu-support.md +++ b/docs/gpu-support.md @@ -3,7 +3,7 @@ ## Supported modules * [vm-instance] and therefore any module that relies on `vm-instance` including: - * HTCondor modules including [htcondor-install], [htcondor-base] and + * HTCondor modules including [htcondor-install], [htcondor-setup] and [htcondor-execute-point]. * [omnia-install] * Slurm on GCP modules where applicable, both version 4 and version 5 @@ -47,7 +47,7 @@ cannot be determined automatically like with `a2`. [login]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-login [omnia-install]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scripts/omnia-install [htcondor-install]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scripts/htcondor-install -[htcondor-base]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/htcondor-configure +[htcondor-setup]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/htcondor-configure [htcondor-execute-point]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/compute/htcondor-execute-point ## Troubleshooting and tips diff --git a/modules/README.md b/modules/README.md index f4ea466766..cb8721e36c 100644 --- a/modules/README.md +++ b/modules/README.md @@ -45,7 +45,7 @@ Modules that are still in development and less stable are labeled with the Kubernetes job file to be used with a [gke-node-pool]. * **[htcondor-execute-point]** ![community-badge] ![experimental-badge] : Manages a group of execute points for use in an [HTCondor - pool][htcondor-base]. + pool][htcondor-setup]. * **[pbspro-execution]** ![community-badge] ![experimental-badge] : Creates execution hosts for use in a PBS Professional cluster. * **[SchedMD-slurm-on-gcp-partition]** ![community-badge] ![deprecated-badge] : Creates a partition @@ -153,7 +153,7 @@ Modules that are still in development and less stable are labeled with the Creates a Slurm login node using [slurm-gcp-version-5]. * **[schedmd-slurm-gcp-v5-hybrid]** ![community-badge] ![experimental-badge] : Creates hybrid Slurm partition configuration files using [slurm-gcp-version-5]. -* **[htcondor-base]** ![community-badge] ![experimental-badge] : Creates the +* **[htcondor-setup]** ![community-badge] ![experimental-badge] : Creates the base infrastructure for an HTCondor pool (service accounts and Cloud Storage bucket). * **[htcondor-pool-secrets]** ![community-badge] ![experimental-badge] : Creates and manages access to the secrets necessary for secure operation of an @@ -173,7 +173,7 @@ Modules that are still in development and less stable are labeled with the [batch-job-template]: ../modules/scheduler/batch-job-template/README.md [batch-login-node]: ../modules/scheduler/batch-login-node/README.md [gke-cluster]: ../community/modules/scheduler/gke-cluster/README.md -[htcondor-base]: ../community/modules/scheduler/htcondor-base/README.md +[htcondor-setup]: ../community/modules/scheduler/htcondor-setup/README.md [htcondor-pool-secrets]: ../community/modules/scheduler/htcondor-pool-secrets/README.md [htcondor-access-point]: ../community/modules/scheduler/htcondor-access-point/README.md [schedmd-slurm-gcp-v5-controller]: ../community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md diff --git a/pkg/config/config.go b/pkg/config/config.go index 0e17119aa5..3a6ecbcf64 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -75,7 +75,7 @@ var errorMessages = map[string]string{ var movedModules = map[string]string{ "community/modules/scheduler/cloud-batch-job": "modules/scheduler/batch-job-template", "community/modules/scheduler/cloud-batch-login-node": "modules/scheduler/batch-login-node", - "community/modules/scheduler/htcondor-configure": "community/modules/scheduler/htcondor-base", + "community/modules/scheduler/htcondor-configure": "community/modules/scheduler/htcondor-setup", } // GroupName is the name of a deployment group diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index 4c4805ef93..94299ed58e 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -265,7 +265,7 @@ func defaultAPIList(source string) []string { "compute.googleapis.com", "storage.googleapis.com", }, - "community/modules/scheduler/htcondor-base": { + "community/modules/scheduler/htcondor-setup": { "iam.googleapis.com", "secretmanager.googleapis.com", "storage.googleapis.com", From 03318d7549edd78c63a0ee0b63d9dea195440726 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 20 Jul 2023 17:15:32 -0500 Subject: [PATCH 085/144] Add required APIs for new HTCondor modules --- pkg/modulereader/resreader.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index 94299ed58e..9b2acc7e75 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -212,6 +212,7 @@ func defaultAPIList(source string) []string { }, "community/modules/compute/htcondor-execute-point": { "compute.googleapis.com", + "storage.googleapis.com", }, "community/modules/compute/pbspro-execution": { "compute.googleapis.com", @@ -265,9 +266,20 @@ func defaultAPIList(source string) []string { "compute.googleapis.com", "storage.googleapis.com", }, - "community/modules/scheduler/htcondor-setup": { + "community/modules/scheduler/htcondor-access-point": { + "compute.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/scheduler/htcondor-central-manager": { + "compute.googleapis.com", + "storage.googleapis.com", + }, + "community/modules/scheduler/htcondor-pool-secrets": { "iam.googleapis.com", "secretmanager.googleapis.com", + }, + "community/modules/scheduler/htcondor-setup": { + "iam.googleapis.com", "storage.googleapis.com", }, "community/modules/scheduler/pbspro-client": { From e0ff78cef4cfd28d627e75ea85189681032d1585 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 21 Jul 2023 10:25:37 -0700 Subject: [PATCH 086/144] Bump google.golang.org/api from 0.130.0 to 0.132.0 (#1613) Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.130.0 to 0.132.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.130.0...v0.132.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 22 +++++++++++----------- go.sum | 44 ++++++++++++++++++++++---------------------- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/go.mod b/go.mod index 0d4becc755..e0ccbbd62d 100644 --- a/go.mod +++ b/go.mod @@ -3,8 +3,8 @@ module hpc-toolkit go 1.18 require ( - cloud.google.com/go/compute v1.19.3 // indirect - cloud.google.com/go/storage v1.28.1 // indirect + cloud.google.com/go/compute v1.20.1 // indirect + cloud.google.com/go/storage v1.30.1 // indirect github.com/go-git/go-git/v5 v5.7.0 github.com/hashicorp/go-getter v1.7.1 github.com/hashicorp/hcl v1.0.0 // indirect @@ -16,7 +16,7 @@ require ( github.com/spf13/cobra v1.7.0 github.com/zclconf/go-cty v1.13.2 golang.org/x/exp v0.0.0-20230108222341-4b8118a2686a - google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc // indirect + google.golang.org/genproto v0.0.0-20230706204954-ccb25ca9f130 // indirect gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c gopkg.in/yaml.v3 v3.0.1 ) @@ -26,20 +26,20 @@ require ( github.com/google/go-cmp v0.5.9 github.com/hashicorp/terraform-exec v0.18.1 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.130.0 + google.golang.org/api v0.132.0 ) require ( - github.com/googleapis/gax-go/v2 v2.11.0 // indirect + github.com/googleapis/gax-go/v2 v2.12.0 // indirect github.com/hashicorp/terraform-json v0.15.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20230530153820-e85fd2cbaebc // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230629202037-9506855d4529 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20230706204954-ccb25ca9f130 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 // indirect ) require ( - cloud.google.com/go v0.110.0 // indirect + cloud.google.com/go v0.110.4 // indirect cloud.google.com/go/compute/metadata v0.2.3 // indirect - cloud.google.com/go/iam v0.13.0 // indirect + cloud.google.com/go/iam v1.1.0 // indirect github.com/Microsoft/go-winio v0.5.2 // indirect github.com/ProtonMail/go-crypto v0.0.0-20230518184743-7afd39499903 // indirect github.com/acomagu/bufpipe v1.0.4 // indirect @@ -79,12 +79,12 @@ require ( go.opencensus.io v0.24.0 // indirect golang.org/x/crypto v0.11.0 // indirect golang.org/x/net v0.12.0 // indirect - golang.org/x/oauth2 v0.9.0 // indirect + golang.org/x/oauth2 v0.10.0 // indirect golang.org/x/sys v0.10.0 golang.org/x/text v0.11.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/grpc v1.56.1 // indirect + google.golang.org/grpc v1.56.2 // indirect google.golang.org/protobuf v1.31.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect ) diff --git a/go.sum b/go.sum index c243e5c0a3..98c32ef24c 100644 --- a/go.sum +++ b/go.sum @@ -32,8 +32,8 @@ cloud.google.com/go v0.100.2/go.mod h1:4Xra9TjzAeYHrl5+oeLlzbM2k3mjVhZh4UqTZ//w9 cloud.google.com/go v0.102.0/go.mod h1:oWcCzKlqJ5zgHQt9YsaeTY9KzIvjyy0ArmiBUgpQ+nc= cloud.google.com/go v0.102.1/go.mod h1:XZ77E9qnTEnrgEOvr4xzfdX5TRo7fB4T2F4O6+34hIU= cloud.google.com/go v0.104.0/go.mod h1:OO6xxXdJyvuJPcEPBLN9BJPD+jep5G1+2U5B5gkRYtA= -cloud.google.com/go v0.110.0 h1:Zc8gqp3+a9/Eyph2KDmcGaPtbKRIoqq4YTlL4NMD0Ys= -cloud.google.com/go v0.110.0/go.mod h1:SJnCLqQ0FCFGSZMUNUf84MV3Aia54kn7pi8st7tMzaY= +cloud.google.com/go v0.110.4 h1:1JYyxKMN9hd5dR2MYTPWkGUgcoxVVhg0LKNKEo0qvmk= +cloud.google.com/go v0.110.4/go.mod h1:+EYjdK8e5RME/VY/qLCAtuyALQ9q67dvuum8i+H5xsI= cloud.google.com/go/aiplatform v1.22.0/go.mod h1:ig5Nct50bZlzV6NvKaTwmplLLddFx0YReh9WfTO5jKw= cloud.google.com/go/aiplatform v1.24.0/go.mod h1:67UUvRBKG6GTayHKV8DBv2RtR1t93YRu5B1P3x99mYY= cloud.google.com/go/analytics v0.11.0/go.mod h1:DjEWCu41bVbYcKyvlws9Er60YE4a//bK6mnhWvQeFNI= @@ -70,8 +70,8 @@ cloud.google.com/go/compute v1.6.0/go.mod h1:T29tfhtVbq1wvAPo0E3+7vhgmkOYeXjhFvz cloud.google.com/go/compute v1.6.1/go.mod h1:g85FgpzFvNULZ+S8AYq87axRKuf2Kh7deLqV/jJ3thU= cloud.google.com/go/compute v1.7.0/go.mod h1:435lt8av5oL9P3fv1OEzSbSUe+ybHXGMPQHHZWZxy9U= cloud.google.com/go/compute v1.10.0/go.mod h1:ER5CLbMxl90o2jtNbGSbtfOpQKR0t15FOtRsugnLrlU= -cloud.google.com/go/compute v1.19.3 h1:DcTwsFgGev/wV5+q8o2fzgcHOaac+DKGC91ZlvpsQds= -cloud.google.com/go/compute v1.19.3/go.mod h1:qxvISKp/gYnXkSAD1ppcSOveRAmzxicEv/JlizULFrI= +cloud.google.com/go/compute v1.20.1 h1:6aKEtlUiwEpJzM001l0yFkpXmUVXaN8W+fbkb2AZNbg= +cloud.google.com/go/compute v1.20.1/go.mod h1:4tCnrn48xsqlwSAiLf1HXMQk8CONslYbdiEZc9FEIbM= cloud.google.com/go/compute/metadata v0.2.3 h1:mg4jlk7mCAj6xXp9UJ4fjI9VUI5rubuGBW5aJ7UnBMY= cloud.google.com/go/compute/metadata v0.2.3/go.mod h1:VAV5nSsACxMJvgaAuX6Pk2AawlZn8kiOGuCv6gTkwuA= cloud.google.com/go/containeranalysis v0.5.1/go.mod h1:1D92jd8gRR/c0fGMlymRgxWD3Qw9C1ff6/T7mLgVL8I= @@ -111,8 +111,8 @@ cloud.google.com/go/gkehub v0.10.0/go.mod h1:UIPwxI0DsrpsVoWpLB0stwKCP+WFVG9+y97 cloud.google.com/go/grafeas v0.2.0/go.mod h1:KhxgtF2hb0P191HlY5besjYm6MqTSTj3LSI+M+ByZHc= cloud.google.com/go/iam v0.3.0/go.mod h1:XzJPvDayI+9zsASAFO68Hk07u3z+f+JrT2xXNdp4bnY= cloud.google.com/go/iam v0.5.0/go.mod h1:wPU9Vt0P4UmCux7mqtRu6jcpPAb74cP1fh50J3QpkUc= -cloud.google.com/go/iam v0.13.0 h1:+CmB+K0J/33d0zSQ9SlFWUeCCEn5XJA0ZMZ3pHE9u8k= -cloud.google.com/go/iam v0.13.0/go.mod h1:ljOg+rcNfzZ5d6f1nAUJ8ZIxOaZUVoS14bKCtaLZ/D0= +cloud.google.com/go/iam v1.1.0 h1:67gSqaPukx7O8WLLHMa0PNs3EBGd2eE4d+psbO/CO94= +cloud.google.com/go/iam v1.1.0/go.mod h1:nxdHjaKfCr7fNYx/HJMM8LgiMugmveWlkatear5gVyk= cloud.google.com/go/language v1.4.0/go.mod h1:F9dRpNFQmJbkaop6g0JhSBXCNlO90e1KWx5iDdxbWic= cloud.google.com/go/language v1.6.0/go.mod h1:6dJ8t3B+lUYfStgls25GusK04NLh3eDLQnWM3mdEbhI= cloud.google.com/go/lifesciences v0.5.0/go.mod h1:3oIKy8ycWGPUyZDR/8RNnTOYevhaMLqh5vLUXs9zvT8= @@ -174,8 +174,8 @@ cloud.google.com/go/storage v1.14.0/go.mod h1:GrKmX003DSIwi9o29oFT7YDnHYwZoctc3f cloud.google.com/go/storage v1.22.1/go.mod h1:S8N1cAStu7BOeFfE8KAQzmyyLkK8p/vmRq6kuBTW58Y= cloud.google.com/go/storage v1.23.0/go.mod h1:vOEEDNFnciUMhBeT6hsJIn3ieU5cFRmzeLgDvXzfIXc= cloud.google.com/go/storage v1.27.0/go.mod h1:x9DOL8TK/ygDUMieqwfhdpQryTeEkhGKMi80i/iqR2s= -cloud.google.com/go/storage v1.28.1 h1:F5QDG5ChchaAVQhINh24U99OWHURqrW8OmQcGKXcbgI= -cloud.google.com/go/storage v1.28.1/go.mod h1:Qnisd4CqDdo6BGs2AD5LLnEsmSQ80wQ5ogcBBKhU86Y= +cloud.google.com/go/storage v1.30.1 h1:uOdMxAs8HExqBlnLtnQyP0YkvbiDpdGShGKtx6U/oNM= +cloud.google.com/go/storage v1.30.1/go.mod h1:NfxhC0UJE1aXSx7CIIbCf7y9HKT7BiccwkR7+P7gN8E= cloud.google.com/go/talent v1.1.0/go.mod h1:Vl4pt9jiHKvOgF9KoZo6Kob9oV4lwd/ZD5Cto54zDRw= cloud.google.com/go/talent v1.2.0/go.mod h1:MoNF9bhFQbiJ6eFD3uSsg0uBALw4n4gaCaEjBw9zo8g= cloud.google.com/go/videointelligence v1.6.0/go.mod h1:w0DIDlVRKtwPCn/C4iwZIJdvC69yInhW0cfi+p546uU= @@ -357,8 +357,8 @@ github.com/googleapis/gax-go/v2 v2.3.0/go.mod h1:b8LNqSzNabLiUpXKkY7HAR5jr6bIT99 github.com/googleapis/gax-go/v2 v2.4.0/go.mod h1:XOTVJ59hdnfJLIP/dh8n5CGryZR2LxK9wbMD5+iXC6c= github.com/googleapis/gax-go/v2 v2.5.1/go.mod h1:h6B0KMMFNtI2ddbGJn3T3ZbwkeT6yqEF02fYlzkUCyo= github.com/googleapis/gax-go/v2 v2.6.0/go.mod h1:1mjbznJAPHFpesgE5ucqfYEscaz5kMdcIDwU/6+DDoY= -github.com/googleapis/gax-go/v2 v2.11.0 h1:9V9PWXEsWnPpQhu/PeQIkS4eGzMlTLGgt80cUUI8Ki4= -github.com/googleapis/gax-go/v2 v2.11.0/go.mod h1:DxmR61SGKkGLa2xigwuZIQpkCI2S5iydzRfb3peWZJI= +github.com/googleapis/gax-go/v2 v2.12.0 h1:A+gCJKdRfqXkr+BIRGtZLibNXf0m1f9E4HG56etFpas= +github.com/googleapis/gax-go/v2 v2.12.0/go.mod h1:y+aIqrI5eb1YGMVJfuV3185Ts/D7qKpsEkdD5+I6QGU= github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+cLsWGBF62rFAi7WjWO4= github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= @@ -628,8 +628,8 @@ golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.0.0-20221014153046-6fdb5e3db783/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/oauth2 v0.1.0/go.mod h1:G9FE4dLTsbXUu90h/Pf85g4w1D+SSAgR+q46nJZ8M4A= -golang.org/x/oauth2 v0.9.0 h1:BPpt2kU7oMRq3kCHAA1tbSEshXRw1LpG2ztgDwrzuAs= -golang.org/x/oauth2 v0.9.0/go.mod h1:qYgFZaFiu6Wg24azG8bdV52QJXJGbZzIIsRCdVKzbLw= +golang.org/x/oauth2 v0.10.0 h1:zHCpF2Khkwy4mMB4bv0U37YtJdTGW8jI0glAApi0Kh8= +golang.org/x/oauth2 v0.10.0/go.mod h1:kTpgurOux7LqtuxjuyZa4Gj2gdezIt/jQtGnNFfypQI= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -855,8 +855,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.130.0 h1:A50ujooa1h9iizvfzA4rrJr2B7uRmWexwbekQ2+5FPQ= -google.golang.org/api v0.130.0/go.mod h1:J/LCJMYSDFvAVREGCbrESb53n4++NMBDetSHGL5I5RY= +google.golang.org/api v0.132.0 h1:8t2/+qZ26kAOGSmOiHwVycqVaDg7q3JDILrNi/Z6rvc= +google.golang.org/api v0.132.0/go.mod h1:AeTBC6GpJnJSRJjktDcPX0QwtS8pGYZOV6MSuSCusw0= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -968,12 +968,12 @@ google.golang.org/genproto v0.0.0-20221010155953-15ba04fc1c0e/go.mod h1:3526vdqw google.golang.org/genproto v0.0.0-20221014173430-6e2ab493f96b/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221014213838-99cd37c6964a/go.mod h1:1vXfmgAz9N9Jx0QA82PqRVauvCz1SGSz739p0f183jM= google.golang.org/genproto v0.0.0-20221025140454-527a21cfbd71/go.mod h1:9qHF0xnpdSfF6knlcsnpzUu5y+rpwgbvsyGAZPBMg4s= -google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc h1:8DyZCyvI8mE1IdLy/60bS+52xfymkE72wv1asokgtao= -google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc/go.mod h1:xZnkP7mREFX5MORlOPEzLMr+90PPZQ2QWzrVTWfAq64= -google.golang.org/genproto/googleapis/api v0.0.0-20230530153820-e85fd2cbaebc h1:kVKPf/IiYSBWEWtkIn6wZXwWGCnLKcC8oWfZvXjsGnM= -google.golang.org/genproto/googleapis/api v0.0.0-20230530153820-e85fd2cbaebc/go.mod h1:vHYtlOoi6TsQ3Uk2yxR7NI5z8uoV+3pZtR4jmHIkRig= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230629202037-9506855d4529 h1:DEH99RbiLZhMxrpEJCZ0A+wdTe0EOgou/poSLx9vWf4= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230629202037-9506855d4529/go.mod h1:66JfowdXAEgad5O9NnYcsNPLCPZJD++2L9X0PCMODrA= +google.golang.org/genproto v0.0.0-20230706204954-ccb25ca9f130 h1:Au6te5hbKUV8pIYWHqOUZ1pva5qK/rwbIhoXEUB9Lu8= +google.golang.org/genproto v0.0.0-20230706204954-ccb25ca9f130/go.mod h1:O9kGHb51iE/nOGvQaDUuadVYqovW56s5emA88lQnj6Y= +google.golang.org/genproto/googleapis/api v0.0.0-20230706204954-ccb25ca9f130 h1:XVeBY8d/FaK4848myy41HBqnDwvxeV3zMZhwN1TvAMU= +google.golang.org/genproto/googleapis/api v0.0.0-20230706204954-ccb25ca9f130/go.mod h1:mPBs5jNgx2GuQGvFwUvVKqtn6HsUw9nP64BedgvqEsQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 h1:bVf09lpb+OJbByTj913DRJioFFAjf/ZGxEz7MajTp2U= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98/go.mod h1:TUfxEVdsvPg18p6AslUXFoLdpED4oBnGwyqk3dV1XzM= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1009,8 +1009,8 @@ google.golang.org/grpc v1.48.0/go.mod h1:vN9eftEi1UMyUsIF80+uQXhHjbXYbm0uXoFCACu google.golang.org/grpc v1.49.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.0/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= google.golang.org/grpc v1.50.1/go.mod h1:ZgQEeidpAuNRZ8iRrlBKXZQP1ghovWIVhdJRyCDK+GI= -google.golang.org/grpc v1.56.1 h1:z0dNfjIl0VpaZ9iSVjA6daGatAYwPGstTjt5vkRMFkQ= -google.golang.org/grpc v1.56.1/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= +google.golang.org/grpc v1.56.2 h1:fVRFRnXvU+x6C4IlHZewvJOVHoOv1TUuQyoRsYnB4bI= +google.golang.org/grpc v1.56.2/go.mod h1:I9bI3vqKfayGqPUAwGdOSu7kt6oIJLixfffKrpXqQ9s= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= From 43cd72c2aba032f41b3686cd7ba2f5ad918dadc7 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 21 Jul 2023 21:24:44 -0500 Subject: [PATCH 087/144] Remove unused variable --- .../compute/htcondor-execute-point/files/htcondor_configure.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/community/modules/compute/htcondor-execute-point/files/htcondor_configure.yml b/community/modules/compute/htcondor-execute-point/files/htcondor_configure.yml index 44fb1e4833..375ae036cd 100644 --- a/community/modules/compute/htcondor-execute-point/files/htcondor_configure.yml +++ b/community/modules/compute/htcondor-execute-point/files/htcondor_configure.yml @@ -17,7 +17,6 @@ hosts: localhost become: true vars: - job_queue_ha: false spool_dir: /var/lib/condor/spool condor_config_root: /etc/condor ghpc_config_file: 50-ghpc-managed From cfa6b7c72b8077a119b06ab7f49e9fe2ea40dd31 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Fri, 21 Jul 2023 21:26:12 -0500 Subject: [PATCH 088/144] Fix unnecessary escaping of HTCondor macros --- community/modules/compute/htcondor-execute-point/README.md | 6 +++--- community/modules/scheduler/htcondor-access-point/main.tf | 6 +++--- docs/tutorials/htcondor.md | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index d47faf0e99..25106c347c 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -43,9 +43,9 @@ attribute is false. A job submit file may override this value as shown below. universe = vanilla executable = /bin/echo arguments = "Hello, World!" -output = out.\$(ClusterId).\$(ProcId) -error = err.\$(ClusterId).\$(ProcId) -log = log.\$(ClusterId).\$(ProcId) +output = out.$(ClusterId).$(ProcId) +error = err.$(ClusterId).$(ProcId) +log = log.$(ClusterId).$(ProcId) request_cpus = 1 request_memory = 100MB +RequireSpot = true diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 5d19301658..deb5e031f0 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -38,9 +38,9 @@ locals { universe = vanilla executable = /bin/sleep arguments = 1000 - output = out.\$(ClusterId).\$(ProcId) - error = err.\$(ClusterId).\$(ProcId) - log = log.\$(ClusterId).\$(ProcId) + output = out.$(ClusterId).$(ProcId) + error = err.$(ClusterId).$(ProcId) + log = log.$(ClusterId).$(ProcId) request_cpus = 1 request_memory = 100MB # if unset, defaults to false diff --git a/docs/tutorials/htcondor.md b/docs/tutorials/htcondor.md index 1f7c7b9932..51349fd4b4 100644 --- a/docs/tutorials/htcondor.md +++ b/docs/tutorials/htcondor.md @@ -160,9 +160,9 @@ The job "submit file" will resemble: universe = vanilla executable = /bin/echo arguments = "Hello, World!" -output = out.\$(ClusterId).\$(ProcId) -error = err.\$(ClusterId).\$(ProcId) -log = log.\$(ClusterId).\$(ProcId) +output = out.$(ClusterId).$(ProcId) +error = err.$(ClusterId).$(ProcId) +log = log.$(ClusterId).$(ProcId) request_cpus = 1 request_memory = 100MB queue From 97ef33e5c2cd06c4cb71dcfc35880d14a1502e91 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Jul 2023 10:46:58 +0000 Subject: [PATCH 089/144] Bump github.com/go-git/go-git/v5 from 5.7.0 to 5.8.0 Bumps [github.com/go-git/go-git/v5](https://github.com/go-git/go-git) from 5.7.0 to 5.8.0. - [Release notes](https://github.com/go-git/go-git/releases) - [Commits](https://github.com/go-git/go-git/compare/v5.7.0...v5.8.0) --- updated-dependencies: - dependency-name: github.com/go-git/go-git/v5 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index e0ccbbd62d..024acc1dd8 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.18 require ( cloud.google.com/go/compute v1.20.1 // indirect cloud.google.com/go/storage v1.30.1 // indirect - github.com/go-git/go-git/v5 v5.7.0 + github.com/go-git/go-git/v5 v5.8.0 github.com/hashicorp/go-getter v1.7.1 github.com/hashicorp/hcl v1.0.0 // indirect github.com/hashicorp/hcl/v2 v2.17.0 diff --git a/go.sum b/go.sum index 98c32ef24c..2abb7eeba9 100644 --- a/go.sum +++ b/go.sum @@ -256,8 +256,8 @@ github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376/go.mod h1:an3vInlBmS github.com/go-git/go-billy/v5 v5.4.1 h1:Uwp5tDRkPr+l/TnbHOQzp+tmJfLceOlbVucgpTz8ix4= github.com/go-git/go-billy/v5 v5.4.1/go.mod h1:vjbugF6Fz7JIflbVpl1hJsGjSHNltrSw45YK/ukIvQg= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20230305113008-0c11038e723f h1:Pz0DHeFij3XFhoBRGUDPzSJ+w2UcK5/0JvF8DRI58r8= -github.com/go-git/go-git/v5 v5.7.0 h1:t9AudWVLmqzlo+4bqdf7GY+46SUuRsx59SboFxkq2aE= -github.com/go-git/go-git/v5 v5.7.0/go.mod h1:coJHKEOk5kUClpsNlXrUvPrDxY3w3gjHvhcZd8Fodw8= +github.com/go-git/go-git/v5 v5.8.0 h1:Rc543s6Tyq+YcyPwZRvU4jzZGM8rB/wWu94TnTIYALQ= +github.com/go-git/go-git/v5 v5.8.0/go.mod h1:coJHKEOk5kUClpsNlXrUvPrDxY3w3gjHvhcZd8Fodw8= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= From db57668fd6a5b551eccdc0913d64259afbdd88cf Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 24 Jul 2023 10:25:56 -0500 Subject: [PATCH 090/144] Disambiguate deployment_name for Packer/Slurm test - existing name is long enough that, when shorted by Slurm v5 modules, it results in project metadata keys that are no longer unique --- tools/cloud-build/daily-tests/tests/packer.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/tests/packer.yml b/tools/cloud-build/daily-tests/tests/packer.yml index 50c1691446..462d335768 100644 --- a/tools/cloud-build/daily-tests/tests/packer.yml +++ b/tools/cloud-build/daily-tests/tests/packer.yml @@ -15,7 +15,7 @@ --- test_name: packer -deployment_name: packer-image-{{ build }} +deployment_name: pkr{{ build }} zone: us-central1-c workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/image-builder.yaml" From bcdac1b92ec36421b45a5141d6031729420b00c6 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 24 Jul 2023 14:55:46 -0500 Subject: [PATCH 091/144] Restrict GKE node pool to use TPG <= 4.74.0 A new field for guest_accelerator was added in TPG 4.75.0 that breaks our usage of the google_container_node_pool resource. Will need to address this in a future commit. https://github.com/hashicorp/terraform-provider-google/releases/tag/v4.75.0 --- community/modules/compute/gke-node-pool/README.md | 8 ++++---- community/modules/compute/gke-node-pool/versions.tf | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/community/modules/compute/gke-node-pool/README.md b/community/modules/compute/gke-node-pool/README.md index bba6b00e17..3bb0e7398c 100644 --- a/community/modules/compute/gke-node-pool/README.md +++ b/community/modules/compute/gke-node-pool/README.md @@ -177,15 +177,15 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.2 | -| [google](#requirement\_google) | >= 4.61.0, < 5.0 | -| [google-beta](#requirement\_google-beta) | >= 4.61.0, < 5.0 | +| [google](#requirement\_google) | >= 4.61.0, <= 4.74.0 | +| [google-beta](#requirement\_google-beta) | >= 4.61.0, <= 4.74.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | >= 4.61.0, < 5.0 | -| [google-beta](#provider\_google-beta) | >= 4.61.0, < 5.0 | +| [google](#provider\_google) | >= 4.61.0, <= 4.74.0 | +| [google-beta](#provider\_google-beta) | >= 4.61.0, <= 4.74.0 | ## Modules diff --git a/community/modules/compute/gke-node-pool/versions.tf b/community/modules/compute/gke-node-pool/versions.tf index c8dd3386b0..2c719fc39e 100644 --- a/community/modules/compute/gke-node-pool/versions.tf +++ b/community/modules/compute/gke-node-pool/versions.tf @@ -18,11 +18,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.61.0, < 5.0" + version = ">= 4.61.0, <= 4.74.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.61.0, < 5.0" + version = ">= 4.61.0, <= 4.74.0" } } provider_meta "google" { From 2d5ea7a8fa598eb655a8b261160cabc1d4c08ea5 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 24 Jul 2023 14:57:16 -0500 Subject: [PATCH 092/144] Adopt TPG 4.74.0 --- pkg/modulewriter/tfversions.go | 4 ++-- .../golden_copies/expectations/igc_pkr/zero/versions.tf | 4 ++-- .../golden_copies/expectations/igc_tf/one/versions.tf | 4 ++-- .../golden_copies/expectations/igc_tf/zero/versions.tf | 4 ++-- .../golden_copies/expectations/merge_flatten/zero/versions.tf | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/modulewriter/tfversions.go b/pkg/modulewriter/tfversions.go index d99c210c3c..20a654b84f 100644 --- a/pkg/modulewriter/tfversions.go +++ b/pkg/modulewriter/tfversions.go @@ -21,11 +21,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.73.0" + version = "~> 4.74.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.73.0" + version = "~> 4.74.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index a71345f475..ae37177189 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.73.0" + version = "~> 4.74.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.73.0" + version = "~> 4.74.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index a71345f475..ae37177189 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.73.0" + version = "~> 4.74.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.73.0" + version = "~> 4.74.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index a71345f475..ae37177189 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.73.0" + version = "~> 4.74.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.73.0" + version = "~> 4.74.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index a71345f475..ae37177189 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.73.0" + version = "~> 4.74.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.73.0" + version = "~> 4.74.0" } } } From aea9fbb1c7f96ad110c84b70b603500f66089679 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 24 Jul 2023 15:36:57 -0500 Subject: [PATCH 093/144] Add MIG ID as output/input to enable job transform and submit requirements --- community/examples/htc-htcondor.yaml | 1 + .../compute/htcondor-execute-point/README.md | 87 ++++++++++--------- .../compute/htcondor-execute-point/outputs.tf | 5 ++ .../scheduler/htcondor-access-point/README.md | 4 +- .../scheduler/htcondor-access-point/main.tf | 9 ++ .../templates/condor_config.tftpl | 43 ++++++--- .../htcondor-access-point/variables.tf | 19 ++++ .../htcondor-access-point/versions.tf | 2 +- 8 files changed, 112 insertions(+), 58 deletions(-) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 462cd8691d..301788c10b 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -119,6 +119,7 @@ deployment_groups: - htcondor_execute_point - htcondor_execute_point_spot settings: + default_mig_id: $(htcondor_execute_point.mig_id) enable_public_ips: true instance_image: project: $(vars.project_id) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 25106c347c..f880d45cf5 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -10,7 +10,7 @@ It is expected to be used with the [htcondor-install] and [htcondor-setup] modules. [htcondor-install]: ../../scripts/htcondor-install/README.md -[htcondor-setup]: ../../scheduler/htcondor-configure/README.md +[htcondor-setup]: ../../scheduler/htcondor-setup/README.md ### Known limitations @@ -30,14 +30,16 @@ including all pricing options. │ This was checked by the validation rule at modules/startup-script/variables.tf:72,3-13. ``` -### How to run HTCondor jobs on Spot VMs +### How to configure jobs to select execute points HTCondor access points provisioned by the Toolkit are specially configured to -add an attribute named `RequireSpot` to each [Job ClassAd][jobad]. When this -value is true, a job's `requirements` are automatically updated to require -that it run on a Spot VM. When this value is false, the `requirements` are -similarly updated to run only on On-Demand VMs. The default value of this -attribute is false. A job submit file may override this value as shown below. +honor an attribute named `RequireId` in each [Job ClassAd][jobad]. This value +must be set to the ID of a MIG created by an instance of this module. The +[htcondor-access-point] module includes a setting `var.default_mig_id` that will +set this value automatically to the MIG ID corresponding to the module's +execute points. If this setting is left unset each job must specify `+RequireId` +explicitly. In all cases, the default value can be overridden explicitly as shown +below: ```text universe = vanilla @@ -48,10 +50,11 @@ error = err.$(ClusterId).$(ProcId) log = log.$(ClusterId).$(ProcId) request_cpus = 1 request_memory = 100MB -+RequireSpot = true ++RequireId = "htcondor-pool-ep-mig" queue ``` +[htcondor-access-point]: ../../scheduler/htcondor-access-point/README.md [jobad]: https://htcondor.readthedocs.io/en/latest/users-manual/matchmaking-with-classads.html ### Example @@ -69,47 +72,46 @@ a startup script and network created in previous steps. source: community/modules/compute/htcondor-execute-point use: - network1 - - htcondor_configure_execute_point + - htcondor_secrets + - htcondor_setup + - htcondor_cm settings: - service_account: - email: $(htcondor_configure.execute_point_service_account) - scopes: - - cloud-platform + instance_image: + project: $(vars.project_id) + family: $(vars.new_image_family) + min_idle: 2 - id: htcondor_execute_point_spot source: community/modules/compute/htcondor-execute-point use: - network1 - - htcondor_configure_execute_point + - htcondor_secrets + - htcondor_setup + - htcondor_cm settings: - service_account: - email: $(htcondor_configure.execute_point_service_account) - scopes: - - cloud-platform - - - id: htcondor_startup_access_point - source: modules/scripts/startup-script - settings: - runners: - - $(htcondor_install.install_htcondor_runner) - - $(htcondor_install.install_autoscaler_deps_runner) - - $(htcondor_install.install_autoscaler_runner) - - $(htcondor_configure.access_point_runner) - - $(htcondor_execute_point.configure_autoscaler_runner) - - $(htcondor_execute_point_spot.configure_autoscaler_runner) - - - id: htcondor_access - source: modules/compute/vm-instance - use: - - network1 - - htcondor_startup_access_point - settings: - name_prefix: access-point - machine_type: c2-standard-4 - service_account: - email: $(htcondor_configure.access_point_service_account) - scopes: - - cloud-platform + instance_image: + project: $(vars.project_id) + family: $(vars.new_image_family) + spot: true + +- id: htcondor_access + source: community/modules/scheduler/htcondor-access-point + use: + - network1 + - htcondor_secrets + - htcondor_setup + - htcondor_cm + - htcondor_execute_point + - htcondor_execute_point_spot + settings: + default_mig_id: $(htcondor_execute_point.mig_id) + enable_public_ips: true + instance_image: + project: $(vars.project_id) + family: $(vars.new_image_family) + outputs: + - access_point_ips + - access_point_name ``` ## Support @@ -217,4 +219,5 @@ limitations under the License. | Name | Description | |------|-------------| | [autoscaler\_runner](#output\_autoscaler\_runner) | Toolkit runner to configure the HTCondor autoscaler | +| [mig\_id](#output\_mig\_id) | ID of the managed instance group containing the execute points | diff --git a/community/modules/compute/htcondor-execute-point/outputs.tf b/community/modules/compute/htcondor-execute-point/outputs.tf index 13192401c4..b31f40130f 100644 --- a/community/modules/compute/htcondor-execute-point/outputs.tf +++ b/community/modules/compute/htcondor-execute-point/outputs.tf @@ -18,3 +18,8 @@ output "autoscaler_runner" { value = local.autoscaler_runner description = "Toolkit runner to configure the HTCondor autoscaler" } + +output "mig_id" { + value = module.mig.instance_group_manager.name + description = "ID of the managed instance group containing the execute points" +} diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index d97ac99c92..cdcb09e9bb 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -46,7 +46,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 0.13.0 | +| [terraform](#requirement\_terraform) | >= 1.1 | | [google](#requirement\_google) | >= 3.83 | | [time](#requirement\_time) | ~> 0.9 | @@ -84,6 +84,7 @@ limitations under the License. | [access\_point\_service\_account\_email](#input\_access\_point\_service\_account\_email) | Service account for access point (e-mail format) | `string` | n/a | yes | | [autoscaler\_runner](#input\_autoscaler\_runner) | A list of Toolkit runners for configuring autoscaling daemons | `list(map(string))` | `[]` | no | | [central\_manager\_ips](#input\_central\_manager\_ips) | List of IP addresses of HTCondor Central Managers | `list(string)` | n/a | yes | +| [default\_mig\_id](#input\_default\_mig\_id) | Default MIG ID for HTCondor jobs; if unset, jobs must specify MIG id | `string` | `""` | no | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `null` | no | | [enable\_high\_availability](#input\_enable\_high\_availability) | Provision HTCondor access point in high availability mode | `bool` | `false` | no | @@ -94,6 +95,7 @@ limitations under the License. | [labels](#input\_labels) | Labels to add to resources. List key, value pairs. | `map(string)` | n/a | yes | | [machine\_type](#input\_machine\_type) | Machine type to use for HTCondor central managers | `string` | `"c2-standard-4"` | no | | [metadata](#input\_metadata) | Metadata to add to HTCondor central managers | `map(string)` | `{}` | no | +| [mig\_id](#input\_mig\_id) | List of Managed Instance Group IDs containing execute points in this pool (supplied by htcondor-execute-point module) | `list(string)` | `[]` | no | | [network\_self\_link](#input\_network\_self\_link) | The self link of the network in which the HTCondor central manager will be created. | `string` | `null` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [project\_id](#input\_project\_id) | Project in which HTCondor pool will be created | `string` | n/a | yes | diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index deb5e031f0..fa27ae662b 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -60,6 +60,8 @@ locals { htcondor_role = "get_htcondor_submit", central_manager_ips = var.central_manager_ips spool_dir = "${var.spool_parent_dir}/spool", + mig_ids = var.mig_id, + default_mig_id = var.default_mig_id }) ap_object = "gs://${var.htcondor_bucket_name}/${google_storage_bucket_object.ap_config.output_name}" @@ -116,6 +118,13 @@ resource "google_storage_bucket_object" "ap_config" { name = "${local.name_prefix}-config-${substr(md5(local.ap_config), 0, 4)}" content = local.ap_config bucket = var.htcondor_bucket_name + + lifecycle { + precondition { + condition = var.default_mig_id == "" || contains(var.mig_id, var.default_mig_id) + error_message = "If set, var.default_mig_id must be an element in var.mig_id" + } + } } module "startup_script" { diff --git a/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl b/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl index db7144e0f3..5641f79add 100644 --- a/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl +++ b/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl @@ -36,20 +36,35 @@ SYSTEM_JOB_MACHINE_ATTRS_HISTORY_LENGTH = 10 use feature:ScheddCronOneShot(cloud, $(LIBEXEC)/common-cloud-attributes-google.py) SCHEDD_CRON_cloud_PREFIX = Cloud -# the sequence of job transforms and submit requirements below set -# a default job attribute RequireSpot to False but allow the user to -# specify *only* a boolean value with +RequireSpot = True in their job -# submit file; the requirements of the job are transformed to filter -# on +RequireSpot unless job has explicit CloudInterruptible requirements -JOB_TRANSFORM_NAMES = SPOT_DEFAULT, SPOT_REQS -JOB_TRANSFORM_SPOT_DEFAULT @=end - DEFAULT RequireSpot False +# aid the user by automatically using RequireSpot in their Requirements, unless +# the user has explicitly used CloudInterruptible +JOB_TRANSFORM_NAMES = $(JOB_TRANSFORM_NAMES) SPOT_REQS +JOB_TRANSFORM_SPOT_REQS @=end + REQUIREMENTS ! isUndefined(RequireSpot) && ! unresolved(Requirements, "^CloudInterruptible$") + SET Requirements ($(MY.Requirements)) && (CloudInterruptible is My.RequireSpot) @end -# Unless explicit, set CloudInterruptible requirements to job RequireSpot attribute + +# help the user by enforcing that RequireSpot is undefined or a boolean +SUBMIT_REQUIREMENT_NAMES = $(SUBMIT_REQUIREMENT_NAMES) SPOT +SUBMIT_REQUIREMENT_SPOT = isUndefined(RequireSpot) || isBoolean(RequireSpot) +SUBMIT_REQUIREMENT_SPOT_REASON = "If +RequireSpot is defined, it must be either True or False" + +%{ if length(mig_ids) > 0 ~} +MIG_IDS = "${join(" ", mig_ids)}" +MIG_ID_LIST = split($(MIG_IDS)) +%{ if default_mig_id != "" ~} +JOB_TRANSFORM_NAMES = $(JOB_TRANSFORM_NAMES) ID_DEFAULT +JOB_TRANSFORM_ID_DEFAULT @=end + DEFAULT RequireId "${default_mig_id}" +@end +%{ endif ~} +SUBMIT_REQUIREMENT_NAMES = $(SUBMIT_REQUIREMENT_NAMES) MIGID +SUBMIT_REQUIREMENT_MIGID = !isUndefined(RequireId) && member(RequireId, $(MIG_ID_LIST)) +SUBMIT_REQUIREMENT_MIGID_REASON = strcat("Jobs must set +RequireId to one of following values surrounded by quotation marks:\n", $(MIG_IDS)) + +JOB_TRANSFORM_NAMES = $(JOB_TRANSFORM_NAMES) SPOT_REQS JOB_TRANSFORM_SPOT_REQS @=end - REQUIREMENTS ! unresolved(Requirements, "^CloudInterruptible$") - SET Requirements $(MY.Requirements) && (CloudInterruptible is My.RequireSpot) + REQUIREMENTS ! isUndefined(RequireId) && ! unresolved(Requirements, "^CloudCreatedBy$") + SET Requirements ($(MY.Requirements)) && regexp(strcat("/", My.RequireId, "$"), CloudCreatedBy) @end -SUBMIT_REQUIREMENT_NAMES = REQSPOT -SUBMIT_REQUIREMENT_REQSPOT = isBoolean(RequireSpot) -SUBMIT_REQUIREMENT_REQSPOT_REASON = "Jobs must set +RequireSpot to either True or False" +%{ endif ~} diff --git a/community/modules/scheduler/htcondor-access-point/variables.tf b/community/modules/scheduler/htcondor-access-point/variables.tf index 99ed55037a..3fae9b5373 100644 --- a/community/modules/scheduler/htcondor-access-point/variables.tf +++ b/community/modules/scheduler/htcondor-access-point/variables.tf @@ -156,3 +156,22 @@ variable "enable_public_ips" { type = bool default = false } + +variable "mig_id" { + description = "List of Managed Instance Group IDs containing execute points in this pool (supplied by htcondor-execute-point module)" + type = list(string) + default = [] + nullable = false + + validation { + condition = length(var.mig_id) > 0 + error_message = "At least 1 MIG containing execute points must be provided to this module" + } +} + +variable "default_mig_id" { + description = "Default MIG ID for HTCondor jobs; if unset, jobs must specify MIG id" + type = string + default = "" + nullable = false +} diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index 6c3301b210..f2575f3746 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -29,5 +29,5 @@ terraform { module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.20.0" } - required_version = ">= 0.13.0" + required_version = ">= 1.1" } From fbdbe76528ad982d4abf7fcc5b06361aa1c541f0 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 24 Jul 2023 16:18:14 -0500 Subject: [PATCH 094/144] Add G2 family support to GPU-normalizing code --- .../htcondor-execute-point/gpu_definition.tf | 54 +++++++++++-------- .../schedmd-slurm-gcp-v5-node-group/README.md | 4 +- .../gpu_definition.tf | 54 +++++++++++-------- .../variables.tf | 3 +- .../versions.tf | 2 +- .../schedmd-slurm-gcp-v5-controller/README.md | 4 +- .../gpu_definition.tf | 54 +++++++++++-------- .../variables.tf | 3 +- .../versions.tf | 2 +- .../schedmd-slurm-gcp-v5-login/README.md | 4 +- .../gpu_definition.tf | 54 +++++++++++-------- .../schedmd-slurm-gcp-v5-login/variables.tf | 3 +- .../schedmd-slurm-gcp-v5-login/versions.tf | 2 +- modules/compute/vm-instance/README.md | 6 +-- modules/compute/vm-instance/gpu_definition.tf | 54 +++++++++++-------- modules/compute/vm-instance/variables.tf | 3 +- modules/compute/vm-instance/versions.tf | 3 +- 17 files changed, 177 insertions(+), 132 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/gpu_definition.tf b/community/modules/compute/htcondor-execute-point/gpu_definition.tf index 197a97b6e6..78ae861fca 100644 --- a/community/modules/compute/htcondor-execute-point/gpu_definition.tf +++ b/community/modules/compute/htcondor-execute-point/gpu_definition.tf @@ -19,29 +19,37 @@ # machine_type locals { - - # Ensure guest_accelerator is a list if not set - input_guest_accelerator = var.guest_accelerator == null ? [] : var.guest_accelerator - - # If the machine type indicates a GPU is used, gather the count and type information - accelerator_types = { - "highgpu" = "nvidia-tesla-a100" - "megagpu" = "nvidia-tesla-a100" - "ultragpu" = "nvidia-a100-80gb" + # example state; terraform will ignore diffs if last element of URL matches + # guest_accelerator = [ + # { + # count = 1 + # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" + # }, + # ] + accelerator_machines = { + "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, + "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, + "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, + "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, + "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, + "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, + "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, + "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, + "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, + "g2-standard-4" = { type = "nvidia-l4", count = 1 }, + "g2-standard-8" = { type = "nvidia-l4", count = 1 }, + "g2-standard-12" = { type = "nvidia-l4", count = 1 }, + "g2-standard-16" = { type = "nvidia-l4", count = 1 }, + "g2-standard-24" = { type = "nvidia-l4", count = 2 }, + "g2-standard-32" = { type = "nvidia-l4", count = 1 }, + "g2-standard-48" = { type = "nvidia-l4", count = 4 }, + "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([{ - type = local.accelerator_types[regex("a2-([A-Za-z]+)-", var.machine_type)[0]], - count = one(regex("a2-[A-Za-z]+-(\\d+)", var.machine_type)), - }], []) - - # If the machine type is a valid a2 machine_type, generated_guest_accelerator - # will be populated. This also guarantees at least one populated list in coalescelist. - is_a2_vm = length(local.generated_guest_accelerator) > 0 + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - # Set the guest_accelerator to the user defined value if supplied, otherwise - # use the locally generated accelerator list. - guest_accelerator = local.is_a2_vm ? coalescelist( - local.input_guest_accelerator, - local.generated_guest_accelerator, - ) : local.input_guest_accelerator + # Select in priority order: + # (1) var.guest_accelerator if not empty + # (2) local.generated_guest_accelerator if not empty + # (3) default to empty list if both are empty + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index 62dfddee02..6509a15b28 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -95,7 +95,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 0.13.0 | +| [terraform](#requirement\_terraform) | >= 1.1 | | [google](#requirement\_google) | >= 3.83 | ## Providers @@ -133,7 +133,7 @@ No modules. | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See
https://cloud.google.com/compute/docs/gpus more details.
- type : the GPU type, e.g. nvidia-tesla-t4, nvidia-a100-80gb, nvidia-tesla-a100, etc
- count : number of GPUs

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
object({
count = number,
type = string
})
| `null` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | Alternative method of providing 'var.gpu' with a consistent naming scheme to
other HPC Toolkit modules.

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
list(object({
type = string,
count = number
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | Alternative method of providing 'var.gpu' with a consistent naming scheme to
other HPC Toolkit modules.

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-7-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf index 197a97b6e6..78ae861fca 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/gpu_definition.tf @@ -19,29 +19,37 @@ # machine_type locals { - - # Ensure guest_accelerator is a list if not set - input_guest_accelerator = var.guest_accelerator == null ? [] : var.guest_accelerator - - # If the machine type indicates a GPU is used, gather the count and type information - accelerator_types = { - "highgpu" = "nvidia-tesla-a100" - "megagpu" = "nvidia-tesla-a100" - "ultragpu" = "nvidia-a100-80gb" + # example state; terraform will ignore diffs if last element of URL matches + # guest_accelerator = [ + # { + # count = 1 + # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" + # }, + # ] + accelerator_machines = { + "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, + "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, + "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, + "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, + "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, + "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, + "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, + "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, + "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, + "g2-standard-4" = { type = "nvidia-l4", count = 1 }, + "g2-standard-8" = { type = "nvidia-l4", count = 1 }, + "g2-standard-12" = { type = "nvidia-l4", count = 1 }, + "g2-standard-16" = { type = "nvidia-l4", count = 1 }, + "g2-standard-24" = { type = "nvidia-l4", count = 2 }, + "g2-standard-32" = { type = "nvidia-l4", count = 1 }, + "g2-standard-48" = { type = "nvidia-l4", count = 4 }, + "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([{ - type = local.accelerator_types[regex("a2-([A-Za-z]+)-", var.machine_type)[0]], - count = one(regex("a2-[A-Za-z]+-(\\d+)", var.machine_type)), - }], []) - - # If the machine type is a valid a2 machine_type, generated_guest_accelerator - # will be populated. This also guarantees at least one populated list in coalescelist. - is_a2_vm = length(local.generated_guest_accelerator) > 0 + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - # Set the guest_accelerator to the user defined value if supplied, otherwise - # use the locally generated accelerator list. - guest_accelerator = local.is_a2_vm ? coalescelist( - local.input_guest_accelerator, - local.generated_guest_accelerator, - ) : local.input_guest_accelerator + # Select in priority order: + # (1) var.guest_accelerator if not empty + # (2) local.generated_guest_accelerator if not empty + # (3) default to empty list if both are empty + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index cf99cea7d2..e1fa801580 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -274,7 +274,8 @@ variable "guest_accelerator" { type = string, count = number })) - default = null + default = [] + nullable = false } variable "preemptible" { diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index ad2ac86f04..cdd7e6b4dd 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -24,5 +24,5 @@ terraform { provider_meta "google" { module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.20.0" } - required_version = ">= 0.13.0" + required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index a4a79f4d07..33fd30c63d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -150,7 +150,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | +| [terraform](#requirement\_terraform) | >= 1.1 | | [google](#requirement\_google) | >= 3.83 | ## Providers @@ -204,7 +204,7 @@ limitations under the License. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See
https://cloud.google.com/compute/docs/gpus more details.
- type : the GPU type, e.g. nvidia-tesla-t4, nvidia-a100-80gb, nvidia-tesla-a100, etc
- count : number of GPUs

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
object({
type = string
count = number
})
| `null` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | Alternative method of providing 'var.gpu' with a consistent naming scheme to
other HPC Toolkit modules.

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
list(object({
type = string,
count = number
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | Alternative method of providing 'var.gpu' with a consistent naming scheme to
other HPC Toolkit modules.

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-7-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf index 197a97b6e6..78ae861fca 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/gpu_definition.tf @@ -19,29 +19,37 @@ # machine_type locals { - - # Ensure guest_accelerator is a list if not set - input_guest_accelerator = var.guest_accelerator == null ? [] : var.guest_accelerator - - # If the machine type indicates a GPU is used, gather the count and type information - accelerator_types = { - "highgpu" = "nvidia-tesla-a100" - "megagpu" = "nvidia-tesla-a100" - "ultragpu" = "nvidia-a100-80gb" + # example state; terraform will ignore diffs if last element of URL matches + # guest_accelerator = [ + # { + # count = 1 + # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" + # }, + # ] + accelerator_machines = { + "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, + "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, + "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, + "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, + "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, + "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, + "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, + "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, + "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, + "g2-standard-4" = { type = "nvidia-l4", count = 1 }, + "g2-standard-8" = { type = "nvidia-l4", count = 1 }, + "g2-standard-12" = { type = "nvidia-l4", count = 1 }, + "g2-standard-16" = { type = "nvidia-l4", count = 1 }, + "g2-standard-24" = { type = "nvidia-l4", count = 2 }, + "g2-standard-32" = { type = "nvidia-l4", count = 1 }, + "g2-standard-48" = { type = "nvidia-l4", count = 4 }, + "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([{ - type = local.accelerator_types[regex("a2-([A-Za-z]+)-", var.machine_type)[0]], - count = one(regex("a2-[A-Za-z]+-(\\d+)", var.machine_type)), - }], []) - - # If the machine type is a valid a2 machine_type, generated_guest_accelerator - # will be populated. This also guarantees at least one populated list in coalescelist. - is_a2_vm = length(local.generated_guest_accelerator) > 0 + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - # Set the guest_accelerator to the user defined value if supplied, otherwise - # use the locally generated accelerator list. - guest_accelerator = local.is_a2_vm ? coalescelist( - local.input_guest_accelerator, - local.generated_guest_accelerator, - ) : local.input_guest_accelerator + # Select in priority order: + # (1) var.guest_accelerator if not empty + # (2) local.generated_guest_accelerator if not empty + # (3) default to empty list if both are empty + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 5b9c3602fb..c024d9811f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -312,7 +312,8 @@ variable "guest_accelerator" { type = string, count = number })) - default = null + default = [] + nullable = false } variable "labels" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index 5c2986b93f..a4cb78a35f 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -24,5 +24,5 @@ terraform { provider_meta "google" { module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.20.0" } - required_version = ">= 0.14.0" + required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 8c025d7dd9..13c0af6760 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -72,7 +72,7 @@ limitations under the License. | Name | Version | |------|---------| -| [terraform](#requirement\_terraform) | >= 0.14.0 | +| [terraform](#requirement\_terraform) | >= 1.1 | | [google](#requirement\_google) | >= 3.83 | ## Providers @@ -113,7 +113,7 @@ limitations under the License. | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See
https://cloud.google.com/compute/docs/gpus more details.
- type : the GPU type, e.g. nvidia-tesla-t4, nvidia-a100-80gb, nvidia-tesla-a100, etc
- count : number of GPUs

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
object({
type = string
count = number
})
| `null` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | Alternative method of providing 'var.gpu' with a consistent naming scheme to
other HPC Toolkit modules.

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
list(object({
type = string,
count = number
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | Alternative method of providing 'var.gpu' with a consistent naming scheme to
other HPC Toolkit modules.

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-7-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf index 197a97b6e6..78ae861fca 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/gpu_definition.tf @@ -19,29 +19,37 @@ # machine_type locals { - - # Ensure guest_accelerator is a list if not set - input_guest_accelerator = var.guest_accelerator == null ? [] : var.guest_accelerator - - # If the machine type indicates a GPU is used, gather the count and type information - accelerator_types = { - "highgpu" = "nvidia-tesla-a100" - "megagpu" = "nvidia-tesla-a100" - "ultragpu" = "nvidia-a100-80gb" + # example state; terraform will ignore diffs if last element of URL matches + # guest_accelerator = [ + # { + # count = 1 + # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" + # }, + # ] + accelerator_machines = { + "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, + "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, + "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, + "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, + "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, + "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, + "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, + "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, + "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, + "g2-standard-4" = { type = "nvidia-l4", count = 1 }, + "g2-standard-8" = { type = "nvidia-l4", count = 1 }, + "g2-standard-12" = { type = "nvidia-l4", count = 1 }, + "g2-standard-16" = { type = "nvidia-l4", count = 1 }, + "g2-standard-24" = { type = "nvidia-l4", count = 2 }, + "g2-standard-32" = { type = "nvidia-l4", count = 1 }, + "g2-standard-48" = { type = "nvidia-l4", count = 4 }, + "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([{ - type = local.accelerator_types[regex("a2-([A-Za-z]+)-", var.machine_type)[0]], - count = one(regex("a2-[A-Za-z]+-(\\d+)", var.machine_type)), - }], []) - - # If the machine type is a valid a2 machine_type, generated_guest_accelerator - # will be populated. This also guarantees at least one populated list in coalescelist. - is_a2_vm = length(local.generated_guest_accelerator) > 0 + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - # Set the guest_accelerator to the user defined value if supplied, otherwise - # use the locally generated accelerator list. - guest_accelerator = local.is_a2_vm ? coalescelist( - local.input_guest_accelerator, - local.generated_guest_accelerator, - ) : local.input_guest_accelerator + # Select in priority order: + # (1) var.guest_accelerator if not empty + # (2) local.generated_guest_accelerator if not empty + # (3) default to empty list if both are empty + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index 007c9d666c..6db3e5f76d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -181,7 +181,8 @@ variable "guest_accelerator" { type = string, count = number })) - default = null + default = [] + nullable = false } variable "service_account" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index d195c77a28..8b7e9382bf 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -24,5 +24,5 @@ terraform { provider_meta "google" { module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.20.0" } - required_version = ">= 0.14.0" + required_version = ">= 1.1" } diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index 409ffc46e7..cbaa73db46 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -160,7 +160,7 @@ limitations under the License. | [terraform](#requirement\_terraform) | >= 1.2.0 | | [google](#requirement\_google) | >= 4.73.0 | | [google-beta](#requirement\_google-beta) | >= 4.73.0 | -| [null](#requirement\_null) | >= 1.0 | +| [null](#requirement\_null) | >= 3.0 | ## Providers @@ -168,7 +168,7 @@ limitations under the License. |------|---------| | [google](#provider\_google) | >= 4.73.0 | | [google-beta](#provider\_google-beta) | >= 4.73.0 | -| [null](#provider\_null) | >= 1.0 | +| [null](#provider\_null) | >= 3.0 | ## Modules @@ -199,7 +199,7 @@ limitations under the License. | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for instances. | `number` | `200` | no | | [disk\_type](#input\_disk\_type) | Disk type for instances. | `string` | `"pd-standard"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | | [instance\_image](#input\_instance\_image) | Instance Image |
object({
family = string,
project = string
})
|
{
"family": "hpc-centos-7",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | n/a | yes | diff --git a/modules/compute/vm-instance/gpu_definition.tf b/modules/compute/vm-instance/gpu_definition.tf index 197a97b6e6..78ae861fca 100644 --- a/modules/compute/vm-instance/gpu_definition.tf +++ b/modules/compute/vm-instance/gpu_definition.tf @@ -19,29 +19,37 @@ # machine_type locals { - - # Ensure guest_accelerator is a list if not set - input_guest_accelerator = var.guest_accelerator == null ? [] : var.guest_accelerator - - # If the machine type indicates a GPU is used, gather the count and type information - accelerator_types = { - "highgpu" = "nvidia-tesla-a100" - "megagpu" = "nvidia-tesla-a100" - "ultragpu" = "nvidia-a100-80gb" + # example state; terraform will ignore diffs if last element of URL matches + # guest_accelerator = [ + # { + # count = 1 + # type = "https://www.googleapis.com/compute/beta/projects/PROJECT/zones/ZONE/acceleratorTypes/nvidia-tesla-a100" + # }, + # ] + accelerator_machines = { + "a2-highgpu-1g" = { type = "nvidia-tesla-a100", count = 1 }, + "a2-highgpu-2g" = { type = "nvidia-tesla-a100", count = 2 }, + "a2-highgpu-4g" = { type = "nvidia-tesla-a100", count = 4 }, + "a2-highgpu-8g" = { type = "nvidia-tesla-a100", count = 8 }, + "a2-megagpu-16g" = { type = "nvidia-tesla-a100", count = 16 }, + "a2-ultragpu-1g" = { type = "nvidia-a100-80gb", count = 1 }, + "a2-ultragpu-2g" = { type = "nvidia-a100-80gb", count = 2 }, + "a2-ultragpu-4g" = { type = "nvidia-a100-80gb", count = 4 }, + "a2-ultragpu-8g" = { type = "nvidia-a100-80gb", count = 8 }, + "g2-standard-4" = { type = "nvidia-l4", count = 1 }, + "g2-standard-8" = { type = "nvidia-l4", count = 1 }, + "g2-standard-12" = { type = "nvidia-l4", count = 1 }, + "g2-standard-16" = { type = "nvidia-l4", count = 1 }, + "g2-standard-24" = { type = "nvidia-l4", count = 2 }, + "g2-standard-32" = { type = "nvidia-l4", count = 1 }, + "g2-standard-48" = { type = "nvidia-l4", count = 4 }, + "g2-standard-96" = { type = "nvidia-l4", count = 8 }, } - generated_guest_accelerator = try([{ - type = local.accelerator_types[regex("a2-([A-Za-z]+)-", var.machine_type)[0]], - count = one(regex("a2-[A-Za-z]+-(\\d+)", var.machine_type)), - }], []) - - # If the machine type is a valid a2 machine_type, generated_guest_accelerator - # will be populated. This also guarantees at least one populated list in coalescelist. - is_a2_vm = length(local.generated_guest_accelerator) > 0 + generated_guest_accelerator = try([local.accelerator_machines[var.machine_type]], []) - # Set the guest_accelerator to the user defined value if supplied, otherwise - # use the locally generated accelerator list. - guest_accelerator = local.is_a2_vm ? coalescelist( - local.input_guest_accelerator, - local.generated_guest_accelerator, - ) : local.input_guest_accelerator + # Select in priority order: + # (1) var.guest_accelerator if not empty + # (2) local.generated_guest_accelerator if not empty + # (3) default to empty list if both are empty + guest_accelerator = try(coalescelist(var.guest_accelerator, local.generated_guest_accelerator), []) } diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf index df929c6337..595287ecb9 100644 --- a/modules/compute/vm-instance/variables.tf +++ b/modules/compute/vm-instance/variables.tf @@ -242,7 +242,8 @@ variable "guest_accelerator" { type = string, count = number })) - default = null + default = [] + nullable = false } variable "automatic_restart" { diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 67b38e0ec7..47483d7c4c 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -26,7 +26,8 @@ terraform { version = ">= 4.73.0" } null = { - version = ">= 1.0" + source = "hashicorp/null" + version = ">= 3.0" } } provider_meta "google" { From 8db90e250f6a2f7ab9394788a984bdaab28d5aa5 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Mon, 24 Jul 2023 16:38:47 -0500 Subject: [PATCH 095/144] Deprecate var.gpu in Slurm v5 modules --- .../schedmd-slurm-gcp-v5-node-group/README.md | 4 +-- .../schedmd-slurm-gcp-v5-node-group/main.tf | 2 +- .../variables.tf | 29 +++++++++---------- .../schedmd-slurm-gcp-v5-controller/README.md | 4 +-- .../schedmd-slurm-gcp-v5-controller/main.tf | 2 +- .../variables.tf | 25 +++++++--------- .../schedmd-slurm-gcp-v5-login/README.md | 4 +-- .../schedmd-slurm-gcp-v5-login/main.tf | 2 +- .../schedmd-slurm-gcp-v5-login/variables.tf | 25 +++++++--------- 9 files changed, 44 insertions(+), 53 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md index 6509a15b28..b74145e450 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/README.md @@ -132,8 +132,8 @@ No modules. | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | -| [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See
https://cloud.google.com/compute/docs/gpus more details.
- type : the GPU type, e.g. nvidia-tesla-t4, nvidia-a100-80gb, nvidia-tesla-a100, etc
- count : number of GPUs

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
object({
count = number,
type = string
})
| `null` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | Alternative method of providing 'var.gpu' with a consistent naming scheme to
other HPC Toolkit modules.

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-7-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf index 601f64df15..ae7c6b3dff 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/main.tf @@ -54,7 +54,7 @@ locals { enable_confidential_vm = var.enable_confidential_vm enable_oslogin = var.enable_oslogin enable_shielded_vm = var.enable_shielded_vm - gpu = var.gpu != null ? var.gpu : one(local.guest_accelerator) + gpu = one(local.guest_accelerator) labels = local.labels machine_type = var.machine_type metadata = var.metadata diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index e1fa801580..3070010996 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -248,34 +248,31 @@ variable "on_host_maintenance" { } variable "gpu" { - description = <<-EOD - GPU information. Type and count of GPU to attach to the instance template. See - https://cloud.google.com/compute/docs/gpus more details. - - type : the GPU type, e.g. nvidia-tesla-t4, nvidia-a100-80gb, nvidia-tesla-a100, etc - - count : number of GPUs - - If both 'var.gpu' and 'var.guest_accelerator' are set, 'var.gpu' will be used. - EOD type = object({ - count = number, type = string + count = number }) - default = null + description = "DEPRECATED: use var.guest_accelerator" + default = null + validation { + condition = var.gpu == null + error_message = "var.gpu is deprecated. Use var.guest_accelerator." + } } variable "guest_accelerator" { - description = <<-EOD - Alternative method of providing 'var.gpu' with a consistent naming scheme to - other HPC Toolkit modules. - - If both 'var.gpu' and 'var.guest_accelerator' are set, 'var.gpu' will be used. - EOD + description = "List of the type and count of accelerator cards attached to the instance." type = list(object({ type = string, count = number })) default = [] nullable = false + + validation { + condition = length(var.guest_accelerator) <= 1 + error_message = "The Slurm modules supports 0 or 1 models of accelerator card on each node." + } } variable "preemptible" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 33fd30c63d..d6387d7ef1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -203,8 +203,8 @@ limitations under the License. | [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfiguration when Slurm configuration changes (e.g.
slurm.conf.tpl, partition details). Compute instances and resource policies
(e.g. placement groups) will be destroyed to align with new configuration.
NOTE: Requires Python and Google Pub/Sub API.
*WARNING*: Toggling this will impact the running workload. Deployed compute nodes
will be destroyed and their jobs will be requeued. | `bool` | `false` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | -| [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See
https://cloud.google.com/compute/docs/gpus more details.
- type : the GPU type, e.g. nvidia-tesla-t4, nvidia-a100-80gb, nvidia-tesla-a100, etc
- count : number of GPUs

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
object({
type = string
count = number
})
| `null` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | Alternative method of providing 'var.gpu' with a consistent naming scheme to
other HPC Toolkit modules.

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-7-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 73ef3e2453..5626dd6523 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -103,7 +103,7 @@ module "slurm_controller_template" { enable_confidential_vm = var.enable_confidential_vm enable_oslogin = var.enable_oslogin enable_shielded_vm = var.enable_shielded_vm - gpu = var.gpu != null ? var.gpu : one(local.guest_accelerator) + gpu = one(local.guest_accelerator) labels = local.labels machine_type = var.machine_type metadata = var.metadata diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index c024d9811f..8a7e93468c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -290,30 +290,27 @@ variable "gpu" { type = string count = number }) - description = <<-EOD - GPU information. Type and count of GPU to attach to the instance template. See - https://cloud.google.com/compute/docs/gpus more details. - - type : the GPU type, e.g. nvidia-tesla-t4, nvidia-a100-80gb, nvidia-tesla-a100, etc - - count : number of GPUs - - If both 'var.gpu' and 'var.guest_accelerator' are set, 'var.gpu' will be used. - EOD + description = "DEPRECATED: use var.guest_accelerator" default = null + validation { + condition = var.gpu == null + error_message = "var.gpu is deprecated. Use var.guest_accelerator." + } } variable "guest_accelerator" { - description = <<-EOD - Alternative method of providing 'var.gpu' with a consistent naming scheme to - other HPC Toolkit modules. - - If both 'var.gpu' and 'var.guest_accelerator' are set, 'var.gpu' will be used. - EOD + description = "List of the type and count of accelerator cards attached to the instance." type = list(object({ type = string, count = number })) default = [] nullable = false + + validation { + condition = length(var.guest_accelerator) <= 1 + error_message = "The Slurm modules supports 0 or 1 models of accelerator card on each node." + } } variable "labels" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 13c0af6760..740d78a6d0 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -112,8 +112,8 @@ limitations under the License. | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See
https://cloud.google.com/compute/docs/gpus more details.
- type : the GPU type, e.g. nvidia-tesla-t4, nvidia-a100-80gb, nvidia-tesla-a100, etc
- count : number of GPUs

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
object({
type = string
count = number
})
| `null` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | Alternative method of providing 'var.gpu' with a consistent naming scheme to
other HPC Toolkit modules.

If both 'var.gpu' and 'var.guest\_accelerator' are set, 'var.gpu' will be used. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [gpu](#input\_gpu) | DEPRECATED: use var.guest\_accelerator |
object({
type = string
count = number
})
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm login node VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-5-7-hpc-centos-7",
"project": "schedmd-slurm-public"
}
| no | | [instance\_template](#input\_instance\_template) | Self link to a custom instance template. If set, other VM definition
variables such as machine\_type and instance\_image will be ignored in favor
of the provided instance template.

For more information on creating custom images for the instance template
that comply with Slurm on GCP see the "Slurm on GCP Custom Images" section
in docs/vm-images.md. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index da2fb1b759..e33ce76350 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -63,7 +63,7 @@ module "slurm_login_template" { enable_confidential_vm = var.enable_confidential_vm enable_oslogin = var.enable_oslogin enable_shielded_vm = var.enable_shielded_vm - gpu = var.gpu != null ? var.gpu : one(local.guest_accelerator) + gpu = one(local.guest_accelerator) labels = local.labels machine_type = var.machine_type metadata = var.metadata diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index 6db3e5f76d..92b86b876c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -159,30 +159,27 @@ variable "gpu" { type = string count = number }) - description = <<-EOD - GPU information. Type and count of GPU to attach to the instance template. See - https://cloud.google.com/compute/docs/gpus more details. - - type : the GPU type, e.g. nvidia-tesla-t4, nvidia-a100-80gb, nvidia-tesla-a100, etc - - count : number of GPUs - - If both 'var.gpu' and 'var.guest_accelerator' are set, 'var.gpu' will be used. - EOD + description = "DEPRECATED: use var.guest_accelerator" default = null + validation { + condition = var.gpu == null + error_message = "var.gpu is deprecated. Use var.guest_accelerator." + } } variable "guest_accelerator" { - description = <<-EOD - Alternative method of providing 'var.gpu' with a consistent naming scheme to - other HPC Toolkit modules. - - If both 'var.gpu' and 'var.guest_accelerator' are set, 'var.gpu' will be used. - EOD + description = "List of the type and count of accelerator cards attached to the instance." type = list(object({ type = string, count = number })) default = [] nullable = false + + validation { + condition = length(var.guest_accelerator) <= 1 + error_message = "The Slurm modules supports 0 or 1 models of accelerator card on each node." + } } variable "service_account" { From cf3d1f31cc3c6de73bfc87122e9b00b9be030303 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Mon, 24 Jul 2023 17:18:30 -0700 Subject: [PATCH 096/144] Run the "label" check on more PR events (#1619) --- .github/workflows/pr-label-validation.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml index 2c6d6a47fd..4c4e3fac67 100644 --- a/.github/workflows/pr-label-validation.yml +++ b/.github/workflows/pr-label-validation.yml @@ -23,6 +23,10 @@ on: - opened - labeled - unlabeled + - synchronize + - edited + - ready_for_review + - unlocked branches: - develop From 98053ae5509ca0c7e817e732254d02e46d6ca8d5 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Tue, 25 Jul 2023 09:32:14 -0700 Subject: [PATCH 097/144] Update DDN EXAscaler to 6.2 (#1606) * Update DDN EXAscaler to 6.2 * Update defaults in DDN-EXAscaler module --- community/modules/file-system/DDN-EXAScaler/README.md | 4 ++-- community/modules/file-system/DDN-EXAScaler/main.tf | 2 +- community/modules/file-system/DDN-EXAScaler/variables.tf | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/community/modules/file-system/DDN-EXAScaler/README.md b/community/modules/file-system/DDN-EXAScaler/README.md index 5a7b96f037..3a981c562f 100644 --- a/community/modules/file-system/DDN-EXAScaler/README.md +++ b/community/modules/file-system/DDN-EXAScaler/README.md @@ -102,7 +102,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [ddn\_exascaler](#module\_ddn\_exascaler) | github.com/DDNStorage/exascaler-cloud-terraform//gcp | 78deadb | +| [ddn\_exascaler](#module\_ddn\_exascaler) | github.com/DDNStorage/exascaler-cloud-terraform//gcp | e33f439a | ## Resources @@ -113,7 +113,7 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [boot](#input\_boot) | Boot disk properties |
object({
disk_type = string
auto_delete = bool
script_url = string
})
|
{
"auto_delete": true,
"disk_type": "pd-standard",
"script_url": null
}
| no | -| [cls](#input\_cls) | Compute client properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 0,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-2",
"public_ip": true
}
| no | +| [cls](#input\_cls) | Compute client properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 1,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-2",
"public_ip": true
}
| no | | [clt](#input\_clt) | Compute client target properties |
object({
disk_bus = string
disk_type = string
disk_size = number
disk_count = number
})
|
{
"disk_bus": "SCSI",
"disk_count": 0,
"disk_size": 256,
"disk_type": "pd-standard"
}
| no | | [fsname](#input\_fsname) | EXAScaler filesystem name, only alphanumeric characters are allowed, and the value must be 1-8 characters long | `string` | `"exacloud"` | no | | [image](#input\_image) | Source image properties | `any` |
{
"family": "exascaler-cloud-6-1-centos",
"project": "ddn-public"
}
| no | diff --git a/community/modules/file-system/DDN-EXAScaler/main.tf b/community/modules/file-system/DDN-EXAScaler/main.tf index 5fd4bc0b9a..385c1b058c 100644 --- a/community/modules/file-system/DDN-EXAScaler/main.tf +++ b/community/modules/file-system/DDN-EXAScaler/main.tf @@ -42,7 +42,7 @@ locals { } module "ddn_exascaler" { - source = "github.com/DDNStorage/exascaler-cloud-terraform//gcp?ref=78deadb" + source = "github.com/DDNStorage/exascaler-cloud-terraform//gcp?ref=e33f439a" fsname = var.fsname zone = var.zone project = var.project_id diff --git a/community/modules/file-system/DDN-EXAScaler/variables.tf b/community/modules/file-system/DDN-EXAScaler/variables.tf index d5c937e008..d3933c5e14 100644 --- a/community/modules/file-system/DDN-EXAScaler/variables.tf +++ b/community/modules/file-system/DDN-EXAScaler/variables.tf @@ -425,7 +425,7 @@ variable "cls" { node_cpu = "Intel Cascade Lake" nic_type = "GVNIC" public_ip = true - node_count = 0 + node_count = 1 } } # Compute client target properties From 9a0b4b0803cf57d65470f2d940e261ca31f59916 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 27 Jul 2023 14:55:59 -0500 Subject: [PATCH 098/144] Remove lingering usage of htcondor-configure in favor of htcondor-setup --- .../scheduler/htcondor-setup/README.md | 136 +++--------------- .../scripts/htcondor-install/README.md | 52 ++++--- docs/gpu-support.md | 2 +- docs/vm-images.md | 2 +- 4 files changed, 49 insertions(+), 143 deletions(-) diff --git a/community/modules/scheduler/htcondor-setup/README.md b/community/modules/scheduler/htcondor-setup/README.md index f59e7f3f8b..deb85ced37 100644 --- a/community/modules/scheduler/htcondor-setup/README.md +++ b/community/modules/scheduler/htcondor-setup/README.md @@ -21,7 +21,6 @@ It is expected to be used with the [htcondor-install] and [hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm [htcondor-install]: ../../scripts/htcondor-setup/README.md [htcondor-execute-point]: ../../compute/htcondor-execute-point/README.md - [htcrole]: https://htcondor.readthedocs.io/en/latest/getting-htcondor/admin-quick-start.html#what-get-htcondor-does-to-configure-a-role ### Example @@ -39,140 +38,39 @@ example can be found in the [examples README][htc-example]. - id: htcondor_install source: community/modules/scripts/htcondor-install -- id: htcondor_configure +- id: htcondor_setup source: community/modules/scheduler/htcondor-setup use: - network1 -- id: htcondor_central_manager_startup - source: modules/scripts/startup-script - settings: - runners: - - $(htcondor_install.install_htcondor_runner) - - $(htcondor_configure.central_manager_runner) +- id: htcondor_secrets + source: community/modules/scheduler/htcondor-pool-secrets + use: + - htcondor_setup - id: htcondor_cm - source: modules/compute/vm-instance + source: community/modules/scheduler/htcondor-central-manager use: - network1 - - htcondor_central_manager_startup + - htcondor_secrets + - htcondor_setup settings: - name_prefix: cm0 - machine_type: c2-standard-4 - disable_public_ips: true - service_account: - email: $(htcondor_configure.central_manager_service_account) - scopes: - - cloud-platform - network_interfaces: - - network: null - subnetwork: $(network1.subnetwork_self_link) - subnetwork_project: $(vars.project_id) - network_ip: $(htcondor_configure.central_manager_internal_ip) - stack_type: null - access_config: [] - ipv6_access_config: [] - alias_ip_range: [] - nic_type: VIRTIO_NET - queue_count: null + instance_image: + project: $(vars.project_id) + family: $(vars.new_image_family) outputs: - - internal_ip + - central_manager_name ``` ## High Availability This module supports high availability modes of the HTCondor Central Manager and -of the Access Points. In these modes, the services can be resiliant against -zonal failures by distributing the services across two zones. Modify the above -example by setting `central_manager_high_availability` to `true` and adding a -new deployment variable `zone_secondary` set to another zone in the same region. -The 2 VMs can use the same startup script, but should differ by setting: - -- primary and secondary zones defined in deployment variables -- primary and secondary IP addresses created by this module -- differing name prefixes - -```yaml -vars: - # add typical settings (deployment_name, project_id, etc.) - # select a region and 2 different zones within the region - region: us-central1 - zone: us-central1-c - zone_secondary: us-central1-f - -- id: htcondor_configure - source: community/modules/scheduler/htcondor-setup - use: - - network1 - settings: - central_manager_high_availability: true - -- id: htcondor_cm_primary - source: modules/compute/vm-instance - use: - - network1 - - htcondor_central_manager_startup - settings: - name_prefix: cm0 - machine_type: c2-standard-4 - disable_public_ips: true - service_account: - email: $(htcondor_configure.central_manager_service_account) - scopes: - - cloud-platform - network_interfaces: - - network: null - subnetwork: $(network1.subnetwork_self_link) - subnetwork_project: $(vars.project_id) - network_ip: $(htcondor_configure.central_manager_internal_ip) - stack_type: null - access_config: [] - ipv6_access_config: [] - alias_ip_range: [] - nic_type: VIRTIO_NET - queue_count: null - outputs: - - internal_ip - -- id: htcondor_cm_secondary - source: modules/compute/vm-instance - use: - - network1 - - htcondor_central_manager_startup - settings: - name_prefix: cm1 - machine_type: c2-standard-4 - zone: $(vars.zone_secondary) - disable_public_ips: true - service_account: - email: $(htcondor_configure.central_manager_service_account) - scopes: - - cloud-platform - network_interfaces: - - network: null - subnetwork: $(network1.subnetwork_self_link) - subnetwork_project: $(vars.project_id) - network_ip: $(htcondor_configure.central_manager_secondary_internal_ip) - stack_type: null - access_config: [] - ipv6_access_config: [] - alias_ip_range: [] - nic_type: VIRTIO_NET - queue_count: null - outputs: - - internal_ip - -``` - -Access Point high availability is impacted by known issues [HTCONDOR-1590] and -[HTCONDOR-1594]. These are anticipated to be resolved in LTS release 10.0.3 and -above or feature release 10.4 and above. Please see [HTCondor version -numbering][htcver] and [release notes][htcnotes] for details. +of the Access Points via [Managed Instance Groups (MIG)][mig]. Please see +[htcondor-central-manager] and [htcondor-access-point] for details. -[htcver]: https://htcondor.readthedocs.io/en/latest/version-history/introduction-version-history.html#types-of-releases -[htcnotes]: https://htcondor.readthedocs.io/en/latest/version-history/index.html -[HTCONDOR-1590]: https://opensciencegrid.atlassian.net/jira/software/c/projects/HTCONDOR/issues/HTCONDOR-1590 -[HTCONDOR-1594]: https://opensciencegrid.atlassian.net/jira/software/c/projects/HTCONDOR/issues/HTCONDOR-1594 +[mig]: https://cloud.google.com/compute/docs/instance-groups +[htcondor-central-manager]: ../htcondor-central-manager/README.md +[htcondor-access-point]: ../htcondor-access-point/README.md ## Support diff --git a/community/modules/scripts/htcondor-install/README.md b/community/modules/scripts/htcondor-install/README.md index f94dc93ce5..d621ec082b 100644 --- a/community/modules/scripts/htcondor-install/README.md +++ b/community/modules/scripts/htcondor-install/README.md @@ -17,35 +17,43 @@ It is expected to be used with the [htcondor-setup] and [htcondor-execute-point] modules. [hpcvmimage]: https://cloud.google.com/compute/docs/instances/create-hpc-vm -[htcondor-setup]: ../../scheduler/htcondor-configure/README.md +[htcondor-setup]: ../../scheduler/htcondor-setup/README.md [htcondor-execute-point]: ../../compute/htcondor-execute-point/README.md ### Example The following code snippet uses this module to create startup scripts that -install the HTCondor software and adds custom configurations using -[htcondor-setup] and [htcondor-execute-point]. +install the HTCondor software into a custom VM image. ```yaml -- id: htcondor_install - source: community/modules/scripts/htcondor-install - -- id: htcondor_startup_central_manager - source: modules/scripts/startup-script - settings: - runners: - - $(htcondor_install.install_htcondor_runner) - - $(htcondor_configure.central_manager_runner) - -- id: htcondor_startup_access_point - source: modules/scripts/startup-script - settings: - runners: - - $(htcondor_install.install_htcondor_runner) - - $(htcondor_install.install_autoscaler_deps_runner) - - $(htcondor_install.install_autoscaler_runner) - - $(htcondor_configure.access_point_runner) - - $(htcondor_execute_point.configure_autoscaler_runner) +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/vpc + outputs: + - network_name + + - id: htcondor_install + source: community/modules/scripts/htcondor-install + + - id: htcondor_install_script + source: modules/scripts/startup-script + use: + - htcondor_install + +- group: packer + modules: + - id: custom-image + source: modules/packer/custom-image + kind: packer + use: + - network1 + - htcondor_install_script + settings: + disk_size: 50 + source_image_family: hpc-rocky-linux-8 + image_family: "htcondor-10x" ``` A full example can be found in the [examples README][htc-example]. diff --git a/docs/gpu-support.md b/docs/gpu-support.md index 4d677524bb..b5542e2359 100644 --- a/docs/gpu-support.md +++ b/docs/gpu-support.md @@ -47,7 +47,7 @@ cannot be determined automatically like with `a2`. [login]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-login [omnia-install]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scripts/omnia-install [htcondor-install]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scripts/htcondor-install -[htcondor-setup]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/htcondor-configure +[htcondor-setup]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/htcondor-setup [htcondor-execute-point]: https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/compute/htcondor-execute-point ## Troubleshooting and tips diff --git a/docs/vm-images.md b/docs/vm-images.md index 813c4b744f..5f6824f107 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -213,7 +213,7 @@ These instructions apply to the following modules: [schedmd-slurm-gcp-v5-node-group]: ../community/modules/compute/schedmd-slurm-gcp-v5-node-group [batch-job]: ../modules/scheduler/batch-job-template [batch-login]: ../modules/scheduler/batch-login-node -[htcondor-configure]: ../community/modules/scheduler/htcondor-configure +[htcondor-setup]: ../community/modules/scheduler/htcondor-setup [omnia-install]: ../community/modules/scripts/omnia-install [hpc-slurm-ubuntu2004.yaml]: ../community/examples/hpc-slurm-ubuntu2004.yaml From 9d80ea6149a3948a935232d5e4e7f72d28079b4d Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Thu, 27 Jul 2023 15:34:34 -0700 Subject: [PATCH 099/144] Update DDN-EXAscaler module reference (#1623) --- community/modules/file-system/DDN-EXAScaler/README.md | 4 ++-- community/modules/file-system/DDN-EXAScaler/main.tf | 2 +- community/modules/file-system/DDN-EXAScaler/variables.tf | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/community/modules/file-system/DDN-EXAScaler/README.md b/community/modules/file-system/DDN-EXAScaler/README.md index 3a981c562f..efb04846c7 100644 --- a/community/modules/file-system/DDN-EXAScaler/README.md +++ b/community/modules/file-system/DDN-EXAScaler/README.md @@ -102,7 +102,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [ddn\_exascaler](#module\_ddn\_exascaler) | github.com/DDNStorage/exascaler-cloud-terraform//gcp | e33f439a | +| [ddn\_exascaler](#module\_ddn\_exascaler) | github.com/DDNStorage/exascaler-cloud-terraform//gcp | a3355d50deebe45c0556b45bd599059b7c06988d | ## Resources @@ -113,7 +113,7 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [boot](#input\_boot) | Boot disk properties |
object({
disk_type = string
auto_delete = bool
script_url = string
})
|
{
"auto_delete": true,
"disk_type": "pd-standard",
"script_url": null
}
| no | -| [cls](#input\_cls) | Compute client properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 1,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-2",
"public_ip": true
}
| no | +| [cls](#input\_cls) | Compute client properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 0,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-2",
"public_ip": true
}
| no | | [clt](#input\_clt) | Compute client target properties |
object({
disk_bus = string
disk_type = string
disk_size = number
disk_count = number
})
|
{
"disk_bus": "SCSI",
"disk_count": 0,
"disk_size": 256,
"disk_type": "pd-standard"
}
| no | | [fsname](#input\_fsname) | EXAScaler filesystem name, only alphanumeric characters are allowed, and the value must be 1-8 characters long | `string` | `"exacloud"` | no | | [image](#input\_image) | Source image properties | `any` |
{
"family": "exascaler-cloud-6-1-centos",
"project": "ddn-public"
}
| no | diff --git a/community/modules/file-system/DDN-EXAScaler/main.tf b/community/modules/file-system/DDN-EXAScaler/main.tf index 385c1b058c..9d6e261e63 100644 --- a/community/modules/file-system/DDN-EXAScaler/main.tf +++ b/community/modules/file-system/DDN-EXAScaler/main.tf @@ -42,7 +42,7 @@ locals { } module "ddn_exascaler" { - source = "github.com/DDNStorage/exascaler-cloud-terraform//gcp?ref=e33f439a" + source = "github.com/DDNStorage/exascaler-cloud-terraform//gcp?ref=a3355d50deebe45c0556b45bd599059b7c06988d" fsname = var.fsname zone = var.zone project = var.project_id diff --git a/community/modules/file-system/DDN-EXAScaler/variables.tf b/community/modules/file-system/DDN-EXAScaler/variables.tf index d3933c5e14..d5c937e008 100644 --- a/community/modules/file-system/DDN-EXAScaler/variables.tf +++ b/community/modules/file-system/DDN-EXAScaler/variables.tf @@ -425,7 +425,7 @@ variable "cls" { node_cpu = "Intel Cascade Lake" nic_type = "GVNIC" public_ip = true - node_count = 1 + node_count = 0 } } # Compute client target properties From 3d54302a3a1897dfbc3de9cc6e294ccc6afbbfcd Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 27 Jul 2023 17:22:44 -0500 Subject: [PATCH 100/144] Support N>2 groups of HTCondor execute points The solution currently only supports 2 groups of execute points because the naming convention supports a fixed name with "spot" optionally added. This commit modifies the approach to require user specification of a group name that is unique to each group of execute points. There cannot be a random component to the name otherwise a for_each will fail in the htcondor-access-point module; future work may avoid need for user to explicitly supply this value. --- community/examples/htc-htcondor.yaml | 2 ++ .../compute/htcondor-execute-point/README.md | 27 ++++++++++--------- .../compute/htcondor-execute-point/main.tf | 11 ++++---- .../htcondor-execute-point/variables.tf | 10 +++++++ .../scheduler/htcondor-access-point/main.tf | 2 -- .../templates/condor_config.tftpl | 8 +++--- .../htcondor-install/files/autoscaler.py | 2 +- 7 files changed, 38 insertions(+), 24 deletions(-) diff --git a/community/examples/htc-htcondor.yaml b/community/examples/htc-htcondor.yaml index 301788c10b..397ba3f600 100644 --- a/community/examples/htc-htcondor.yaml +++ b/community/examples/htc-htcondor.yaml @@ -91,6 +91,7 @@ deployment_groups: - htcondor_setup - htcondor_cm settings: + name_prefix: grp1 instance_image: project: $(vars.project_id) family: $(vars.new_image_family) @@ -104,6 +105,7 @@ deployment_groups: - htcondor_setup - htcondor_cm settings: + name_prefix: spot instance_image: project: $(vars.project_id) family: $(vars.new_image_family) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index f880d45cf5..463904e376 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -14,20 +14,22 @@ modules. ### Known limitations -This module may be used exactly 1 or 2 times in a blueprint to create sets of -execute points in an HTCondor pool. If using 1 set, it may use either Spot or -On-demand pricing. If using 2 sets, one must use Spot and the other must -use On-demand pricing. If you do not follow this constraint, you will likely -receive an error while running `terraform apply` similar to that shown below. -Future development is planned to support more than 2 sets of VM configurations, -including all pricing options. +This module may be used multiple times in a blueprint to create sets of +execute points in an HTCondor pool. If used more than 1 time, the setting +[name_prefix](#input_name_prefix) must be set to a value that is unique across +all uses of the htcondor-execute-point module. If you do not follow this +constraint, you will likely receive an error while running `terraform apply` +similar to that shown below. ```text -│ │ var.runners is list of map of string with 7 elements -│ -│ All startup-script runners must have a unique destination. -│ -│ This was checked by the validation rule at modules/startup-script/variables.tf:72,3-13. +Error: Invalid value for variable + + on modules/embedded/community/modules/scheduler/htcondor-access-point/main.tf line 136, in module "startup_script": + 136: runners = local.all_runners + ├──────────────── + │ var.runners is list of map of string with 5 elements + +All startup-script runners must have a unique destination. ``` ### How to configure jobs to select execute points @@ -203,6 +205,7 @@ limitations under the License. | [max\_size](#input\_max\_size) | Maximum size of the HTCondor execute point pool. | `number` | `100` | no | | [metadata](#input\_metadata) | Metadata to add to HTCondor execute points | `map(string)` | `{}` | no | | [min\_idle](#input\_min\_idle) | Minimum number of idle VMs in the HTCondor pool (if pool reaches var.max\_size, this minimum is not guaranteed); set to ensure jobs beginning run more quickly. | `number` | `0` | no | +| [name\_prefix](#input\_name\_prefix) | Name prefix given to hostnames in this group of execute points; must be unique across all instances of this module | `string` | n/a | yes | | [network\_self\_link](#input\_network\_self\_link) | The self link of the network HTCondor execute points will join | `string` | `"default"` | no | | [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [project\_id](#input\_project\_id) | Project in which the HTCondor execute points will be created | `string` | n/a | yes | diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 806955df56..677104c974 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -76,7 +76,7 @@ locals { } ) - hostnames = var.spot ? "${var.deployment_name}-spot-ep" : "${var.deployment_name}-ep" + name_prefix = "${var.deployment_name}-${var.name_prefix}-ep" } data "google_compute_image" "htcondor" { @@ -97,7 +97,7 @@ data "google_compute_zones" "available" { } resource "google_storage_bucket_object" "execute_config" { - name = "${local.hostnames}-config-${substr(md5(local.execute_config), 0, 4)}" + name = "${local.name_prefix}-config-${substr(md5(local.execute_config), 0, 4)}" content = local.execute_config bucket = var.htcondor_bucket_name } @@ -117,7 +117,7 @@ module "execute_point_instance_template" { source = "terraform-google-modules/vm/google//modules/instance_template" version = "~> 8.0" - name_prefix = local.hostnames + name_prefix = local.name_prefix project_id = var.project_id network = var.network_self_link subnetwork = var.subnetwork_self_link @@ -143,10 +143,11 @@ module "mig" { region = var.region distribution_policy_zones = local.zones target_size = var.target_size - hostname = local.hostnames + hostname = local.name_prefix + mig_name = local.name_prefix instance_template = module.execute_point_instance_template.self_link - health_check_name = "health-htcondor-${local.hostnames}" + health_check_name = "health-htcondor-${local.name_prefix}" health_check = { type = "tcp" initial_delay_sec = 600 diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index 7df7044d95..fcb1629f94 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -185,3 +185,13 @@ variable "guest_accelerator" { error_message = "The HTCondor module supports 0 or 1 models of accelerator card on each execute point" } } + +variable "name_prefix" { + description = "Name prefix given to hostnames in this group of execute points; must be unique across all instances of this module" + type = string + nullable = false + validation { + condition = length(var.name_prefix) > 0 + error_message = "var.name_prefix must be a set to a non-empty string and must also be unique across all instances of htcondor-execute-point" + } +} diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index fa27ae662b..1df3bc4ad0 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -43,8 +43,6 @@ locals { log = log.$(ClusterId).$(ProcId) request_cpus = 1 request_memory = 100MB - # if unset, defaults to false - +RequireSpot = true queue EOT } diff --git a/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl b/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl index 5641f79add..7f40b5cb14 100644 --- a/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl +++ b/community/modules/scheduler/htcondor-access-point/templates/condor_config.tftpl @@ -38,8 +38,8 @@ SCHEDD_CRON_cloud_PREFIX = Cloud # aid the user by automatically using RequireSpot in their Requirements, unless # the user has explicitly used CloudInterruptible -JOB_TRANSFORM_NAMES = $(JOB_TRANSFORM_NAMES) SPOT_REQS -JOB_TRANSFORM_SPOT_REQS @=end +JOB_TRANSFORM_NAMES = $(JOB_TRANSFORM_NAMES) SPOT +JOB_TRANSFORM_SPOT @=end REQUIREMENTS ! isUndefined(RequireSpot) && ! unresolved(Requirements, "^CloudInterruptible$") SET Requirements ($(MY.Requirements)) && (CloudInterruptible is My.RequireSpot) @end @@ -62,8 +62,8 @@ SUBMIT_REQUIREMENT_NAMES = $(SUBMIT_REQUIREMENT_NAMES) MIGID SUBMIT_REQUIREMENT_MIGID = !isUndefined(RequireId) && member(RequireId, $(MIG_ID_LIST)) SUBMIT_REQUIREMENT_MIGID_REASON = strcat("Jobs must set +RequireId to one of following values surrounded by quotation marks:\n", $(MIG_IDS)) -JOB_TRANSFORM_NAMES = $(JOB_TRANSFORM_NAMES) SPOT_REQS -JOB_TRANSFORM_SPOT_REQS @=end +JOB_TRANSFORM_NAMES = $(JOB_TRANSFORM_NAMES) MIGID +JOB_TRANSFORM_MIGID @=end REQUIREMENTS ! isUndefined(RequireId) && ! unresolved(Requirements, "^CloudCreatedBy$") SET Requirements ($(MY.Requirements)) && regexp(strcat("/", My.RequireId, "$"), CloudCreatedBy) @end diff --git a/community/modules/scripts/htcondor-install/files/autoscaler.py b/community/modules/scripts/htcondor-install/files/autoscaler.py index 462b45e52d..cc0e3dd1ae 100644 --- a/community/modules/scripts/htcondor-install/files/autoscaler.py +++ b/community/modules/scripts/htcondor-install/files/autoscaler.py @@ -208,7 +208,7 @@ def scale(self): # this query will constrain the search for jobs to those that either # require spot VMs or do not require Spot VMs based on whether the # VM instance template is configured for Spot pricing - spot_query = classad.ExprTree(f"RequireSpot == {self.is_spot}") + spot_query = classad.ExprTree(f"RequireId == \"{self.instance_group_manager}\"") # For purpose of scaling a Managed Instance Group, count only jobs that # are idle and likely participated in a negotiation cycle (there does From 6046163cca33918b86916361399d16c01ad43e9f Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Thu, 27 Jul 2023 20:26:34 -0500 Subject: [PATCH 101/144] Increase default Packer VM scopes to cloud-platform --- modules/packer/custom-image/README.md | 2 +- modules/packer/custom-image/variables.pkr.hcl | 5 +---- .../expectations/igc_pkr/one/image/variables.pkr.hcl | 5 +---- .../expectations/text_escape/zero/lime/variables.pkr.hcl | 5 +---- 4 files changed, 4 insertions(+), 13 deletions(-) diff --git a/modules/packer/custom-image/README.md b/modules/packer/custom-image/README.md index 2d9fa099bf..dbaf1323f7 100644 --- a/modules/packer/custom-image/README.md +++ b/modules/packer/custom-image/README.md @@ -256,7 +256,7 @@ No resources. | [omit\_external\_ip](#input\_omit\_external\_ip) | Provision the image building VM without a public IP address | `bool` | `true` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Describes maintenance behavior for the instance. If left blank this will default to `MIGRATE` except the use of GPUs requires it to be `TERMINATE` | `string` | `null` | no | | [project\_id](#input\_project\_id) | Project in which to create VM and image | `string` | n/a | yes | -| [scopes](#input\_scopes) | Service account scopes to attach to the instance. See
https://cloud.google.com/compute/docs/access/service-accounts. | `list(string)` |
[
"https://www.googleapis.com/auth/userinfo.email",
"https://www.googleapis.com/auth/compute",
"https://www.googleapis.com/auth/devstorage.full_control",
"https://www.googleapis.com/auth/logging.write"
]
| no | +| [scopes](#input\_scopes) | Service account scopes to attach to the instance. See
https://cloud.google.com/compute/docs/access/service-accounts. | `list(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [service\_account\_email](#input\_service\_account\_email) | The service account email to use. If null or 'default', then the default Compute Engine service account will be used. | `string` | `null` | no | | [shell\_scripts](#input\_shell\_scripts) | A list of paths to local shell scripts which will be uploaded to customize the VM image | `list(string)` | `[]` | no | | [source\_image](#input\_source\_image) | Source OS image to build from | `string` | `null` | no | diff --git a/modules/packer/custom-image/variables.pkr.hcl b/modules/packer/custom-image/variables.pkr.hcl index 9589669153..ed4350387c 100644 --- a/modules/packer/custom-image/variables.pkr.hcl +++ b/modules/packer/custom-image/variables.pkr.hcl @@ -115,10 +115,7 @@ https://cloud.google.com/compute/docs/access/service-accounts. EOD type = list(string) default = [ - "https://www.googleapis.com/auth/userinfo.email", - "https://www.googleapis.com/auth/compute", - "https://www.googleapis.com/auth/devstorage.full_control", - "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/cloud-platform", ] } diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl index 9589669153..ed4350387c 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/one/image/variables.pkr.hcl @@ -115,10 +115,7 @@ https://cloud.google.com/compute/docs/access/service-accounts. EOD type = list(string) default = [ - "https://www.googleapis.com/auth/userinfo.email", - "https://www.googleapis.com/auth/compute", - "https://www.googleapis.com/auth/devstorage.full_control", - "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/cloud-platform", ] } diff --git a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl index 9589669153..ed4350387c 100644 --- a/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl +++ b/tools/validate_configs/golden_copies/expectations/text_escape/zero/lime/variables.pkr.hcl @@ -115,10 +115,7 @@ https://cloud.google.com/compute/docs/access/service-accounts. EOD type = list(string) default = [ - "https://www.googleapis.com/auth/userinfo.email", - "https://www.googleapis.com/auth/compute", - "https://www.googleapis.com/auth/devstorage.full_control", - "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/cloud-platform", ] } From 55d11b20e7603c618674d4c0c37e4c968de5f915 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Fri, 28 Jul 2023 13:50:56 -0700 Subject: [PATCH 102/144] Rename spack-install to spack-setup --- .../modules/scripts/{spack-install => spack-setup}/README.md | 0 .../modules/scripts/{spack-install => spack-setup}/main.tf | 2 +- .../modules/scripts/{spack-install => spack-setup}/outputs.tf | 0 .../scripts/install_spack_deps.yml | 0 .../{spack-install => spack-setup}/templates/.shellcheckrc | 0 .../templates/spack_setup.yml.tftpl | 0 .../modules/scripts/{spack-install => spack-setup}/variables.tf | 0 .../modules/scripts/{spack-install => spack-setup}/versions.tf | 0 8 files changed, 1 insertion(+), 1 deletion(-) rename community/modules/scripts/{spack-install => spack-setup}/README.md (100%) rename community/modules/scripts/{spack-install => spack-setup}/main.tf (98%) rename community/modules/scripts/{spack-install => spack-setup}/outputs.tf (100%) rename community/modules/scripts/{spack-install => spack-setup}/scripts/install_spack_deps.yml (100%) rename community/modules/scripts/{spack-install => spack-setup}/templates/.shellcheckrc (100%) rename community/modules/scripts/{spack-install => spack-setup}/templates/spack_setup.yml.tftpl (100%) rename community/modules/scripts/{spack-install => spack-setup}/variables.tf (100%) rename community/modules/scripts/{spack-install => spack-setup}/versions.tf (100%) diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-setup/README.md similarity index 100% rename from community/modules/scripts/spack-install/README.md rename to community/modules/scripts/spack-setup/README.md diff --git a/community/modules/scripts/spack-install/main.tf b/community/modules/scripts/spack-setup/main.tf similarity index 98% rename from community/modules/scripts/spack-install/main.tf rename to community/modules/scripts/spack-setup/main.tf index 4b8ede607a..7691a9e90a 100644 --- a/community/modules/scripts/spack-install/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -16,7 +16,7 @@ locals { # This label allows for billing report tracking based on module. - labels = merge(var.labels, { ghpc_module = "spack-install" }) + labels = merge(var.labels, { ghpc_module = "spack-setup" }) } locals { diff --git a/community/modules/scripts/spack-install/outputs.tf b/community/modules/scripts/spack-setup/outputs.tf similarity index 100% rename from community/modules/scripts/spack-install/outputs.tf rename to community/modules/scripts/spack-setup/outputs.tf diff --git a/community/modules/scripts/spack-install/scripts/install_spack_deps.yml b/community/modules/scripts/spack-setup/scripts/install_spack_deps.yml similarity index 100% rename from community/modules/scripts/spack-install/scripts/install_spack_deps.yml rename to community/modules/scripts/spack-setup/scripts/install_spack_deps.yml diff --git a/community/modules/scripts/spack-install/templates/.shellcheckrc b/community/modules/scripts/spack-setup/templates/.shellcheckrc similarity index 100% rename from community/modules/scripts/spack-install/templates/.shellcheckrc rename to community/modules/scripts/spack-setup/templates/.shellcheckrc diff --git a/community/modules/scripts/spack-install/templates/spack_setup.yml.tftpl b/community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl similarity index 100% rename from community/modules/scripts/spack-install/templates/spack_setup.yml.tftpl rename to community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl diff --git a/community/modules/scripts/spack-install/variables.tf b/community/modules/scripts/spack-setup/variables.tf similarity index 100% rename from community/modules/scripts/spack-install/variables.tf rename to community/modules/scripts/spack-setup/variables.tf diff --git a/community/modules/scripts/spack-install/versions.tf b/community/modules/scripts/spack-setup/versions.tf similarity index 100% rename from community/modules/scripts/spack-install/versions.tf rename to community/modules/scripts/spack-setup/versions.tf From db8b6ae3f556e670b18b0eb1d930dd325b0f2c69 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Sun, 30 Jul 2023 13:13:27 -0700 Subject: [PATCH 103/144] Update documentation for spack-setup module --- .../modules/scripts/spack-setup/README.md | 173 +++++++----------- 1 file changed, 70 insertions(+), 103 deletions(-) diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index 49fcf49a6c..a2a1a94a3f 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -1,120 +1,87 @@ ## Description -This module can be used to install spack on a VM. This includes: - -1. Cloning spack into a predefined directory -1. Checking out a specific version of spack -1. Configuring compilers within spack -1. Installing application licenses that spack packages might depend on -1. Installing various spack specs. - -The output of this module is a startup script that is intended to be attached -to either the login or controller node of a scheduler, or a -[vm-instance](../../../../modules/compute/vm-instance/README.md). The -resulting installation of spack can then be mounted across many other VMs to -share a software stack. - -> **_NOTE:_** This module currently is capable of re-running to install -> additional packages, but cannot be used to uninstall packages from the VM. -> -> **_NOTE:_** Currently, license installation is performed by copying a -> license file from a GCS bucket to a specific directory on the target VM. -> -> **_NOTE:_** When populating a buildcache with packages, the VM this -> spack module is running on requires the following scope: -> https://www.googleapis.com/auth/devstorage.read_write - -## Example - -As an example, the below is a possible definition of a spack installation. To -see this module used in a full blueprint, see the [hpc-slurm-gromacs.yaml] example. +This module can be used to setup and install Spack on a VM. To actually run +Spack commands to install other software use the +[spack-execute](../spack-execute/) module. + +This module generates a script that performs the following: + +1. Install system dependencies needed for Spack +1. Clone Spack into a predefined directory +1. Check out a specific version of Spack + +There are several options on how to consume the outputs of this module: + +## Examples + +### `use` `spack-setup` with `spack-execute` + +This will prepend the `spack-setup` script to the `spack-execute` commands. ```yaml - - id: spack - source: community/modules/scripts/spack-install + - id: spack-setup + source: modules/scripts/spack-setup + + - id: spack-build + source: modules/scripts/spack-execute + use: [spack-setup] settings: - install_dir: /sw/spack - spack_url: https://github.com/spack/spack - spack_ref: v0.19.0 - spack_cache_url: - - mirror_name: 'gcs_cache' - mirror_url: gs://example-buildcache/linux-centos7 - configs: - - type: single-config - scope: defaults - value: "config:build_stage:/sw/spack/spack-stage" - - type: file - scope: defaults - value: | - modules: - default: - tcl: - hash_length: 0 - all: - conflict: - - '{name}' - projections: - all: '{name}/{version}-{compiler.name}-{compiler.version}' - compilers: - - gcc@10.3.0 target=x86_64 - packages: - - cmake%gcc@10.3.0 target=x86_64 - environments: - - name: main-env - packages: - - intel-mkl%gcc@10.3.0 target=skylake - - intel-mpi@2018.4.274%gcc@10.3.0 target=skylake - - fftw%intel@18.0.5 target=skylake ^intel-mpi@2018.4.274%intel@18.0.5 target=x86_64 - - name: explicit-env - content: | - spack: - definitions: - - compilers: - - gcc@10.3.0 - - mpis: - - intel-mpi@2018.4.274 - - packages: - - intel-mkl - - mpi_packages: - - fftw - specs: - - matrix: - - - $packages - - - $%compilers - - matrix: - - - $mpis - - - $%compilers - - matrix: - - - $mpi_packages - - - $%compilers - - - $^mpis + commands: | + spack install gcc@10.3.0 target=x86_64 + + - id: builder + source: modules/compute/vm-instance + use: [network1, spack-build] ``` -Following the above description of this module, it can be added to a Slurm -deployment via the following: +### `use` `spack-setup` with `vm-instance` or Slurm module + +This will run `spack-setup` scripts on the downstream compute resource. ```yaml -- id: slurm_controller - source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - use: [spack] - settings: - subnetwork_name: $(network1.primary_subnetwork.name) - login_node_count: 1 - partitions: - - $(compute_partition.partition) + - id: spack-setup + source: modules/scripts/spack-setup + + - id: spack-installer + source: modules/compute/vm-instance + use: [network1, spack-setup] +``` + +OR + +```yaml + - id: spack-setup + source: modules/scripts/spack-setup + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + use: [network1, partition1, spack-setup] ``` -Alternatively, it can be added as a startup script via: +### Build `starup-script` with `spack-runner` output + +This will use the generated `spack-setup` script as one step in `startup-script`. ```yaml - - id: startup + - id: spack-setup + source: modules/scripts/spack-setup + + - id: startup-script source: modules/scripts/startup-script settings: runners: - - $(spack.install_spack_deps_runner) - - $(spack.install_spack_runner) + - $(spack-setup.spack-runner) + - type: shell + destination: "my-script.sh" + content: echo 'hello world' + + - id: workstation + source: modules/compute/vm-instance + use: [network1, startup-script] ``` +To see a full example of this module in use, see the [hpc-slurm-gromacs.yaml] example. + [hpc-slurm-gromacs.yaml]: ../../../examples/hpc-slurm-gromacs.yaml ## Environment Setup @@ -124,10 +91,9 @@ Alternatively, it can be added as a startup script via: [Spack installation] produces a setup script that adds `spack` to your `PATH` as well as some other command-line integration tools. This script can be found at `/share/spack/setup-env.sh`. This script will be automatically -added to bash startup by the `install_spack_runner`. In the case that you are -using Spack on a different machine than the one where Spack was installed, you -can use the `setup_spack_runner` to make sure Spack is also available on that -machine. +added to bash startup by the `spack_runner`. In the case that you are using +Spack on a different machine than the one where Spack was installed, you can use +the `setup_spack_runner` to make sure Spack is also available on that machine. [Spack installation]: https://spack-tutorial.readthedocs.io/en/latest/tutorial_basics.html#installing-spack @@ -168,6 +134,7 @@ sudo -i spack python -m pip install package-name [SPACK_PYTHON]: https://spack.readthedocs.io/en/latest/getting_started.html#shell-support [builds]: https://spack.readthedocs.io/en/latest/binary_caches.html + ## License From 1f51fa9e3e6f9ecdf705f5b95831b77e96cdc3c9 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Sun, 30 Jul 2023 13:55:30 -0700 Subject: [PATCH 104/144] Update all references to spack-install to spack-setup --- community/examples/AMD/hpc-amd-slurm.yaml | 2 +- community/examples/hpc-slurm-gromacs.yaml | 2 +- community/examples/hpc-slurm-ramble-gromacs.yaml | 2 +- community/modules/scripts/ramble-execute/README.md | 6 +++--- community/modules/scripts/spack-execute/README.md | 4 ++-- community/modules/scripts/spack-execute/outputs.tf | 2 +- .../modules/scripts/spack-execute/variables.tf | 2 +- community/modules/scripts/spack-setup/README.md | 2 +- docs/cloud-batch.md | 2 +- docs/tutorials/gromacs/spack-gromacs.yaml | 2 +- docs/tutorials/openfoam/spack-openfoam.yaml | 2 +- docs/tutorials/wrfv3/spack-wrfv3.yaml | 2 +- .../hcls-blueprint.yaml | 2 +- examples/README.md | 14 ++++++++------ examples/serverless-batch-mpi.yaml | 2 +- modules/README.md | 8 ++++---- pkg/modulereader/resreader.go | 4 +++- tools/duplicate-diff.py | 2 +- tools/validate_configs/ramble.yaml | 2 +- .../validate_configs/test_configs/centos8-ss.yaml | 2 +- tools/validate_configs/test_configs/debian-ss.yaml | 2 +- .../test_configs/hpc-centos-ss.yaml | 2 +- tools/validate_configs/test_configs/rocky-ss.yaml | 2 +- .../test_configs/spack-buildcache.yaml | 5 +---- .../test_configs/spack-environments.yaml | 5 +---- .../test_configs/test_outputs.yaml | 2 +- tools/validate_configs/test_configs/ubuntu-ss.yaml | 2 +- 27 files changed, 42 insertions(+), 44 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index ca17bc65b7..b63aed96fb 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -40,7 +40,7 @@ deployment_groups: local_mount: /sw - id: spack-setup - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup settings: install_dir: /sw/spack spack_ref: v0.18.1 diff --git a/community/examples/hpc-slurm-gromacs.yaml b/community/examples/hpc-slurm-gromacs.yaml index 3f87fb02a8..bc3a064d2a 100644 --- a/community/examples/hpc-slurm-gromacs.yaml +++ b/community/examples/hpc-slurm-gromacs.yaml @@ -46,7 +46,7 @@ deployment_groups: ## Install Scripts - id: spack-setup - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup settings: install_dir: /sw/spack diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml index d66523a394..97eb685af2 100644 --- a/community/examples/hpc-slurm-ramble-gromacs.yaml +++ b/community/examples/hpc-slurm-ramble-gromacs.yaml @@ -36,7 +36,7 @@ deployment_groups: ## Install Scripts - id: spack-install - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup settings: install_dir: /opt/apps/spack diff --git a/community/modules/scripts/ramble-execute/README.md b/community/modules/scripts/ramble-execute/README.md index bd4594c1e8..ae56747d33 100644 --- a/community/modules/scripts/ramble-execute/README.md +++ b/community/modules/scripts/ramble-execute/README.md @@ -11,7 +11,7 @@ This module outputs a startup script runner, which can be combined with other startup script runners to execute a set of Ramble commands. Ramble makes extensive use of Spack. It must be installed with a Toolkit runner -generated by the [spack module](../spack-install/README.md) following the +generated by the [spack-setup module](../spack-setup/README.md) following the [basic example](#basic-example) below. > **_NOTE:_** This is an experimental module and the functionality and @@ -26,7 +26,7 @@ Below is a basic example of using this module. ```yaml - id: spack - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup - id: ramble-setup source: community/modules/scripts/ramble-setup @@ -40,7 +40,7 @@ Below is a basic example of using this module. ``` This example shows installing Spack and Ramble with their own modules -(spack-install and ramble-setup respectively). Then the ramble-execute module +(spack-setup and ramble-setup respectively). Then the ramble-execute module is added to simply list all applications Ramble knows about. ## License diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md index e516796f9a..dbdddfe537 100644 --- a/community/modules/scripts/spack-execute/README.md +++ b/community/modules/scripts/spack-execute/README.md @@ -50,7 +50,7 @@ limitations under the License. | [log\_file](#input\_log\_file) | Defines the logfile that script output will be written to | `string` | `"/var/log/spack.log"` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created. | `string` | n/a | yes | | [region](#input\_region) | Region to place bucket containing spack scripts. | `string` | n/a | yes | -| [spack\_runner](#input\_spack\_runner) | Runner from previous spack-install or spack-execute to be chained with scripts generated by this module. |
object({
type = string
content = string
destination = string
})
| n/a | yes | +| [spack\_runner](#input\_spack\_runner) | Runner from previous spack-setup or spack-execute to be chained with scripts generated by this module. |
object({
type = string
content = string
destination = string
})
| n/a | yes | ## Outputs @@ -58,6 +58,6 @@ limitations under the License. |------|-------------| | [controller\_startup\_script](#output\_controller\_startup\_script) | Path to the Spack installation script, duplicate for SLURM controller. | | [gcs\_bucket\_path](#output\_gcs\_bucket\_path) | Bucket containing the startup scripts for spack, to be reused by spack-execute module. | -| [spack\_runner](#output\_spack\_runner) | Single runner that combines scripts from this module and any previously chained spack-execute or spack-install modules. | +| [spack\_runner](#output\_spack\_runner) | Single runner that combines scripts from this module and any previously chained spack-execute or spack-setup modules. | | [startup\_script](#output\_startup\_script) | Path to the Spack installation script. | diff --git a/community/modules/scripts/spack-execute/outputs.tf b/community/modules/scripts/spack-execute/outputs.tf index 65fae325b7..03904bc61f 100644 --- a/community/modules/scripts/spack-execute/outputs.tf +++ b/community/modules/scripts/spack-execute/outputs.tf @@ -25,7 +25,7 @@ output "controller_startup_script" { } output "spack_runner" { - description = "Single runner that combines scripts from this module and any previously chained spack-execute or spack-install modules." + description = "Single runner that combines scripts from this module and any previously chained spack-execute or spack-setup modules." value = local.combined_runner } diff --git a/community/modules/scripts/spack-execute/variables.tf b/community/modules/scripts/spack-execute/variables.tf index 03a5d94d4b..f8a67c9526 100644 --- a/community/modules/scripts/spack-execute/variables.tf +++ b/community/modules/scripts/spack-execute/variables.tf @@ -75,7 +75,7 @@ variable "commands" { } variable "spack_runner" { - description = "Runner from previous spack-install or spack-execute to be chained with scripts generated by this module." + description = "Runner from previous spack-setup or spack-execute to be chained with scripts generated by this module." type = object({ type = string content = string diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index a2a1a94a3f..79fe76a6bd 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -105,7 +105,7 @@ instal through a shared file system. ```yaml - id: spack - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup ... - id: spack-setup diff --git a/docs/cloud-batch.md b/docs/cloud-batch.md index 2a5b82149c..0a16008310 100644 --- a/docs/cloud-batch.md +++ b/docs/cloud-batch.md @@ -15,7 +15,7 @@ The HPC Toolkit supports Google Cloud Batch through two Toolkit modules: Google Cloud Batch API - Creates an instance template for the Google Cloud Batch job to use - Works with existing Toolkit modules such as `vpc`, `filestore`, - `startup-script` & `spack-install` + `startup-script` & `spack-setup` - [batch-login-node](../modules/scheduler/batch-login-node/README.md) - Creates a login node VM for Google Cloud Batch job submission diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index b1036d5bb6..dd3ea62751 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -33,7 +33,7 @@ deployment_groups: ## Install Scripts - id: spack-setup - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index 973692e926..54c870eec3 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -33,7 +33,7 @@ deployment_groups: ## Install Scripts - id: spack-setup - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index 9bb7571054..d78dbf0d2c 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -33,7 +33,7 @@ deployment_groups: ## Install Scripts - id: spack-setup - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack diff --git a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml index 64ff214639..65543e6727 100644 --- a/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml +++ b/docs/videos/healthcare-and-life-sciences/hcls-blueprint.yaml @@ -114,7 +114,7 @@ deployment_groups: ### Software ### - id: spack-setup - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup settings: install_dir: /apps/spack diff --git a/examples/README.md b/examples/README.md index 36a5ebd46c..1430194143 100644 --- a/examples/README.md +++ b/examples/README.md @@ -441,8 +441,9 @@ This blueprint demonstrates how to use Spack to run a real MPI job on Batch. The blueprint contains the following: * A shared `filestore` filesystem. -* A `spack-install` module that builds a script to install Spack and the WRF - application onto the shared `filestore`. +* A `spack-setup` module that generates a script to install Spack +* A `spack-execute` module that builds the WRF application onto the shared + `filestore`. * A `startup-script` module which uses the above script and stages job data. * A builder `vm-instance` which performs the Spack install and then shuts down. * A `batch-job-template` that builds a Batch job to execute the WRF job. @@ -662,13 +663,14 @@ bucket: Spack is an HPC software package manager. This example creates a small Slurm cluster with software installed using the -[spack-install module](../community/modules/scripts/spack-install/README.md) The -controller will install and configure spack, and install +[spack-setup](../community/modules/scripts/spack-setup/README.md) and +[spack-execute](../community/modules/scripts/spack-execute/README.md) modules. +The controller will install and configure spack, and install [gromacs](https://www.gromacs.org/) using spack. Spack is installed in a shared location (/sw) via filestore. This build leverages the [startup-script module](../modules/scripts/startup-script/README.md) and can be -applied in any cluster by using the output of spack-install or -startup-script modules. +applied in any cluster by using the output of spack-setup or startup-script +modules. The installation will occur as part of the Slurm startup-script, a warning message will be displayed upon SSHing to the login node indicating diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index aa0aed339f..d11cf5ee02 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -35,7 +35,7 @@ deployment_groups: local_mount: /share - id: spack-setup - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup settings: spack_ref: v0.19.0 install_dir: /share/spack diff --git a/modules/README.md b/modules/README.md index cb8721e36c..34c946d8ca 100644 --- a/modules/README.md +++ b/modules/README.md @@ -214,9 +214,9 @@ Modules that are still in development and less stable are labeled with the startup script to install [Ramble](https://github.com/GoogleCloudPlatform/ramble) on an instance or a slurm login or controller. -* **[spack-install]** ![community-badge] ![experimental-badge] : Creates a - startup script to install [Spack](https://github.com/spack/spack) on an - instance or a slurm login or controller. +* **[spack-setup]** ![community-badge] ![experimental-badge] : Creates a startup + script to install [Spack](https://github.com/spack/spack) on an instance or a + slurm login or controller. * **[wait-for-startup]** ![community-badge] ![experimental-badge] : Waits for successful completion of a startup script on a compute VM. @@ -231,7 +231,7 @@ Modules that are still in development and less stable are labeled with the [pbspro]: https://www.altair.com/pbs-professional [ramble-execute]: ../community/modules/scripts/ramble-execute/README.md [ramble-setup]: ../community/modules/scripts/ramble-setup/README.md -[spack-install]: ../community/modules/scripts/spack-install/README.md +[spack-setup]: ../community/modules/scripts/spack-setup/README.md [wait-for-startup]: ../community/modules/scripts/wait-for-startup/README.md ## Module Fields diff --git a/pkg/modulereader/resreader.go b/pkg/modulereader/resreader.go index de95a9cd93..cddf11174f 100644 --- a/pkg/modulereader/resreader.go +++ b/pkg/modulereader/resreader.go @@ -317,7 +317,9 @@ func defaultAPIList(source string) []string { }, "community/modules/scripts/pbspro-install": {}, "community/modules/scripts/pbspro-qmgr": {}, - "community/modules/scripts/spack-install": {}, + "community/modules/scripts/spack-setup": { + "storage.googleapis.com", + }, "community/modules/scripts/wait-for-startup": { "compute.googleapis.com", }, diff --git a/tools/duplicate-diff.py b/tools/duplicate-diff.py index b419225aba..bfd92cb0d7 100644 --- a/tools/duplicate-diff.py +++ b/tools/duplicate-diff.py @@ -56,7 +56,7 @@ "community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl", ], [ - "community/modules/scripts/spack-install/templates/spack_setup.yml.tftpl", + "community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl", "community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl", ], ] diff --git a/tools/validate_configs/ramble.yaml b/tools/validate_configs/ramble.yaml index 3e4b26897f..105ed6a0bb 100644 --- a/tools/validate_configs/ramble.yaml +++ b/tools/validate_configs/ramble.yaml @@ -28,7 +28,7 @@ deployment_groups: source: modules/network/pre-existing-vpc - id: spack - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup settings: install_dir: /spack diff --git a/tools/validate_configs/test_configs/centos8-ss.yaml b/tools/validate_configs/test_configs/centos8-ss.yaml index b6f5722bf4..30a25c1728 100644 --- a/tools/validate_configs/test_configs/centos8-ss.yaml +++ b/tools/validate_configs/test_configs/centos8-ss.yaml @@ -42,7 +42,7 @@ deployment_groups: auto_delete_disk: true - id: spack-setup - source: ./community//modules/scripts/spack-install + source: ./community/modules/scripts/spack-setup settings: install_dir: /apps/spack diff --git a/tools/validate_configs/test_configs/debian-ss.yaml b/tools/validate_configs/test_configs/debian-ss.yaml index 15fe0d40e7..b2a4a3e515 100644 --- a/tools/validate_configs/test_configs/debian-ss.yaml +++ b/tools/validate_configs/test_configs/debian-ss.yaml @@ -42,7 +42,7 @@ deployment_groups: auto_delete_disk: true - id: spack-setup - source: ./community//modules/scripts/spack-install + source: ./community/modules/scripts/spack-setup settings: install_dir: /apps/spack diff --git a/tools/validate_configs/test_configs/hpc-centos-ss.yaml b/tools/validate_configs/test_configs/hpc-centos-ss.yaml index 385b399c89..076afa9052 100644 --- a/tools/validate_configs/test_configs/hpc-centos-ss.yaml +++ b/tools/validate_configs/test_configs/hpc-centos-ss.yaml @@ -42,7 +42,7 @@ deployment_groups: auto_delete_disk: true - id: spack-setup - source: ./community//modules/scripts/spack-install + source: ./community/modules/scripts/spack-setup settings: install_dir: /apps/spack diff --git a/tools/validate_configs/test_configs/rocky-ss.yaml b/tools/validate_configs/test_configs/rocky-ss.yaml index 9852a9ee95..462b89e677 100644 --- a/tools/validate_configs/test_configs/rocky-ss.yaml +++ b/tools/validate_configs/test_configs/rocky-ss.yaml @@ -43,7 +43,7 @@ deployment_groups: auto_delete_disk: true - id: spack-setup - source: ./community//modules/scripts/spack-install + source: ./community/modules/scripts/spack-setup settings: install_dir: /apps/spack diff --git a/tools/validate_configs/test_configs/spack-buildcache.yaml b/tools/validate_configs/test_configs/spack-buildcache.yaml index d11d108e7b..2d1053f121 100644 --- a/tools/validate_configs/test_configs/spack-buildcache.yaml +++ b/tools/validate_configs/test_configs/spack-buildcache.yaml @@ -29,7 +29,7 @@ deployment_groups: source: modules/network/pre-existing-vpc - id: spack-setup - source: ./community/modules/scripts/spack-install + source: ./community/modules/scripts/spack-setup settings: install_dir: /apps/spack @@ -72,9 +72,6 @@ deployment_groups: mkdir /apps chmod a+rwx /apps destination: apps_create.sh - - type: ansible-local - source: modules/spack-install/scripts/install_spack_deps.yml - destination: install_spack_deps.yml - $(spack-execute.spack_runner) - type: shell destination: shutdown.sh diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml index bcdd731d3e..21e56821e2 100644 --- a/tools/validate_configs/test_configs/spack-environments.yaml +++ b/tools/validate_configs/test_configs/spack-environments.yaml @@ -29,7 +29,7 @@ deployment_groups: source: modules/network/pre-existing-vpc - id: spack-setup - source: ./community/modules/scripts/spack-install + source: ./community/modules/scripts/spack-setup settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -116,9 +116,6 @@ deployment_groups: mkdir /apps chmod a+rwx /apps destination: apps_create.sh - - type: ansible-local - source: modules/spack-install/scripts/install_spack_deps.yml - destination: install_spack_deps.yml - $(spack-execute.spack_runner) - type: shell destination: shutdown.sh diff --git a/tools/validate_configs/test_configs/test_outputs.yaml b/tools/validate_configs/test_configs/test_outputs.yaml index 467af02e81..c9c1aaa3b3 100644 --- a/tools/validate_configs/test_configs/test_outputs.yaml +++ b/tools/validate_configs/test_configs/test_outputs.yaml @@ -121,7 +121,7 @@ deployment_groups: - compute.instanceAdmin.v1 - id: spack - source: community/modules/scripts/spack-install + source: community/modules/scripts/spack-setup outputs: - startup_script - controller_startup_script diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml index 761c0d71c5..35657939d8 100644 --- a/tools/validate_configs/test_configs/ubuntu-ss.yaml +++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml @@ -42,7 +42,7 @@ deployment_groups: auto_delete_disk: true - id: spack-setup - source: ./community//modules/scripts/spack-install + source: ./community/modules/scripts/spack-setup settings: install_dir: /apps/spack From e25e7c31ffe77fe1701fe24d3f866269559cfb20 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Sun, 30 Jul 2023 16:41:57 -0700 Subject: [PATCH 105/144] Add documentation for usage of spack-execute module --- .../modules/scripts/spack-execute/README.md | 74 +++++++++++++++++++ .../modules/scripts/spack-setup/README.md | 10 +-- 2 files changed, 79 insertions(+), 5 deletions(-) diff --git a/community/modules/scripts/spack-execute/README.md b/community/modules/scripts/spack-execute/README.md index dbdddfe537..434240e090 100644 --- a/community/modules/scripts/spack-execute/README.md +++ b/community/modules/scripts/spack-execute/README.md @@ -1,3 +1,77 @@ +## Description + +This module creates a script that defines a software build using Spack and +performs any additional customization to a Spack installation. + +There are two main variable inputs that can be used to define a Spack build: +`data_files` and `commands`. + +- `data_files`: Any files specified will be transferred to the machine running + outputted script. Data file `content` can be defined inline in the blueprint + or can point to a `source`, an absolute local path of a file. This can be used + to transfer environment definition files, config definition files, GPG keys, + or software licenses. `data_files` are transferred before `commands` are run. +- `commands`: A script that is run. This can be used to perform actions such as + installation of compilers & packages, environment creation, adding a build + cache, and modifying the spack configuration. + +## Example + +The `spack-execute` module should `use` a `spack-setup` module. This will +prepend the installation of Spack and its dependencies to the build. Then +`spack-execute` can be used by a module that takes `startup-script` as an input. + +```yaml + - id: spack-setup + source: community/modules/scripts/spack-setup + + - id: spack-build + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: + commands: | + spack install gcc@10.3.0 target=x86_64 + + - id: builder-vm + source: modules/compute/vm-instance + use: [network1, spack-build] +``` + +To see a full example of this module in use, see the [hpc-slurm-gromacs.yaml] example. + +[hpc-slurm-gromacs.yaml]: ../../../examples/hpc-slurm-gromacs.yaml + +### Using with `startup-script` module + +The `spack-runner` output can be used by the `startup-script` module. + +```yaml + - id: spack-setup + source: community/modules/scripts/spack-setup + + - id: spack-build + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: + commands: | + spack install gcc@10.3.0 target=x86_64 + + - id: startup-script + source: modules/scripts/startup-script + settings: + runners: + - $(spack-build.spack-runner) + - type: shell + destination: "my-script.sh" + content: echo 'hello world' + + - id: workstation + source: modules/compute/vm-instance + use: [network1, startup-script] +``` + +## License + Copyright 2023 Google LLC diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index 79fe76a6bd..58192bd4cd 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -20,10 +20,10 @@ This will prepend the `spack-setup` script to the `spack-execute` commands. ```yaml - id: spack-setup - source: modules/scripts/spack-setup + source: community/modules/scripts/spack-setup - id: spack-build - source: modules/scripts/spack-execute + source: community/modules/scripts/spack-execute use: [spack-setup] settings: commands: | @@ -40,7 +40,7 @@ This will run `spack-setup` scripts on the downstream compute resource. ```yaml - id: spack-setup - source: modules/scripts/spack-setup + source: community/modules/scripts/spack-setup - id: spack-installer source: modules/compute/vm-instance @@ -51,7 +51,7 @@ OR ```yaml - id: spack-setup - source: modules/scripts/spack-setup + source: community/modules/scripts/spack-setup - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller @@ -64,7 +64,7 @@ This will use the generated `spack-setup` script as one step in `startup-script` ```yaml - id: spack-setup - source: modules/scripts/spack-setup + source: community/modules/scripts/spack-setup - id: startup-script source: modules/scripts/startup-script From 42b85d69d79e6e6566b698092eab211fb931b30f Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Mon, 31 Jul 2023 10:33:32 -0700 Subject: [PATCH 106/144] Notify users that spack-install has moved to spack-setup --- pkg/config/config.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/config/config.go b/pkg/config/config.go index 3a6ecbcf64..874094fce5 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -76,6 +76,7 @@ var movedModules = map[string]string{ "community/modules/scheduler/cloud-batch-job": "modules/scheduler/batch-job-template", "community/modules/scheduler/cloud-batch-login-node": "modules/scheduler/batch-login-node", "community/modules/scheduler/htcondor-configure": "community/modules/scheduler/htcondor-setup", + "community/modules/scripts/spack-install": "community/modules/scripts/spack-setup", } // GroupName is the name of a deployment group From 28ad6bc2a64ec3d0675e9b4e47401ceeb3691482 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Mon, 31 Jul 2023 14:43:44 -0700 Subject: [PATCH 107/144] Make the "Apply Changes" prompt clearer (#1625) --- pkg/shell/common.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/shell/common.go b/pkg/shell/common.go index c0e0962a46..e0cb2ce5ee 100644 --- a/pkg/shell/common.go +++ b/pkg/shell/common.go @@ -99,7 +99,11 @@ func ApplyChangesChoice(c ProposedChanges) bool { var userResponse string for { - fmt.Print("Display full proposed changes, Apply proposed changes, Stop and exit, Continue without applying? [d,a,s,c]: ") + fmt.Print(`Please select an option below [d,a,s,c]: +(D)isplay full proposed changes, +(A)pply proposed changes, +(S)top and exit, +(C)ontinue without applying`) _, err := fmt.Scanln(&userResponse) if err != nil { From 4c3592800633ac88675b1f5e9d0408481094e149 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Mon, 31 Jul 2023 14:40:17 -0700 Subject: [PATCH 108/144] Update to latest tf provider and resolve GKE conflict --- community/modules/compute/gke-node-pool/README.md | 13 ++++++++----- .../modules/compute/gke-node-pool/variables.tf | 7 +++++-- community/modules/compute/gke-node-pool/versions.tf | 4 ++-- pkg/modulewriter/tfversions.go | 4 ++-- .../expectations/igc_pkr/zero/versions.tf | 4 ++-- .../expectations/igc_tf/one/versions.tf | 4 ++-- .../expectations/igc_tf/zero/versions.tf | 4 ++-- .../expectations/merge_flatten/zero/versions.tf | 4 ++-- 8 files changed, 25 insertions(+), 19 deletions(-) diff --git a/community/modules/compute/gke-node-pool/README.md b/community/modules/compute/gke-node-pool/README.md index 3bb0e7398c..e4d27e6fe2 100644 --- a/community/modules/compute/gke-node-pool/README.md +++ b/community/modules/compute/gke-node-pool/README.md @@ -114,6 +114,7 @@ an A100 GPU: count: 1 gpu_partition_size: 1g.5gb gpu_sharing_config: null + gpu_driver_installation_config: null ``` > **Note**: Once we define the [`guest_accelerator`] block, all fields must be @@ -138,6 +139,7 @@ The following is an example of gpu_sharing_config: - gpu_sharing_strategy: TIME_SHARING max_shared_clients_per_gpu: 3 + gpu_driver_installation_config: null ``` Finally, the following is an example of using a GPU attached to an `n1` machine: @@ -153,6 +155,7 @@ Finally, the following is an example of using a GPU attached to an `n1` machine: count: 2 gpu_partition_size: null gpu_sharing_config: null + gpu_driver_installation_config: null ``` ## License @@ -177,15 +180,15 @@ limitations under the License. | Name | Version | |------|---------| | [terraform](#requirement\_terraform) | >= 1.2 | -| [google](#requirement\_google) | >= 4.61.0, <= 4.74.0 | -| [google-beta](#requirement\_google-beta) | >= 4.61.0, <= 4.74.0 | +| [google](#requirement\_google) | >= 4.75.1, < 5.0 | +| [google-beta](#requirement\_google-beta) | >= 4.75.1, < 5.0 | ## Providers | Name | Version | |------|---------| -| [google](#provider\_google) | >= 4.61.0, <= 4.74.0 | -| [google-beta](#provider\_google-beta) | >= 4.61.0, <= 4.74.0 | +| [google](#provider\_google) | >= 4.75.1, < 5.0 | +| [google-beta](#provider\_google-beta) | >= 4.75.1, < 5.0 | ## Modules @@ -217,7 +220,7 @@ No modules. | [disk\_type](#input\_disk\_type) | Disk type for each node. | `string` | `"pd-standard"` | no | | [enable\_gcfs](#input\_enable\_gcfs) | Enable the Google Container Filesystem (GCFS). See [restrictions](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/container_cluster#gcfs_config). | `bool` | `false` | no | | [enable\_secure\_boot](#input\_enable\_secure\_boot) | Enable secure boot for the nodes. Keep enabled unless custom kernel modules need to be loaded. See [here](https://cloud.google.com/compute/shielded-vm/docs/shielded-vm#secure-boot) for more info. | `bool` | `true` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string
count = number
gpu_partition_size = string
gpu_sharing_config = list(object({
gpu_sharing_strategy = string
max_shared_clients_per_gpu = number
}))
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string
count = number
gpu_driver_installation_config = list(object({
gpu_driver_version = string
}))
gpu_partition_size = string
gpu_sharing_config = list(object({
gpu_sharing_strategy = string
max_shared_clients_per_gpu = number
}))
}))
| `null` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | diff --git a/community/modules/compute/gke-node-pool/variables.tf b/community/modules/compute/gke-node-pool/variables.tf index 6cb86a0e78..98288b38f4 100644 --- a/community/modules/compute/gke-node-pool/variables.tf +++ b/community/modules/compute/gke-node-pool/variables.tf @@ -69,8 +69,11 @@ variable "enable_secure_boot" { variable "guest_accelerator" { description = "List of the type and count of accelerator cards attached to the instance." type = list(object({ - type = string - count = number + type = string + count = number + gpu_driver_installation_config = list(object({ + gpu_driver_version = string + })) gpu_partition_size = string gpu_sharing_config = list(object({ gpu_sharing_strategy = string diff --git a/community/modules/compute/gke-node-pool/versions.tf b/community/modules/compute/gke-node-pool/versions.tf index d6ec2ced5e..d1b661c298 100644 --- a/community/modules/compute/gke-node-pool/versions.tf +++ b/community/modules/compute/gke-node-pool/versions.tf @@ -18,11 +18,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = ">= 4.61.0, <= 4.74.0" + version = ">= 4.75.1, < 5.0" } google-beta = { source = "hashicorp/google-beta" - version = ">= 4.61.0, <= 4.74.0" + version = ">= 4.75.1, < 5.0" } } provider_meta "google" { diff --git a/pkg/modulewriter/tfversions.go b/pkg/modulewriter/tfversions.go index 20a654b84f..4efa6f8275 100644 --- a/pkg/modulewriter/tfversions.go +++ b/pkg/modulewriter/tfversions.go @@ -21,11 +21,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.74.0" + version = "~> 4.76.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.74.0" + version = "~> 4.76.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf index ae37177189..90fe1151e7 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_pkr/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.74.0" + version = "~> 4.76.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.74.0" + version = "~> 4.76.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf index ae37177189..90fe1151e7 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/one/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.74.0" + version = "~> 4.76.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.74.0" + version = "~> 4.76.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf index ae37177189..90fe1151e7 100644 --- a/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/igc_tf/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.74.0" + version = "~> 4.76.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.74.0" + version = "~> 4.76.0" } } } diff --git a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf index ae37177189..90fe1151e7 100644 --- a/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf +++ b/tools/validate_configs/golden_copies/expectations/merge_flatten/zero/versions.tf @@ -20,11 +20,11 @@ terraform { required_providers { google = { source = "hashicorp/google" - version = "~> 4.74.0" + version = "~> 4.76.0" } google-beta = { source = "hashicorp/google-beta" - version = "~> 4.74.0" + version = "~> 4.76.0" } } } From 0f26a0da1664237bbe4723d119b5c4bd9c505a40 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Mon, 31 Jul 2023 11:57:44 -0700 Subject: [PATCH 109/144] Make machines to wait for Spack install lock --- .../templates/ramble_setup.yml.tftpl | 24 ++++++++++++++++--- .../templates/spack_setup.yml.tftpl | 24 ++++++++++++++++--- 2 files changed, 42 insertions(+), 6 deletions(-) diff --git a/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl b/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl index 9b80783fcf..f61c122c3a 100644 --- a/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl +++ b/community/modules/scripts/ramble-setup/templates/ramble_setup.yml.tftpl @@ -25,7 +25,7 @@ chgrp_group: ${chgrp_group} finalize_setup_script: ${finalize_setup_script} tasks: - - name: Print Software Name + - name: Print software name ansible.builtin.debug: msg: "Running installation for software: {{ sw_name }}" @@ -41,9 +41,13 @@ path: "{{ install_dir | dirname }}" state: directory + - name: Set lock dir + ansible.builtin.set_fact: + lock_dir: "{{ install_dir | dirname }}/.install_{{ sw_name }}_lock" + - name: Acquire lock ansible.builtin.command: - mkdir "{{ install_dir | dirname }}/.install_{{ sw_name }}_lock" + mkdir "{{ lock_dir }}" register: lock_out changed_when: lock_out.rc == 0 failed_when: false @@ -61,6 +65,20 @@ recurse: true when: lock_out.rc == 0 - - name: Finalize Setup + - name: Finalize setup ansible.builtin.shell: "{{ finalize_setup_script }}" when: lock_out.rc == 0 and finalize_setup_script + + - name: Release lock + ansible.builtin.file: + path: "{{ lock_dir }}/done" + state: touch + when: lock_out.rc == 0 + + - name: Wait for lock + ansible.builtin.wait_for: + path: "{{ lock_dir }}/done" + state: present + timeout: 600 + sleep: 10 + when: lock_out.rc != 0 diff --git a/community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl b/community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl index 9b80783fcf..f61c122c3a 100644 --- a/community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl +++ b/community/modules/scripts/spack-setup/templates/spack_setup.yml.tftpl @@ -25,7 +25,7 @@ chgrp_group: ${chgrp_group} finalize_setup_script: ${finalize_setup_script} tasks: - - name: Print Software Name + - name: Print software name ansible.builtin.debug: msg: "Running installation for software: {{ sw_name }}" @@ -41,9 +41,13 @@ path: "{{ install_dir | dirname }}" state: directory + - name: Set lock dir + ansible.builtin.set_fact: + lock_dir: "{{ install_dir | dirname }}/.install_{{ sw_name }}_lock" + - name: Acquire lock ansible.builtin.command: - mkdir "{{ install_dir | dirname }}/.install_{{ sw_name }}_lock" + mkdir "{{ lock_dir }}" register: lock_out changed_when: lock_out.rc == 0 failed_when: false @@ -61,6 +65,20 @@ recurse: true when: lock_out.rc == 0 - - name: Finalize Setup + - name: Finalize setup ansible.builtin.shell: "{{ finalize_setup_script }}" when: lock_out.rc == 0 and finalize_setup_script + + - name: Release lock + ansible.builtin.file: + path: "{{ lock_dir }}/done" + state: touch + when: lock_out.rc == 0 + + - name: Wait for lock + ansible.builtin.wait_for: + path: "{{ lock_dir }}/done" + state: present + timeout: 600 + sleep: 10 + when: lock_out.rc != 0 From 6bf4a8ce56444959e3fc562ba0f0ab5649860d56 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Tue, 1 Aug 2023 07:33:25 +0100 Subject: [PATCH 110/144] OFE Backend: adding additional rule to OFE service account, so it can store custom images in the buckets --- community/front-end/ofe/script/service_account.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/community/front-end/ofe/script/service_account.sh b/community/front-end/ofe/script/service_account.sh index 5707b5c403..e92f72f4d2 100755 --- a/community/front-end/ofe/script/service_account.sh +++ b/community/front-end/ofe/script/service_account.sh @@ -53,6 +53,7 @@ sa_expand() { declare -a SA_ROLES SA_ROLES=('aiplatform.admin' 'compute.admin' + 'storage.admin' 'file.editor' 'iam.serviceAccountAdmin' 'iam.serviceAccountUser' From 88d5a5e4fa5bf43ec39377f9e0ec9c17eed06324 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Tue, 1 Aug 2023 10:29:10 +0100 Subject: [PATCH 111/144] OFE Backend: making compute dev env setup script compatible with rocky8 --- .../roles/dev_env/tasks/main.yaml | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/dev_env/tasks/main.yaml b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/dev_env/tasks/main.yaml index ddc70959bc..08cdb9bdf1 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/dev_env/tasks/main.yaml +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/clusters/ansible_setup/roles/dev_env/tasks/main.yaml @@ -18,7 +18,17 @@ name: centos-release-scl when: ansible_distribution == 'CentOS' -- name: Install Dev Tools +- name: Install Dev Tools Rocky + ansible.builtin.yum: + name: + - gcc-toolset-11 + - gcc-toolset-9 + - cmake + - python2-devel + - python36-devel + when: ansible_distribution in ['Rocky'] + +- name: Install Dev Tools CentOS ansible.builtin.yum: name: - devtoolset-9 @@ -26,10 +36,10 @@ - cmake - python2-devel - python36-devel - when: ansible_distribution in ['CentOS', 'Rocky'] + when: ansible_distribution in ['CentOS'] -- name: Add DevTools to default shells +- name: Add DevTools to default shells on CentOS ansible.builtin.copy: dest: /etc/profile.d/98-devtools.sh content: | @@ -37,7 +47,17 @@ owner: root mode: 0755 force: False - when: ansible_distribution in ['CentOS', 'Rocky'] + when: ansible_distribution in ['CentOS'] + +- name: Add DevTools to default shells on Rocky + ansible.builtin.copy: + dest: /etc/profile.d/98-devtools.sh + content: | + . /opt/rh/gcc-toolset-11/enable + owner: root + mode: 0755 + force: False + when: ansible_distribution in ['Rocky'] - name: Install Debian Dev tools ansible.builtin.apt: From e79eb35314c51649aceccd9792f3b5cba2ca0452 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Tue, 1 Aug 2023 10:30:00 +0100 Subject: [PATCH 112/144] OFE Backend: improving custom image delete function --- .../ofe/website/ghpcfe/cluster_manager/image.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py index c183219ab1..a2d27854f2 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py @@ -255,11 +255,17 @@ def delete_image(self): project_id = json.loads(self.image.cloud_credential.detail)["project_id"] image_name = f"image-{self.image.name}" zone = self.image.cloud_zone + + # Set the GOOGLE_APPLICATION_CREDENTIALS environment variable + os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.credentials_file.as_posix() # Create a client client = compute_v1.ImagesClient() try: + # Make sure that the builder env is destroyed + self._destroy_builder_env() + # Delete the image operation = client.delete(project=project_id, image=image_name) operation.result() @@ -269,4 +275,8 @@ def delete_image(self): logger.error(f"Image '{image_name}' not found in project '{project_id}' or zone '{zone}'") except Exception as e: - logger.error(f"An error occurred while deleting the image: {e}") + logger.error(f"An error occurred while deleting the image {image_name}: {e}") + + finally: + # Clear the GOOGLE_APPLICATION_CREDENTIALS environment variable + os.environ.pop("GOOGLE_APPLICATION_CREDENTIALS", None) From 9fdd28ed1853768c7de6fa82272f26ee665a1182 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Tue, 1 Aug 2023 13:46:27 +0100 Subject: [PATCH 113/144] OFE: addressing #1594 @maxm17 comments --- .../ghpcfe/cluster_manager/clusterinfo.py | 127 ++++++++++-------- .../website/ghpcfe/cluster_manager/image.py | 39 +++++- .../website/ghpcfe/cluster_manager/utils.py | 90 +++++++++++-- .../front-end/ofe/website/ghpcfe/forms.py | 18 ++- .../website/ghpcfe/templates/image/list.html | 2 +- .../ofe/website/ghpcfe/views/images.py | 13 +- 6 files changed, 205 insertions(+), 84 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index ef40c31c92..133574f038 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -62,6 +62,25 @@ def __init__(self, cluster): ) def prepare(self, credentials): + """Prepares the cluster for deployment. + + This method performs the necessary steps to prepare the cluster for deployment. + It creates the required directory, sets up authentication credentials, and updates + the cluster configuration. This method must be called before starting the cluster. + + Args: + credentials (str): The authentication credentials required to access the cloud + provider's resources. This should be a JSON-formatted string containing the + necessary authentication details. + + Raises: + subprocess.CalledProcessError: If there is an error during the preparation + process, this exception will be raised, indicating that the process failed. + + Note: + The required credentials can be obtained from the cloud provider's dashboard or + by following the documentation for obtaining authentication credentials. + """ self._create_cluster_dir() self._set_credentials(credentials) self.update() @@ -165,7 +184,7 @@ def _prepare_ghpc_filesystems(self): refs.append(storage_id) return ("\n\n".join(yaml), refs) - + def _prepare_ghpc_partitions(self, part_uses): yaml = [] refs = [] @@ -226,59 +245,55 @@ def _yaml_refs_to_uses(self, use_list): def _prepare_ghpc_yaml(self): - try: - yaml_file = self.cluster_dir / "cluster.yaml" - project_id = json.loads(self.cluster.cloud_credential.detail)[ - "project_id" - ] - - ( - filesystems_yaml, - filesystems_references, - ) = self._prepare_ghpc_filesystems() - ( - partitions_yaml, - partitions_references, - ) = self._prepare_ghpc_partitions( - ["hpc_network"] + filesystems_references - ) - - controller_uses = self._yaml_refs_to_uses( - ["hpc_network"] + partitions_references + filesystems_references - ) - login_uses = self._yaml_refs_to_uses( - ["hpc_network"] + filesystems_references - ) - - controller_sa = "sa" - # TODO: Determine if these all should be different, and if so, add to - # resource to be created. NOTE though, that at the moment, GHPC won't - # let us unpack output variables, so we can't index properly. - # for now, just use the singular access, and only create a single acct - # compute_sa = controller_sa - # login_sa = controller_sa - - # pylint: disable=line-too-long - startup_bucket = self.config["server"]["gcs_bucket"] - - if self.cluster.login_node_image is not None: - login_image_yaml = f"""instance_image: - family: image-{self.cluster.login_node_image.family} - project: {self.cluster.project_id}""" - else: - login_image_yaml = "" - - if self.cluster.controller_node_image is not None: - controller_image_yaml = f"""instance_image: - family: image-{self.cluster.controller_node_image.family} - project: {self.cluster.project_id} - """ - else: - controller_image_yaml = "" - - with yaml_file.open("w") as f: - f.write( - f""" + try: + yaml_file = self.cluster_dir / "cluster.yaml" + project_id = json.loads(self.cluster.cloud_credential.detail)[ + "project_id" + ] + ( + filesystems_yaml, + filesystems_references, + ) = self._prepare_ghpc_filesystems() + ( + partitions_yaml, + partitions_references, + ) = self._prepare_ghpc_partitions( + ["hpc_network"] + filesystems_references + ) + controller_uses = self._yaml_refs_to_uses( + ["hpc_network"] + partitions_references + filesystems_references + ) + login_uses = self._yaml_refs_to_uses( + ["hpc_network"] + filesystems_references + ) + controller_sa = "sa" + # TODO: Determine if these all should be different, and if so, add to + # resource to be created. NOTE though, that at the moment, GHPC won't + # let us unpack output variables, so we can't index properly. + # for now, just use the singular access, and only create a single acct + # compute_sa = controller_sa + # login_sa = controller_sa + + # pylint: disable=line-too-long + startup_bucket = self.config["server"]["gcs_bucket"] + if self.cluster.login_node_image is not None: + login_image_yaml = f"""instance_image: + family: image-{self.cluster.login_node_image.family} + project: {self.cluster.project_id}""" + else: + login_image_yaml = "" + + if self.cluster.controller_node_image is not None: + controller_image_yaml = f"""instance_image: + family: image-{self.cluster.controller_node_image.family} + project: {self.cluster.project_id} + """ + else: + controller_image_yaml = "" + + with yaml_file.open("w") as f: + f.write( + f""" blueprint_name: {self.cluster.cloud_id} vars: @@ -379,8 +394,8 @@ def _prepare_ghpc_yaml(self): ) # pylint: enable=line-too-long - except Exception as E: - logger.exception(f"Exception happened creating blueprint for cluster {self.cluster.name} - {E}") + except Exception as e: + logger.exception(f"Exception happened creating blueprint for cluster {self.cluster.name} - {e}") def _prepare_bootstrap_gcs(self): diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py index a2d27854f2..49fde4dc1d 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/image.py @@ -33,7 +33,7 @@ class ImageBackend: def __init__(self, image): self.config = utils.load_config() - self.ghpc_path = self.config["baseDir"].parent.parent / "ghpc" + self.ghpc_path = self.config["baseDir"].parents[1] / "ghpc" self.image = image self.image_dir = ( @@ -46,6 +46,34 @@ def __init__(self, image): def prepare(self): + """ + Prepare the image creation process by following these steps: + + 1. Create the necessary directory structure for the image. + 2. Generate a HPC Toolkit blueprint to build the image. + 3. Run the HPC Toolkit (`ghpc`) to create the image based on the blueprint. + 4. Set up the builder environment on Google Cloud Platform (GCP) using Terraform. + 5. Create the image on GCP using Packer. + 6. Destroy the builder environment after the image creation is complete. + + This method handles the entire image creation process, from setting up the necessary + directories and configuration files to executing HPC Toolkit and Packer to build + and finalize the image. If any step encounters an error, it logs the issue and marks + the image's status as "error" (status code 'e'). + + Note: + - This method assumes that the necessary tools (HPC Toolkit, Terraform, and Packer) + are properly installed and configured on the system. + - The credentials file required for GCP authentication is created during the image + directory setup. + + Raises: + OSError: If there is an error while creating the image directory or writing to + the credentials file. + IOError: If there is an error while writing to the credentials file. + subprocess.CalledProcessError: If any of the subprocess calls (ghpc, Terraform, or Packer) + encounter an error during execution. + """ self._create_image_dir() self._create_blueprint() self._run_ghpc() @@ -82,7 +110,6 @@ def _create_blueprint(self): runners = "" for script in scripts: script_path = os.path.join(settings.MEDIA_ROOT, script.content.name) - print(script_path) runners+=f""" - type: {script.type} destination: {script.name} @@ -139,7 +166,7 @@ def _create_blueprint(self): ) except Exception as e: self.update_image_status("e") - print(f"Error occurred while creating blueprint: {str(e)}") + logger.error(f"Error occurred while creating blueprint: {e}") def _run_ghpc(self): target_dir = self.image_dir @@ -174,7 +201,7 @@ def _create_builder_env(self): packer_dir = os.path.join(self.image_dir, f"{self.blueprint_name}/packer-image") except OSError as e: self.update_image_status("e") - print(f"Error occurred while constructing terraform_dir: {e}") + logger.error(f"Error occurred while constructing terraform_dir: {e}") utils.run_terraform(terraform_dir, "init") utils.run_terraform(terraform_dir, "validate", extra_env=extra_env) logger.info("Invoking Terraform Plan for builder env.") @@ -226,7 +253,7 @@ def _create_image(self): logger.info(" STDERR:\n%s\n", cpe.stderr.decode("utf-8")) raise except Exception as e: - logger.exception(f"Unhandled error happened durring image {self.image.id} creation.") + logger.exception(f"Unhandled error happened during image {self.image.id} creation.") def _destroy_builder_env(self): """Destroy builder environment on GCP.""" @@ -239,7 +266,7 @@ def _destroy_builder_env(self): terraform_dir = os.path.join(self.image_dir, f"{self.blueprint_name}/builder-env") except OSError as e: self.update_image_status("e") - print(f"Error occurred while constructing terraform_dir: {e}") + logger.error(f"Error occurred while constructing terraform_dir: {e}") logger.info("Invoking Terraform Destroy for builder env.") utils.run_terraform(terraform_dir, "destroy", extra_env=extra_env) except subprocess.CalledProcessError as cpe: diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/utils.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/utils.py index d4a0f6f909..990b7f76d2 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/utils.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/utils.py @@ -225,6 +225,49 @@ def run_terraform(target_dir, command, arguments=None, extra_env=None): return (log_out_fn, log_err_fn) def run_packer(target_dir, command, arguments=None, extra_env=None): + """ + Run the Packer command with the specified arguments in the given target directory. + + This function facilitates the execution of Packer commands from within a Python script. + It uses the `subprocess.run` method to invoke Packer and logs the output and any errors + generated during the execution. + + Args: + target_dir (str): The target directory where Packer should be executed. + command (str): The Packer command to be executed (e.g., "build", "validate", etc.). + arguments (list, optional): Additional command-line arguments to pass to Packer. + Defaults to an empty list if not provided. + extra_env (dict, optional): Extra environment variables to set before running Packer. + Defaults to an empty dictionary if not provided. + + Returns: + tuple: A tuple containing two `Path` objects representing the paths to the log files + where the standard output and standard error of the Packer command are logged. + + Raises: + RuntimeError: If there is an error while executing the Packer command or if an + exception is raised during the execution. + + Note: + - The `target_dir` should be a valid directory path where Packer-related files and + configuration are located. + - The `command` should be a valid Packer command (e.g., "build", "validate"). + - The `arguments` parameter allows users to provide additional command-line arguments + for Packer as a list of strings. + - The `extra_env` parameter allows users to set additional environment variables for + Packer execution as a dictionary. + - The standard output and standard error of the Packer command are logged to files in + the target directory with filenames in the format `packer_{command}_log.stdout` + and `packer_{command}_log.stderr`, respectively. + + Example Usage: + >>> target_dir = "/path/to/packer/directory" + >>> command = "build" + >>> arguments = ["-var", "variable=value", "template.json"] + >>> extra_env = {"PACKER_LOG": "1"} + >>> run_packer(target_dir, command, arguments, extra_env) + + """ arguments = arguments if arguments else [] extra_env = extra_env if extra_env else {} @@ -237,21 +280,19 @@ def run_packer(target_dir, command, arguments=None, extra_env=None): log_err_fn = Path(target_dir) / f"packer_{command}_log.stderr" new_env = os.environ.copy() - if "SSH_AUTH_SOCK" in new_env: - del new_env["SSH_AUTH_SOCK"] + new_env.pop("SSH_AUTH_SOCK", None) new_env.update(extra_env) try: - with log_out_fn.open("wb") as log_out: - with log_err_fn.open("wb") as log_err: - subprocess.run( - cmdline, - cwd=target_dir, - env=new_env, - stdout=log_out, - stderr=log_err, - check=True, - ) + with log_out_fn.open("wb") as log_out, log_err_fn.open("wb") as log_err: + subprocess.run( + cmdline, + cwd=target_dir, + env=new_env, + stdout=log_out, + stderr=log_err, + check=True, + ) except subprocess.CalledProcessError as e: # Handle the error from Packer command execution raise RuntimeError(f"Packer command failed: {e}") @@ -263,6 +304,31 @@ def run_packer(target_dir, command, arguments=None, extra_env=None): def copy_file(source_file, destination_file): + """ + Copy a file from the source path to the destination path. + + This function uses the `shutil.copy` method from the standard library to copy the file. + It logs the success message if the file is copied successfully, and it logs any errors + that may occur during the copy process. + + Args: + source_file (str): The path to the source file that needs to be copied. + destination_file (str): The path to the destination where the file should be copied. + + Raises: + shutil.Error: If any error occurs during the file copy process. + IOError: If there is an error while reading the source file or writing to the destination file. + + Note: + - If the destination file already exists, it will be replaced by the source file. + - If the source file does not exist, a `FileNotFoundError` will be raised by `shutil.copy`. + + Example: + >>> source_file = "/path/to/source_file.txt" + >>> destination_file = "/path/to/destination_file.txt" + >>> copy_file(source_file, destination_file) + + """ try: shutil.copy(source_file, destination_file) logger.info("File copied successfully.") diff --git a/community/front-end/ofe/website/ghpcfe/forms.py b/community/front-end/ofe/website/ghpcfe/forms.py index 4a5a2a5f24..07f6e00fbe 100644 --- a/community/front-end/ofe/website/ghpcfe/forms.py +++ b/community/front-end/ofe/website/ghpcfe/forms.py @@ -869,7 +869,23 @@ def __init__(self, *args, **kwargs): self.fields[field].widget.attrs.update({"class": "form-control"}) class StartupScriptForm(forms.ModelForm): - """Custom form for StartupScript model""" + """ + Custom form for handling data input and validation for the StartupScript model. + + This form class extends the `forms.ModelForm` class and is designed to work with the + `StartupScript` model, which represents a script executed during + the startup phase of a node. + + Form Fields: + - "name": A text input field for providing a name for the startup script. + - "description": A textarea input field for adding a description of the script. + - "type": A select input field for choosing the type or category of the script. + - "content": A file input field for uploading the content of the startup script. + + Form Validation: + The form automatically validates the input data based on the model field definitions + and any additional constraints defined in the model. + """ class Meta: model = StartupScript diff --git a/community/front-end/ofe/website/ghpcfe/templates/image/list.html b/community/front-end/ofe/website/ghpcfe/templates/image/list.html index 1f96087402..1b1b25e1fb 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/image/list.html +++ b/community/front-end/ofe/website/ghpcfe/templates/image/list.html @@ -106,7 +106,7 @@

Images

View - {% if admin_view == 1 and image.status == "r" %} + {% if admin_view == 1 and image.status == "r" or image.status = "e" %} {% endif %} diff --git a/community/front-end/ofe/website/ghpcfe/views/images.py b/community/front-end/ofe/website/ghpcfe/views/images.py index 03e54440e2..e596b922ff 100644 --- a/community/front-end/ofe/website/ghpcfe/views/images.py +++ b/community/front-end/ofe/website/ghpcfe/views/images.py @@ -35,6 +35,7 @@ from ..cluster_manager.image import ImageBackend from ..cluster_manager.cloud_info import get_region_zone_info from ..views.asyncview import BackendAsyncView +from pathlib import Path import logging @@ -63,8 +64,8 @@ def get_queryset(self): authorized_images = Image.objects.filter(authorised_users=self.request.user) # Combine the owned and authorized objects - startup_scripts = startup_scripts | authorized_startup_scripts - images = images | authorized_images + startup_scripts |= authorized_startup_scripts + images |= authorized_images return startup_scripts, images @@ -104,7 +105,7 @@ def get_context_data(self, **kwargs): # Check if the user is an admin, the owner, or authorized for the startup script if self.is_admin_or_authorized_user(startup_script): - file_path = os.path.join(settings.MEDIA_ROOT, startup_script.content.name) + file_path = Path(settings.MEDIA_ROOT) / startup_script.content.name try: with open(file_path, 'r') as file: try: @@ -145,7 +146,7 @@ def test_func(self): def post(self, request, *args, **kwargs): startup_script = StartupScript.objects.get(pk=self.kwargs['pk']) - file_path = os.path.join(settings.MEDIA_ROOT, startup_script.content.name) + file_path = Path(settings.MEDIA_ROOT) / startup_script.content.name try: os.remove(file_path) logger.info("File deleted successfully.") @@ -284,7 +285,3 @@ def get(self, request, pk, *args, **kwargs): credentials = get_object_or_404(Credential, pk=pk) regions = get_region_zone_info("GCP", credentials.detail) return JsonResponse(regions) - - - - From a767c339a635bffcf23cf890e2cb82901ab9145a Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 1 Aug 2023 22:05:39 -0700 Subject: [PATCH 114/144] Spack module tweeks to make compatible across supported images --- .../scripts/ramble-execute/templates/ramble_execute.yml.tpl | 2 ++ .../scripts/spack-execute/templates/execute_commands.yml.tpl | 2 ++ community/modules/scripts/spack-setup/main.tf | 2 +- .../modules/scripts/spack-setup/scripts/install_spack_deps.yml | 2 +- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl b/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl index b0fac689f9..10dd388040 100644 --- a/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl +++ b/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl @@ -34,6 +34,8 @@ {{ commands }} echo " === Finished commands ===" } 2>&1 | tee -a {{ log_file }} + args: + executable: /bin/bash register: output always: diff --git a/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl b/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl index b0fac689f9..10dd388040 100644 --- a/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl +++ b/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl @@ -34,6 +34,8 @@ {{ commands }} echo " === Finished commands ===" } 2>&1 | tee -a {{ log_file }} + args: + executable: /bin/bash register: output always: diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index 7691a9e90a..a3ec81c81d 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -29,7 +29,7 @@ locals { finalize_setup_script = <<-EOF set -e - source /etc/profile.d/spack.sh + . /etc/profile.d/spack.sh spack gpg init spack compiler find --scope site EOF diff --git a/community/modules/scripts/spack-setup/scripts/install_spack_deps.yml b/community/modules/scripts/spack-setup/scripts/install_spack_deps.yml index d962113527..dd7d418e05 100644 --- a/community/modules/scripts/spack-setup/scripts/install_spack_deps.yml +++ b/community/modules/scripts/spack-setup/scripts/install_spack_deps.yml @@ -23,7 +23,7 @@ - name: Install pip3 and git ansible.builtin.package: name: - - python + - python3 - python3-pip - git register: package From 7406ed0ef68c2008dbcf6040d61a72d15c656b24 Mon Sep 17 00:00:00 2001 From: harshthakkar-google Date: Wed, 2 Aug 2023 04:52:40 +0000 Subject: [PATCH 115/144] Change description of variable instance image to use ubuntu 2204 default --- .../modules/remote-desktop/chrome-remote-desktop/README.md | 2 +- .../modules/remote-desktop/chrome-remote-desktop/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index 941e7a813c..eaa1c863cd 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -84,7 +84,7 @@ No resources. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. Requires virtual workstation accelerator if Nvidia Grid Drivers are required |
list(object({
type = string,
count = number
}))
|
[
{
"count": 1,
"type": "nvidia-tesla-t4-vws"
}
]
| no | | [install\_nvidia\_driver](#input\_install\_nvidia\_driver) | Installs the nvidia driver (true/false). For details, see https://cloud.google.com/compute/docs/gpus/install-drivers-gpu | `bool` | n/a | yes | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Instance Image. An alternative could be family = "ubuntu-2004-lts" and project = "ubuntu-os-cloud" or family = "debian-11" and project = "debian-cloud" |
object({
family = string,
project = string
})
|
{
"family": "ubuntu-2204-lts",
"project": "ubuntu-os-cloud"
}
| no | +| [instance\_image](#input\_instance\_image) | Instance Image. An alternative could be family = "ubuntu-2204-lts" and project = "ubuntu-os-cloud" or family = "debian-11" and project = "debian-cloud" |
object({
family = string,
project = string
})
|
{
"family": "ubuntu-2204-lts",
"project": "ubuntu-os-cloud"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation. Must be N1 family if GPU is used. | `string` | `"n1-standard-8"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | diff --git a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf index e51e4efd1a..0f7d800edb 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/variables.tf +++ b/community/modules/remote-desktop/chrome-remote-desktop/variables.tf @@ -56,7 +56,7 @@ variable "network_storage" { } variable "instance_image" { - description = "Instance Image. An alternative could be family = \"ubuntu-2004-lts\" and project = \"ubuntu-os-cloud\" or family = \"debian-11\" and project = \"debian-cloud\"" + description = "Instance Image. An alternative could be family = \"ubuntu-2204-lts\" and project = \"ubuntu-os-cloud\" or family = \"debian-11\" and project = \"debian-cloud\"" type = object({ family = string, project = string From 7cf695bd4f128510e82aa03f77593214fdb3d5a5 Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Wed, 2 Aug 2023 08:58:44 +0100 Subject: [PATCH 116/144] OFE: updating certifi>=2023.07.22 --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 9e879290eb..777494c01d 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -4,7 +4,7 @@ asgiref==3.7.2 astroid==2.15.5 backports.zoneinfo==0.2.1 cachetools==5.3.1 -certifi==2023.5.7 +certifi==2023.07.22 cffi==1.15.1 cfgv==3.3.1 charset-normalizer==3.1.0 From 6f2929405afdbf986a61935717eb0ac1f139fdea Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Wed, 2 Aug 2023 13:17:51 +0100 Subject: [PATCH 117/144] OFE Backend: updating GPU partition definition --- .../ofe/website/ghpcfe/cluster_manager/clusterinfo.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index 133574f038..dec018ba14 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -76,7 +76,7 @@ def prepare(self, credentials): Raises: subprocess.CalledProcessError: If there is an error during the preparation process, this exception will be raised, indicating that the process failed. - + Note: The required credentials can be obtained from the cloud provider's dashboard or by following the documentation for obtaining authentication credentials. @@ -232,8 +232,9 @@ def _prepare_ghpc_partitions(self, part_uses): if part.GPU_per_node > 0: yaml[-1] += ( f"""\ - gpu.count: {part.GPU_per_node} - gpu.type: {part.GPU_type} + guest_accelerator: + - type: {part.GPU_type} + count: {part.GPU_per_node} """ ) refs.append(part_id) From 0e656d509b0428da5944a7833f3b714ed10d2143 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 1 Aug 2023 22:29:46 -0700 Subject: [PATCH 118/144] Provide info about logging to the user as ansible output will hang --- .../scripts/ramble-execute/templates/ramble_execute.yml.tpl | 6 ++++++ .../spack-execute/templates/execute_commands.yml.tpl | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl b/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl index 10dd388040..f5e38fd212 100644 --- a/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl +++ b/community/modules/scripts/ramble-execute/templates/ramble_execute.yml.tpl @@ -25,6 +25,12 @@ ansible.builtin.debug: msg: "{{ commands.split('\n') | ansible.builtin.to_nice_yaml }}" + - name: Streaming log info + ansible.builtin.debug: + msg: | + Logs from commands will not be printed here until success (or failure) + Streaming logs can be found at {{ log_file }} + - name: Execute commands ansible.builtin.shell: | set -eo pipefail diff --git a/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl b/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl index 10dd388040..f5e38fd212 100644 --- a/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl +++ b/community/modules/scripts/spack-execute/templates/execute_commands.yml.tpl @@ -25,6 +25,12 @@ ansible.builtin.debug: msg: "{{ commands.split('\n') | ansible.builtin.to_nice_yaml }}" + - name: Streaming log info + ansible.builtin.debug: + msg: | + Logs from commands will not be printed here until success (or failure) + Streaming logs can be found at {{ log_file }} + - name: Execute commands ansible.builtin.shell: | set -eo pipefail From e487a7fcfb421251c25c20207189b55294878c35 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 1 Aug 2023 23:24:10 -0700 Subject: [PATCH 119/144] Add information about deprecation and breaking changes --- .../modules/scripts/spack-setup/README.md | 70 +++++++++++++++++++ modules/README.md | 3 + 2 files changed, 73 insertions(+) diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index 58192bd4cd..b517e4106e 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -12,6 +12,11 @@ This module generates a script that performs the following: There are several options on how to consume the outputs of this module: +> [!IMPORTANT] +> Breaking changes between after v1.21.0. `spack-install` module replaced by +> `spack-setup` and `spack-execute` modules. +> [Details Below](#deprecations-and-breaking-changes) + ## Examples ### `use` `spack-setup` with `spack-execute` @@ -135,6 +140,71 @@ sudo -i spack python -m pip install package-name [SPACK_PYTHON]: https://spack.readthedocs.io/en/latest/getting_started.html#shell-support [builds]: https://spack.readthedocs.io/en/latest/binary_caches.html +## Deprecations and Breaking Changes + +The old `spack-install` module has been replaced by the `spack-setup` and +`spack-execute` modules. Generally this change strives to allow for a more +flexible definition of a Spack build by using native Spack commands. + +For every deprecated variable from `spack-install` there is documentation on how +to perform the equivalent action using `commands` and `data_files`. The +documentation can be found on the [inputs table](#inputs) below. + +Below is a simple example of the same functionality shown before and after the +breaking changes. + +```yaml + # Before + - id: spack-install + source: community/modules/scripts/spack-install + settings: + install_dir: /sw/spack + compilers: + - gcc@10.3.0 target=x86_64 + packages: + - intel-mpi@2018.4.274%gcc@10.3.0 + +- id: spack-startup + source: modules/scripts/startup-script + settings: + runners: + - $(spack.install_spack_deps_runner) + - $(spack.install_spack_runner) +``` + +```yaml + # After + - id: spack-setup + source: community/modules/scripts/spack-setup + settings: + install_dir: /sw/spack + + - id: spack-execute + source: community/modules/scripts/spack-execute + use: [spack-setup] + settings: + commands: | + spack install gcc@10.3.0 target=x86_64 + spack load gcc@10.3.0 target=x86_64 + spack compiler find --scope site + spack install intel-mpi@2018.4.274%gcc@10.3.0 + +- id: spack-startup + source: modules/scripts/startup-script + settings: + runners: + - $(spack-execute.spack-runner) +``` + +Although the old `spack-install` module will no longer be maintained, it is +still possible to use the old module in a blueprint by referencing an old +version from GitHub. Note the source line in the following example. + +```yaml + - id: spack-install + source: github.com/GoogleCloudPlatform/hpc-toolkit//community/modules/scripts/spack-install?ref=v1.21.0&depth=1 +``` + ## License diff --git a/modules/README.md b/modules/README.md index 34c946d8ca..a71a3dd430 100644 --- a/modules/README.md +++ b/modules/README.md @@ -217,6 +217,8 @@ Modules that are still in development and less stable are labeled with the * **[spack-setup]** ![community-badge] ![experimental-badge] : Creates a startup script to install [Spack](https://github.com/spack/spack) on an instance or a slurm login or controller. +* **[spack-execute]** ![community-badge] ![experimental-badge] : Defines a + software build using [Spack](https://github.com/spack/spack). * **[wait-for-startup]** ![community-badge] ![experimental-badge] : Waits for successful completion of a startup script on a compute VM. @@ -232,6 +234,7 @@ Modules that are still in development and less stable are labeled with the [ramble-execute]: ../community/modules/scripts/ramble-execute/README.md [ramble-setup]: ../community/modules/scripts/ramble-setup/README.md [spack-setup]: ../community/modules/scripts/spack-setup/README.md +[spack-execute]: ../community/modules/scripts/spack-execute/README.md [wait-for-startup]: ../community/modules/scripts/wait-for-startup/README.md ## Module Fields From 82dd41158ef85c88644967f12126abfd90bfeec9 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Wed, 2 Aug 2023 01:06:48 -0700 Subject: [PATCH 120/144] Setup Spack to pull from Google's Spack binary cache by default --- community/modules/scripts/spack-setup/README.md | 1 + community/modules/scripts/spack-setup/main.tf | 10 ++++++++++ community/modules/scripts/spack-setup/variables.tf | 6 ++++++ 3 files changed, 17 insertions(+) diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index b517e4106e..eb3e38c044 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -261,6 +261,7 @@ limitations under the License. | [compilers](#input\_compilers) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to install compilers:
spack install gcc@10.3.0 target=x86_64
spack load gcc@10.3.0 target=x86_64
spack compiler find --scope site
spack clean -s
spack unload gcc@10.3.0
Defines compilers for spack to install before installing packages. | `list(string)` | `null` | no | | [concretize\_flags](#input\_concretize\_flags) | DEPRECATED - spack concretize is now performed using the [spack-execute](../spack-execute/) module `commands` variable. | `string` | `null` | no | | [configs](#input\_configs) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to add a single config:
spack config --scope defaults add config:default:true
Alternatively, use `data_files` to transfer a config file and use the `spack config add -f ` command to add the config.

List of configuration options to set within spack. | `list(map(any))` | `null` | no | +| [configure\_for\_google](#input\_configure\_for\_google) | When true, the spack installation will be configured to pull from Google's Spack binary cache. | `bool` | `true` | no | | [deployment\_name](#input\_deployment\_name) | Name of deployment, used to name bucket containing startup script. | `string` | n/a | yes | | [environments](#input\_environments) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to configure an environment:
if ! spack env list \| grep -q my-env; then
spack env create my-env
fi
spack env activate my-env
spack add intel-mpi@2018.4.274 %gcc@10.3.0
spack concretize
spack install
Defines spack environments to configure.
For more information, see: https://spack.readthedocs.io/en/latest/environments.html. | `any` | `null` | no | | [gpg\_keys](#input\_gpg\_keys) | DEPRECATED

Use [spack-execute](../spack-execute/) module with the following `commands` can be used to create a new GPG key:
spack gpg init
spack gpg create
Alternatively, `data_files` can be used to transfer an existing GPG key. Then use `spack gpg trust ` to add the key to the keyring.

GPG Keys to trust within spack. | `list(map(any))` | `null` | no | diff --git a/community/modules/scripts/spack-setup/main.tf b/community/modules/scripts/spack-setup/main.tf index a3ec81c81d..5b67d0b8da 100644 --- a/community/modules/scripts/spack-setup/main.tf +++ b/community/modules/scripts/spack-setup/main.tf @@ -27,11 +27,21 @@ locals { fi EOF + supported_cache_versions = ["v0.19.0", "v0.20.0"] + cache_version = contains(local.supported_cache_versions, var.spack_ref) ? var.spack_ref : "latest" + add_google_mirror_script = !var.configure_for_google ? "" : <<-EOF + if ! spack mirror list | grep -q google_binary_cache; then + spack mirror add --scope site google_binary_cache gs://spack/${local.cache_version} + spack buildcache keys --install --trust + fi + EOF + finalize_setup_script = <<-EOF set -e . /etc/profile.d/spack.sh spack gpg init spack compiler find --scope site + ${local.add_google_mirror_script} EOF script_content = templatefile( diff --git a/community/modules/scripts/spack-setup/variables.tf b/community/modules/scripts/spack-setup/variables.tf index e2c0ca5920..f0f385d3e7 100644 --- a/community/modules/scripts/spack-setup/variables.tf +++ b/community/modules/scripts/spack-setup/variables.tf @@ -39,6 +39,12 @@ variable "spack_ref" { default = "v0.20.0" } +variable "configure_for_google" { + description = "When true, the spack installation will be configured to pull from Google's Spack binary cache." + type = bool + default = true +} + variable "chown_owner" { description = "Owner to chown the Spack clone to. Default will not modify the clone." default = null From 56c337457648599eded557950f01305f6d2663d3 Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Wed, 2 Aug 2023 13:53:59 -0500 Subject: [PATCH 121/144] Support specification of MIG target shape in HTCondor modules --- .../compute/htcondor-execute-point/README.md | 3 ++- .../compute/htcondor-execute-point/main.tf | 18 +++++++++--------- .../htcondor-execute-point/variables.tf | 6 ++++++ .../scheduler/htcondor-access-point/README.md | 3 ++- .../scheduler/htcondor-access-point/main.tf | 17 +++++++++-------- .../htcondor-access-point/variables.tf | 6 ++++++ .../htcondor-central-manager/README.md | 3 ++- .../scheduler/htcondor-central-manager/main.tf | 17 +++++++++-------- .../htcondor-central-manager/variables.tf | 6 ++++++ 9 files changed, 51 insertions(+), 28 deletions(-) diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index 463904e376..5bfd4dfcf9 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -176,7 +176,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [execute\_point\_instance\_template](#module\_execute\_point\_instance\_template) | terraform-google-modules/vm/google//modules/instance_template | ~> 8.0 | -| [mig](#module\_mig) | terraform-google-modules/vm/google//modules/mig | ~> 8.0 | +| [mig](#module\_mig) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | | [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.20.0&depth=1 | ## Resources @@ -194,6 +194,7 @@ limitations under the License. | [central\_manager\_ips](#input\_central\_manager\_ips) | List of IP addresses of HTCondor Central Managers | `list(string)` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `100` | no | +| [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape across zones for instance group managing execute points | `string` | `"ANY"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [execute\_point\_runner](#input\_execute\_point\_runner) | A list of Toolkit runners for configuring an HTCondor execute point | `list(map(string))` | `[]` | no | | [execute\_point\_service\_account\_email](#input\_execute\_point\_service\_account\_email) | Service account for HTCondor execute point (e-mail format) | `string` | n/a | yes | diff --git a/community/modules/compute/htcondor-execute-point/main.tf b/community/modules/compute/htcondor-execute-point/main.tf index 677104c974..45f3b4891b 100644 --- a/community/modules/compute/htcondor-execute-point/main.tf +++ b/community/modules/compute/htcondor-execute-point/main.tf @@ -137,15 +137,15 @@ module "execute_point_instance_template" { } module "mig" { - source = "terraform-google-modules/vm/google//modules/mig" - version = "~> 8.0" - project_id = var.project_id - region = var.region - distribution_policy_zones = local.zones - target_size = var.target_size - hostname = local.name_prefix - mig_name = local.name_prefix - instance_template = module.execute_point_instance_template.self_link + source = "github.com/terraform-google-modules/terraform-google-vm//modules/mig?ref=aea74d1" + project_id = var.project_id + region = var.region + distribution_policy_target_shape = var.distribution_policy_target_shape + distribution_policy_zones = local.zones + target_size = var.target_size + hostname = local.name_prefix + mig_name = local.name_prefix + instance_template = module.execute_point_instance_template.self_link health_check_name = "health-htcondor-${local.name_prefix}" health_check = { diff --git a/community/modules/compute/htcondor-execute-point/variables.tf b/community/modules/compute/htcondor-execute-point/variables.tf index fcb1629f94..b21dbf5424 100644 --- a/community/modules/compute/htcondor-execute-point/variables.tf +++ b/community/modules/compute/htcondor-execute-point/variables.tf @@ -31,6 +31,12 @@ variable "zones" { nullable = false } +variable "distribution_policy_target_shape" { + description = "Target shape across zones for instance group managing execute points" + type = string + default = "ANY" +} + variable "deployment_name" { description = "HPC Toolkit deployment name. HTCondor cloud resource names will include this value." type = string diff --git a/community/modules/scheduler/htcondor-access-point/README.md b/community/modules/scheduler/htcondor-access-point/README.md index cdcb09e9bb..f4458ace78 100644 --- a/community/modules/scheduler/htcondor-access-point/README.md +++ b/community/modules/scheduler/htcondor-access-point/README.md @@ -62,7 +62,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [access\_point\_instance\_template](#module\_access\_point\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | -| [htcondor\_ap](#module\_htcondor\_ap) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | 84d7959 | +| [htcondor\_ap](#module\_htcondor\_ap) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | | [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.20.0&depth=1 | ## Resources @@ -87,6 +87,7 @@ limitations under the License. | [default\_mig\_id](#input\_default\_mig\_id) | Default MIG ID for HTCondor jobs; if unset, jobs must specify MIG id | `string` | `""` | no | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `null` | no | +| [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape acoss zones for instance group managing high availability of access point | `string` | `"BALANCED"` | no | | [enable\_high\_availability](#input\_enable\_high\_availability) | Provision HTCondor access point in high availability mode | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | Enable Public IPs on the access points | `bool` | `false` | no | diff --git a/community/modules/scheduler/htcondor-access-point/main.tf b/community/modules/scheduler/htcondor-access-point/main.tf index 1df3bc4ad0..562113a136 100644 --- a/community/modules/scheduler/htcondor-access-point/main.tf +++ b/community/modules/scheduler/htcondor-access-point/main.tf @@ -160,14 +160,15 @@ module "access_point_instance_template" { module "htcondor_ap" { # tflint-ignore: terraform_module_pinned_source - source = "github.com/terraform-google-modules/terraform-google-vm//modules/mig?ref=84d7959" - - project_id = var.project_id - region = var.region - distribution_policy_zones = local.zones - target_size = local.host_count - hostname = local.name_prefix - instance_template = module.access_point_instance_template.self_link + source = "github.com/terraform-google-modules/terraform-google-vm//modules/mig?ref=aea74d1" + + project_id = var.project_id + region = var.region + distribution_policy_target_shape = var.distribution_policy_target_shape + distribution_policy_zones = local.zones + target_size = local.host_count + hostname = local.name_prefix + instance_template = module.access_point_instance_template.self_link health_check_name = "health-${local.name_prefix}" health_check = { diff --git a/community/modules/scheduler/htcondor-access-point/variables.tf b/community/modules/scheduler/htcondor-access-point/variables.tf index 3fae9b5373..2ba01eefbe 100644 --- a/community/modules/scheduler/htcondor-access-point/variables.tf +++ b/community/modules/scheduler/htcondor-access-point/variables.tf @@ -41,6 +41,12 @@ variable "zones" { nullable = false } +variable "distribution_policy_target_shape" { + description = "Target shape acoss zones for instance group managing high availability of access point" + type = string + default = "BALANCED" +} + variable "network_self_link" { description = "The self link of the network in which the HTCondor central manager will be created." type = string diff --git a/community/modules/scheduler/htcondor-central-manager/README.md b/community/modules/scheduler/htcondor-central-manager/README.md index 4f96c10834..d138ed2b98 100644 --- a/community/modules/scheduler/htcondor-central-manager/README.md +++ b/community/modules/scheduler/htcondor-central-manager/README.md @@ -98,7 +98,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| | [central\_manager\_instance\_template](#module\_central\_manager\_instance\_template) | github.com/terraform-google-modules/terraform-google-vm//modules/instance_template | 84d7959 | -| [htcondor\_cm](#module\_htcondor\_cm) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | 84d7959 | +| [htcondor\_cm](#module\_htcondor\_cm) | github.com/terraform-google-modules/terraform-google-vm//modules/mig | aea74d1 | | [startup\_script](#module\_startup\_script) | github.com/GoogleCloudPlatform/hpc-toolkit//modules/scripts/startup-script | v1.20.0&depth=1 | ## Resources @@ -120,6 +120,7 @@ limitations under the License. | [central\_manager\_service\_account\_email](#input\_central\_manager\_service\_account\_email) | Service account e-mail for central manager (can be supplied by htcondor-setup module) | `string` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | HPC Toolkit deployment name. HTCondor cloud resource names will include this value. | `string` | n/a | yes | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB | `number` | `20` | no | +| [distribution\_policy\_target\_shape](#input\_distribution\_policy\_target\_shape) | Target shape for instance group managing high availability of central manager | `string` | `"BALANCED"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [htcondor\_bucket\_name](#input\_htcondor\_bucket\_name) | Name of HTCondor configuration bucket | `string` | n/a | yes | | [instance\_image](#input\_instance\_image) | Custom VM image with HTCondor installed using the htcondor-install module. |
object({
family = string,
project = string
})
| n/a | yes | diff --git a/community/modules/scheduler/htcondor-central-manager/main.tf b/community/modules/scheduler/htcondor-central-manager/main.tf index fce3ff2709..f9e07535be 100644 --- a/community/modules/scheduler/htcondor-central-manager/main.tf +++ b/community/modules/scheduler/htcondor-central-manager/main.tf @@ -123,14 +123,15 @@ module "central_manager_instance_template" { module "htcondor_cm" { # tflint-ignore: terraform_module_pinned_source - source = "github.com/terraform-google-modules/terraform-google-vm//modules/mig?ref=84d7959" - - project_id = var.project_id - region = var.region - distribution_policy_zones = local.zones - target_size = 1 - hostname = local.name_prefix - instance_template = module.central_manager_instance_template.self_link + source = "github.com/terraform-google-modules/terraform-google-vm//modules/mig?ref=aea74d1" + + project_id = var.project_id + region = var.region + distribution_policy_target_shape = var.distribution_policy_target_shape + distribution_policy_zones = local.zones + target_size = 1 + hostname = local.name_prefix + instance_template = module.central_manager_instance_template.self_link health_check_name = "health-${local.name_prefix}" health_check = { diff --git a/community/modules/scheduler/htcondor-central-manager/variables.tf b/community/modules/scheduler/htcondor-central-manager/variables.tf index 1844c3d93c..8ea3f605fb 100644 --- a/community/modules/scheduler/htcondor-central-manager/variables.tf +++ b/community/modules/scheduler/htcondor-central-manager/variables.tf @@ -41,6 +41,12 @@ variable "zones" { nullable = false } +variable "distribution_policy_target_shape" { + description = "Target shape for instance group managing high availability of central manager" + type = string + default = "BALANCED" +} + variable "network_self_link" { description = "The self link of the network in which the HTCondor central manager will be created." type = string From a4243e9af4027ccdfc90a4177611acab581892b2 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Wed, 2 Aug 2023 12:30:16 -0700 Subject: [PATCH 122/144] Fix "Apply" prompt so user input is separated (#1653) --- pkg/shell/common.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/shell/common.go b/pkg/shell/common.go index e0cb2ce5ee..9023e0117a 100644 --- a/pkg/shell/common.go +++ b/pkg/shell/common.go @@ -99,11 +99,11 @@ func ApplyChangesChoice(c ProposedChanges) bool { var userResponse string for { - fmt.Print(`Please select an option below [d,a,s,c]: -(D)isplay full proposed changes, + fmt.Print(`(D)isplay full proposed changes, (A)pply proposed changes, (S)top and exit, -(C)ontinue without applying`) +(C)ontinue without applying +Please select an option [d,a,s,c]: `) _, err := fmt.Scanln(&userResponse) if err != nil { From 0ccfd4ea47e7a7d2b8d01f5a4b8225e15bf41485 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Wed, 2 Aug 2023 12:34:06 -0700 Subject: [PATCH 123/144] Add labels automatically to dependabot PRs (#1652) --- .github/dependabot.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 1f0fe23acf..45a0d213b3 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -18,6 +18,10 @@ version: 2 updates: - package-ecosystem: gomod directory: / + labels: + - dependencies + - go + - chore schedule: interval: weekly day: monday @@ -26,6 +30,10 @@ updates: target-branch: develop - package-ecosystem: pip directory: /community/front-end/ofe/ + labels: + - dependencies + - python + - chore schedule: interval: monthly time: "03:00" From d0f3010cd6c2f5137236df2b4a914d5ce50f7547 Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Tue, 1 Aug 2023 23:48:58 -0700 Subject: [PATCH 124/144] Remove duplicated and outdated outputs from spack-setup --- .../modules/scripts/spack-setup/README.md | 5 +-- .../modules/scripts/spack-setup/outputs.tf | 41 ++++--------------- 2 files changed, 8 insertions(+), 38 deletions(-) diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index eb3e38c044..3e7be19d53 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -284,10 +284,7 @@ limitations under the License. |------|-------------| | [controller\_startup\_script](#output\_controller\_startup\_script) | Path to the Spack installation script, duplicate for SLURM controller. | | [gcs\_bucket\_path](#output\_gcs\_bucket\_path) | Bucket containing the startup scripts for spack, to be reused by spack-execute module. | -| [install\_spack\_deps\_runner](#output\_install\_spack\_deps\_runner) | Runner to install dependencies for spack using an ansible playbook. The
startup-script module will automatically handle installation of ansible.
- id: example-startup-script
source: modules/scripts/startup-script
settings:
runners:
- $(your-spack-id.install\_spack\_deps\_runner)
... | -| [install\_spack\_runner](#output\_install\_spack\_runner) | Runner to install Spack using the startup-script module | -| [setup\_spack\_runner](#output\_setup\_spack\_runner) | Adds Spack setup-env.sh script to /etc/profile.d so that it is called at shell startup. Among other things this adds Spack binary to user PATH. | | [spack\_path](#output\_spack\_path) | Path to the root of the spack installation | -| [spack\_runner](#output\_spack\_runner) | Runner to install Spack using the startup-script module | +| [spack\_runner](#output\_spack\_runner) | Runner to be used with startup-script module or passed to spack-execute module.
- installs Spack dependencies
- installs Spack
- generates profile.d script to enable access to Spack
This is safe to run in parallel by multiple machines. Use in place of deprecated `setup_spack_runner`. | | [startup\_script](#output\_startup\_script) | Path to the Spack installation script. | diff --git a/community/modules/scripts/spack-setup/outputs.tf b/community/modules/scripts/spack-setup/outputs.tf index ea68b9d37c..7c5b548c9c 100644 --- a/community/modules/scripts/spack-setup/outputs.tf +++ b/community/modules/scripts/spack-setup/outputs.tf @@ -24,46 +24,19 @@ output "controller_startup_script" { value = module.startup_script.startup_script } -output "install_spack_deps_runner" { - description = <<-EOT - Runner to install dependencies for spack using an ansible playbook. The - startup-script module will automatically handle installation of ansible. - - id: example-startup-script - source: modules/scripts/startup-script - settings: - runners: - - $(your-spack-id.install_spack_deps_runner) - ... - EOT - value = local.install_spack_deps_runner -} - -output "install_spack_runner" { - description = "Runner to install Spack using the startup-script module" - value = local.combined_runner -} - -output "setup_spack_runner" { - description = "Adds Spack setup-env.sh script to /etc/profile.d so that it is called at shell startup. Among other things this adds Spack binary to user PATH." - value = { - "type" = "data" - "destination" = "/etc/profile.d/spack.sh" - "content" = <<-EOT - #!/bin/sh - if [ -f ${var.install_dir}/share/spack/setup-env.sh ]; then - . ${var.install_dir}/share/spack/setup-env.sh - fi - EOT - } -} - output "spack_path" { description = "Path to the root of the spack installation" value = var.install_dir } output "spack_runner" { - description = "Runner to install Spack using the startup-script module" + description = <<-EOT + Runner to be used with startup-script module or passed to spack-execute module. + - installs Spack dependencies + - installs Spack + - generates profile.d script to enable access to Spack + This is safe to run in parallel by multiple machines. Use in place of deprecated `setup_spack_runner`. + EOT value = local.combined_runner } From 5f5d4b414517950a4c18ecc85ee911e68ae2095c Mon Sep 17 00:00:00 2001 From: Nick Stroud Date: Wed, 2 Aug 2023 15:03:59 -0700 Subject: [PATCH 125/144] Update blueprints and documentation to reflect output changes --- community/examples/AMD/hpc-amd-slurm.yaml | 2 +- .../modules/scripts/spack-setup/README.md | 29 +++---------------- examples/serverless-batch-mpi.yaml | 8 +---- 3 files changed, 6 insertions(+), 33 deletions(-) diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index b63aed96fb..6abb946cec 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -132,7 +132,7 @@ deployment_groups: source: modules/scripts/startup-script settings: runners: - - $(spack-setup.setup_spack_runner) + - $(spack-setup.spack_runner) # the following installation of AOCC may be automated in the future # with a clear direction to the user to read the EULA at # https://developer.amd.com/aocc-compiler-eula/ diff --git a/community/modules/scripts/spack-setup/README.md b/community/modules/scripts/spack-setup/README.md index 3e7be19d53..e1833a6baf 100644 --- a/community/modules/scripts/spack-setup/README.md +++ b/community/modules/scripts/spack-setup/README.md @@ -96,33 +96,12 @@ To see a full example of this module in use, see the [hpc-slurm-gromacs.yaml] ex [Spack installation] produces a setup script that adds `spack` to your `PATH` as well as some other command-line integration tools. This script can be found at `/share/spack/setup-env.sh`. This script will be automatically -added to bash startup by the `spack_runner`. In the case that you are using -Spack on a different machine than the one where Spack was installed, you can use -the `setup_spack_runner` to make sure Spack is also available on that machine. +added to bash startup by any machine that runs the `spack_runner`. -[Spack installation]: https://spack-tutorial.readthedocs.io/en/latest/tutorial_basics.html#installing-spack - -### Example using `setup_spack_runner` +If you have multiple machines that all want to use the same shared Spack +installation you can just have both machines run the `spack_runner`. -The following examples assumes that a different machine is running -`$(spack.install_spack_runner)` and the Slurm login node has access to the Spack -instal through a shared file system. - -```yaml - - id: spack - source: community/modules/scripts/spack-setup - ... - - - id: spack-setup - source: modules/scripts/startup-script - settings: - runners: - - $(spack.setup_spack_runner) - - - id: slurm_login - source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - use: [spack-setup, ...] -``` +[Spack installation]: https://spack-tutorial.readthedocs.io/en/latest/tutorial_basics.html#installing-spack ### Managing Spack Python dependencies diff --git a/examples/serverless-batch-mpi.yaml b/examples/serverless-batch-mpi.yaml index d11cf5ee02..9703b3daa2 100644 --- a/examples/serverless-batch-mpi.yaml +++ b/examples/serverless-batch-mpi.yaml @@ -153,13 +153,7 @@ deployment_groups: task_count: 2 mpi_mode: true - - id: login-spack-setup - source: modules/scripts/startup-script - settings: - runners: - - $(spack-setup.setup_spack_runner) - - id: batch-login source: modules/scheduler/batch-login-node - use: [login-spack-setup, batch-job] + use: [spack-setup, batch-job] outputs: [instructions] From 9ff32743933a299279e5a87e9bc07f243aec5844 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 2 Aug 2023 15:17:44 -0700 Subject: [PATCH 126/144] Bump google.golang.org/api from 0.132.0 to 0.134.0 (#1633) Bumps [google.golang.org/api](https://github.com/googleapis/google-api-go-client) from 0.132.0 to 0.134.0. - [Release notes](https://github.com/googleapis/google-api-go-client/releases) - [Changelog](https://github.com/googleapis/google-api-go-client/blob/main/CHANGES.md) - [Commits](https://github.com/googleapis/google-api-go-client/compare/v0.132.0...v0.134.0) --- updated-dependencies: - dependency-name: google.golang.org/api dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 4 ++-- go.sum | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 024acc1dd8..ce9c9684f0 100644 --- a/go.mod +++ b/go.mod @@ -26,14 +26,14 @@ require ( github.com/google/go-cmp v0.5.9 github.com/hashicorp/terraform-exec v0.18.1 github.com/zclconf/go-cty-debug v0.0.0-20191215020915-b22d67c1ba0b - google.golang.org/api v0.132.0 + google.golang.org/api v0.134.0 ) require ( github.com/googleapis/gax-go/v2 v2.12.0 // indirect github.com/hashicorp/terraform-json v0.15.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20230706204954-ccb25ca9f130 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20230720185612-659f7aaaa771 // indirect ) require ( diff --git a/go.sum b/go.sum index 2abb7eeba9..af29886e2b 100644 --- a/go.sum +++ b/go.sum @@ -855,8 +855,8 @@ google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ google.golang.org/api v0.97.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.98.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/api v0.100.0/go.mod h1:ZE3Z2+ZOr87Rx7dqFsdRQkRBk36kDtp/h+QpHbB7a70= -google.golang.org/api v0.132.0 h1:8t2/+qZ26kAOGSmOiHwVycqVaDg7q3JDILrNi/Z6rvc= -google.golang.org/api v0.132.0/go.mod h1:AeTBC6GpJnJSRJjktDcPX0QwtS8pGYZOV6MSuSCusw0= +google.golang.org/api v0.134.0 h1:ktL4Goua+UBgoP1eL1/60LwZJqa1sIzkLmvoR3hR6Gw= +google.golang.org/api v0.134.0/go.mod h1:sjRL3UnjTx5UqNQS9EWr9N8p7xbHpy1k0XGRLCf3Spk= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -972,8 +972,8 @@ google.golang.org/genproto v0.0.0-20230706204954-ccb25ca9f130 h1:Au6te5hbKUV8pIY google.golang.org/genproto v0.0.0-20230706204954-ccb25ca9f130/go.mod h1:O9kGHb51iE/nOGvQaDUuadVYqovW56s5emA88lQnj6Y= google.golang.org/genproto/googleapis/api v0.0.0-20230706204954-ccb25ca9f130 h1:XVeBY8d/FaK4848myy41HBqnDwvxeV3zMZhwN1TvAMU= google.golang.org/genproto/googleapis/api v0.0.0-20230706204954-ccb25ca9f130/go.mod h1:mPBs5jNgx2GuQGvFwUvVKqtn6HsUw9nP64BedgvqEsQ= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98 h1:bVf09lpb+OJbByTj913DRJioFFAjf/ZGxEz7MajTp2U= -google.golang.org/genproto/googleapis/rpc v0.0.0-20230711160842-782d3b101e98/go.mod h1:TUfxEVdsvPg18p6AslUXFoLdpED4oBnGwyqk3dV1XzM= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230720185612-659f7aaaa771 h1:Z8qdAF9GFsmcUuWQ5KVYIpP3PCKydn/YKORnghIalu4= +google.golang.org/genproto/googleapis/rpc v0.0.0-20230720185612-659f7aaaa771/go.mod h1:TUfxEVdsvPg18p6AslUXFoLdpED4oBnGwyqk3dV1XzM= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= From 8db313892757ffe061fc71d106e1aecee81c755a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 2 Aug 2023 19:44:26 -0700 Subject: [PATCH 127/144] Bump github.com/hashicorp/go-getter from 1.7.1 to 1.7.2 (#1634) Bumps [github.com/hashicorp/go-getter](https://github.com/hashicorp/go-getter) from 1.7.1 to 1.7.2. - [Release notes](https://github.com/hashicorp/go-getter/releases) - [Changelog](https://github.com/hashicorp/go-getter/blob/main/.goreleaser.yml) - [Commits](https://github.com/hashicorp/go-getter/compare/v1.7.1...v1.7.2) --- updated-dependencies: - dependency-name: github.com/hashicorp/go-getter dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index ce9c9684f0..656b2164ec 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( cloud.google.com/go/compute v1.20.1 // indirect cloud.google.com/go/storage v1.30.1 // indirect github.com/go-git/go-git/v5 v5.8.0 - github.com/hashicorp/go-getter v1.7.1 + github.com/hashicorp/go-getter v1.7.2 github.com/hashicorp/hcl v1.0.0 // indirect github.com/hashicorp/hcl/v2 v2.17.0 github.com/hashicorp/terraform-config-inspect v0.0.0-20221020162138-81db043ad408 diff --git a/go.sum b/go.sum index af29886e2b..fe84aa8d95 100644 --- a/go.sum +++ b/go.sum @@ -364,8 +364,8 @@ github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8 github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= -github.com/hashicorp/go-getter v1.7.1 h1:SWiSWN/42qdpR0MdhaOc/bLR48PLuP1ZQtYLRlM69uY= -github.com/hashicorp/go-getter v1.7.1/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744= +github.com/hashicorp/go-getter v1.7.2 h1:uJDtyXwEfalmp1PqdxuhZqrNkUyClZAhVeZYTArbqkg= +github.com/hashicorp/go-getter v1.7.2/go.mod h1:W7TalhMmbPmsSMdNjD0ZskARur/9GJ17cfHTRtXV744= github.com/hashicorp/go-safetemp v1.0.0 h1:2HR189eFNrjHQyENnQMMpCiBAsRxzbTMIgBhEyExpmo= github.com/hashicorp/go-safetemp v1.0.0/go.mod h1:oaerMy3BhqiTbVye6QuFhFtIceqFoDHxNAB65b+Rj1I= github.com/hashicorp/go-version v1.6.0 h1:feTTfFNnjP967rlCxM/I9g701jU+RN74YKx2mOkIeek= From 25afd62b1185f50a55839625fb5a03bc13c8e095 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 2 Aug 2023 21:03:38 -0700 Subject: [PATCH 128/144] Add H3 to disk_type validation in vm-instance and slurm modules (#1651) --- .../compute/schedmd-slurm-gcp-v5-node-group/outputs.tf | 8 ++++++-- modules/compute/vm-instance/main.tf | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf index ab17ac17d4..d289ee3554 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/outputs.tf @@ -19,7 +19,11 @@ output "node_groups" { value = local.node_group precondition { - condition = (substr(var.machine_type, 0, 3) != "c3-") || (var.disk_type != "pd-standard") - error_message = "A disk_type of pd-standard cannot be used with c3 machines." + condition = !contains([ + "c3-:pd-standard", + "h3-:pd-standard", + "h3-:pd-ssd", + ], "${substr(var.machine_type, 0, 3)}:${var.disk_type}") + error_message = "A disk_type=${var.disk_type} cannot be used with machine_type=${var.machine_type}." } } diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index eec033b2a8..f2765f67af 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -238,8 +238,12 @@ resource "google_compute_instance" "compute_vm" { error_message = "Exactly one of network_interfaces or network_self_link/subnetwork_self_link must be specified." } precondition { - condition = (substr(var.machine_type, 0, 3) != "c3-") || (var.disk_type != "pd-standard") - error_message = "A disk_type of pd-standard cannot be used with c3 machines." + condition = !contains([ + "c3-:pd-standard", + "h3-:pd-standard", + "h3-:pd-ssd", + ], "${substr(var.machine_type, 0, 3)}:${var.disk_type}") + error_message = "A disk_type=${var.disk_type} cannot be used with machine_type=${var.machine_type}." } } } From 93539eec9547090d50fca19f64810e6519c6f43a Mon Sep 17 00:00:00 2001 From: Eimantas Kazakevicius Date: Thu, 3 Aug 2023 13:02:36 +0100 Subject: [PATCH 129/144] OFE Backend: adding name validator for cluster partition --- community/front-end/ofe/website/ghpcfe/models.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/community/front-end/ofe/website/ghpcfe/models.py b/community/front-end/ofe/website/ghpcfe/models.py index f31d7427eb..977136a1b8 100644 --- a/community/front-end/ofe/website/ghpcfe/models.py +++ b/community/front-end/ofe/website/ghpcfe/models.py @@ -26,7 +26,7 @@ from django.conf import settings from django.contrib.auth.models import AbstractUser from django.core.exceptions import ValidationError -from django.core.validators import MinValueValidator, RegexValidator +from django.core.validators import MinValueValidator, RegexValidator, MaxLengthValidator from django.db import models from django.db.models.signals import post_save from django.dispatch import receiver @@ -855,7 +855,18 @@ class ComputeInstance(CloudResource): class ClusterPartition(models.Model): """Compute partition on a clustero""" - name = models.CharField(max_length=80, help_text="Partition name") + # Define the regex pattern validator + name_validator = RegexValidator( + regex=r"^[a-z](?:[a-z0-9]{0,6})$", + message="Name must start with a lowercase letter and can have up to 7 characters (lowercase letters or digits).", + ) + # Define the max length validator + max_length_validator = MaxLengthValidator(7, "Name cannot exceed 7 characters.") + name = models.CharField( + max_length=7, + validators=[name_validator, max_length_validator], + help_text="Partition name must start with a lowercase letter and can have up to 7 character (lowercase letters or digits).", + ) cluster = models.ForeignKey( Cluster, related_name="partitions", From 929db03b880b2721457c5b66d397ba81528b2cdc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 3 Aug 2023 09:19:36 -0700 Subject: [PATCH 130/144] Bump github.com/go-git/go-git/v5 from 5.8.0 to 5.8.1 (#1635) Bumps [github.com/go-git/go-git/v5](https://github.com/go-git/go-git) from 5.8.0 to 5.8.1. - [Release notes](https://github.com/go-git/go-git/releases) - [Commits](https://github.com/go-git/go-git/compare/v5.8.0...v5.8.1) --- updated-dependencies: - dependency-name: github.com/go-git/go-git/v5 dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 15 +++++++++------ go.sum | 35 ++++++++++++++++++++++------------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/go.mod b/go.mod index 656b2164ec..c99eee36a0 100644 --- a/go.mod +++ b/go.mod @@ -5,7 +5,7 @@ go 1.18 require ( cloud.google.com/go/compute v1.20.1 // indirect cloud.google.com/go/storage v1.30.1 // indirect - github.com/go-git/go-git/v5 v5.8.0 + github.com/go-git/go-git/v5 v5.8.1 github.com/hashicorp/go-getter v1.7.2 github.com/hashicorp/hcl v1.0.0 // indirect github.com/hashicorp/hcl/v2 v2.17.0 @@ -30,8 +30,12 @@ require ( ) require ( + dario.cat/mergo v1.0.0 // indirect github.com/googleapis/gax-go/v2 v2.12.0 // indirect github.com/hashicorp/terraform-json v0.15.0 // indirect + github.com/rogpeppe/go-internal v1.9.0 // indirect + golang.org/x/mod v0.9.0 // indirect + golang.org/x/tools v0.6.0 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20230706204954-ccb25ca9f130 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20230720185612-659f7aaaa771 // indirect ) @@ -40,8 +44,8 @@ require ( cloud.google.com/go v0.110.4 // indirect cloud.google.com/go/compute/metadata v0.2.3 // indirect cloud.google.com/go/iam v1.1.0 // indirect - github.com/Microsoft/go-winio v0.5.2 // indirect - github.com/ProtonMail/go-crypto v0.0.0-20230518184743-7afd39499903 // indirect + github.com/Microsoft/go-winio v0.6.1 // indirect + github.com/ProtonMail/go-crypto v0.0.0-20230717121422-5aa5874ade95 // indirect github.com/acomagu/bufpipe v1.0.4 // indirect github.com/agext/levenshtein v1.2.2 // indirect github.com/apparentlymart/go-textseg/v13 v13.0.0 // indirect @@ -59,20 +63,19 @@ require ( github.com/hashicorp/go-safetemp v1.0.0 // indirect github.com/hashicorp/go-version v1.6.0 // indirect github.com/hashicorp/hc-install v0.5.1 // indirect - github.com/imdario/mergo v0.3.15 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect github.com/klauspost/compress v1.15.11 // indirect - github.com/kr/pretty v0.2.1 // indirect + github.com/kr/pretty v0.3.1 // indirect github.com/kr/text v0.2.0 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/mitchellh/go-testing-interface v1.14.1 // indirect github.com/mitchellh/go-wordwrap v1.0.0 // indirect github.com/pjbgf/sha1cd v0.3.0 // indirect github.com/sergi/go-diff v1.2.0 // indirect - github.com/skeema/knownhosts v1.1.1 // indirect + github.com/skeema/knownhosts v1.2.0 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/ulikunitz/xz v0.5.10 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect diff --git a/go.sum b/go.sum index fe84aa8d95..13854b262a 100644 --- a/go.sum +++ b/go.sum @@ -187,14 +187,17 @@ cloud.google.com/go/webrisk v1.4.0/go.mod h1:Hn8X6Zr+ziE2aNd8SliSDWpEnSS1u4R9+xX cloud.google.com/go/webrisk v1.5.0/go.mod h1:iPG6fr52Tv7sGk0H6qUFzmL3HHZev1htXuWDEEsqMTg= cloud.google.com/go/workflows v1.6.0/go.mod h1:6t9F5h/unJz41YqfBmqSASJSXccBLtD1Vwf+KmJENM0= cloud.google.com/go/workflows v1.7.0/go.mod h1:JhSrZuVZWuiDfKEFxU0/F1PQjmpnpcoISEXH2bcHC3M= +dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk= +dario.cat/mergo v1.0.0/go.mod h1:uNxQE+84aUszobStD9th8a29P2fMDhsBdgRYvZOxGmk= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= -github.com/Microsoft/go-winio v0.5.2 h1:a9IhgEQBCUEk6QCdml9CiJGhAws+YwffDHEMp1VMrpA= github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY= +github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow= +github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM= github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= -github.com/ProtonMail/go-crypto v0.0.0-20230518184743-7afd39499903 h1:ZK3C5DtzV2nVAQTx5S5jQvMeDqWtD1By5mOoyY/xJek= -github.com/ProtonMail/go-crypto v0.0.0-20230518184743-7afd39499903/go.mod h1:8TI4H3IbrackdNgv+92dI+rhpCaLqM0IfpgCgenFvRE= +github.com/ProtonMail/go-crypto v0.0.0-20230717121422-5aa5874ade95 h1:KLq8BE0KwCL+mmXnjLWEAOYO+2l2AE4YMmqG1ZpZHBs= +github.com/ProtonMail/go-crypto v0.0.0-20230717121422-5aa5874ade95/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azugULFRteWMNc0= github.com/acomagu/bufpipe v1.0.4 h1:e3H4WUzM3npvo5uv95QuJM3cQspFNtFBzvJ2oNjKIDQ= github.com/acomagu/bufpipe v1.0.4/go.mod h1:mxdxdup/WdsKVreO5GpW4+M/1CE2sMG4jeGJ2sYmHc4= github.com/agext/levenshtein v1.2.2 h1:0S/Yg6LYmFJ5stwQeRp6EeOcCbj7xiqQSdNelsXvaqE= @@ -209,7 +212,7 @@ github.com/aws/aws-sdk-go v1.44.122 h1:p6mw01WBaNpbdP2xrisz5tIkcNwzj/HysobNoaAHj github.com/aws/aws-sdk-go v1.44.122/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo= github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d h1:xDfNPAt8lFiC1UJrqV3uuy861HCTo708pDMbjHHdCas= github.com/bgentry/go-netrc v0.0.0-20140422174119-9fd32a8b3d3d/go.mod h1:6QX/PXZ00z/TKoufEY6K/a0k6AhaJrQKdFe6OfVXsa4= -github.com/bwesterb/go-ristretto v1.2.0/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0= +github.com/bwesterb/go-ristretto v1.2.3/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= @@ -218,7 +221,6 @@ github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWR github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= -github.com/cloudflare/circl v1.1.0/go.mod h1:prBCrKB9DV4poKZY1l9zBXg2QJY7mvgRvtMxxK7fi4I= github.com/cloudflare/circl v1.3.3 h1:fE/Qz0QdIGqeWfnwq0RE0R7MI51s0M2E4Ga9kq5AEMs= github.com/cloudflare/circl v1.3.3/go.mod h1:5XYMA4rFBvNIrhs50XuiBJ15vF2pZn4nnUKZrLbUZFA= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= @@ -256,8 +258,8 @@ github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376/go.mod h1:an3vInlBmS github.com/go-git/go-billy/v5 v5.4.1 h1:Uwp5tDRkPr+l/TnbHOQzp+tmJfLceOlbVucgpTz8ix4= github.com/go-git/go-billy/v5 v5.4.1/go.mod h1:vjbugF6Fz7JIflbVpl1hJsGjSHNltrSw45YK/ukIvQg= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20230305113008-0c11038e723f h1:Pz0DHeFij3XFhoBRGUDPzSJ+w2UcK5/0JvF8DRI58r8= -github.com/go-git/go-git/v5 v5.8.0 h1:Rc543s6Tyq+YcyPwZRvU4jzZGM8rB/wWu94TnTIYALQ= -github.com/go-git/go-git/v5 v5.8.0/go.mod h1:coJHKEOk5kUClpsNlXrUvPrDxY3w3gjHvhcZd8Fodw8= +github.com/go-git/go-git/v5 v5.8.1 h1:Zo79E4p7TRk0xoRgMq0RShiTHGKcKI4+DI6BfJc/Q+A= +github.com/go-git/go-git/v5 v5.8.1/go.mod h1:FHFuoD6yGz5OSKEBK+aWN9Oah0q54Jxl0abmj6GnqAo= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= @@ -386,8 +388,6 @@ github.com/hashicorp/terraform-json v0.15.0 h1:/gIyNtR6SFw6h5yzlbDbACyGvIhKtQi8m github.com/hashicorp/terraform-json v0.15.0/go.mod h1:+L1RNzjDU5leLFZkHTFTbJXaoqUC6TqXlFgDoOXrtvk= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= -github.com/imdario/mergo v0.3.15 h1:M8XP7IuFNsqUx6VPK2P9OSmsYsI/YFaGil0uD21V3dM= -github.com/imdario/mergo v0.3.15/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A= @@ -405,8 +405,9 @@ github.com/klauspost/compress v1.15.11 h1:Lcadnb3RKGin4FYM/orgq0qde+nc15E5Cbqg4B github.com/klauspost/compress v1.15.11/go.mod h1:QPwzmACJjUTFsnSHH934V6woptycfrDDJnH7hvFVbGM= github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= @@ -432,6 +433,7 @@ github.com/otiai10/copy v1.12.0/go.mod h1:rSaLseMUsZFFbsFGc7wCJnnkTAvdc5L6VWxPE4 github.com/otiai10/mint v1.5.1 h1:XaPLeE+9vGbuyEHem1JNk3bYc7KKqyI/na0/mLd/Kks= github.com/pjbgf/sha1cd v0.3.0 h1:4D5XXmUUBUl/xQ6IjCkEAbqXskkq/4O7LmGn0AqMDs4= github.com/pjbgf/sha1cd v0.3.0/go.mod h1:nZ1rrWOcGJ5uZgEEVL1VUM9iRQiZvWdbZjkKyFzPPsI= +github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/sftp v1.13.1/go.mod h1:3HaPG6Dq1ILlpPZRO0HVMrsydcdLt6HRDccSgb87qRg= @@ -440,13 +442,15 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= +github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8= +github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/sebdah/goldie v1.0.0/go.mod h1:jXP4hmWywNEwZzhMuv2ccnqTSFpuq8iyQhtQdkkZBH4= github.com/sergi/go-diff v1.2.0 h1:XU+rvMAioB0UC3q1MFrIQy4Vo5/4VsRDQQXHsEya6xQ= github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= -github.com/skeema/knownhosts v1.1.1 h1:MTk78x9FPgDFVFkDLTrsnnfCJl7g1C/nnKvePgrIngE= -github.com/skeema/knownhosts v1.1.1/go.mod h1:g4fPeYpque7P0xefxtGzV81ihjC8sX2IqpAoNkjxbMo= +github.com/skeema/knownhosts v1.2.0 h1:h9r9cf0+u7wSE+M183ZtMGgOJKiL96brpaz5ekfJCpM= +github.com/skeema/knownhosts v1.2.0/go.mod h1:g4fPeYpque7P0xefxtGzV81ihjC8sX2IqpAoNkjxbMo= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/afero v1.9.5 h1:stMpOSZFs//0Lv29HduCmli3GUfpFoF3Y1Q/aXj/wVM= github.com/spf13/afero v1.9.5/go.mod h1:UBogFpq8E9Hx+xc5CNTTEpTnuHVmXDwZcZcE1eb/UhQ= @@ -506,6 +510,7 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20220314234659-1baeb1ce4c0b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/crypto v0.3.1-0.20221117191849-2c476679df9a/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4= golang.org/x/crypto v0.7.0/go.mod h1:pYwdfH91IfpZVANVyUOhSIPZaFoJGxTFbZhFTx+dXZU= golang.org/x/crypto v0.11.0 h1:6Ewdq3tDic1mg5xRO4milcWCfMVQhI4NkqWWvqejpuA= golang.org/x/crypto v0.11.0/go.mod h1:xgJhtzW8F9jGdVFWZESrid1U1bjeNy4zgy5cRr/CIio= @@ -549,6 +554,7 @@ golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.9.0 h1:KENHtAZL2y3NLMYZeHY9DW8HW8V+kQyJsY/V9JlKvCs= +golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180811021610-c39426892332/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -599,6 +605,7 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.0.0-20220909164309-bea034e7d591/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco= +golang.org/x/net v0.2.0/go.mod h1:KqCZLdyyvdV855qA2rE3GC2aiw5xGR5TEjj8smXukLY= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.12.0 h1:cfawfvKITfUsFCeJIHJrbSxpeu/E81khclypR0GVT50= @@ -696,7 +703,6 @@ golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210806184541-e5e7981a1069/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210823070655-63515b42dcdf/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210908233432-aa78b53d3365/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211124211545-fe61309f8881/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211210111614-af8b64212486/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -715,6 +721,7 @@ golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= @@ -723,6 +730,7 @@ golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.10.0 h1:3R7pNqamzBraeqj/Tj8qt1aQ2HpmlC+Cx/qL/7hn4/c= @@ -797,6 +805,7 @@ golang.org/x/tools v0.1.3/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.4/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.5/go.mod h1:o0xws9oXOQQZyjljx8fwUC0k7L1pTE6eaCbjGeHmOkk= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= From 41db9549a4c94b0ebeddf9a0e65c9e5035d6024a Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Thu, 3 Aug 2023 11:37:52 -0700 Subject: [PATCH 131/144] Add H3 to `examples/hpc-enterprise-slurm` (#1657) --- examples/README.md | 4 ++++ examples/hpc-enterprise-slurm.yaml | 38 ++++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index 1430194143..d8db0f18c1 100644 --- a/examples/README.md +++ b/examples/README.md @@ -179,6 +179,8 @@ generation AMD EPYC Milan. * `c3` with compute-optimized [`c3-highcpu-176` nodes][c3] based on Intel Sapphire Rapids processors. When configured with Tier_1 networking, C3 nodes feature 200 Gbps low-latency networking. +* `h3` with compute-optimized [`h3-standard-88` nodes][h3] based on Intel Sapphire +Rapids processors. H3 VMs can use the entire host network bandwidth and come with a default network bandwidth rate of up to 200 Gbps. * `a208` with [`a2-ultragpu-8g` nodes][a2] with 8 of the NVIDIA A100 GPU accelerators with 80GB of GPU memory each. * `a216` with [`a2-megagpu-16g` nodes][a2] with 16 of the NVIDIA A100 GPU accelerators @@ -196,6 +198,7 @@ are configured with: [c2]: https://cloud.google.com/compute/docs/compute-optimized-machines#c2_machine_types [c2d]: https://cloud.google.com/compute/docs/compute-optimized-machines#c2d_machine_types [c3]: https://cloud.google.com/blog/products/compute/introducing-c3-machines-with-googles-custom-intel-ipu +[h3]: https://cloud.google.com/compute/docs/compute-optimized-machines#h3_series [a2]: https://cloud.google.com/compute/docs/gpus#a100-gpus [g2]: https://cloud.google.com/compute/docs/gpus#l4-gpus [compact placement]: https://cloud.google.com/compute/docs/instances/define-instance-placement @@ -234,6 +237,7 @@ For this example the following is needed in the selected region: in `c2` partition up to 1,204 * Compute Engine API: C2D CPUs: **112/node** active in `c2d` partition up to 2,240 * Compute Engine API: C3 CPUs: **176/node** active in `c3` partition up to 3,520 +* Compute Engine API: H3 CPUs: **88/node** active in `h3` partition up to 1,408 * Compute Engine API: A2 CPUs: **96/node** active in `a208` and `a216` partitions up to 3,072 * Compute Engine API: NVIDIA A100 80GB GPUs: **8/node** active in `a208` partition diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index 35ed922618..a4c0f80da4 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -244,7 +244,7 @@ deployment_groups: scopes: - https://www.googleapis.com/auth/cloud-platform - # use `-p a208` to submit jobs to this partition: + # use `-p a216` to submit jobs to this partition: # ex: `srun -p a216 --gpus-per-node=16 -N 1 nvidia-smi` - id: a2_16_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition @@ -255,10 +255,44 @@ deployment_groups: # https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/develop/community/modules/compute/schedmd-slurm-gcp-v5-partition#compute-vm-zone-policies zones: $(vars.gpu_zones) + - id: h3_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + node_count_dynamic_max: 16 + machine_type: h3-standard-88 + bandwidth_tier: gvnic_enabled # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_network + instance_image: + family: $(vars.family) + project: $(vars.project) + service_account: + email: $(compute_sa.service_account_email) + scopes: + - https://www.googleapis.com/auth/cloud-platform + # H3 does not support pd-ssd and pd-standard + # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks + disk_type: pd-balanced + disk_size_gb: 100 + node_conf: # one has to specify number of CPUs and sockets for H3 machine type + CPUs: 88 + CoresPerSocket: 44 + Sockets: 2 + ThreadsPerCore: 1 + + # use `-p h3` to submit jobs to this partition: + # ex: `srun -p h3 -N 1 hostname` + - id: h3_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: [h3_node_group, network1, homefs, projectsfs, scratchfs] + settings: + partition_name: h3 + partition_conf: # one has to specify mem per CPU for H3 machine type + DefMemPerCPU: 3971 + - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller use: [network1, homefs, projectsfs, scratchfs, n2_partition, - c2_partition, c2d_partition, c3_partition, a2_8_partition, a2_16_partition] + c2_partition, c2d_partition, c3_partition, a2_8_partition, a2_16_partition, + h3_partition] settings: instance_image: family: $(vars.family) From 446c57e502d00a007943cd2d54ad0d28b320bd55 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 3 Aug 2023 12:49:34 -0700 Subject: [PATCH 132/144] Bump cryptography from 41.0.1 to 41.0.3 in /community/front-end/ofe (#1655) Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.1 to 41.0.3. - [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/41.0.1...41.0.3) --- updated-dependencies: - dependency-name: cryptography dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Rohit Ramu --- community/front-end/ofe/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt index 777494c01d..42f400bf52 100644 --- a/community/front-end/ofe/requirements.txt +++ b/community/front-end/ofe/requirements.txt @@ -9,7 +9,7 @@ cffi==1.15.1 cfgv==3.3.1 charset-normalizer==3.1.0 click==8.1.3 -cryptography==41.0.2 +cryptography==41.0.3 decorator==5.1.1 defusedxml==0.7.1 dill==0.3.6 From 64cf615d63032c67d290346aa6b78042a5f99bfa Mon Sep 17 00:00:00 2001 From: harshthakkar-google Date: Thu, 3 Aug 2023 21:18:43 +0000 Subject: [PATCH 133/144] Remove basic tutorial for HPC toolkit cluster deployment --- docs/tutorials/README.md | 14 --- docs/tutorials/basic.md | 219 --------------------------------------- 2 files changed, 233 deletions(-) delete mode 100644 docs/tutorials/basic.md diff --git a/docs/tutorials/README.md b/docs/tutorials/README.md index c481026501..8b7cf5d34d 100644 --- a/docs/tutorials/README.md +++ b/docs/tutorials/README.md @@ -5,20 +5,6 @@ Find the quickstart tutorial on [Google Cloud docs](https://cloud.google.com/hpc-toolkit/docs/quickstarts/slurm-cluster). -## Simple Cluster Tutorial - -Deploy a simple HPC cluster with the HPC Toolkit in -[cloud shell](https://cloud.google.com/shell) using the -[hpc-slurm.yaml](../../examples/hpc-slurm.yaml) example. - -It is recommended to use the [Quickstart Tutorial](#quickstart-tutorial), which -covers similar material as the Simple Cluster Tutorial and will be replacing -this tutorial in the future. - -Click the button below to launch the Simple Cluster Tutorial. - -[![Open in Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://shell.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https%3A%2F%2Fgithub.com%2FGoogleCloudPlatform%2Fhpc-toolkit&cloudshell_open_in_editor=examples%2Fhpc-slurm.yaml&cloudshell_tutorial=docs%2Ftutorials%2Fbasic.md) - ## Intel Select Tutorial Walks through deploying an HPC cluster that is based on the diff --git a/docs/tutorials/basic.md b/docs/tutorials/basic.md deleted file mode 100644 index 440f04148f..0000000000 --- a/docs/tutorials/basic.md +++ /dev/null @@ -1,219 +0,0 @@ -# HPC Toolkit Basic Cluster Deployment - -HPC Toolkit is an open-source software offered by Google Cloud which makes it -easy for customers to deploy HPC environments on Google Cloud. - -This tutorial will walk you through deploying a simple HPC cluster on Google -Cloud using the HPC Toolkit. - -## Select a Project - -Select a project in which to deploy an HPC cluster on Google. - - - -Once you have selected a project, click START. - -## Add Credits to the Project - -Talk with your tutorial leader to see if Google Cloud credits are available. - -## Enable APIs & Permissions - -In a new Google Cloud project there are several apis that must be enabled to -deploy your HPC cluster. These will be caught when you perform `terraform apply` -but you can save time by enabling them now by running: - - - -We also need to grant the default compute service account project edit access so -the slurm controller can perform actions such as auto-scaling. - - - -```bash -PROJECT_NUMBER=$(gcloud projects describe --format='value(projectNumber)') - -echo "granting roles/editor to $PROJECT_NUMBER-compute@developer.gserviceaccount.com" - -gcloud projects add-iam-policy-binding --member=serviceAccount:"$PROJECT_NUMBER"-compute@developer.gserviceaccount.com --role=roles/editor -``` - -## Build the Toolkit Binary - -To build HPC Toolkit binary from source run: - -```bash -make -``` - -You should now have a binary named ghpc in the current directory. To verify the -build run: - -```bash -./ghpc --version -``` - -This should show you the version of the HPC Toolkit you are using. - -(Optional) To install the `ghpc` binary in your home directory under bin, -run the following command: - -```bash -make install -exec $SHELL -l -``` - -## Generate a Deployment - -To create a deployment, an input blueprint file needs to be written or adapted -from one of the examples found in the `examples/` or `community/examples` -directories. - -This tutorial will use `examples/hpc-slurm.yaml`, which is a good starting -point and creates a deployment containing: - -* a new network -* a filestore instance -* a Slurm login node -* a Slurm controller -* several auto-scaling Slurm partitions - -The blueprint `examples/hpc-slurm.yaml` should be open in the Cloud Shell -Editor (on the left). - -This file describes the cluster you will deploy. After you have inspected the -file, use the ghpc binary to create a deployment directory by running: - -```bash -./ghpc create examples/hpc-slurm.yaml --vars "project_id=" -``` - -> **_NOTE:_** The `--vars` argument is used to override `project_id` in the -> blueprint variables. The `--vars` argument supports comma-separated list of -> name=value variables to override blueprint variables. This feature only -> supports variables of string type. - -This will create a deployment directory named `hpc-small/`, which -contains the terraform needed to deploy your cluster. - -## Deploy the Cluster - -Use the following commands to run terraform and deploy your cluster. - -```bash -terraform -chdir=hpc-small/primary init -terraform -chdir=hpc-small/primary apply -``` - -The `terraform apply` command will generate a _plan_ that describes the Google -Cloud resources that will be deployed. - -You can review the plan and then start the deployment by typing -**`yes [enter]`**. - -The deployment will take about 5 minutes. There should be regular status updates -in the terminal. - -If the `apply` is successful, a message similar to the following will be -displayed: - - - - -```shell -Apply complete! Resources: xx added, 0 changed, 0 destroyed. -``` - -> **_NOTE:_** This example does not contain any Packer-based modules but for -> completeness, you can use the following command to deploy a Packer-based -> deployment group: -> -> ```shell -> cd // -> packer init . -> packer validate . -> packer build . -> ``` - -## Run a Job on the Cluster - -Once the cluster has successfully been deployed, take the following steps to -run a job: - -1. Open the following URL in a new tab. This will take you to `Compute Engine` > - `VM instances` in the Google Cloud Console: - - - - ```text - https://console.cloud.google.com/compute?project= - ``` - - - - -1. Click on the `SSH` button associated with the `slurm-hpc-small-login0` - instance. - - This will open a separate pop up window with a terminal into our newly created - Slurm login VM. - - > **_NOTE:_** If you see a message saying: - > _`Slurm is currently being configured in the background`_, then re-launch - > the pop up after a minute. This gives time for Slurm to become ready. - -1. Next you will run the `hostname` command across 3 nodes. Do this by running - the following command in the shell popup: - - ```shell - srun -N 3 hostname - ``` - -This may take a minute while Slurm auto-scales to create the nodes. If you are -curious you can refresh the `Compute Engine` > `VM instances` page and see that -additional VMs have been created. - -When the job finishes you should see an output similar to: - -```shell -$ srun -N 3 hostname - slurm-hpc-small-compute-0-0 - slurm-hpc-small-compute-0-1 - slurm-hpc-small-compute-0-2 -``` - -By default, this runs the job on the `debug` partition. See details in -[examples/](examples/README.md#compute-partition) for how to run on the more -performant `compute` partition. - -Running the same job again will run much faster as Slurm will reuse the nodes. - -The auto-scaled nodes will be automatically destroyed by the Slurm controller if -left idle for several minutes. - -> **_NOTE:_** If the Slurm controller is shut down before the auto-scale nodes -> are destroyed then they will be left running. - -## Destroy the Cluster - -To avoid incurring ongoing charges we will want to destroy our cluster. Run the -following command in the cloud shell terminal (not in the pop-up): - -```bash -terraform -chdir=hpc-small/primary destroy -auto-approve -``` - -When complete you should see something like: - -```shell -Destroy complete! Resources: xx destroyed. -``` - -> **_NOTE:_** If destroy is run before Slurm shut down the auto-scale nodes then -> they will be left behind and destroy may fail. In this case you can delete the -> VMs manually and rerun the destroy command above. - -## Tutorial Complete - - From c8d17d491974cde2f5d52e52c4959a008e373614 Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Fri, 4 Aug 2023 09:45:25 -0700 Subject: [PATCH 134/144] Update DDN EXAScaler image to 6.2 (#1656) --- community/modules/file-system/DDN-EXAScaler/README.md | 2 +- community/modules/file-system/DDN-EXAScaler/variables.tf | 2 +- docs/vm-images.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/community/modules/file-system/DDN-EXAScaler/README.md b/community/modules/file-system/DDN-EXAScaler/README.md index efb04846c7..b4ca5623a8 100644 --- a/community/modules/file-system/DDN-EXAScaler/README.md +++ b/community/modules/file-system/DDN-EXAScaler/README.md @@ -116,7 +116,7 @@ No resources. | [cls](#input\_cls) | Compute client properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 0,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-2",
"public_ip": true
}
| no | | [clt](#input\_clt) | Compute client target properties |
object({
disk_bus = string
disk_type = string
disk_size = number
disk_count = number
})
|
{
"disk_bus": "SCSI",
"disk_count": 0,
"disk_size": 256,
"disk_type": "pd-standard"
}
| no | | [fsname](#input\_fsname) | EXAScaler filesystem name, only alphanumeric characters are allowed, and the value must be 1-8 characters long | `string` | `"exacloud"` | no | -| [image](#input\_image) | Source image properties | `any` |
{
"family": "exascaler-cloud-6-1-centos",
"project": "ddn-public"
}
| no | +| [image](#input\_image) | Source image properties | `any` |
{
"family": "exascaler-cloud-6-2-rocky-linux-8-optimized-gcp",
"project": "ddn-public"
}
| no | | [labels](#input\_labels) | Labels to add to EXAScaler Cloud deployment. Key-value pairs. | `map(string)` | `{}` | no | | [local\_mount](#input\_local\_mount) | Mountpoint (at the client instances) for this EXAScaler system | `string` | `"/shared"` | no | | [mds](#input\_mds) | Metadata server properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 1,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-32",
"public_ip": true
}
| no | diff --git a/community/modules/file-system/DDN-EXAScaler/variables.tf b/community/modules/file-system/DDN-EXAScaler/variables.tf index d5c937e008..9a0bc7cd3f 100644 --- a/community/modules/file-system/DDN-EXAScaler/variables.tf +++ b/community/modules/file-system/DDN-EXAScaler/variables.tf @@ -212,7 +212,7 @@ variable "image" { # }) default = { project = "ddn-public" - family = "exascaler-cloud-6-1-centos" + family = "exascaler-cloud-6-2-rocky-linux-8-optimized-gcp" } validation { condition = lookup(var.image, "name", null) == null && lookup(var.image, "project", null) != null && lookup(var.image, "family", null) != null diff --git a/docs/vm-images.md b/docs/vm-images.md index 5f6824f107..cb8cd5dae0 100644 --- a/docs/vm-images.md +++ b/docs/vm-images.md @@ -129,7 +129,7 @@ a description of our support for Windows images. ✓ - + ✓ ✓ From ff4e7b82ad70a0835dfd096b27ff5fd9dabaa698 Mon Sep 17 00:00:00 2001 From: Skyler Malinowski Date: Thu, 3 Aug 2023 12:28:19 -0400 Subject: [PATCH 135/144] Bump slurm-gcp to 5.7.5 (from 5.7.4) --- .../variables.tf | 2 +- .../README.md | 2 +- .../main.tf | 2 +- .../variables.tf | 2 +- .../schedmd-slurm-gcp-v5-partition/README.md | 2 +- .../schedmd-slurm-gcp-v5-partition/main.tf | 2 +- .../variables.tf | 2 +- .../schedmd-slurm-gcp-v5-controller/README.md | 18 +++++++++--------- .../schedmd-slurm-gcp-v5-controller/main.tf | 4 ++-- .../variables.tf | 2 +- .../schedmd-slurm-gcp-v5-hybrid/README.md | 14 +++++++------- .../schedmd-slurm-gcp-v5-hybrid/main.tf | 2 +- .../schedmd-slurm-gcp-v5-login/README.md | 14 +++++++------- .../schedmd-slurm-gcp-v5-login/main.tf | 4 ++-- .../schedmd-slurm-gcp-v5-login/variables.tf | 2 +- .../demo-with-cloud-controller-instructions.md | 2 +- .../deploy-instructions.md | 4 ++-- .../on-prem-instructions.md | 16 ++++++++-------- docs/image-building.md | 2 +- examples/README.md | 4 ++-- tools/cloud-build/Dockerfile | 2 +- 21 files changed, 52 insertions(+), 52 deletions(-) diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf index 3070010996..0e93805a3b 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 +# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 variable "project_id" { description = "Project in which the HPC deployment will be created." diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md index cad8fc0697..92ceb8e686 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/README.md @@ -69,7 +69,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.7.4 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.7.5 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf index 554a4da771..889cfb4899 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/main.tf @@ -29,7 +29,7 @@ locals { } module "slurm_partition" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.7.4" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.7.5" slurm_cluster_name = local.slurm_cluster_name enable_job_exclusive = var.exclusive diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf index cfe375d34f..0733a68d96 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition-dynamic/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 +# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 variable "deployment_name" { description = "Name of the deployment." diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index 7c91f2d941..c952ee209e 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -146,7 +146,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_partition](#module\_slurm\_partition) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.7.4 | +| [slurm\_partition](#module\_slurm\_partition) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition | 5.7.5 | ## Resources diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf index 4e60ba24d2..e3a2179ba6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/main.tf @@ -38,7 +38,7 @@ data "google_compute_zones" "available" { } module "slurm_partition" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.7.4" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_partition?ref=5.7.5" slurm_cluster_name = local.slurm_cluster_name partition_nodes = var.node_groups diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf index 8c8fadbe92..628d2e7f79 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 +# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 variable "deployment_name" { description = "Name of the deployment." diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index d6387d7ef1..0b06026a76 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -17,14 +17,14 @@ controller for optimal performance at different scales. > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.4/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.5/scripts/requirements.txt > ``` -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 -[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4/terraform/slurm_cluster/modules/slurm_controller_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +[slurm\_controller\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster/modules/slurm_controller_instance +[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster/modules/slurm_instance_template [slurm-ug]: https://goo.gle/slurm-gcp-user-guide. -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.7.4/scripts/requirements.txt +[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/scripts/requirements.txt [enable\_cleanup\_compute]: #input\_enable\_cleanup\_compute [enable\_cleanup\_subscriptions]: #input\_enable\_cleanup\_subscriptions [enable\_reconfigure]: #input\_enable\_reconfigure @@ -94,12 +94,12 @@ This option has some additional requirements: development environment deploying the cluster. One can use following commands: ```bash - pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.4/scripts/requirements.txt + pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.5/scripts/requirements.txt ``` For more information, see the [description][optdeps] of this module. -[optdeps]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4/terraform/slurm_cluster#optional +[optdeps]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster#optional ## Custom Images @@ -163,8 +163,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.7.4 | -| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.7.4 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance | 5.7.5 | +| [slurm\_controller\_template](#module\_slurm\_controller\_template) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.7.5 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 5626dd6523..ce5b4e6af8 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -54,7 +54,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_controller_instance" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.7.4" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_instance?ref=5.7.5" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name @@ -90,7 +90,7 @@ module "slurm_controller_instance" { } module "slurm_controller_template" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.7.4" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.7.5" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 8a7e93468c..9ee056ea84 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 +# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 variable "access_config" { description = "Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet." diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md index d79fa9cc32..22b8f0ab82 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md @@ -38,7 +38,7 @@ manually. This will require addition configuration and verification of permissions. For more information see the [hybrid.md] documentation on [slurm-gcp]. -[slurm-controller-hybrid]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4/terraform/slurm_cluster/modules/slurm_controller_hybrid +[slurm-controller-hybrid]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster/modules/slurm_controller_hybrid > **_NOTE:_** The hybrid module requires the following dependencies to be > installed on the system deploying the module: @@ -58,15 +58,15 @@ permissions. For more information see the [hybrid.md] documentation on [pyyaml]: https://pypi.org/project/PyYAML/ [google-api-python-client]: https://pypi.org/project/google-api-python-client/ [google-cloud-pubsub]: https://pypi.org/project/google-cloud-pubsub/ -[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.7.4/scripts/requirements.txt +[requirements.txt]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/scripts/requirements.txt ### Manual Configuration This module *does not* complete the installation of hybrid partitions on your slurm cluster. After deploying, you must follow the steps listed out in the [hybrid.md] documentation under [manual steps]. -[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.7.4/docs/hybrid.md -[manual steps]: https://github.com/SchedMD/slurm-gcp/blob/5.7.4/docs/hybrid.md#manual-configurations +[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/docs/hybrid.md +[manual steps]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/docs/hybrid.md#manual-configurations ### Example Usage The hybrid module can be added to a blueprint as follows: @@ -146,10 +146,10 @@ strongly advise only using versions 21 or 22 when using this module. Attempting to use this module with any version older than 21 may lead to unexpected results. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 [pre-existing-network-storage]: ../../../../modules/file-system/pre-existing-network-storage/ [schedmd-slurm-gcp-v5-partition]: ../../compute/schedmd-slurm-gcp-v5-partition/ -[packer templates]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4/packer +[packer templates]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/packer ## License @@ -181,7 +181,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.7.4 | +| [slurm\_controller\_instance](#module\_slurm\_controller\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid | 5.7.5 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index 787e558726..6d73c71fe9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -28,7 +28,7 @@ locals { } module "slurm_controller_instance" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.7.4" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=5.7.5" project_id = var.project_id slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index 740d78a6d0..9b72d416ad 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -5,9 +5,9 @@ This module creates a login node for a Slurm cluster based on the terraform modules. The login node is used in conjunction with the [Slurm controller](../schedmd-slurm-gcp-v5-controller/README.md). -[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 -[slurm\_login\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4/terraform/slurm_cluster/modules/slurm_login_instance -[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4/terraform/slurm_cluster/modules/slurm_instance_template +[SchedMD/slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +[slurm\_login\_instance]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster/modules/slurm_login_instance +[slurm\_instance\_template]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/terraform/slurm_cluster/modules/slurm_instance_template ### Example @@ -49,8 +49,8 @@ The HPC Toolkit team maintains the wrapper around the [slurm-on-gcp] terraform modules. For support with the underlying modules, see the instructions in the [slurm-gcp README][slurm-gcp-readme]. -[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 -[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4#slurm-on-google-cloud-platform +[slurm-on-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +[slurm-gcp-readme]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5#slurm-on-google-cloud-platform ## License @@ -85,8 +85,8 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| -| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.7.4 | -| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.7.4 | +| [slurm\_login\_instance](#module\_slurm\_login\_instance) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance | 5.7.5 | +| [slurm\_login\_template](#module\_slurm\_login\_template) | github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template | 5.7.5 | ## Resources diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf index e33ce76350..d4d9e849c1 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/main.tf @@ -50,7 +50,7 @@ data "google_compute_default_service_account" "default" { } module "slurm_login_template" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.7.4" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_instance_template?ref=5.7.5" additional_disks = local.additional_disks can_ip_forward = var.can_ip_forward @@ -88,7 +88,7 @@ module "slurm_login_template" { } module "slurm_login_instance" { - source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.7.4" + source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_login_instance?ref=5.7.5" access_config = local.access_config slurm_cluster_name = local.slurm_cluster_name diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index 92b86b876c..fa7cfb8135 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -15,7 +15,7 @@ */ # Most variables have been sourced and modified from the SchedMD/slurm-gcp -# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 +# github repository: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 variable "project_id" { type = string diff --git a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md index 67178fa5db..8d4dae6ca9 100644 --- a/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md +++ b/docs/hybrid-slurm-cluster/demo-with-cloud-controller-instructions.md @@ -22,7 +22,7 @@ for use with an on-premise slurm-cluster. > further testing is done, documentation on applying the hybrid module to > on-premise slurm clusters will be added and expanded. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 ## Definitions diff --git a/docs/hybrid-slurm-cluster/deploy-instructions.md b/docs/hybrid-slurm-cluster/deploy-instructions.md index 781d4588ed..4426b24433 100644 --- a/docs/hybrid-slurm-cluster/deploy-instructions.md +++ b/docs/hybrid-slurm-cluster/deploy-instructions.md @@ -260,8 +260,8 @@ sudo systemctl restart slurmctld If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` should point you in the right direction. -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 -[slurm-gcp-hybrid]: https://github.com/SchedMD/slurm-gcp/blob/5.7.4/docs/hybrid.md +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 +[slurm-gcp-hybrid]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/docs/hybrid.md [demo-with-cloud-controller-instructions.md]: ./demo-with-cloud-controller-instructions.md ## Validate the Hybrid Cluster diff --git a/docs/hybrid-slurm-cluster/on-prem-instructions.md b/docs/hybrid-slurm-cluster/on-prem-instructions.md index 1c35bc8246..a07ef8fc49 100644 --- a/docs/hybrid-slurm-cluster/on-prem-instructions.md +++ b/docs/hybrid-slurm-cluster/on-prem-instructions.md @@ -39,9 +39,9 @@ detail, as well as how to customize many of these assumptions to fit your needs. deployments in their [hybrid.md] documentation. [hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md -[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4 +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5 [slurm\_controller\_hybrid]: https://github.com/SchedMD/slurm-gcp/tree/master/terraform/slurm_cluster/modules/slurm_controller_hybrid -[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.7.4/docs/hybrid.md +[hybrid.md]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/docs/hybrid.md ### NFS Mounts @@ -235,12 +235,12 @@ image created with slurm 21.08.8: partition_name: compute ``` -[slurmgcppacker]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4/packer -[example.pkrvars.hcl]: https://github.com/SchedMD/slurm-gcp/tree/5.7.4/packer/example.pkrvars.hcl -[slurmversion]: https://github.com/SchedMD/slurm-gcp/blob/5.7.4/packer/variables.pkr.hcl#L97 -[`service_account_scopes`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.4/packer/variables.pkr.hcl#L166 -[`munge_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.4/ansible/roles/munge/defaults/main.yml#L17 -[`slurm_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.4/ansible/roles/slurm/defaults/main.yml#L31 +[slurmgcppacker]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/packer +[example.pkrvars.hcl]: https://github.com/SchedMD/slurm-gcp/tree/5.7.5/packer/example.pkrvars.hcl +[slurmversion]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/packer/variables.pkr.hcl#L97 +[`service_account_scopes`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/packer/variables.pkr.hcl#L166 +[`munge_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/ansible/roles/munge/defaults/main.yml#L17 +[`slurm_user`]: https://github.com/SchedMD/slurm-gcp/blob/5.7.5/ansible/roles/slurm/defaults/main.yml#L31 ## On Premise Setup diff --git a/docs/image-building.md b/docs/image-building.md index 7dd29e8297..1d80b7c9a3 100644 --- a/docs/image-building.md +++ b/docs/image-building.md @@ -168,7 +168,7 @@ deployment_groups: - group: packer modules: - id: custom-image - source: github.com/SchedMD/slurm-gcp//packer?ref=5.7.4&depth=1 + source: github.com/SchedMD/slurm-gcp//packer?ref=5.7.5&depth=1 kind: packer settings: use_iap: true diff --git a/examples/README.md b/examples/README.md index d8db0f18c1..1e1387b6f2 100644 --- a/examples/README.md +++ b/examples/README.md @@ -120,7 +120,7 @@ the experimental badge (![experimental-badge]). > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.4/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.5/scripts/requirements.txt > ``` Creates a basic auto-scaling Slurm cluster with mostly default settings. The @@ -530,7 +530,7 @@ For this example the following is needed in the selected region: > > ```shell > # Install Python3 and run -> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.4/scripts/requirements.txt +> pip3 install -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.5/scripts/requirements.txt > ``` Similar to the [hpc-slurm.yaml] example, but using Ubuntu 20.04 instead of CentOS 7. diff --git a/tools/cloud-build/Dockerfile b/tools/cloud-build/Dockerfile index 730ca2d214..cafb8d027e 100644 --- a/tools/cloud-build/Dockerfile +++ b/tools/cloud-build/Dockerfile @@ -50,7 +50,7 @@ WORKDIR /ghpc-tmp COPY ./ ./ RUN pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.4/scripts/requirements.txt && \ + pip install --no-cache-dir -r https://raw.githubusercontent.com/SchedMD/slurm-gcp/5.7.5/scripts/requirements.txt && \ pip install --no-cache-dir -r tools/cloud-build/requirements.txt && \ rm -rf ~/.cache/pip/* From 8c461a7e093b7a848671aeb90df619912662d99a Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Fri, 4 Aug 2023 15:16:13 -0700 Subject: [PATCH 136/144] Replace "module" with "deployment group" in prompt (#1662) --- pkg/shell/terraform.go | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/pkg/shell/terraform.go b/pkg/shell/terraform.go index c3c91253f1..bf645ae81e 100644 --- a/pkg/shell/terraform.go +++ b/pkg/shell/terraform.go @@ -91,13 +91,13 @@ func needsInit(tf *tfexec.Terraform) bool { func initModule(tf *tfexec.Terraform) error { var err error if needsInit(tf) { - log.Printf("initializing terraform module %s", tf.WorkingDir()) + log.Printf("Initializing deployment group %s", tf.WorkingDir()) err = tf.Init(context.Background()) } if err != nil { return &TfError{ - help: fmt.Sprintf("initialization of %s failed; manually resolve errors below", tf.WorkingDir()), + help: fmt.Sprintf("initialization of deployment group %s failed; manually resolve errors below", tf.WorkingDir()), err: err, } } @@ -106,11 +106,11 @@ func initModule(tf *tfexec.Terraform) error { } func outputModule(tf *tfexec.Terraform) (map[string]cty.Value, error) { - log.Printf("collecting terraform outputs from %s", tf.WorkingDir()) + log.Printf("Collecting terraform outputs from %s", tf.WorkingDir()) output, err := tf.Output(context.Background()) if err != nil { return map[string]cty.Value{}, &TfError{ - help: fmt.Sprintf("collecting terraform outputs from %s failed; manually resolve errors below", tf.WorkingDir()), + help: fmt.Sprintf("collecting terraform outputs from deployment group %s failed; manually resolve errors below", tf.WorkingDir()), err: err, } } @@ -143,7 +143,7 @@ func planModule(tf *tfexec.Terraform, path string, destroy bool) (bool, error) { wantsChange, err := tf.Plan(context.Background(), outOpt, tfexec.Destroy(destroy)) if err != nil { return false, &TfError{ - help: fmt.Sprintf("terraform plan for %s failed; suggest running \"ghpc export-outputs\" on previous deployment groups to define inputs", tf.WorkingDir()), + help: fmt.Sprintf("terraform plan for deployment group %s failed; suggest running \"ghpc export-outputs\" on previous deployment groups to define inputs", tf.WorkingDir()), err: err, } } @@ -165,7 +165,7 @@ func promptForApply(tf *tfexec.Terraform, path string, b ApplyBehavior) bool { summary := re.FindString(plan) if summary == "" { - summary = fmt.Sprintf("Please review full proposed changes for %s", tf.WorkingDir()) + summary = fmt.Sprintf("Please review full proposed changes for deployment group %s", tf.WorkingDir()) } changes := ProposedChanges{ @@ -181,7 +181,7 @@ func promptForApply(tf *tfexec.Terraform, path string, b ApplyBehavior) bool { func applyPlanConsoleOutput(tf *tfexec.Terraform, path string) error { planFileOpt := tfexec.DirOrPlan(path) - log.Printf("running terraform apply on group %s", tf.WorkingDir()) + log.Printf("Running terraform apply on deployment group %s", tf.WorkingDir()) tf.SetStdout(os.Stdout) tf.SetStderr(os.Stderr) if err := tf.Apply(context.Background(), planFileOpt); err != nil { @@ -207,7 +207,7 @@ func applyOrDestroy(tf *tfexec.Terraform, b ApplyBehavior, destroy bool) error { return err } - log.Printf("testing if module in %s requires %s cloud infrastructure", tf.WorkingDir(), action) + log.Printf("Testing if deployment group %s requires %s cloud infrastructure", tf.WorkingDir(), action) // capture Terraform plan in a file f, err := os.CreateTemp("", "plan-)") if err != nil { @@ -221,10 +221,10 @@ func applyOrDestroy(tf *tfexec.Terraform, b ApplyBehavior, destroy bool) error { var apply bool if wantsChange { - log.Printf("module in %s requires %s cloud infrastructure", tf.WorkingDir(), action) + log.Printf("Deployment group %s requires %s cloud infrastructure", tf.WorkingDir(), action) apply = b == AutomaticApply || promptForApply(tf, f.Name(), b) } else { - log.Printf("cloud infrastructure in %s is already %s", tf.WorkingDir(), pastTense) + log.Printf("Cloud infrastructure in deployment group %s is already %s", tf.WorkingDir(), pastTense) } if !apply { @@ -270,11 +270,11 @@ func ExportOutputs(tf *tfexec.Terraform, artifactsDir string, applyBehavior Appl // blueprint; edge case is that "terraform output" can be missing keys // whose values are null if len(outputValues) == 0 { - log.Printf("group %s contains no artifacts to export", thisGroup) + log.Printf("Deployment group %s contains no artifacts to export", thisGroup) return nil } - log.Printf("writing outputs artifact from group %s to file %s", thisGroup, filepath) + log.Printf("Writing outputs artifact from deployment group %s to file %s", thisGroup, filepath) if err := modulewriter.WriteHclAttributes(outputValues, filepath); err != nil { return err } @@ -362,9 +362,9 @@ func ImportInputs(deploymentGroupDir string, artifactsDir string, expandedBluepr } allInputValues = evaluatedSettings.Items() default: - return fmt.Errorf("unexpected error: unknown module kind for group %s", g.Name) + return fmt.Errorf("unexpected error: unknown module kind for deployment group %s", g.Name) } - log.Printf("writing outputs for group %s to file %s\n", g.Name, outfile) + log.Printf("Writing outputs for deployment group %s to file %s\n", g.Name, outfile) if err := modulewriter.WriteHclAttributes(allInputValues, outfile); err != nil { return err } From 2108065f0a5f93610c0261b86fad05c320ab2f2f Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Aug 2023 13:38:04 -0700 Subject: [PATCH 137/144] Bump golang.org/x/sys from 0.10.0 to 0.11.0 (#1663) Bumps [golang.org/x/sys](https://github.com/golang/sys) from 0.10.0 to 0.11.0. - [Commits](https://github.com/golang/sys/compare/v0.10.0...v0.11.0) --- updated-dependencies: - dependency-name: golang.org/x/sys dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index c99eee36a0..490658af95 100644 --- a/go.mod +++ b/go.mod @@ -83,7 +83,7 @@ require ( golang.org/x/crypto v0.11.0 // indirect golang.org/x/net v0.12.0 // indirect golang.org/x/oauth2 v0.10.0 // indirect - golang.org/x/sys v0.10.0 + golang.org/x/sys v0.11.0 golang.org/x/text v0.11.0 // indirect golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect google.golang.org/appengine v1.6.7 // indirect diff --git a/go.sum b/go.sum index 13854b262a..808e6a9910 100644 --- a/go.sum +++ b/go.sum @@ -725,8 +725,8 @@ golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA= -golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.11.0 h1:eG7RXZHdqOJ1i+0lgLgCpSXAp6M3LYlAo6osgSi0xOM= +golang.org/x/sys v0.11.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= From f45731f39fe0c796465325ade26de8e635c6bd6b Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 7 Aug 2023 15:39:42 -0700 Subject: [PATCH 138/144] Implement quota validation logic (#1592) * Implement quota validation logic * Implement quota validation logic; * Add code coverage exemption, bumped coverage (0% -> 30%). **Tested:** ```go qe, _ := ValidateQuotas([]Quota{ { Metric: "compute.googleapis.com/disks_total_storage", Service: "compute.googleapis.com", Consumer: "projects/X", Limit: 900000, Dimensions: map[string]string{ "region": "us-east1" }, Aggregation: "SUM"}, }) for _, e := range qe { fmt.Println(e.Error()) } // Got: QuotaError{Consumer:"projects/X", Service:"compute.googleapis.com", Metric:"compute.googleapis.com/disks_total_storage", Dimensions:map[string]string(nil), EffectiveLimit:A, Requested:900000} QuotaError{Consumer:"projects/X", Service:"compute.googleapis.com", Metric:"compute.googleapis.com/disks_total_storage", Dimensions:map[string]string{"region":"us-east1"}, EffectiveLimit:B, Requested:900000} ``` * Disambiguate naming --- pkg/validators/quota.go | 218 +++++++++++++++++++++++++++++++++++ pkg/validators/quota_test.go | 171 +++++++++++++++++++++++++++ tools/enforce_coverage.pl | 3 + 3 files changed, 392 insertions(+) create mode 100644 pkg/validators/quota.go create mode 100644 pkg/validators/quota_test.go diff --git a/pkg/validators/quota.go b/pkg/validators/quota.go new file mode 100644 index 0000000000..fff74ad18f --- /dev/null +++ b/pkg/validators/quota.go @@ -0,0 +1,218 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validators + +import ( + "context" + "fmt" + + sub "google.golang.org/api/serviceusage/v1beta1" +) + +// ResourceRequirement represents an amount of desired resource. +type ResourceRequirement struct { + Consumer string // e.g. "projects/myprojectid"" + Service string // e.g. "compute.googleapis.com" + Metric string // e.g. "compute.googleapis.com/disks_total_storage" + Required int64 + Dimensions map[string]string // e.g. {"region": "us-central1"} + // How this requirement should be aggregated with other requirements in the same bucket. + Aggregation string +} + +// InBucket returns true if the quota is in the QuotaBucket. +func (q ResourceRequirement) InBucket(b *sub.QuotaBucket) bool { + for d, v := range b.Dimensions { + if q.Dimensions[d] != v { + return false + } + } + return true +} + +// QuotaError represents an event of not having enough quota. +type QuotaError struct { + Consumer string + Service string + Metric string + Dimensions map[string]string + EffectiveLimit int64 + Requested int64 +} + +func (e QuotaError) Error() string { + return fmt.Sprintf("QuotaError: %#v", e) +} + +// ValidateQuotas validates the resource requirements. +func ValidateQuotas(rs []ResourceRequirement) ([]QuotaError, error) { + qe := []QuotaError{} + // Group by Consumer and Service + type gk struct { + Consumer string + Service string + } + + groups := map[gk][]ResourceRequirement{} + for _, r := range rs { + k := gk{r.Consumer, r.Service} + groups[k] = append(groups[k], r) + } + + for k, g := range groups { + ls, err := serviceLimits(k.Consumer, k.Service) + if err != nil { + return qe, err + } + qse, err := validateServiceLimits(g, ls) + if err != nil { + return qe, err + } + qe = append(qe, qse...) + } + + return qe, nil +} + +func validateServiceLimits(rs []ResourceRequirement, ls []*sub.ConsumerQuotaMetric) ([]QuotaError, error) { + // Group by Metric and Aggregation + type gk struct { + Metric string + Aggregation string + } + groups := map[gk][]ResourceRequirement{} + for _, r := range rs { + k := gk{r.Metric, r.Aggregation} + groups[k] = append(groups[k], r) + } + + qe := []QuotaError{} + for k, g := range groups { + agg, err := aggregation(k.Aggregation) + if err != nil { + return qe, err + } + + // select limits for the metric + ml := []*sub.ConsumerQuotaLimit{} + for _, l := range ls { + if l.Metric == k.Metric { + ml = append(ml, l.ConsumerQuotaLimits...) + } + } + if len(ml) == 0 { + return qe, fmt.Errorf("limits for metric %q were not found", k.Metric) + } + + for _, limit := range ml { + qle := validateLimit(g, limit, agg) + qe = append(qe, qle...) + } + } + return qe, nil +} + +func validateLimit(rs []ResourceRequirement, limit *sub.ConsumerQuotaLimit, agg aggFn) []QuotaError { + qe := []QuotaError{} + for _, bucket := range limit.QuotaBuckets { + vs := []int64{} + for _, r := range rs { + if r.InBucket(bucket) { + vs = append(vs, r.Required) + } + } + if len(vs) == 0 { + continue + } + required := agg(vs) + for _, r := range required { + if !satisfied(r, bucket.EffectiveLimit) { + r0 := rs[0] // all should have the same consumer, service and metric + qe = append(qe, QuotaError{ + Consumer: r0.Consumer, + Service: r0.Service, + Metric: r0.Metric, + Dimensions: bucket.Dimensions, + EffectiveLimit: bucket.EffectiveLimit, + Requested: r, + }) + } + } + } + return qe +} + +func satisfied(requested int64, limit int64) bool { + if limit == -1 { + return true + } + if requested == -1 { + return false + } + return requested <= limit +} + +type aggFn func([]int64) []int64 + +func aggregation(agg string) (aggFn, error) { + switch agg { + case "MAX": + return func(l []int64) []int64 { + max := int64(0) + for _, v := range l { + if v == -1 { + return []int64{-1} + } + if v > max { + max = v + } + } + return []int64{max} + }, nil + case "SUM": + return func(l []int64) []int64 { + sum := int64(0) + for _, v := range l { + if v == -1 { + return []int64{-1} + } + sum += v + } + return []int64{sum} + }, nil + case "DO_NOT_AGGREGATE": + return func(l []int64) []int64 { return l }, nil + default: + return nil, fmt.Errorf("aggregation %q is not supported", agg) + } +} + +func serviceLimits(consumer string, service string) ([]*sub.ConsumerQuotaMetric, error) { + ctx := context.Background() + s, err := sub.NewService(ctx) + if err != nil { + return nil, err + } + res := []*sub.ConsumerQuotaMetric{} + parent := fmt.Sprintf("%s/services/%s", consumer, service) + err = s.Services.ConsumerQuotaMetrics. + List(parent). + View("BASIC"). // BASIC reduces the response size & latency + Pages(ctx, func(page *sub.ListConsumerQuotaMetricsResponse) error { + res = append(res, page.Metrics...) + return nil + }) + return res, err +} diff --git a/pkg/validators/quota_test.go b/pkg/validators/quota_test.go new file mode 100644 index 0000000000..9b070cf300 --- /dev/null +++ b/pkg/validators/quota_test.go @@ -0,0 +1,171 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package validators + +import ( + "fmt" + "sort" + "testing" + + "github.com/google/go-cmp/cmp" + sub "google.golang.org/api/serviceusage/v1beta1" +) + +func TestAggregation(t *testing.T) { + type test struct { + requested []int64 + aggregation string + want []int64 + err bool + } + tests := []test{ + {[]int64{1, 3, 2}, "SUM", []int64{6}, false}, + {[]int64{1, 3, 2}, "MAX", []int64{3}, false}, + {[]int64{}, "SUM", []int64{0}, false}, + {[]int64{}, "MAX", []int64{0}, false}, + {[]int64{1, -1, 2}, "SUM", []int64{-1}, false}, + {[]int64{1, -1, 2}, "MAX", []int64{-1}, false}, + {[]int64{1, -1, 2}, "DO_NOT_AGGREGATE", []int64{1, -1, 2}, false}, + {[]int64{1, -1, 2}, "KARL_MAX", nil, true}, + } + + for _, tc := range tests { + t.Run(fmt.Sprintf("%s%#v", tc.aggregation, tc.requested), func(t *testing.T) { + fn, err := aggregation(tc.aggregation) + if tc.err != (err != nil) { + t.Errorf("got unexpected error: %s", err) + } + if err != nil { + return + } + got := fn(tc.requested) + if diff := cmp.Diff(tc.want, got); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } +} + +func TestSatisfied(t *testing.T) { + type test struct { + requested int64 + limit int64 + want bool + } + tests := []test{ + {1, 1, true}, + {1, 2, true}, + {2, 1, false}, + {1, -1, true}, + {-1, 1, false}, + {-1, -1, true}, + } + + for _, tc := range tests { + t.Run(fmt.Sprintf("%d::%d", tc.requested, tc.limit), func(t *testing.T) { + got := satisfied(tc.requested, tc.limit) + if diff := cmp.Diff(tc.want, got); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } +} + +func TestInBucket(t *testing.T) { + type test struct { + qDimensions map[string]string + bDimensions map[string]string + want bool + } + tests := []test{ + {map[string]string{"a": "1", "b": "2"}, map[string]string{"a": "1", "b": "2"}, true}, + {map[string]string{"a": "1", "b": "2"}, map[string]string{"a": "1", "b": "3"}, false}, + {map[string]string{"a": "1", "b": "2"}, map[string]string{"a": "1"}, true}, + {map[string]string{"a": "1", "b": "2"}, map[string]string{"a": "1", "b": "2", "c": "3"}, false}, + {map[string]string{}, map[string]string{}, true}, + } + for _, tc := range tests { + t.Run(fmt.Sprintf("%#v::%#v", tc.qDimensions, tc.bDimensions), func(t *testing.T) { + q := ResourceRequirement{Dimensions: tc.qDimensions} + b := sub.QuotaBucket{Dimensions: tc.bDimensions} + + got := q.InBucket(&b) + if diff := cmp.Diff(tc.want, got); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } +} + +func TestValidateServiceLimits(t *testing.T) { + // Configured quotas: + // global: 5 + // green_eggs: 3 + // green_sleeve: -1 + // + // Requested: + // green_eggs: 4 + // green_sleeve: 7 + // + // Expected errors: + // green_eggs: 4 > 3 + // global: 11 > 5 + buckets := []*sub.QuotaBucket{ + { + EffectiveLimit: int64(5), + }, { + EffectiveLimit: int64(3), + Dimensions: map[string]string{"green": "eggs"}, + }, { + EffectiveLimit: int64(-1), + Dimensions: map[string]string{"green": "sleeve"}, + }, + } + quotas := []ResourceRequirement{ + { + Metric: "pony", + Required: int64(4), + Dimensions: map[string]string{"green": "eggs"}, + Aggregation: "SUM", + }, { + Metric: "pony", + Required: int64(7), + Dimensions: map[string]string{"green": "sleeve"}, + Aggregation: "SUM", + }, + } + + want := []QuotaError{ + {Metric: "pony", Dimensions: nil, EffectiveLimit: 5, Requested: 11}, + {Metric: "pony", Dimensions: map[string]string{"green": "eggs"}, EffectiveLimit: 3, Requested: 4}, + } + got, err := validateServiceLimits(quotas, []*sub.ConsumerQuotaMetric{ + { + Metric: "pony", + ConsumerQuotaLimits: []*sub.ConsumerQuotaLimit{ + {Metric: "pony", QuotaBuckets: buckets}}, + }, + }) + + if err != nil { + t.Errorf("got unexpected error: %s", err) + return + } + // Sort by error message to make test deterministic + sort.Slice(got, func(i, j int) bool { return got[i].Error() < got[j].Error() }) + if diff := cmp.Diff(want, got); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } +} diff --git a/tools/enforce_coverage.pl b/tools/enforce_coverage.pl index 5f78aae300..b4c5c99c21 100755 --- a/tools/enforce_coverage.pl +++ b/tools/enforce_coverage.pl @@ -20,6 +20,7 @@ my $min = 80; my $cmdmin = 40; my $shellmin = 0; +my $validatorsmin = 25; my $failed_coverage = 0; while (<>){ @@ -28,6 +29,8 @@ $failed_coverage++ if ($1 < $cmdmin); } elsif ( $_ =~ /hpc-toolkit\/pkg\/shell.*coverage: (\d+\.\d)%/) { $failed_coverage++ if ($1 < $shellmin); + } elsif ( $_ =~ /hpc-toolkit\/pkg\/validators.*coverage: (\d+\.\d)%/) { + $failed_coverage++ if ($1 < $validatorsmin); } elsif ( $_ =~ /coverage: (\d+\.\d)%/ ) { $failed_coverage++ if ($1 < $min); } From cfb43fc8dddadd188c59a43e4faaca7978bd35ad Mon Sep 17 00:00:00 2001 From: Rohit Ramu Date: Mon, 7 Aug 2023 17:09:27 -0700 Subject: [PATCH 139/144] Add a prefix to release labels (#1668) --- .github/dependabot.yml | 2 +- .github/release.yml | 14 +++++++------- .github/workflows/pr-label-validation.yml | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 45a0d213b3..3b5ad735c3 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -33,7 +33,7 @@ updates: labels: - dependencies - python - - chore + - release-chore schedule: interval: monthly time: "03:00" diff --git a/.github/release.yml b/.github/release.yml index c791664188..96ba64c330 100644 --- a/.github/release.yml +++ b/.github/release.yml @@ -18,27 +18,27 @@ changelog: exclude: labels: - - chore + - release-chore authors: [] categories: - title: Key New Features 🎉 labels: - - key-new-features + - release-key-new-features - title: New Modules 🧱 labels: - - new-modules + - release-new-modules - title: Module Improvements 🛠 labels: - - module-improvements + - release-module-improvements - title: Improvements labels: - - improvements + - release-improvements - title: Deprecations labels: - - deprecations + - release-deprecations - title: Version Updates labels: - - version-updates + - release-version-updates - title: Other changes labels: - "*" diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml index 4c4e3fac67..986a383d5e 100644 --- a/.github/workflows/pr-label-validation.yml +++ b/.github/workflows/pr-label-validation.yml @@ -31,7 +31,7 @@ on: - develop jobs: - label: + pr-label-validation: runs-on: ubuntu-latest permissions: pull-requests: read @@ -41,8 +41,8 @@ jobs: with: mode: minimum count: 1 - labels: "chore, key-new-features, new-modules, module-improvements, improvements, deprecations, version-updates" - message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: chore, key-new-features, new-modules, module-improvements, improvements, deprecations, version-updates" + labels: "release-chore, release-key-new-features, release-new-modules, release-module-improvements, release-improvements, release-deprecations, release-version-updates" + message: "This PR is being prevented from merging because it is not labeled. Please add a label to this PR. Accepted labels: release-chore, release-key-new-features, release-new-modules, release-module-improvements, release-improvements, release-deprecations, release-version-updates" - id: print-labels run: | echo "Current PR labels:" From 3a752e8bc5b074a540d3c87c2e3bb14d85de1cc0 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 7 Aug 2023 17:52:29 -0700 Subject: [PATCH 140/144] Add logic to retrieve current resource usage (#1669) --- pkg/validators/quota.go | 55 ++++++++++++++++++++++++++++++++++++ pkg/validators/quota_test.go | 32 +++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/pkg/validators/quota.go b/pkg/validators/quota.go index fff74ad18f..5e490428ef 100644 --- a/pkg/validators/quota.go +++ b/pkg/validators/quota.go @@ -17,7 +17,9 @@ package validators import ( "context" "fmt" + "time" + cm "google.golang.org/api/monitoring/v3" sub "google.golang.org/api/serviceusage/v1beta1" ) @@ -216,3 +218,56 @@ func serviceLimits(consumer string, service string) ([]*sub.ConsumerQuotaMetric, }) return res, err } + +type usageKey struct { + Metric string + Location string // either "global", region, or zone +} + +type usageProvider struct { + u map[usageKey]int64 +} + +func (up *usageProvider) Usage(metric string, region string, zone string) int64 { + if up.u == nil { + return 0 + } + k := usageKey{metric, "global"} + if region != "" { + k.Location = region + } + if zone != "" { + k.Location = zone + } + return up.u[k] // 0 if not found +} + +func newUsageProvider(projectID string) (usageProvider, error) { + s, err := cm.NewService(context.Background()) + if err != nil { + return usageProvider{}, err + } + + u := map[usageKey]int64{} + err = s.Projects.TimeSeries.List("projects/"+projectID). + Filter(`metric.type="serviceruntime.googleapis.com/quota/allocation/usage" resource.type="consumer_quota"`). + IntervalEndTime(time.Now().Format(time.RFC3339)). + // Quota usage metrics get duplicated once a day + IntervalStartTime(time.Now().Add(-24*time.Hour).Format(time.RFC3339)). + Pages(context.Background(), func(page *cm.ListTimeSeriesResponse) error { + for _, ts := range page.TimeSeries { + usage := ts.Points[0].Value.Int64Value // Points[0] is latest + if *usage == 0 { + continue + } + metric := ts.Metric.Labels["quota_metric"] + location := ts.Resource.Labels["location"] + u[usageKey{metric, location}] = *usage + } + return nil + }) + if err != nil { + return usageProvider{}, err + } + return usageProvider{u}, nil +} diff --git a/pkg/validators/quota_test.go b/pkg/validators/quota_test.go index 9b070cf300..9704f495d4 100644 --- a/pkg/validators/quota_test.go +++ b/pkg/validators/quota_test.go @@ -169,3 +169,35 @@ func TestValidateServiceLimits(t *testing.T) { t.Errorf("diff (-want +got):\n%s", diff) } } + +func TestUsageProviderGet(t *testing.T) { + up := usageProvider{u: map[usageKey]int64{ + {Metric: "pony", Location: "global"}: 17, + {Metric: "pony", Location: "us-west1"}: 13, + {Metric: "pony", Location: "us-west1-c"}: 11, + {Metric: "zebra", Location: "us-east1"}: 7, + }} + + type test struct { + metric string + region string + zone string + want int64 + } + tests := []test{ + {"pony", "", "", 17}, + {"zebra", "", "", 0}, + {"pony", "us-west1", "", 13}, + {"zebra", "us-east2", "", 0}, + {"pony", "us-west1", "us-west1-c", 11}, + {"zebra", "us-east1", "us-east1-b", 0}, + } + for _, tc := range tests { + t.Run(fmt.Sprintf("%#v", tc), func(t *testing.T) { + got := up.Usage(tc.metric, tc.region, tc.zone) + if diff := cmp.Diff(tc.want, got); diff != "" { + t.Errorf("diff (-want +got):\n%s", diff) + } + }) + } +} From 9ca6a5a218d35dd34f34a917d8127b8b8a7b4445 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Mon, 7 Aug 2023 21:31:06 -0700 Subject: [PATCH 141/144] Add H3 partition to the hpc-slurm example (#1667) --- examples/README.md | 5 +++++ examples/hpc-slurm.yaml | 23 +++++++++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/examples/README.md b/examples/README.md index 1e1387b6f2..9fdb0004a4 100644 --- a/examples/README.md +++ b/examples/README.md @@ -143,6 +143,11 @@ uses `c2-standard-60` VMs with placement groups enabled. You may need to request additional quota for `C2 CPUs` in the region you are deploying in. You can select the compute partition using the `-p compute` argument when running `srun`. +#### H3 Partition + +There is an `h3` partition that uses compute-optimized `h3-standard-88` machine type. +You can read more about the H3 machine series [here](https://cloud.google.com/compute/docs/compute-optimized-machines#h3_series). + #### Quota Requirements for hpc-slurm.yaml For this example the following is needed in the selected region: diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml index c47033ca44..68a460e379 100644 --- a/examples/hpc-slurm.yaml +++ b/examples/hpc-slurm.yaml @@ -19,8 +19,8 @@ blueprint_name: hpc-slurm vars: project_id: ## Set GCP Project ID Here ## deployment_name: hpc-small - region: us-west4 - zone: us-west4-c + region: us-central1 + zone: us-central1-a # Documentation for each of the modules used below can be found at # https://github.com/GoogleCloudPlatform/hpc-toolkit/blob/main/modules/README.md @@ -72,12 +72,31 @@ deployment_groups: settings: partition_name: compute + - id: h3_node_group + source: community/modules/compute/schedmd-slurm-gcp-v5-node-group + settings: + node_count_dynamic_max: 20 + machine_type: h3-standard-88 + # H3 does not support pd-ssd and pd-standard + # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks + disk_type: pd-balanced + + - id: h3_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + - homefs + - h3_node_group + settings: + partition_name: h3 + - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller use: - network1 - debug_partition - compute_partition + - h3_partition - homefs settings: disable_controller_public_ips: false From 0ebe8a7cc2e80eec45e1e90601d04c8909a67c08 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Tue, 8 Aug 2023 17:25:25 -0700 Subject: [PATCH 142/144] Simplify H3 enterprise example (#1665) * Simplify H3 enterprise example * Update image --- examples/hpc-enterprise-slurm.yaml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index a4c0f80da4..bf0a90f3e7 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -24,7 +24,7 @@ vars: gpu_zones: [us-central1-a, us-central1-b, us-central1-c, us-central1-f] # Visit https://github.com/SchedMD/slurm-gcp/blob/master/docs/images.md#published-image-family # for a list of valid family options with Slurm - family: schedmd-v5-slurm-22-05-9-hpc-centos-7 + family: slurm-gcp-5-7-hpc-centos-7 project: schedmd-slurm-public # Set to true for active cluster reconfiguration. # Note that setting this option requires additional dependencies to be installed locally. @@ -272,11 +272,6 @@ deployment_groups: # https://cloud.google.com/compute/docs/compute-optimized-machines#h3_disks disk_type: pd-balanced disk_size_gb: 100 - node_conf: # one has to specify number of CPUs and sockets for H3 machine type - CPUs: 88 - CoresPerSocket: 44 - Sockets: 2 - ThreadsPerCore: 1 # use `-p h3` to submit jobs to this partition: # ex: `srun -p h3 -N 1 hostname` @@ -285,8 +280,6 @@ deployment_groups: use: [h3_node_group, network1, homefs, projectsfs, scratchfs] settings: partition_name: h3 - partition_conf: # one has to specify mem per CPU for H3 machine type - DefMemPerCPU: 3971 - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller From 0e3cb799743427af64feb2c3a52acce5e35886c8 Mon Sep 17 00:00:00 2001 From: Ivan Orlov Date: Wed, 9 Aug 2023 16:02:02 -0700 Subject: [PATCH 143/144] Fix zone in test-slurm-gcp-v5-hpc-centos7 (#1672) --- tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml b/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml index f5f8b91c9e..a30133e376 100644 --- a/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml +++ b/tools/cloud-build/daily-tests/tests/slurm-v5-hpc-centos7.yml @@ -19,7 +19,7 @@ deployment_name: "cent-v5-{{ build }}" # Manually adding the slurm_cluster_name for use in node names, which filters # non-alphanumeric chars and is capped at 10 chars. slurm_cluster_name: "centv5{{ build[0:4] }}" -zone: us-west4-c +zone: us-central1-a workspace: /workspace blueprint_yaml: "{{ workspace }}/examples/hpc-slurm.yaml" network: "{{ deployment_name }}-net" From e649b7a75a9295057c32cb78aa85ef92f3ee85fe Mon Sep 17 00:00:00 2001 From: Tom Downes Date: Tue, 15 Aug 2023 13:27:58 -0500 Subject: [PATCH 144/144] Bump version to v1.22.0 --- cmd/root.go | 2 +- community/modules/compute/gke-node-pool/versions.tf | 2 +- community/modules/compute/htcondor-execute-point/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-node-group/versions.tf | 2 +- .../compute/schedmd-slurm-gcp-v5-partition/versions.tf | 2 +- .../modules/database/slurm-cloudsql-federation/versions.tf | 4 ++-- .../modules/file-system/cloud-storage-bucket/versions.tf | 2 +- .../modules/file-system/gke-persistent-volume/versions.tf | 2 +- community/modules/file-system/nfs-server/versions.tf | 2 +- community/modules/project/service-enablement/versions.tf | 2 +- .../scheduler/SchedMD-slurm-on-gcp-controller/versions.tf | 2 +- .../scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf | 2 +- community/modules/scheduler/gke-cluster/versions.tf | 2 +- community/modules/scheduler/htcondor-access-point/versions.tf | 2 +- .../modules/scheduler/htcondor-central-manager/versions.tf | 2 +- community/modules/scheduler/htcondor-pool-secrets/versions.tf | 2 +- community/modules/scheduler/htcondor-setup/versions.tf | 2 +- .../scheduler/schedmd-slurm-gcp-v5-controller/versions.tf | 2 +- .../modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf | 2 +- community/modules/scripts/wait-for-startup/versions.tf | 2 +- community/modules/scripts/windows-startup-script/versions.tf | 2 +- modules/compute/vm-instance/versions.tf | 4 ++-- modules/file-system/filestore/versions.tf | 4 ++-- modules/monitoring/dashboard/versions.tf | 2 +- modules/network/pre-existing-vpc/versions.tf | 2 +- modules/scheduler/batch-login-node/versions.tf | 2 +- modules/scripts/startup-script/versions.tf | 2 +- 27 files changed, 30 insertions(+), 30 deletions(-) diff --git a/cmd/root.go b/cmd/root.go index 774235475f..25b0b8636d 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -50,7 +50,7 @@ HPC deployments on the Google Cloud Platform.`, log.Fatalf("cmd.Help function failed: %s", err) } }, - Version: "v1.21.0", + Version: "v1.22.0", Annotations: annotation, } ) diff --git a/community/modules/compute/gke-node-pool/versions.tf b/community/modules/compute/gke-node-pool/versions.tf index d1b661c298..bf1075fac0 100644 --- a/community/modules/compute/gke-node-pool/versions.tf +++ b/community/modules/compute/gke-node-pool/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-node-pool/v1.22.0" } } diff --git a/community/modules/compute/htcondor-execute-point/versions.tf b/community/modules/compute/htcondor-execute-point/versions.tf index f06d695674..29cbb2bedb 100644 --- a/community/modules/compute/htcondor-execute-point/versions.tf +++ b/community/modules/compute/htcondor-execute-point/versions.tf @@ -25,6 +25,6 @@ terraform { } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.20.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-execute-point/v1.22.0" } } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf index 893e8d7b8a..f7d9080d0c 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-node-group/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-node-group/v1.22.0" } required_version = ">= 1.1" } diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf index 426c47e559..1877215899 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-partition/v1.22.0" } required_version = ">= 0.13.0" } diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index 7dddce45b6..87d2f53509 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.22.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.22.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/cloud-storage-bucket/versions.tf b/community/modules/file-system/cloud-storage-bucket/versions.tf index e026df62da..0ebe0f769f 100644 --- a/community/modules/file-system/cloud-storage-bucket/versions.tf +++ b/community/modules/file-system/cloud-storage-bucket/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-storage-bucket/v1.22.0" } required_version = ">= 0.14.0" } diff --git a/community/modules/file-system/gke-persistent-volume/versions.tf b/community/modules/file-system/gke-persistent-volume/versions.tf index c707ec31ff..d6224a215d 100644 --- a/community/modules/file-system/gke-persistent-volume/versions.tf +++ b/community/modules/file-system/gke-persistent-volume/versions.tf @@ -29,6 +29,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-persistent-volume/v1.22.0" } } diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index ea1ffc1421..f0b8391d26 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.22.0" } required_version = ">= 0.14.0" diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index 6efc1c66bc..f062b31f4f 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.22.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf index 8b10839380..63a4e854d7 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-controller/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-controller/v1.22.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf index 28d742fddd..7e3810c031 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-login-node/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-login-node/v1.22.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/gke-cluster/versions.tf b/community/modules/scheduler/gke-cluster/versions.tf index 8bf11e17e0..a37e410392 100644 --- a/community/modules/scheduler/gke-cluster/versions.tf +++ b/community/modules/scheduler/gke-cluster/versions.tf @@ -26,6 +26,6 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:gke-cluster/v1.22.0" } } diff --git a/community/modules/scheduler/htcondor-access-point/versions.tf b/community/modules/scheduler/htcondor-access-point/versions.tf index f2575f3746..27022c1d8a 100644 --- a/community/modules/scheduler/htcondor-access-point/versions.tf +++ b/community/modules/scheduler/htcondor-access-point/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.20.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-access-point/v1.22.0" } required_version = ">= 1.1" diff --git a/community/modules/scheduler/htcondor-central-manager/versions.tf b/community/modules/scheduler/htcondor-central-manager/versions.tf index 4f8fd8952a..aa41c5c232 100644 --- a/community/modules/scheduler/htcondor-central-manager/versions.tf +++ b/community/modules/scheduler/htcondor-central-manager/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.20.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-central-manager/v1.22.0" } required_version = ">= 1.1.0" diff --git a/community/modules/scheduler/htcondor-pool-secrets/versions.tf b/community/modules/scheduler/htcondor-pool-secrets/versions.tf index 37bb1a846b..7b395e71e7 100644 --- a/community/modules/scheduler/htcondor-pool-secrets/versions.tf +++ b/community/modules/scheduler/htcondor-pool-secrets/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-pool-secrets/v1.22.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/htcondor-setup/versions.tf b/community/modules/scheduler/htcondor-setup/versions.tf index 115c5f50d6..102f526dda 100644 --- a/community/modules/scheduler/htcondor-setup/versions.tf +++ b/community/modules/scheduler/htcondor-setup/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.20.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-setup/v1.22.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf index a6967fc2c1..620fb57432 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-controller/v1.22.0" } required_version = ">= 1.1" } diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf index 94d8a67b78..7ff11852c9 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:schedmd-slurm-gcp-v5-login/v1.22.0" } required_version = ">= 1.1" } diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index 962d1c04cf..c17f5d40f1 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.22.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scripts/windows-startup-script/versions.tf b/community/modules/scripts/windows-startup-script/versions.tf index 1622d8adb3..ff9dfc80cd 100644 --- a/community/modules/scripts/windows-startup-script/versions.tf +++ b/community/modules/scripts/windows-startup-script/versions.tf @@ -16,7 +16,7 @@ terraform { provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:windows-startup-script/v1.22.0" } required_version = ">= 0.14.0" diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 3867f90916..238efc62db 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -31,10 +31,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.22.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.22.0" } required_version = ">= 1.2.0" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index c61a8b25d0..aed8db9082 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.22.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.22.0" } required_version = ">= 0.14.0" diff --git a/modules/monitoring/dashboard/versions.tf b/modules/monitoring/dashboard/versions.tf index 76a95a516d..abdebe9543 100644 --- a/modules/monitoring/dashboard/versions.tf +++ b/modules/monitoring/dashboard/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:dashboard/v1.22.0" } required_version = ">= 0.14.0" diff --git a/modules/network/pre-existing-vpc/versions.tf b/modules/network/pre-existing-vpc/versions.tf index 206bcc49f5..48a0fab6d6 100644 --- a/modules/network/pre-existing-vpc/versions.tf +++ b/modules/network/pre-existing-vpc/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:pre-existing-vpc/v1.22.0" } required_version = ">= 0.14.0" diff --git a/modules/scheduler/batch-login-node/versions.tf b/modules/scheduler/batch-login-node/versions.tf index e61a105f01..f9a133c625 100644 --- a/modules/scheduler/batch-login-node/versions.tf +++ b/modules/scheduler/batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:batch-login-node/v1.22.0" } required_version = ">= 0.14.0" diff --git a/modules/scripts/startup-script/versions.tf b/modules/scripts/startup-script/versions.tf index 6c57077672..4c51e8d513 100644 --- a/modules/scripts/startup-script/versions.tf +++ b/modules/scripts/startup-script/versions.tf @@ -30,7 +30,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.21.0" + module_name = "blueprints/terraform/hpc-toolkit:startup-script/v1.22.0" } required_version = ">= 0.14.0"