diff --git a/Makefile b/Makefile index 59c4b9b3b6..7842de65ee 100644 --- a/Makefile +++ b/Makefile @@ -14,11 +14,21 @@ ENG = ./cmd/... ./pkg/... TERRAFORM_FOLDERS=$(shell find ./modules ./community/modules ./tools -type f -name "*.tf" -not -path '*/\.*' -exec dirname "{}" \; | sort -u) PACKER_FOLDERS=$(shell find ./modules ./community/modules ./tools -type f -name "*.pkr.hcl" -not -path '*/\.*' -exec dirname "{}" \; | sort -u) +ifneq (, $(shell which git)) +## GIT IS PRESENT +ifneq (,$(wildcard .git)) +## GIT DIRECTORY EXISTS +GIT_TAG_VERSION=$(shell git tag --points-at HEAD) +GIT_BRANCH=$(shell git branch --show-current) +GIT_COMMIT_INFO=$(shell git describe --tags --dirty --long) +endif +endif + # RULES MEANT TO BE USED DIRECTLY ghpc: warn-go-version warn-terraform-version warn-packer-version $(shell find ./cmd ./pkg ghpc.go -type f) $(info **************** building ghpc ************************) - go build ghpc.go + @go build -ldflags="-X 'main.gitTagVersion=$(GIT_TAG_VERSION)' -X 'main.gitBranch=$(GIT_BRANCH)' -X 'main.gitCommitInfo=$(GIT_COMMIT_INFO)'" ghpc.go install-user: $(info ******** installing ghpc in ~/bin *********************) diff --git a/README.md b/README.md index 166764ad33..c48aef644c 100644 --- a/README.md +++ b/README.md @@ -204,6 +204,59 @@ In the right side, expand the Filters view and then filter by label, specifying ## Troubleshooting +### Network is unreachable (Slurm V5) + +Slurm requires access to google APIs to function. This can be achieved through one of the following methods: + +1. Create a [Cloud NAT](https://cloud.google.com/nat) (preferred). +2. Setting `disable_controller_public_ips: false` & + `disable_login_public_ips: false` on the controller and login nodes + respectively. +3. Enable + [private access to Google APIs](https://cloud.google.com/vpc/docs/private-access-options). + +By default the Toolkit VPC module will create an associated Cloud NAT so this is +typically seen when working with the pre-existing-vpc module. If no access +exists you will see the following errors: + +When you ssh into the login node or controller you will see the following +message: + +```text +*** Slurm setup failed! Please view log: /slurm/scripts/setup.log *** +``` + +> **_NOTE:_**: Many different potential issues could be indicated by the above +> message, so be sure to verify issue in logs. + +To confirm the issue, ssh onto the controller and call `sudo cat /slurm/scripts/setup.log`. Look for +the following logs: + +```text +google_metadata_script_runner: startup-script: ERROR: [Errno 101] Network is unreachable +google_metadata_script_runner: startup-script: OSError: [Errno 101] Network is unreachable +google_metadata_script_runner: startup-script: ERROR: Aborting setup... +google_metadata_script_runner: startup-script exit status 0 +google_metadata_script_runner: Finished running startup scripts. +``` + +You may also notice mount failure logs on the login node: + +```text +INFO: Waiting for '/usr/local/etc/slurm' to be mounted... +INFO: Waiting for '/home' to be mounted... +INFO: Waiting for '/opt/apps' to be mounted... +INFO: Waiting for '/etc/munge' to be mounted... +ERROR: mount of path '/usr/local/etc/slurm' failed: : Command '['mount', '/usr/local/etc/slurm']' returned non-zero exit status 32. +ERROR: mount of path '/opt/apps' failed: : Command '['mount', '/opt/apps']' returned non-zero exit status 32. +ERROR: mount of path '/home' failed: : Command '['mount', '/home']' returned non-zero exit status 32. +ERROR: mount of path '/etc/munge' failed: : Command '['mount', '/etc/munge']' returned non-zero exit status 32. +``` + +> **_NOTE:_**: The above logs only indicate that something went wrong with the +> startup of the controller. Check logs on the controller to be sure it is a +> network issue. + ### Failure to Create Auto Scale Nodes (Slurm) If your deployment succeeds but your jobs fail with the following error: diff --git a/cmd/root.go b/cmd/root.go index 0963e7afe2..1a99a4f59d 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -23,8 +23,16 @@ import ( "github.com/spf13/cobra" ) +// Git references when use Makefile var ( - rootCmd = &cobra.Command{ + GitTagVersion string + GitBranch string + GitCommitInfo string +) + +var ( + annotation = make(map[string]string) + rootCmd = &cobra.Command{ Use: "ghpc", Short: "A blueprint and deployment engine for HPC clusters in GCP.", Long: `gHPC provides a flexible and simple to use interface to accelerate @@ -34,12 +42,28 @@ HPC deployments on the Google Cloud Platform.`, log.Fatalf("cmd.Help function failed: %s", err) } }, - Version: "v1.5.0", + Version: "v1.6.0", + Annotations: annotation, } ) // Execute the root command func Execute() error { + if len(GitCommitInfo) > 0 { + if len(GitTagVersion) == 0 { + GitTagVersion = "- not built from oficial release" + } + if len(GitBranch) == 0 { + GitBranch = "detached HEAD" + } + annotation["version"] = GitTagVersion + annotation["branch"] = GitBranch + annotation["commitInfo"] = GitCommitInfo + rootCmd.SetVersionTemplate(`ghpc version {{index .Annotations "version"}} +Built from '{{index .Annotations "branch"}}' branch. +Commit info: {{index .Annotations "commitInfo"}} +`) + } return rootCmd.Execute() } diff --git a/community/examples/AMD/README.md b/community/examples/AMD/README.md index 1e600c6d3c..05bd740a3f 100644 --- a/community/examples/AMD/README.md +++ b/community/examples/AMD/README.md @@ -75,7 +75,7 @@ remounted and that you should logout and login. Follow its instructions. Once configuration is complete, install AOCC by running: ```shell -sudo -i bash /var/tmp/install_aocc.sh +sudo bash /var/tmp/install_aocc.sh ``` Spack will prompt you to accept the AOCC End User License Agreement by opening a @@ -83,12 +83,7 @@ text file containing information about the license. Leave the file unmodified and write it to disk by typing `:q` as two characters in sequence ([VI help][vihelp]). -Installation of AOCC and OpenMPI will take approximately 15 minutes. Once they -are installed, you can install additional packages such as `amdblis`: - -```shell -sudo -i spack -d install -v amdblis %aocc@3.2.0 -``` +Installation of AOCC and OpenMPI will take approximately 15 minutes. Configure SSH user keys for access between cluster nodes: diff --git a/community/examples/AMD/hpc-cluster-amd-slurmv5.yaml b/community/examples/AMD/hpc-cluster-amd-slurmv5.yaml index e6e06ed895..0c8164151c 100644 --- a/community/examples/AMD/hpc-cluster-amd-slurmv5.yaml +++ b/community/examples/AMD/hpc-cluster-amd-slurmv5.yaml @@ -65,8 +65,25 @@ deployment_groups: - type: shell source: modules/startup-script/examples/install_ansible.sh destination: install_ansible.sh + - $(swfs.install_nfs_client_runner) + - $(swfs.mount_runner) - $(spack.install_spack_deps_runner) - $(spack.install_spack_runner) + - type: shell + content: "shutdown -h +1" + destination: shutdown.sh + + - id: slurm_startup + source: modules/scripts/startup-script + settings: + runners: + - type: data + destination: /etc/profile.d/spack.sh + content: | + #!/bin/sh + if [ -f /sw/spack/share/spack/setup-env.sh ]; then + . /sw/spack/share/spack/setup-env.sh + fi # the following installation of AOCC may be automated in the future # with a clear direction to the user to read the EULA at # https://developer.amd.com/aocc-compiler-eula/ @@ -74,11 +91,20 @@ deployment_groups: destination: /var/tmp/install_aocc.sh content: | #!/bin/bash + source /sw/spack/share/spack/setup-env.sh spack install aocc@3.2.0 +license-agreed spack load aocc@3.2.0 spack compiler find --scope site spack -d install -v openmpi@4.1.3 %aocc@3.2.0 +legacylaunchers +pmi schedulers=slurm + # must restart vm to re-initiate subsequent installs + - id: spack_builder + source: modules/compute/vm-instance + use: [network1, swfs, spack-startup] + settings: + name_prefix: spack-builder + machine_type: c2d-standard-16 + - id: low_cost_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition use: @@ -118,6 +144,6 @@ deployment_groups: use: - network1 - slurm_controller - - spack-startup + - slurm_startup settings: machine_type: c2d-standard-4 diff --git a/community/examples/cloud-batch.yaml b/community/examples/cloud-batch.yaml index 9e5c10e5a6..a244781950 100644 --- a/community/examples/cloud-batch.yaml +++ b/community/examples/cloud-batch.yaml @@ -29,17 +29,14 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: appfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: {local_mount: /sw} - id: hello-startup-script source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -55,7 +52,6 @@ deployment_groups: - id: batch-job source: community/modules/scheduler/cloud-batch-job - kind: terraform use: [network1, appfs, hello-startup-script] settings: runnable: "cat /sw/hello.txt" @@ -66,6 +62,5 @@ deployment_groups: - id: batch-login source: community/modules/scheduler/cloud-batch-login-node - kind: terraform use: [batch-job] outputs: [instructions] diff --git a/community/examples/hpc-cluster-small-sharedvpc.yaml b/community/examples/hpc-cluster-small-sharedvpc.yaml index e70e22ba80..c1b920f1d4 100644 --- a/community/examples/hpc-cluster-small-sharedvpc.yaml +++ b/community/examples/hpc-cluster-small-sharedvpc.yaml @@ -43,7 +43,6 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform settings: project_id: $(vars.host_project_id) network_name: your-shared-network @@ -51,7 +50,6 @@ deployment_groups: - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -61,7 +59,6 @@ deployment_groups: # This debug_partition will work out of the box without requesting additional GCP quota. - id: debug_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -75,7 +72,6 @@ deployment_groups: # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -85,7 +81,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -97,7 +92,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/community/examples/htcondor-pool.yaml b/community/examples/htcondor-pool.yaml index f4d82168cd..b12b00e8d1 100644 --- a/community/examples/htcondor-pool.yaml +++ b/community/examples/htcondor-pool.yaml @@ -29,7 +29,6 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform settings: network_name: htcondor-pool subnetwork_name: htcondor-pool-usc1 @@ -38,21 +37,17 @@ deployment_groups: - id: htcondor_install source: community/modules/scripts/htcondor-install - kind: terraform - id: htcondor_services source: community/modules/project/service-enablement - kind: terraform use: - htcondor_install - id: htcondor_configure source: community/modules/scheduler/htcondor-configure - kind: terraform - id: htcondor_configure_central_manager source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -63,7 +58,6 @@ deployment_groups: - id: htcondor_cm source: modules/compute/vm-instance - kind: terraform use: - network1 - htcondor_configure_central_manager @@ -80,7 +74,6 @@ deployment_groups: - id: htcondor_configure_execute_point source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -91,7 +84,6 @@ deployment_groups: - id: htcondor_execute_point source: community/modules/compute/htcondor-execute-point - kind: terraform use: - network1 - htcondor_configure_execute_point @@ -106,7 +98,6 @@ deployment_groups: - id: htcondor_configure_access_point source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -130,7 +121,6 @@ deployment_groups: queue - id: htcondor_access source: modules/compute/vm-instance - kind: terraform use: - network1 - htcondor_configure_access_point diff --git a/community/examples/intel/daos-cluster.yaml b/community/examples/intel/daos-cluster.yaml index 0fab4f4431..f930d980ff 100644 --- a/community/examples/intel/daos-cluster.yaml +++ b/community/examples/intel/daos-cluster.yaml @@ -30,14 +30,12 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform # This module creates a DAOS server. Server images MUST be created before running this. # https://github.com/daos-stack/google-cloud-daos/tree/main/images # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server - id: daos-server source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1 - kind: terraform use: [network1] settings: number_of_instances: 2 @@ -48,7 +46,6 @@ deployment_groups: # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_client - id: daos-client source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_client?ref=v0.2.1 - kind: terraform use: [network1, daos-server] settings: number_of_instances: 2 diff --git a/community/examples/intel/daos-slurm.yaml b/community/examples/intel/daos-slurm.yaml index beb5598b3b..b392a23ebb 100644 --- a/community/examples/intel/daos-slurm.yaml +++ b/community/examples/intel/daos-slurm.yaml @@ -30,11 +30,9 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: "/home" @@ -44,7 +42,6 @@ deployment_groups: # more info: https://github.com/daos-stack/google-cloud-daos/tree/main/terraform/modules/daos_server - id: daos source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1 - kind: terraform use: [network1] settings: labels: {ghpc_role: file-system} @@ -70,7 +67,6 @@ deployment_groups: - id: daos-client-script source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -89,7 +85,6 @@ deployment_groups: ## This debug_partition will work out of the box without requesting additional GCP quota. - id: debug_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -102,7 +97,6 @@ deployment_groups: # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -112,7 +106,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -129,7 +122,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/community/examples/intel/hpc-cluster-intel-select.yaml b/community/examples/intel/hpc-cluster-intel-select.yaml index 6e6372a855..962fcc5c5d 100644 --- a/community/examples/intel/hpc-cluster-intel-select.yaml +++ b/community/examples/intel/hpc-cluster-intel-select.yaml @@ -33,10 +33,8 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: startup_controller source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -49,7 +47,6 @@ deployment_groups: - startup_script - id: startup_compute source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -98,10 +95,8 @@ deployment_groups: modules: - id: cluster-network source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: - cluster-network settings: @@ -109,7 +104,6 @@ deployment_groups: # This debug_partition will work out of the box without requesting additional GCP quota. - id: debug_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - cluster-network - homefs @@ -124,7 +118,6 @@ deployment_groups: project: $(vars.project_id) - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - cluster-network - homefs @@ -137,7 +130,6 @@ deployment_groups: machine_type: c2-standard-60 - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - cluster-network - compute_partition @@ -150,7 +142,6 @@ deployment_groups: controller_machine_type: c2-standard-4 - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - cluster-network - slurm_controller diff --git a/community/examples/omnia-cluster.yaml b/community/examples/omnia-cluster.yaml index 9af3a6712b..655eb1a1f8 100644 --- a/community/examples/omnia-cluster.yaml +++ b/community/examples/omnia-cluster.yaml @@ -35,12 +35,10 @@ deployment_groups: ## Network - id: network source: modules/network/pre-existing-vpc - kind: terraform ## File Systems - id: homefs source: modules/file-system/filestore - kind: terraform use: [network] settings: local_mount: "/home" @@ -48,7 +46,6 @@ deployment_groups: ## Installation Scripts - id: omnia source: community/modules/scripts/omnia-install - kind: terraform outputs: [inventory_file, omnia_user_warning] settings: manager_ips: [localhost] @@ -56,7 +53,6 @@ deployment_groups: - id: startup-manager source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -70,7 +66,6 @@ deployment_groups: - id: startup-compute source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -83,7 +78,6 @@ deployment_groups: ## Compute - id: manager source: modules/compute/vm-instance - kind: terraform use: - network - homefs @@ -94,7 +88,6 @@ deployment_groups: - id: compute source: modules/compute/vm-instance - kind: terraform use: - network - homefs @@ -106,6 +99,5 @@ deployment_groups: # This module simply makes terraform wait until the startup script is complete - id: wait source: community/modules/scripts/wait-for-startup - kind: terraform settings: instance_name: ((module.manager.name[0])) diff --git a/community/examples/quantum-circuit-simulator.yaml b/community/examples/quantum-circuit-simulator.yaml index a23b897b66..736021a350 100644 --- a/community/examples/quantum-circuit-simulator.yaml +++ b/community/examples/quantum-circuit-simulator.yaml @@ -113,7 +113,8 @@ deployment_groups: - type: shell destination: run-qsim.sh content: | - #!/bin/bash + #!/bin/bash -i + # The -i above (for interactive) is required so that conda command will be accessible. # this script demonstrates how to run the qsim example application and # also "warms up" the GPU to give reliable performance metrics conda activate qsim diff --git a/community/examples/slurm-gcp-v5-hpc-centos7.yaml b/community/examples/slurm-gcp-v5-hpc-centos7.yaml index e913af43f7..14965bece7 100644 --- a/community/examples/slurm-gcp-v5-hpc-centos7.yaml +++ b/community/examples/slurm-gcp-v5-hpc-centos7.yaml @@ -33,18 +33,15 @@ deployment_groups: # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -57,7 +54,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -67,7 +63,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - kind: terraform use: - network1 - debug_partition @@ -76,7 +71,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - kind: terraform use: - network1 - slurm_controller diff --git a/community/examples/slurm-gcp-v5-ubuntu2004.yaml b/community/examples/slurm-gcp-v5-ubuntu2004.yaml index f42e707147..956ab40270 100644 --- a/community/examples/slurm-gcp-v5-ubuntu2004.yaml +++ b/community/examples/slurm-gcp-v5-ubuntu2004.yaml @@ -33,18 +33,15 @@ deployment_groups: # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -57,7 +54,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -67,7 +63,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - kind: terraform use: - network1 - debug_partition @@ -76,7 +71,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - kind: terraform use: - network1 - slurm_controller diff --git a/community/examples/spack-gromacs.yaml b/community/examples/spack-gromacs.yaml index ef499f81e9..d5cd2c062a 100644 --- a/community/examples/spack-gromacs.yaml +++ b/community/examples/spack-gromacs.yaml @@ -30,19 +30,16 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform ## Filesystems - id: appsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /sw - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -50,7 +47,6 @@ deployment_groups: ## Install Scripts - id: spack source: community/modules/scripts/spack-install - kind: terraform settings: install_dir: /sw/spack spack_url: https://github.com/spack/spack @@ -85,7 +81,6 @@ deployment_groups: - id: spack-startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -96,7 +91,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -107,7 +101,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -118,7 +111,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md b/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md index 6b9d45500e..3f8d3f5bd4 100644 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md +++ b/community/modules/compute/SchedMD-slurm-on-gcp-partition/README.md @@ -20,7 +20,6 @@ The following code snippet creates a partition module with: ```yaml - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: [network1, homefs] settings: max_node_count: 200 diff --git a/community/modules/compute/SchedMD-slurm-on-gcp-partition/versions.tf b/community/modules/compute/SchedMD-slurm-on-gcp-partition/versions.tf index 056ac4edc4..7688479f49 100644 --- a/community/modules/compute/SchedMD-slurm-on-gcp-partition/versions.tf +++ b/community/modules/compute/SchedMD-slurm-on-gcp-partition/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-partition/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-partition/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/compute/htcondor-execute-point/README.md b/community/modules/compute/htcondor-execute-point/README.md index a54a4e3234..7476b5c1a6 100644 --- a/community/modules/compute/htcondor-execute-point/README.md +++ b/community/modules/compute/htcondor-execute-point/README.md @@ -24,7 +24,6 @@ a startup script and network created in previous steps. ```yaml - id: htcondor_execute_point source: community/modules/compute/htcondor-execute-point - kind: terraform use: - network1 - htcondor_configure_execute_point diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md index d977d584fd..cc62509377 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/README.md @@ -20,7 +20,6 @@ The following code snippet creates a partition module with: ```yaml - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -85,7 +84,7 @@ No resources. | [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | -| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | +| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, local-ssd, or pd-standard. | `string` | `"pd-standard"` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf index 03230e5c1f..c4dc33c8ec 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v5-partition/variables.tf @@ -231,7 +231,7 @@ variable "can_ip_forward" { variable "disable_smt" { type = bool description = "Disables Simultaneous Multi-Threading (SMT) on instance." - default = false + default = true } variable "labels" { diff --git a/community/modules/database/slurm-cloudsql-federation/README.md b/community/modules/database/slurm-cloudsql-federation/README.md index f19cf1353a..7ca93be525 100644 --- a/community/modules/database/slurm-cloudsql-federation/README.md +++ b/community/modules/database/slurm-cloudsql-federation/README.md @@ -12,7 +12,6 @@ accounting data storage. ```yaml - id: project source: community/modules/database/cloudsql-federation - kind: terraform use: [network1] settings: sql_instance_name: slurm-sql6-demo diff --git a/community/modules/database/slurm-cloudsql-federation/versions.tf b/community/modules/database/slurm-cloudsql-federation/versions.tf index cf1f5fe1cb..83fc3bcb62 100644 --- a/community/modules/database/slurm-cloudsql-federation/versions.tf +++ b/community/modules/database/slurm-cloudsql-federation/versions.tf @@ -30,10 +30,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.6.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:slurm-cloudsql-federation/v1.6.0" } required_version = ">= 0.13.0" diff --git a/community/modules/file-system/DDN-EXAScaler/README.md b/community/modules/file-system/DDN-EXAScaler/README.md index 995538ece6..abddb1c45c 100644 --- a/community/modules/file-system/DDN-EXAScaler/README.md +++ b/community/modules/file-system/DDN-EXAScaler/README.md @@ -21,7 +21,45 @@ More information about the architecture can be found at [marketplace]: https://console.developers.google.com/marketplace/product/ddnstorage/exascaler-cloud [architecture]: https://cloud.google.com/architecture/lustre-architecture +## Mounting + +To mount the DDN EXAScaler Lustre file system you must first install the DDN +Luster client and then call the proper `mount` command. + +When mounting to a Slurm resource both of these steps are automatically handled +with the use of the `use` command. See the +[hpc-cluster-high-io](../../../../examples/hpc-cluster-high-io.yaml) for an +example of using this module with Slurm. + +The DDN-EXAScaler module outputs runners that can be used with the +startup-script module to install the client and mount the file system when +mounting to other compute resources such as `vm-instance` or `cloud-batch-job`. +See the following example: + +```yaml + - id: lustrefs + source: community/modules/file-system/DDN-EXAScaler + use: [network1] + settings: {local_mount: /scratch} + + - id: mount-at-startup + source: modules/scripts/startup-script + settings: + runners: + - $(lustrefs.install_ddn_lustre_client_runner) + - $(lustrefs.mount_runner) + + - id: workstation + source: modules/compute/vm-instance + use: [network1, lustrefs, mount-at-startup] +``` + +See [additional documentation][ddn-install-docs] from DDN EXAScaler. + +[ddn-install-docs]: https://github.com/DDNStorage/exascaler-cloud-terraform/tree/master/gcp#install-new-exascaler-cloud-clients + ## Support + EXAScaler Cloud includes self-help support with access to publicly available documents and videos. Premium support includes 24x7x365 access to DDN's experts, along with support community access, automated notifications of updates and @@ -61,7 +99,7 @@ No providers. | Name | Source | Version | |------|--------|---------| -| [ddn\_exascaler](#module\_ddn\_exascaler) | github.com/DDNStorage/exascaler-cloud-terraform//gcp | 3eec46e | +| [ddn\_exascaler](#module\_ddn\_exascaler) | github.com/DDNStorage/exascaler-cloud-terraform//gcp | 78deadb | ## Resources @@ -71,11 +109,11 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [boot](#input\_boot) | Boot disk properties |
object({
disk_type = string
auto_delete = bool
})
|
{
"auto_delete": true,
"disk_type": "pd-standard"
}
| no | +| [boot](#input\_boot) | Boot disk properties |
object({
disk_type = string
auto_delete = bool
script_url = string
})
|
{
"auto_delete": true,
"disk_type": "pd-standard",
"script_url": null
}
| no | | [cls](#input\_cls) | Compute client properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 0,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-2",
"public_ip": true
}
| no | | [clt](#input\_clt) | Compute client target properties |
object({
disk_bus = string
disk_type = string
disk_size = number
disk_count = number
})
|
{
"disk_bus": "SCSI",
"disk_count": 0,
"disk_size": 256,
"disk_type": "pd-standard"
}
| no | | [fsname](#input\_fsname) | EXAScaler filesystem name, only alphanumeric characters are allowed, and the value must be 1-8 characters long | `string` | `"exacloud"` | no | -| [image](#input\_image) | Source image properties |
object({
project = string
name = string
})
|
{
"name": "exascaler-cloud-v523-centos7",
"project": "ddn-public"
}
| no | +| [image](#input\_image) | Source image properties | `any` |
{
"family": "exascaler-cloud-6-1-centos",
"project": "ddn-public"
}
| no | | [labels](#input\_labels) | Labels to add to EXAScaler Cloud deployment. List of key key, value pairs. | `any` | `{}` | no | | [local\_mount](#input\_local\_mount) | Mountpoint (at the client instances) for this EXAScaler system | `string` | `"/shared"` | no | | [mds](#input\_mds) | Metadata server properties |
object({
node_type = string
node_cpu = string
nic_type = string
node_count = number
public_ip = bool
})
|
{
"nic_type": "GVNIC",
"node_count": 1,
"node_cpu": "Intel Cascade Lake",
"node_type": "n2-standard-32",
"public_ip": true
}
| no | @@ -101,8 +139,11 @@ No resources. | Name | Description | |------|-------------| +| [client\_config\_script](#output\_client\_config\_script) | Script that will install DDN EXAScaler lustre client. The machine running this script must be on the same network & subnet as the EXAScaler. | | [http\_console](#output\_http\_console) | HTTP address to access the system web console. | -| [mount\_command](#output\_mount\_command) | Command to mount the file system. | +| [install\_ddn\_lustre\_client\_runner](#output\_install\_ddn\_lustre\_client\_runner) | Runner that encapsulates the `client_config_script` output on this module. | +| [mount\_command](#output\_mount\_command) | Command to mount the file system. `client_config_script` must be run first. | +| [mount\_runner](#output\_mount\_runner) | Runner to mount the DDN EXAScaler Lustre file system | | [network\_storage](#output\_network\_storage) | Describes a EXAScaler system to be mounted by other systems. | | [private\_addresses](#output\_private\_addresses) | Private IP addresses for all instances. | | [ssh\_console](#output\_ssh\_console) | Instructions to ssh into the instances. | diff --git a/community/modules/file-system/DDN-EXAScaler/main.tf b/community/modules/file-system/DDN-EXAScaler/main.tf index eeb9ecbd5e..e28b11e6e5 100644 --- a/community/modules/file-system/DDN-EXAScaler/main.tf +++ b/community/modules/file-system/DDN-EXAScaler/main.tf @@ -36,7 +36,7 @@ locals { } module "ddn_exascaler" { - source = "github.com/DDNStorage/exascaler-cloud-terraform//gcp?ref=3eec46e" + source = "github.com/DDNStorage/exascaler-cloud-terraform//gcp?ref=78deadb" fsname = var.fsname zone = var.zone project = var.project_id diff --git a/community/modules/file-system/DDN-EXAScaler/outputs.tf b/community/modules/file-system/DDN-EXAScaler/outputs.tf index 7a2da4c7bb..4713a921a5 100644 --- a/community/modules/file-system/DDN-EXAScaler/outputs.tf +++ b/community/modules/file-system/DDN-EXAScaler/outputs.tf @@ -24,9 +24,39 @@ output "ssh_console" { value = module.ddn_exascaler.ssh_console } +output "client_config_script" { + description = "Script that will install DDN EXAScaler lustre client. The machine running this script must be on the same network & subnet as the EXAScaler." + value = module.ddn_exascaler.client_config +} + +output "install_ddn_lustre_client_runner" { + description = "Runner that encapsulates the `client_config_script` output on this module." + value = { + "type" = "shell" + "content" = module.ddn_exascaler.client_config + "destination" = "install_ddn_lustre_client.sh" + } +} + +locals { + split_mount_cmd = split(" ", module.ddn_exascaler.mount_command) + split_mount_cmd_wo_mountpoint = slice(local.split_mount_cmd, 0, length(local.split_mount_cmd) - 1) + mount_cmd = "${join(" ", local.split_mount_cmd_wo_mountpoint)} ${var.local_mount}" + mount_cmd_w_mkdir = "mkdir -p ${var.local_mount} && ${local.mount_cmd}" +} + output "mount_command" { - description = "Command to mount the file system." - value = module.ddn_exascaler.mount_command + description = "Command to mount the file system. `client_config_script` must be run first." + value = local.mount_cmd_w_mkdir +} + +output "mount_runner" { + description = "Runner to mount the DDN EXAScaler Lustre file system" + value = { + "type" = "shell" + "content" = local.mount_cmd_w_mkdir + "destination" = "mount-ddn-lustre.sh" + } } output "http_console" { diff --git a/community/modules/file-system/DDN-EXAScaler/variables.tf b/community/modules/file-system/DDN-EXAScaler/variables.tf index ed1b0caf1e..c9a12771e1 100644 --- a/community/modules/file-system/DDN-EXAScaler/variables.tf +++ b/community/modules/file-system/DDN-EXAScaler/variables.tf @@ -189,25 +189,34 @@ variable "boot" { type = object({ disk_type = string auto_delete = bool + script_url = string }) default = { disk_type = "pd-standard" auto_delete = true + script_url = null } } # Source image properties # project: project name -# name: image name +# family: image family name +# name: !!DEPRECATED!! - image name variable "image" { description = "Source image properties" - type = object({ - project = string - name = string - }) + type = any + # Ommiting type checking so validation can provide more useful error message + # type = object({ + # project = string + # family = string + # }) default = { project = "ddn-public" - name = "exascaler-cloud-v523-centos7" + family = "exascaler-cloud-6-1-centos" + } + validation { + condition = lookup(var.image, "name", null) == null && lookup(var.image, "project", null) != null && lookup(var.image, "family", null) != null + error_message = "Use image.family & image.project to specify the image. Field image.name is deprecated. See EXAScaler documentation for input options:(https://github.com/DDNStorage/exascaler-cloud-terraform/tree/master/gcp#boot-image-options)." } } diff --git a/community/modules/file-system/Intel-DAOS/README.md b/community/modules/file-system/Intel-DAOS/README.md index e71614a1f7..a2ebbca1f0 100644 --- a/community/modules/file-system/Intel-DAOS/README.md +++ b/community/modules/file-system/Intel-DAOS/README.md @@ -23,7 +23,6 @@ For example, in the following snippet taken from the [community/example/intel/da ```yaml - id: daos-server source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1 - kind: terraform use: [network1] settings: number_of_instances: 2 @@ -44,7 +43,6 @@ The following settings will configure this [system for TCO](https://github.com/d ```yaml - id: daos-server source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1 - kind: terraform use: [network1] settings: labels: {ghpc_role: file-system} @@ -60,7 +58,6 @@ The following settings will configure this system for [best performance](https:/ ```yaml - id: daos-server source: github.com/daos-stack/google-cloud-daos.git//terraform/modules/daos_server?ref=v0.2.1 - kind: terraform use: [network1] settings: labels: {ghpc_role: file-system} diff --git a/community/modules/file-system/nfs-server/README.md b/community/modules/file-system/nfs-server/README.md index cffc4d840c..671e398f8e 100644 --- a/community/modules/file-system/nfs-server/README.md +++ b/community/modules/file-system/nfs-server/README.md @@ -17,7 +17,6 @@ community modules that create compute VMs. ```yaml - id: homefs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: auto_delete_disk: true diff --git a/community/modules/file-system/nfs-server/scripts/mount.yaml b/community/modules/file-system/nfs-server/scripts/mount.yaml index b39a2f4adb..f7fbe58d5e 100644 --- a/community/modules/file-system/nfs-server/scripts/mount.yaml +++ b/community/modules/file-system/nfs-server/scripts/mount.yaml @@ -22,14 +22,14 @@ url: "http://metadata.google.internal/computeMetadata/v1/instance/attributes" tasks: - name: Read metadata network_storage information - uri: + ansible.builtin.uri: url: "{{ url }}/{{ meta_key }}" method: GET headers: Metadata-Flavor: "Google" register: storage - name: Mount file systems - mount: + ansible.posix.mount: src: "{{ item.server_ip }}:/{{ item.remote_mount }}" path: "{{ item.local_mount }}" opts: "{{ item.mount_options }}" diff --git a/community/modules/file-system/nfs-server/versions.tf b/community/modules/file-system/nfs-server/versions.tf index 43c741ea6b..dd04c20cf8 100644 --- a/community/modules/file-system/nfs-server/versions.tf +++ b/community/modules/file-system/nfs-server/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:nfs-server/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/project/new-project/README.md b/community/modules/project/new-project/README.md index 6301b04d3f..c7fad43517 100644 --- a/community/modules/project/new-project/README.md +++ b/community/modules/project/new-project/README.md @@ -11,7 +11,6 @@ This module is meant for use with Terraform 0.13. ```yaml - id: project source: community/modules/project/new-project - kind: terraform settings: project_id: test_project folder_id: 334688113020 # random number diff --git a/community/modules/project/service-account/README.md b/community/modules/project/service-account/README.md index cf9331f2d2..79b661784c 100644 --- a/community/modules/project/service-account/README.md +++ b/community/modules/project/service-account/README.md @@ -7,7 +7,6 @@ Allows creation of service accounts for a Google Cloud Platform project. ```yaml - id: service_acct source: community/modules/project/service-account - kind: terraform settings: - project_id: $(vars.project_id) - names: [ "instance_acct" ] diff --git a/community/modules/project/service-enablement/README.md b/community/modules/project/service-enablement/README.md index f03091a28b..266eac26ec 100644 --- a/community/modules/project/service-enablement/README.md +++ b/community/modules/project/service-enablement/README.md @@ -7,7 +7,6 @@ Allows management of multiple API services for a Google Cloud Platform project. ```yaml - id: services-api source: community/modules/project/service-enablement - kind: terraform settings: gcp_service_list: [ "file.googleapis.com", diff --git a/community/modules/project/service-enablement/versions.tf b/community/modules/project/service-enablement/versions.tf index 04b4813a6b..c0c446cfa0 100644 --- a/community/modules/project/service-enablement/versions.tf +++ b/community/modules/project/service-enablement/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:service-enablement/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md index ff3787ad52..f606e4dac5 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/README.md @@ -19,7 +19,6 @@ controller for optimal performance at different scales. ```yaml - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -76,6 +75,7 @@ limitations under the License. | Name | Source | Version | |------|--------|---------| +| [slurm\_cluster\_compute\_node](#module\_slurm\_cluster\_compute\_node) | github.com/SchedMD/slurm-gcp//tf/modules/compute/ | v4.2.0 | | [slurm\_cluster\_controller](#module\_slurm\_cluster\_controller) | github.com/SchedMD/slurm-gcp//tf/modules/controller/ | v4.2.0 | ## Resources diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf index 8c2ae382a8..ff9dda67dd 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/main.tf @@ -16,6 +16,7 @@ locals { controller_startup_script = var.controller_startup_script != null ? var.controller_startup_script : var.startup_script compute_startup_script = var.compute_startup_script != null ? var.compute_startup_script : var.startup_script + cluster_name = var.cluster_name != null ? var.cluster_name : "slurm-${var.deployment_name}" } data "google_compute_image" "compute_image" { @@ -29,7 +30,7 @@ module "slurm_cluster_controller" { boot_disk_type = var.boot_disk_type image = data.google_compute_image.compute_image.self_link instance_template = var.controller_instance_template - cluster_name = var.cluster_name != null ? var.cluster_name : "slurm-${var.deployment_name}" + cluster_name = local.cluster_name compute_node_scopes = var.compute_node_scopes compute_node_service_account = var.compute_node_service_account disable_compute_public_ips = var.disable_compute_public_ips @@ -58,3 +59,23 @@ module "slurm_cluster_controller" { intel_select_solution = var.intel_select_solution cloudsql = var.cloudsql } + +module "slurm_cluster_compute_node" { + source = "github.com/SchedMD/slurm-gcp//tf/modules/compute/?ref=v4.2.0" + project = var.project_id + cluster_name = local.cluster_name + region = var.region + zone = var.zone + controller_name = module.slurm_cluster_controller.controller_node_name + controller_secondary_disk = var.controller_secondary_disk + disable_compute_public_ips = var.disable_compute_public_ips + network_storage = var.network_storage + partitions = var.partition + compute_startup_script = local.compute_startup_script + scopes = var.compute_node_scopes + service_account = var.compute_node_service_account + shared_vpc_host_project = var.shared_vpc_host_project + subnetwork_name = var.subnetwork_name + intel_select_solution = var.intel_select_solution + munge_key = var.munge_key +} diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf index 3e4689a994..a06a6a442f 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-controller/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-controller/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-controller/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md index ba18a7adf4..2f6cb2f13a 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/README.md @@ -16,7 +16,6 @@ node is used in conjunction with the ```yaml - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf index 5608f15aac..ed4fa6ad84 100644 --- a/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf +++ b/community/modules/scheduler/SchedMD-slurm-on-gcp-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-login-node/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:SchedMD-slurm-on-gcp-login-node/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/cloud-batch-job/README.md b/community/modules/scheduler/cloud-batch-job/README.md index 5b5328b16a..21dab0e068 100644 --- a/community/modules/scheduler/cloud-batch-job/README.md +++ b/community/modules/scheduler/cloud-batch-job/README.md @@ -17,7 +17,6 @@ job unless one is provided. See the ```yaml - id: batch-job source: community/modules/scheduler/cloud-batch-job - kind: terraform use: [network1] settings: runnable: "echo 'hello world'" @@ -53,22 +52,18 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: appfs source: modules/file-system/filestore - kind: terraform use: [network1] - id: batch-startup-script source: modules/scripts/startup-script - kind: terraform settings: runners: ... - id: batch-compute-template source: github.com/terraform-google-modules/terraform-google-vm//modules/instance_template?ref=v7.8.0 - kind: terraform use: [batch-startup-script] settings: # Boiler plate to work with Cloud Foundation Toolkit @@ -84,7 +79,6 @@ deployment_groups: - id: batch-job source: ./community/modules/scheduler/cloud-batch-job - kind: terraform settings: instance_template: $(batch-compute-template.self_link) outputs: [instructions] diff --git a/community/modules/scheduler/cloud-batch-login-node/README.md b/community/modules/scheduler/cloud-batch-login-node/README.md index 4f98e24adb..80c214e954 100644 --- a/community/modules/scheduler/cloud-batch-login-node/README.md +++ b/community/modules/scheduler/cloud-batch-login-node/README.md @@ -20,12 +20,10 @@ systems and test installed software before submitting a Google Cloud Batch job. ```yaml - id: batch-job source: community/modules/scheduler/cloud-batch-job - kind: terraform ... - id: batch-login source: community/modules/scheduler/cloud-batch-login-node - kind: terraform use: [batch-job] outputs: [instructions] ``` diff --git a/community/modules/scheduler/cloud-batch-login-node/versions.tf b/community/modules/scheduler/cloud-batch-login-node/versions.tf index bf49cb9e0d..9bf593bd90 100644 --- a/community/modules/scheduler/cloud-batch-login-node/versions.tf +++ b/community/modules/scheduler/cloud-batch-login-node/versions.tf @@ -22,7 +22,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:cloud-batch-login-node/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:cloud-batch-login-node/v1.6.0" } required_version = ">= 0.14.0" diff --git a/community/modules/scheduler/htcondor-configure/README.md b/community/modules/scheduler/htcondor-configure/README.md index 5e36ea81e7..4ae216b3bc 100644 --- a/community/modules/scheduler/htcondor-configure/README.md +++ b/community/modules/scheduler/htcondor-configure/README.md @@ -26,11 +26,9 @@ install the HTCondor software and adds custom configurations using ```yaml - id: htcondor_install source: community/modules/scripts/htcondor-install - kind: terraform - id: htcondor_configure_central_manager source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -41,7 +39,6 @@ install the HTCondor software and adds custom configurations using - id: htcondor_configure_access_point source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell diff --git a/community/modules/scheduler/htcondor-configure/versions.tf b/community/modules/scheduler/htcondor-configure/versions.tf index 0a4c1c40fa..d62fbd8d09 100644 --- a/community/modules/scheduler/htcondor-configure/versions.tf +++ b/community/modules/scheduler/htcondor-configure/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:htcondor-configure/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:htcondor-configure/v1.6.0" } required_version = ">= 0.13.0" diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md index 56a4830e3d..419fa3186c 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/README.md @@ -30,7 +30,6 @@ controller for optimal performance at different scales. ```yaml - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - kind: terraform use: - network1 - homefs @@ -112,7 +111,7 @@ No resources. | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | If set to false. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
* /usr/local/etc/slurm
* /etc/munge
* /home
* /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `false` | no | -| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | +| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, local-ssd, or pd-standard. | `string` | `"pd-ssd"` | no | @@ -122,6 +121,7 @@ No resources. | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_devel](#input\_enable\_devel) | Enables development mode. Not for production use. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | +| [enable\_reconfigure](#input\_enable\_reconfigure) | Enables automatic Slurm reconfiguration when Slurm configuration changes (e.g.
slurm.conf.tpl, partition details). Compute instances and resource policies
(e.g. placement groups) will be destroyed to align with new configuration.
NOTE: Requires Python and Google Pub/Sub API.
*WARNING*: Toggling this will impact the running workload. Deployed compute nodes
will be destroyed and their jobs will be requeued. | `bool` | `false` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [gpu](#input\_gpu) | GPU information. Type and count of GPU to attach to the instance template. See
https://cloud.google.com/compute/docs/gpus more details.
type : the GPU type
count : number of GPUs |
object({
type = string
count = number
})
| `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf index 52b14457ed..5d5a08f9a2 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/main.tf @@ -54,6 +54,7 @@ module "slurm_controller_instance" { enable_devel = var.enable_devel enable_cleanup_compute = var.enable_cleanup_compute enable_cleanup_subscriptions = var.enable_cleanup_subscriptions + enable_reconfigure = var.enable_reconfigure enable_bigquery_load = var.enable_bigquery_load epilog_scripts = var.epilog_scripts disable_default_mounts = var.disable_default_mounts diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf index 1c1fe0525a..793a1c3573 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-controller/variables.tf @@ -127,7 +127,7 @@ variable "disable_default_mounts" { variable "disable_smt" { type = bool description = "Disables Simultaneous Multi-Threading (SMT) on instance." - default = false + default = true } variable "disk_type" { @@ -188,6 +188,19 @@ variable "enable_cleanup_subscriptions" { default = false } +variable "enable_reconfigure" { + description = < [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. |
object({
no_comma_params = bool
resume_rate = number
resume_timeout = number
suspend_rate = number
suspend_timeout = number
})
|
{
"no_comma_params": false,
"resume_rate": 0,
"resume_timeout": 300,
"suspend_rate": 0,
"suspend_timeout": 300
}
| no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `""` | no | +| [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_default\_mounts](#input\_disable\_default\_mounts) | Disable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
If these are disabled, the slurm etc and munge dirs must be added manually,
or some other mechanism must be used to synchronize the slurm conf files
and the munge key across the cluster. | `bool` | `false` | no | | [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.
NOTE: Requires Google Bigquery API. | `bool` | `false` | no | | [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.
NOTE: Requires Python and script dependencies.
*WARNING*: Toggling this may impact the running workload. Deployed compute nodes
may be destroyed and their jobs will be requeued. | `bool` | `false` | no | @@ -204,7 +204,7 @@ limitations under the License. | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = string
}))
| `[]` | no | | [slurm\_bin\_dir](#input\_slurm\_bin\_dir) | Path to directroy of Slurm binary commands (e.g. scontrol, sinfo). If 'null',
then it will be assumed that binaries are in $PATH. | `string` | `null` | no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. | `string` | n/a | yes | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting. If not provided
it will default to the first 8 characters of the deployment name (removing
any invalid characters). | `string` | `null` | no | | [slurm\_control\_host](#input\_slurm\_control\_host) | The short, or long, hostname of the machine where Slurm control daemon is
executed (i.e. the name returned by the command "hostname -s").
See https://slurm.schedmd.com/slurm.conf.html#OPT_SlurmctldHost | `string` | `null` | no | | [slurm\_depends\_on](#input\_slurm\_depends\_on) | Custom terraform dependencies without replacement on delta. This is useful to
ensure order of resource creation.
NOTE: Also see terraform meta-argument 'depends\_on'. | `list(string)` | `[]` | no | | [slurm\_log\_dir](#input\_slurm\_log\_dir) | Directory where Slurm logs to. | `string` | `"/var/log/slurm"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf index 9bc3fa4e00..d27eec58b3 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/main.tf @@ -19,16 +19,32 @@ locals { filename = "ghpc_startup.sh" content = var.compute_startup_script }] + + # Install Directory Variables + # In order to allow the hybrid module to run in a different environment than + # the controller, certain paths need to be updated to match the anticpated + # install directory on the controller. This is done with a sed command that + # find all matching variables with names ending in Program (SuspendProgram, + # etc) or logSlurmctld (EpilogSlurmctld, etc) and replaces the path before + # suspend.py or resume.py with the user provided install_dir. install_dir = var.install_dir != null ? var.install_dir : abspath(var.output_dir) install_dir_pattern = replace(local.install_dir, ".", "\\.") - install_path_cmd = "sed -i -E 's|Program=/.*/(resume\\|suspend).py|Program=${local.install_dir_pattern}/\\1\\.py|g' cloud.conf" + match_pattern = "(Program\\|logSlurmctld)=/.*/(resume\\|suspend).py" + replace_pattern = "\\1=${local.install_dir_pattern}/\\2\\.py" + install_path_cmd = "sed -i -E 's|${local.match_pattern}|${local.replace_pattern}|g' cloud.conf" + + # Since deployment name may be used to create a cluster name, we remove any invalid character from the beginning + # Also, slurm imposed a lot of restrictions to this name, so we format it to an acceptable string + tmp_cluster_name = substr(replace(lower(var.deployment_name), "/^[^a-z]*|[^a-z0-9]/", ""), 0, 10) + slurm_cluster_name = var.slurm_cluster_name != null ? var.slurm_cluster_name : local.tmp_cluster_name + } module "slurm_controller_instance" { source = "github.com/SchedMD/slurm-gcp.git//terraform/slurm_cluster/modules/slurm_controller_hybrid?ref=v5.1.0" project_id = var.project_id - slurm_cluster_name = var.slurm_cluster_name + slurm_cluster_name = local.slurm_cluster_name enable_devel = var.enable_devel enable_cleanup_compute = var.enable_cleanup_compute enable_cleanup_subscriptions = var.enable_cleanup_subscriptions @@ -50,6 +66,8 @@ module "slurm_controller_instance" { disable_default_mounts = var.disable_default_mounts } +# Null resource that injects the installation path before the resume/suspend +# scripts in the hybrid configuration files. resource "null_resource" "set_prefix_cloud_conf" { depends_on = [ module.slurm_controller_instance diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf index de51d540a5..a382028690 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/variables.tf @@ -20,14 +20,19 @@ variable "project_id" { description = "Project ID to create resources in." } -variable "slurm_cluster_name" { +variable "deployment_name" { + description = "Name of the deployment." type = string - description = "Cluster name, used for resource naming and slurm accounting." +} - validation { - condition = can(regex("(^[a-z][a-z0-9]*$)", var.slurm_cluster_name)) - error_message = "Variable 'slurm_cluster_name' must be composed of only alphanumeric values and begin with a leter. regex: '(^[a-z][a-z0-9]*$)'." - } +variable "slurm_cluster_name" { + type = string + description = <<-EOD + Cluster name, used for resource naming and slurm accounting. If not provided + it will default to the first 8 characters of the deployment name (removing + any invalid characters). + EOD + default = null } variable "enable_devel" { diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md index e5d75fb538..d596f7e64d 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/README.md @@ -14,7 +14,6 @@ terraform modules. The login node is used in conjunction with the ```yaml - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - kind: terraform use: - network1 - slurm_controller @@ -89,7 +88,7 @@ No resources. | [controller\_instance\_id](#input\_controller\_instance\_id) | The server-assigned unique identifier of the controller instance, typically
supplied as an output of the controler module. | `string` | n/a | yes | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_login\_public\_ips](#input\_disable\_login\_public\_ips) | If set to false. The login will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `true` | no | -| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | +| [disable\_smt](#input\_disable\_smt) | Disables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `true` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either pd-ssd, local-ssd, or pd-standard. | `string` | `"pd-standard"` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf index 0f565de63b..249d717f48 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf +++ b/community/modules/scheduler/schedmd-slurm-gcp-v5-login/variables.tf @@ -31,7 +31,7 @@ variable "labels" { variable "disable_smt" { type = bool description = "Disables Simultaneous Multi-Threading (SMT) on instance." - default = false + default = true } variable "deployment_name" { diff --git a/community/modules/scripts/htcondor-install/README.md b/community/modules/scripts/htcondor-install/README.md index 80c085d6ff..8f37e411d1 100644 --- a/community/modules/scripts/htcondor-install/README.md +++ b/community/modules/scripts/htcondor-install/README.md @@ -23,11 +23,9 @@ install the HTCondor software and adds custom configurations using ```yaml - id: htcondor_install source: community/modules/scripts/htcondor-install - kind: terraform - id: htcondor_configure_central_manager source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -38,7 +36,6 @@ install the HTCondor software and adds custom configurations using - id: htcondor_configure_access_point source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell diff --git a/community/modules/scripts/htcondor-install/files/install-htcondor-autoscaler-deps.yml b/community/modules/scripts/htcondor-install/files/install-htcondor-autoscaler-deps.yml index 0351dfd75f..db989f9d40 100644 --- a/community/modules/scripts/htcondor-install/files/install-htcondor-autoscaler-deps.yml +++ b/community/modules/scripts/htcondor-install/files/install-htcondor-autoscaler-deps.yml @@ -24,13 +24,13 @@ - name: Create virtual environment for HTCondor autoscaler ansible.builtin.pip: name: pip - version: 21.3.1 # last Python 2.7-compatible release + version: 21.3.1 # last Python 3.6-compatible release virtualenv: /usr/local/htcondor virtualenv_command: /usr/bin/python3 -m venv - name: Install latest setuptools ansible.builtin.pip: name: setuptools - state: 44.1.1 # last Python 2.7-compatible release + version: 59.6.0 # last Python 3.6-compatible release virtualenv: /usr/local/htcondor virtualenv_command: /usr/bin/python3 -m venv - name: Install HTCondor autoscaler dependencies diff --git a/community/modules/scripts/omnia-install/templates/install_omnia.tpl b/community/modules/scripts/omnia-install/templates/install_omnia.tpl index 5989e8f9b1..6164c95ebf 100644 --- a/community/modules/scripts/omnia-install/templates/install_omnia.tpl +++ b/community/modules/scripts/omnia-install/templates/install_omnia.tpl @@ -29,7 +29,7 @@ mode: 0700 owner: "{{ username }}" - name: Create keys - ansible.builtin.openssh_keypair: + community.crypto.openssh_keypair: path: "{{ pub_key_file }}" owner: "{{ username }}" - name: Copy public key to authorized keys diff --git a/community/modules/scripts/spack-install/README.md b/community/modules/scripts/spack-install/README.md index 8c6c1c539b..65ec73178d 100644 --- a/community/modules/scripts/spack-install/README.md +++ b/community/modules/scripts/spack-install/README.md @@ -32,7 +32,6 @@ see this module used in a full blueprint, see the [spack-gromacs.yaml] example. ```yaml - id: spack source: community/modules/scripts/spack-install - kind: terraform settings: install_dir: /sw/spack spack_url: https://github.com/spack/spack @@ -97,7 +96,6 @@ deployment via the following: ```yaml - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: [spack] settings: subnetwork_name: ((module.network1.primary_subnetwork.name)) @@ -111,7 +109,6 @@ Alternatively, it can be added as a startup script via: ```yaml - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - $(spack.install_spack_deps_runner) diff --git a/community/modules/scripts/spack-install/templates/install_spack.tpl b/community/modules/scripts/spack-install/templates/install_spack.tpl index 620ca3e7ff..085a23e76e 100755 --- a/community/modules/scripts/spack-install/templates/install_spack.tpl +++ b/community/modules/scripts/spack-install/templates/install_spack.tpl @@ -24,7 +24,7 @@ if [ ! -d ${INSTALL_DIR} ]; then chmod a+rwx ${INSTALL_DIR}; chmod a+s ${INSTALL_DIR}; cd ${INSTALL_DIR}; - git clone ${SPACK_URL} . + git clone --no-checkout ${SPACK_URL} . } &>> ${LOG_FILE} echo "$PREFIX Checking out ${SPACK_REF}..." git checkout ${SPACK_REF} >> ${LOG_FILE} 2>&1 @@ -99,6 +99,7 @@ echo "$PREFIX Installing root spack specs..." echo "$PREFIX Configuring spack environments" %{if ENVIRONMENTS != null ~} %{for e in ENVIRONMENTS ~} +if [ ! -d ${INSTALL_DIR}/var/spack/environments/${e.name} ]; then %{if e.content != null} { cat << 'EOF' > ${INSTALL_DIR}/spack_env.yaml @@ -129,6 +130,7 @@ EOF spack env deactivate >> ${LOG_FILE} 2>&1 spack clean -s >> ${LOG_FILE} 2>&1 +fi %{endfor ~} %{endif ~} @@ -152,7 +154,9 @@ echo "$PREFIX Populating defined buildcaches" %{endif ~} %{endfor ~} -echo "source ${INSTALL_DIR}/share/spack/setup-env.sh" >> /etc/profile.d/spack.sh -chmod a+rx /etc/profile.d/spack.sh +if [ ! -f /etc/profile.d/spack.sh ]; then + echo "source ${INSTALL_DIR}/share/spack/setup-env.sh" > /etc/profile.d/spack.sh + chmod a+rx /etc/profile.d/spack.sh +fi echo "$PREFIX Setup complete..." diff --git a/community/modules/scripts/wait-for-startup/README.md b/community/modules/scripts/wait-for-startup/README.md index 705699f8b7..aa1a3b408b 100644 --- a/community/modules/scripts/wait-for-startup/README.md +++ b/community/modules/scripts/wait-for-startup/README.md @@ -17,7 +17,6 @@ up a node. ```yaml - id: wait source: community/modules/scripts/wait-for-startup - kind: terraform settings: instance_name: ((module.workstation.name[0])) ``` diff --git a/community/modules/scripts/wait-for-startup/versions.tf b/community/modules/scripts/wait-for-startup/versions.tf index 0a14e4cca1..bc175d5e04 100644 --- a/community/modules/scripts/wait-for-startup/versions.tf +++ b/community/modules/scripts/wait-for-startup/versions.tf @@ -26,7 +26,7 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:wait-for-startup/v1.6.0" } required_version = ">= 0.14.0" diff --git a/docs/hybrid-slurm-cluster/README.md b/docs/hybrid-slurm-cluster/README.md new file mode 100644 index 0000000000..a1c8067147 --- /dev/null +++ b/docs/hybrid-slurm-cluster/README.md @@ -0,0 +1,14 @@ +# Hybrid Slurm Clusters + +## [inter-gcp-project-hybrid-slurm.md](./inter-gcp-project-hybrid-slurm.md) +This document describes how to deploy a simulated hybrid slurm cluster entirely +in GCP. These instructions can be used as a way of trying the +[schedmd-slurm-gcp-v5-hybrid](../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md) +in GCP before bringing the configuration changes to a physical on-premise slurm +cluster. + +## Blueprints +The [blueprints directory](./blueprints/) contains a set of support blueprints +for the documentation in this directory. These blueprints are intended to be +used as is with minimal tweaking of deployment variables either in place or on +the command line. diff --git a/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml b/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml new file mode 100644 index 0000000000..b8f77a213f --- /dev/null +++ b/docs/hybrid-slurm-cluster/blueprints/create-networks.yaml @@ -0,0 +1,53 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: peering-networks + +vars: + project_id: # The project ID for the static cluster + project_id_compute: # The project ID for the burst compute VMs + deployment_name: peering-networks-demo + region: us-central1 + zone: us-central1-c + +deployment_groups: +- group: primary + modules: + # Source is an embedded module, denoted by "modules/*" without ./, ../, / + # as a prefix. To refer to a local or community module, prefix with ./, ../ or / + # Example - ./modules/network/vpc + - source: modules/network/vpc + kind: terraform + id: network0 + settings: + network_name: static-cluster-network + network_address_range: 10.0.0.0/16 + subnetworks: + - subnet_name: primary-subnet + subnet_region: $(vars.region) + new_bits: 8 + + - source: modules/network/vpc + kind: terraform + id: network1 + settings: + network_name: compute-vpc-network + project_id: $(vars.project_id_compute) + network_address_range: 10.1.0.0/16 + subnetworks: + - subnet_name: primary-subnet + subnet_region: $(vars.region) + new_bits: 8 diff --git a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml new file mode 100644 index 0000000000..b7b477db30 --- /dev/null +++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml @@ -0,0 +1,69 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: hpc-cluster-hybrid-v5 + +vars: + project_id: ## <> + deployment_name: hybrid-config + region: us-central1 + zone: us-central1-c + static_controller_hostname: ## <>.c.<>.internal + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/pre-existing-vpc + settings: + network_name: compute-vpc-network + subnetwork_name: primary-subnet + + - id: scratchfs + source: modules/file-system/filestore + use: [network1] + settings: + local_mount: /scratch + + - id: debug-partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: [network1] + settings: + partition_name: debug + node_count_dynamic_max: 10 + exclusive: false + machine_type: n2-standard-2 + is_default: false + + - id: compute-partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + settings: + partition_name: compute + node_count_dynamic_max: 20 + + - id: slurm-controller + source: community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid + use: + - debug-partition + - compute-partition + - scratchfs + settings: + output_dir: ./hybrid + slurm_bin_dir: /usr/local/bin + slurm_control_host: $(vars.static_controller_hostname) + install_dir: /etc/slurm/hybrid diff --git a/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml b/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml new file mode 100644 index 0000000000..162850527f --- /dev/null +++ b/docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml @@ -0,0 +1,65 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +blueprint_name: static-slurm-cluster + +vars: + project_id: ## <> + deployment_name: cluster + region: us-central1 + zone: us-central1-c + +deployment_groups: +- group: primary + modules: + - id: network1 + source: modules/network/pre-existing-vpc + settings: + network_name: static-cluster-network + subnetwork_name: primary-subnet + + - id: scratchfs + source: modules/file-system/filestore + use: [network1] + settings: + local_mount: /scratch + + - id: static_partition + source: community/modules/compute/schedmd-slurm-gcp-v5-partition + use: + - network1 + settings: + partition_name: static + node_count_static: 4 + node_count_dynamic_max: 0 + enable_placement: false + machine_type: n2-standard-2 + is_default: true + + - id: slurm_controller + source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller + use: + - network1 + - static_partition + - scratchfs + + - id: slurm_login + source: community/modules/scheduler/schedmd-slurm-gcp-v5-login + use: + - network1 + - slurm_controller + settings: + machine_type: n2-standard-4 diff --git a/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md b/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md new file mode 100644 index 0000000000..1460c42167 --- /dev/null +++ b/docs/hybrid-slurm-cluster/inter-gcp-project-hybrid-slurm.md @@ -0,0 +1,545 @@ +# Hybrid Slurm Cluster Demonstration With GCP Static Cluster + +## Description +These instructions step through the setup and execution of a demo of the HPC +Toolkit hybrid module. In this process you will: + +* Setup networking and internal DNS peering between 2 GCP projects +* Deploy a [static cluster](#deploy-a-static-cluster) that will simulate an + on-premise cluster using the HPC Toolkit and + [SchedMD's Slurm on GCP][slurm-gcp] terraform modules. +* Create and deploy a hybrid deployment directory using the HPC Toolkit +* Run through a few manual steps of integrating the hybrid configurations + created with the hybrid deployment directory. +* Test the new hybrid controller. + +These instructions are provided for demonstration purposes only. This process +may serve as a first step in evaluating the HPC Toolkit's hybrid slurm module +for use with an on-premise slurm-cluster. + +> **Warning:** The [hybrid module][hybridmodule] is in active development and +> the interface is not guaranteed to be static. As the module matures and +> further testing is done, documentation on applying the hybrid module to +> on-premise slurm clusters will be added and expanded. + +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/v5.1.0 + +## Definitions + +**_static cluster:_** The static cluster will simulate an on-premise slurm cluster +for the purposes of this all-GCP demo. The static cluster will be deployed with +slurm-gcp and optionally have a set of statically created VMs populating it's +local partition. + +**hybrid deployment:** A deployment using the [schedmd-slurm-gcp-v5-hybrid][hybridmodule] +module. The deployment itself includes the hybrid configuration directory as +well as metadata in the cloud bursting project. + +**hybrid configuration directory:** The directory created locally by the +[hybrid module][hybridmodule]. This directory contains the required +configuration files and scripts needed to convert a static cluster to a cloud +hybrid cluster. + +[hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md + +**cloud bursting:** Cloud bursting refers to creating new compute VM instances +in the cloud elastically that can be used to complete slurm jobs. + +**compute node:** In this document, a compute node specifically will refer to +the compute VM instances created by the hybrid configuration. + +## More Information +To learn more about the underlying terraform modules that support this demo, you +can visit the [slurm-gcp] repo. Specifically, the hybrid documentation can be +found at [docs/hybrid.md][slurm-gcp-hybrid]. + +## Blueprints + +* [create-networks.yaml] creates VPC networks in 2 projects with IP ranges that + are suitable for setting up bidirectional network peering. These networks will + be used by subequent blueprints. +* [static-cluster.yaml] defines a slurm cluster with 4 static nodes that will be + used to simulate an on-premise slurm cluster. +* [hybrid-configuration.yaml] sets up the hybrid project and creates a hybrid + configuration directory with all required configurations and scripts. + +[create-networks.yaml]: ./blueprints/create-networks.yaml +[static-cluster.yaml]: ./blueprints/static-cluster.yaml +[hybrid-configuration.yaml]: ./blueprints/hybrid-configuration.yaml + +## Debugging Suggestions + +### Logging +The logs from VMs created by the hybrid configuration will be populated under +`/var/log/slurm/*.log`, a selection of pertinent logs are described below: + +* `slurmctld.log`: The logging information for the slurm controller daemon. Any + issues with the config or permissions will be logged here. +* `slurmd.log`: The logging information for the slurm daemon on the compute + nodes. Any issues with the config or permissions on the compute node can be + found here. Note: These logs require SSH'ing to the compute nodes and viewing + them directly. +* `resume.log`: Output from the resume.py script that is used by hybrid + partitions to create the burst VM instances. Any issues creating new compute + VM nodes will be logged here. + +In addition, any startup failures can be tracked through the logs at +`/var/log/messages` for centos/rhel based images and `/var/log/syslog` for +debian/ubuntu based images. Instructions for viewing these logs can be found in +[Google Cloud docs][view-ss-output]. + +[view-ss-output]: https://cloud.google.com/compute/docs/instances/startup-scripts/linux#viewing-output + +### Connectivity Issues +To verify the network and DNS peering setup was successful, you can create a VM +in each project attached to the networks created in these instructions. You can +run ping to verify the settings are correct: + +```shell +.c..internal +``` + +This should succeed in both directions. + +If the ping test doesn’t work, the DNS may not be configured correctly, or the +networks may not be able to peer correctly. If it’s the former, you should be +able to ping the internal IP of the other VM. If you cannot, the firewall rule +or network peering setting are likely not correct. + +## Instructions + +### Before you begin + +#### Select or Create 2 GCP Projects + +This process will require 2 projects: + +* Project A: Where the simulated “On-premise” static slurm cluster will be + deployed. +* Project B: Where the cloud partitions will create new compute VM instances to + complete slurm jobs. + +Identify the 2 projects you intend to use. "Project A" and "Project B" will be +referred to in the rest of this document based on these definitions. + +#### Enable Required APIs + +The following APIs are required to complete this demo: + +* [Compute Engine API][computeapi] +* [Cloud DNS API][clouddnsapi] +* [Filestore API][fileapi] + +[computeapi]: https://cloud.google.com/compute/docs/reference/rest/v1 +[clouddnsapi]: https://cloud.google.com/dns/docs/reference/v1 +[fileapi]: https://cloud.google.com/filestore/docs/reference/rest + +#### Set IAM Roles +The service account attaches to the slurm controller in Project A +([see above](#select-or-create-2-gcp-projects)) +must have the Editor role in +Project A and Project B. If not specified, this will be the +[default compute engine service account][computesa]. + +[computesa]:https://cloud.google.com/compute/docs/access/service-accounts#default_service_account + +#### Dependencies +This demo has the same baseline dependencies as the HPC Toolkit that are +outlined in the main [README.md](../../README.md#dependencies). + +In addition, some pip packages need to be installed locally. Run the following +command to install the pip packages outlined in +[requirements.txt](./requirements.txt): + +```shell +pip install -r docs/hybrid-slurm-cluster/requirements.txt +``` + +#### Build ghpc + +Before you begin, ensure that you have built the `ghpc` tool in the HPC Toolkit. +For more information see the [README.md](../../README.md#quickstart) Quickstart. + +The commands in these instructions assume the ghpc binary is installed in a +directory represented in the PATH environment variable. To ensure this is the +case, run `make install` after building `ghpc`: + +```shell +make +make install +``` + +### Create VPC Networks +A blueprint for creating VPC networks in each project that can support network +and DNS peering can be found at [create-networks.yaml]. This +blueprint will do the following: + +* Create a network named `static-cluster-network` in project A. +* Create a subnetwork of `static-cluster-network` named `primary-subnet` with + an internal IP range of 10.0.0.0/16. +* Create a network named `compute-vpc-network` in project B. +* Create a subnetwork of `compute-vpc-network` named `primary-subnet` with an + internal IP range of 10.1.0.0/16 + +Create a deployment directory for the networks using `ghpc`: + +```shell +ghpc create docs/hybrid-slurm-cluster/blueprints/create-networks.yaml --vars project_id="<>",project_id_compute="<>" +``` + +If successful, this command will provide 3 terraform operations that can be +performed to deploy the deployment directory. They should look similar to the +following: + +```shell +Terraform group was successfully created in directory peering-networks-demo/primary +To deploy, run the following commands: + terraform -chdir=peering-networks-demo/primary init + terraform -chdir=peering-networks-demo/primary validate + terraform -chdir=peering-networks-demo/primary apply +``` + +Execute the terraform commands to deploy the two networks. + +### Allow Peering Between VPC Networks +Bidirectional VPC and DNS peering is needed between both networks created +in the last step. [VPC peering][netpeering] allows internal IP address +connectivity between the projects. [DNS peering][dnspeering] allows resolution +of the fully qualified hostname of instances in the other project in the current +project. + +These instructions will step you through how to set up both of these peering +connections via the [cloud console][console]. + +[netpeering]: https://cloud.google.com/vpc/docs/vpc-peering +[dnspeering]: https://cloud.google.com/dns/docs/overview +[console]: https://cloud.google.com/cloud-console + +#### Setup VPC Peering +First, set up VPC peering from Project A to Project B: + +* Navigate to the [VPC Network Peering][netpeeringconsole] page in the GCP + console. +* Click on [Create Peering Connection][createpeering]. +* Click "CONTINUE" if prompted to gather additional information (project ID, IP + ranges, etc) +* Provide the following information: + * **_Name:_** The name of the peering connection, for example + "hybrid-demo-network-peering". + * **_Your VPC Network:_** The name of the VPC network in this project created + in the last step, by default "static-cluster-network" for project A and + "compute-vpc-network" for project B. + * **_Peered VPC Network_** Select "In another project" + * **_Project ID:_** The name of the other project. + * **_VPC network name:_** The name of the VPC network in the other project, + "compute-vpc-network" if creating from project A or + "static-cluster-network" if creating from project B. + * All other fields can be left alone. +* Click "CREATE". + +Repeat these same steps in Project B. + +When complete, both [network peering connections][netpeeringconsole] should show +a green check icon and be listed as "Active". + +Next, set up firewall rules in each project that allow data to pass between the +peered networks. Starting in project A, do the following: + +* Navigate to the [VPC Networks][vpcnetworks] page in the GCP console. +* Click on the network created in the prior step, "static-cluster-network" for + project A and "compute-vpc-network" for project B. +* Click on the tab titled "FIREWALLS". +* Click on "ADD FIREWALL RULE". +* Provide the following information: + * **_Name:_** The name of the firewall rule, for example + "allow-peering-connection". + * **_Network:_** The name of the network, this should already be filled in. + * **_Direction of traffic:_** Ingress + * **_Action on match:_** Allow + * **_Targets:_** All instances in the network + * **_Source filter:_** IPv4 ranges + * **_Source IPv4 ranges:_** 10.0.0.0/8 + * **_Protocols and Ports:_** Specified protocols and ports + * TCP: 0-65532 + * UDP: 0-65532 + * Other: icmp +* Click "CREATE" + +Repeat these same steps in Project B. + +[netpeeringconsole]: https://console.cloud.google.com/networking/peering/list +[createpeering]: https://console.cloud.google.com/networking/peering/add +[vpcnetworks]: https://console.cloud.google.com/networking/networks/list + +#### Setup DNS Peering +First, set up private DNS peering from Project A to Project B: + +* Navigate to the [Cloud DNS][dnszones] page in the GCP console. +* Click on "CREATE ZONE". +* Provide the following information: + * **_Zone Type:_** Private + * **_Zone name:_** The name of the DNS zone, for example + "hybrid-demo-dns-zone". + * **_DNS name:_** `c.<>.internal` replacing `<>` + with the project ID of project B. When adding the zone in project B, the + DNS name will be `c.<>.internal`. + * **_Options:_** DNS Peering + * **_Networks:_** The network created in the prior step in this project, + "static-cluster-network" for project A and "compute-vpc-network" for + project B. + * **_Peer Project:_** The project ID of the other project. + * **_Peer Network:_** The network name created in the last step in the peer + project, "compute-vpc-network" if creating from project A or + "static-cluster-network" if creating from project B. +* Click "CREATE" + +Repeat these same steps in Project B. + +[dnszones]: https://console.cloud.google.com/net-services/dns/zones + +### Deploy a Static Cluster + +The blueprint defined by [static-cluster.yaml] in the blueprints directory will +create a new slurm cluster with the following: + +* A pointer to the network created in [Create VPC Networks](#create-vpc-networks) + in project A, "static-cluster-network". +* A new filestore instance that will serve as the local scratch network + filesystem. +* One partition with 4 static nodes (compute VMs that are always up) of machine + type n2-standard-2. This will be the default partition. +* A Slurm controller and login node. + +First, use the HPC Toolkit to create the deployment directory, replacing +"<>" with the ID of your project A: + +```shell +ghpc create docs/hybrid-slurm-cluster/blueprints/static-cluster.yaml --vars project_id="<>" +``` + +If successful, this command will provide 3 terraform operations that can be +performed to deploy the deployment directory. They should look similar to the +following: + +```shell +Terraform group was successfully created in directory peering-networks-demo/primary +To deploy, run the following commands: + terraform -chdir=cluster/primary init + terraform -chdir=cluster/primary validate + terraform -chdir=cluster/primary apply +``` + +Execute the terraform commands to deploy the static Slurm cluster in project A. + +### Use the Cloud HPC Toolkit to Create the Hybrid Deployment Directory +The blueprint for creating a deploying the hybrid configuration can be found in +the blueprints directory as [hybrid-configuration.yaml]. This blueprint defines +a deployment that does the following: + +* Create a pointer to the network in project B created in + [Create VPC Networks](#create-vpc-networks). +* Create a filestore for a cloud scratch network filesystem. +* Create a single partition named "cloud" with a dynamic maximum size of 10 + nodes of machine type n2-standard-2. +* Creates a hybrid configuration using the + [`schedmd-slurm-gcp-v5-hybrid`][hybridmodule] module. This module will do the + following: + * Create a directory at `output_dir` locally containing the hybrid + configuration files and execution scripts. + * Set metadata in project B that inform the burst compute nodes how to + configure themselves. + * Create pubsub actions triggered by changes to the hybrid configuration. + +The following deployment variables in the [hybrid-configuration.yaml] blueprint +will be set based on your configuration via the command line: + +* **_project\_id:_** The ID of project B. +* **_static\_controller\_hostname:_** The fully qualified internal hostname of + the static cluster's controller in project A. The format is + `cluster-controller.c.<>.internal`. + +To create the deployment directory with deployment variables passed through the +command line, run the following command with the updated values for +`<>` and `<>`: + +```shell +ghpc create docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml --vars project_id="<>",static_controller_hostname="cluster-controller.c.<>.internal" +``` + +If successful, this command will provide 3 terraform operations that can be +performed to deploy the deployment directory. They should look similar to the +following: + +```shell +Terraform group was successfully created in directory peering-networks-demo/primary +To deploy, run the following commands: + terraform -chdir=hybrid-config/primary init + terraform -chdir=hybrid-config/primary validate + terraform -chdir=hybrid-config/primary apply +``` + +Execute the terraform commands to create the hybrid configuration. A directory +in `hybrid-configuration/primary` named `hyrid/` should be created which +contains a `cloud.conf` file, `cloud_gres.conf` file and a set of support +scripts. + +[hybridmodule]: ../../community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid/README.md + +### Install and Configure Hybrid on the Controller Instance + +> **_NOTE:_** Many of the manual steps in this section have been adapted from the +> hybrid documentation in [Slurm on GCP][slurm-gcp]. The source document can be +> found at [docs/hybrid.md][slurm-gcp-hybrid] + +Now that the hybrid configuration directory has been created, it needs to be +installed on the controller VM instance. First, tar the directory: + +```shell +cd hybrid-config/primary +tar czvf hybrid.tar.gz hybrid +``` + +Copy the `hybrid.tar.gz` file to the controller VM instance. This can be done +in whichever way is easiest for you, `gcloud compute scp` is used here. + +```shell +gcloud compute scp --project="<>" --zone=us-central1-c ./hybrid.tar.gz "cluster-controller:~" +``` + +Now SSH to the controller VM either using the console or the following gcloud +command: + +```shell +gcloud compute ssh --project="<>" --zone=us-central1-c "cluster-controller" +``` + +Decompress the `hybrid.tar.gz` file: + +```shell +sudo tar xzvf hybrid.tar.gz --directory /etc/slurm +rm hybrid.tar.gz +``` + +Set the correct permissions for the hybrid directory and the files contained in +it: + +```shell +sudo chown -R slurm: /etc/slurm/hybrid +sudo chmod 644 /etc/slurm/hybrid/cloud.conf +sudo chmod 755 /etc/slurm/hybrid +``` + +Because the static cluster was also created by [Slurm on GCP][slurm-gcp] +terraform modules, the partition information must be copied from the file +`/etc/slurm/cloud.conf` to the slurm config file at `/etc/slurm/slurm.conf`. The +lines that need to be copied will look similar to the following block: + +```text +NodeName=DEFAULT State=UNKNOWN RealMemory=7552 Boards=1 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 CPUs=1 +NodeName=cluster-static-ghpc-[0-3] State=CLOUD +NodeSet=cluster-static-ghpc Nodes=cluster-static-ghpc-[0-3] +PartitionName=static Nodes=cluster-static-ghpc State=UP DefMemPerCPU=7552 SuspendTime=300 Oversubscribe=Exclusive Default=YES + +SuspendExcNodes=cluster-static-ghpc-[0-3] +``` + +Depending on the configuration of the static partitions, the `SuspendExcNodes` +may not be included. + +These lines can be copied to the bottom of the `slurm.conf` file. + +Make the following changes to the `/etc/slurm/slurm.conf` file: + +* replace `include cloud.conf` with `include hybrid/cloud.conf` +* Add the fully qualified hostname in parentheses after the controller hostname + in the parameter `SlurmctldHost`. + +```text +# slurm.conf +... +SlurmctldHost=cluster-controller(cluster-controller.c.<>.internal) +... +include hybrid/cloud.conf +... +``` + +Make the following changes to the `/etc/slurm/hybrid/cloud.conf` file: + +* `SlurmctldParameters` + * Remove `cloud_dns` + * Add `cloud_reg_addrs` +* Add `TreeWidth=65533` + +```text +# cloud.conf +... +SlurmctldParameters=idle_on_node_suspend,cloud_reg_addrs +... +TreeWidth=65533 +... +``` + +These changes will inform the controller to use the IP of compute nodes to +communicate rather than the hostnames. + +Next, create a new cronjob as the slurm user that will periodically call the +`/etc/slurm/hybrid/slurmsync.py` file. + +```shell +sudo su slurm +crontab -e +``` + +Since the controller was deployed using [Slurm on GCP][slurm-gcp], there will +already be a cronjob pointing to the `slurmsync.py` script in `/etc/slurm/`, +simply update it to the following: + +```text +*/1 * * * * /etc/slurm/hybrid/slurmsync.py +``` + +Exit the editor and the slurm user when complete. + +Finally, restart the slurmctld service to enable the changes made: + +```shell +sudo systemctl restart slurmctld +``` + +If the restart did not succeed, the logs at `/var/log/slurm/slurmctld.log` +should point you in the right direction. + +[slurm-gcp]: https://github.com/SchedMD/slurm-gcp/tree/v5.1.0 +[slurm-gcp-hybrid]: https://github.com/SchedMD/slurm-gcp/blob/v5.1.0/docs/hybrid.md + +### Validate the Hybrid Cluster + +Now that the hybrid configuration has been installed, you can test your new +cloud partition. First off, run `sinfo` to see your partitions listed side by +side: + +```shell +$ sinfo +PARTITION AVAIL TIMELIMIT NODES STATE NODELIST +static* up infinite 4 idle cluster-static-ghpc-[0-3] +compute up infinite 20 idle~ hybridconf-compute-ghpc-[0-19] +debug up infinite 10 idle~ hybridconf-debug-ghpc-[0-9] +``` + +To verify that your local partitions are still active, run a simple test with +`srun`: + +```shell +$ srun -N 1 hostname +cluster-static-ghpc-0 +``` + +Now verify the cloud partition is running with a similar test. Note that since a +node is being created, the same command will take much longer the first time. +Subsequent uses of the cloud nodes before being suspended will be near +instantaneous after the initial startup cost. + +```shell +$ srun -N 1 -p debug hostname +hybridconf-debug-ghpc-0 +``` diff --git a/docs/hybrid-slurm-cluster/requirements.txt b/docs/hybrid-slurm-cluster/requirements.txt new file mode 100644 index 0000000000..a99bcfa21e --- /dev/null +++ b/docs/hybrid-slurm-cluster/requirements.txt @@ -0,0 +1,5 @@ +addict~=2.0 +google-cloud-pubsub~=2.0 +google-api-python-client==2.61.0 +httplib2==0.20.4 +PyYAML==6.0 \ No newline at end of file diff --git a/docs/tutorials/gromacs/spack-gromacs.yaml b/docs/tutorials/gromacs/spack-gromacs.yaml index d69cbc1f06..cbbba9f9d0 100644 --- a/docs/tutorials/gromacs/spack-gromacs.yaml +++ b/docs/tutorials/gromacs/spack-gromacs.yaml @@ -28,19 +28,15 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: hpc_dash source: modules/monitoring/dashboard - kind: terraform ## Install Scripts - id: spack source: community/modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack - spack_url: https://github.com/spack/spack spack_ref: v0.18.0 log_file: /var/log/spack.log configs: @@ -99,7 +95,6 @@ deployment_groups: - id: controller-setup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -143,7 +138,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 settings: @@ -152,7 +146,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - compute_partition @@ -162,7 +155,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - slurm_controller diff --git a/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml b/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml index c1ea22ae51..dfe2a9f276 100644 --- a/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml +++ b/docs/tutorials/intel-select/hpc-cluster-intel-select.yaml @@ -27,18 +27,15 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: startup-controller source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -50,7 +47,6 @@ deployment_groups: - id: startup-compute source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -63,7 +59,6 @@ deployment_groups: # This debug_partition will work out of the box without requesting additional GCP quota. - id: debug_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -77,7 +72,6 @@ deployment_groups: # This compute_partition is far more performant than debug_partition but may require requesting GCP quotas first. - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -87,7 +81,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -101,7 +94,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/docs/tutorials/openfoam/spack-openfoam.yaml b/docs/tutorials/openfoam/spack-openfoam.yaml index 117c790b77..aa908cbd89 100644 --- a/docs/tutorials/openfoam/spack-openfoam.yaml +++ b/docs/tutorials/openfoam/spack-openfoam.yaml @@ -28,19 +28,15 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: hpc_dash source: modules/monitoring/dashboard - kind: terraform ## Install Scripts - id: spack source: community/modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack - spack_url: https://github.com/spack/spack spack_ref: v0.18.0 log_file: /var/log/spack.log configs: @@ -106,7 +102,6 @@ deployment_groups: - id: controller-setup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -154,7 +149,6 @@ deployment_groups: mpirun -n 60 -npernode 30 -hostfile hostfile simpleFoam -parallel - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 settings: @@ -163,7 +157,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - compute_partition @@ -173,7 +166,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - slurm_controller diff --git a/docs/tutorials/wrfv3/spack-wrfv3.yaml b/docs/tutorials/wrfv3/spack-wrfv3.yaml index bfe67504b3..96b5ee7eb6 100644 --- a/docs/tutorials/wrfv3/spack-wrfv3.yaml +++ b/docs/tutorials/wrfv3/spack-wrfv3.yaml @@ -28,19 +28,15 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: hpc_dash source: modules/monitoring/dashboard - kind: terraform ## Install Scripts - id: spack source: community/modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack - spack_url: https://github.com/spack/spack spack_ref: v0.18.0 log_file: /var/log/spack.log configs: @@ -99,7 +95,6 @@ deployment_groups: - id: controller-setup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -141,7 +136,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 settings: @@ -150,7 +144,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - compute_partition @@ -160,7 +153,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - slurm_controller diff --git a/examples/README.md b/examples/README.md index e9a74af013..d1ef417c33 100644 --- a/examples/README.md +++ b/examples/README.md @@ -588,7 +588,7 @@ deployment_groups: # Local source, prefixed with ./ (/ and ../ also accepted) - id: # Required: Name of this module used to uniquely identify it. source: ./modules/role/module-name # Required: Points to the module directory. - kind: < terraform | packer > # Required: Type of module, currently choose from terraform or packer. + kind: < terraform | packer > # Optional: Type of module, currently choose from terraform or packer. If not specified, `kind` will default to `terraform` # Optional: All configured settings for the module. For terraform, each # variable listed in variables.tf can be set here, and are mandatory if no # default was provided and are not defined elsewhere (like the top-level vars) diff --git a/examples/hpc-cluster-high-io.yaml b/examples/hpc-cluster-high-io.yaml index fd7b82a552..05f2f3a630 100644 --- a/examples/hpc-cluster-high-io.yaml +++ b/examples/hpc-cluster-high-io.yaml @@ -33,18 +33,15 @@ deployment_groups: # Example - ./modules/network/pre-existing-vpc - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: projectsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: filestore_tier: HIGH_SCALE_SSD @@ -53,14 +50,12 @@ deployment_groups: - id: scratchfs source: community/modules/file-system/DDN-EXAScaler - kind: terraform use: [network1] settings: local_mount: /scratch - id: low_cost_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -76,7 +71,6 @@ deployment_groups: # This compute_partition is far more performant than low_cost_partition. - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -88,7 +82,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -102,7 +95,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs @@ -114,5 +106,4 @@ deployment_groups: - id: hpc_dashboard source: modules/monitoring/dashboard - kind: terraform outputs: [instructions] diff --git a/examples/image-builder.yaml b/examples/image-builder.yaml index 6893c612cf..38412d2125 100644 --- a/examples/image-builder.yaml +++ b/examples/image-builder.yaml @@ -32,10 +32,8 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: scripts_for_image source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -60,10 +58,8 @@ deployment_groups: modules: - id: cluster-network source: modules/network/pre-existing-vpc - kind: terraform - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: [cluster-network] settings: partition_name: compute @@ -73,7 +69,6 @@ deployment_groups: project: $(vars.project_id) - id: slurm_controller source: community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: [cluster-network, compute_partition] settings: login_node_count: 1 @@ -82,7 +77,6 @@ deployment_groups: project: $(vars.project_id) - id: slurm_login source: community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: [cluster-network, slurm_controller] settings: instance_image: diff --git a/ghpc.go b/ghpc.go index b4443c2642..2f676fe10d 100644 --- a/ghpc.go +++ b/ghpc.go @@ -25,8 +25,16 @@ import ( //go:embed modules community/modules var moduleFS embed.FS +// Git references when use Makefile +var gitTagVersion string +var gitBranch string +var gitCommitInfo string + func main() { sourcereader.ModuleFS = moduleFS + cmd.GitTagVersion = gitTagVersion + cmd.GitBranch = gitBranch + cmd.GitCommitInfo = gitCommitInfo if err := cmd.Execute(); err != nil { os.Exit(1) } diff --git a/go.mod b/go.mod index ea86b39003..9dcee3402c 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,8 @@ module hpc-toolkit go 1.16 require ( - cloud.google.com/go/compute v1.9.0 + cloud.google.com/go/compute v1.10.0 + cloud.google.com/go/storage v1.26.0 // indirect github.com/hashicorp/go-getter v1.6.2 github.com/hashicorp/hcl v1.0.0 // indirect github.com/hashicorp/hcl/v2 v2.14.0 @@ -16,7 +17,7 @@ require ( github.com/spf13/afero v1.9.2 github.com/spf13/cobra v1.5.0 github.com/zclconf/go-cty v1.10.0 - google.golang.org/genproto v0.0.0-20220804142021-4e6b2dfa6612 + google.golang.org/genproto v0.0.0-20220915135415-7fd63a7952de gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f gopkg.in/yaml.v3 v3.0.1 ) diff --git a/go.sum b/go.sum index 126b63bd0e..f485fbc57a 100644 --- a/go.sum +++ b/go.sum @@ -32,6 +32,7 @@ cloud.google.com/go v0.100.2/go.mod h1:4Xra9TjzAeYHrl5+oeLlzbM2k3mjVhZh4UqTZ//w9 cloud.google.com/go v0.102.0/go.mod h1:oWcCzKlqJ5zgHQt9YsaeTY9KzIvjyy0ArmiBUgpQ+nc= cloud.google.com/go v0.102.1 h1:vpK6iQWv/2uUeFJth4/cBHsQAGjn1iIE6AAlxipRaA0= cloud.google.com/go v0.102.1/go.mod h1:XZ77E9qnTEnrgEOvr4xzfdX5TRo7fB4T2F4O6+34hIU= +cloud.google.com/go/asset v1.5.0/go.mod h1:5mfs8UvcM5wHhqtSv8J1CtxxaQq3AdBxxQi2jGW/K4o= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= @@ -44,8 +45,8 @@ cloud.google.com/go/compute v1.5.0/go.mod h1:9SMHyhJlzhlkJqrPAc839t2BZFTSk6Jdj6m cloud.google.com/go/compute v1.6.0/go.mod h1:T29tfhtVbq1wvAPo0E3+7vhgmkOYeXjhFvz/FMzPu0s= cloud.google.com/go/compute v1.6.1/go.mod h1:g85FgpzFvNULZ+S8AYq87axRKuf2Kh7deLqV/jJ3thU= cloud.google.com/go/compute v1.7.0/go.mod h1:435lt8av5oL9P3fv1OEzSbSUe+ybHXGMPQHHZWZxy9U= -cloud.google.com/go/compute v1.9.0 h1:ED/FP4xv8GJw63v556/ASNc1CeeLUO2Bs8nzaHchkHg= -cloud.google.com/go/compute v1.9.0/go.mod h1:lWv1h/zUWTm/LozzfTJhBSkd6ShQq8la8VeeuOEGxfY= +cloud.google.com/go/compute v1.10.0 h1:aoLIYaA1fX3ywihqpBk2APQKOo20nXsp1GEZQbx5Jk4= +cloud.google.com/go/compute v1.10.0/go.mod h1:ER5CLbMxl90o2jtNbGSbtfOpQKR0t15FOtRsugnLrlU= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= cloud.google.com/go/iam v0.3.0 h1:exkAomrVUuzx9kWFI1wm3KI0uoDeUFPB4kKGzx6x+Gc= @@ -54,14 +55,16 @@ cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2k cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= +cloud.google.com/go/security v1.5.0/go.mod h1:lgxGdyOKKjHL4YG3/YwIL2zLqMFCKs0UbQwgyZmfJl4= cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= cloud.google.com/go/storage v1.14.0/go.mod h1:GrKmX003DSIwi9o29oFT7YDnHYwZoctc3fOKtUw0Xmo= -cloud.google.com/go/storage v1.22.1 h1:F6IlQJZrZM++apn9V5/VfS3gbTUYg98PS3EMQAzqtfg= cloud.google.com/go/storage v1.22.1/go.mod h1:S8N1cAStu7BOeFfE8KAQzmyyLkK8p/vmRq6kuBTW58Y= +cloud.google.com/go/storage v1.26.0 h1:lYAGjknyDJirSzfwUlkv4Nsnj7od7foxQNH/fqZqles= +cloud.google.com/go/storage v1.26.0/go.mod h1:mk/N7YwIKEWyTvXAWQCIeiCTdLoRH6Pd5xmSnolQLTI= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= @@ -205,7 +208,6 @@ github.com/googleapis/gax-go/v2 v2.2.0/go.mod h1:as02EH8zWkzwUoLbBaFeQ+arQaj/Oth github.com/googleapis/gax-go/v2 v2.3.0/go.mod h1:b8LNqSzNabLiUpXKkY7HAR5jr6bIT99EXz9pXxye9YM= github.com/googleapis/gax-go/v2 v2.4.0 h1:dS9eYAjhrE2RjmzYw2XAPvcXfmcQLtFEQWn0CR82awk= github.com/googleapis/gax-go/v2 v2.4.0/go.mod h1:XOTVJ59hdnfJLIP/dh8n5CGryZR2LxK9wbMD5+iXC6c= -github.com/googleapis/go-type-adapters v1.0.0 h1:9XdMn+d/G57qq1s8dNc5IesGCXHf6V2HZ2JwRxfA2tA= github.com/googleapis/go-type-adapters v1.0.0/go.mod h1:zHW75FOG2aur7gAO2B+MLby+cLsWGBF62rFAi7WjWO4= github.com/googleapis/google-cloud-go-testing v0.0.0-20200911160855-bcd43fbb19e8/go.mod h1:dvDLG8qkwmyD9a/MJJN3XJcT3xFxOKAvTZGvuZmac9g= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= @@ -409,8 +411,9 @@ golang.org/x/net v0.0.0-20220325170049-de3da57026de/go.mod h1:CfG3xpIq0wQ8r1q4Su golang.org/x/net v0.0.0-20220412020605-290c469a71a5/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220425223048-2871e0cb64e4/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= -golang.org/x/net v0.0.0-20220624214902-1bab6f366d9e h1:TsQ7F31D3bUCLeqPT0u+yjp1guoArKaNKmCr22PYgTQ= golang.org/x/net v0.0.0-20220624214902-1bab6f366d9e/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.0.0-20220909164309-bea034e7d591 h1:D0B/7al0LLrVC8aWF4+oxpv/m8bc7ViFfVS8/gXGdqI= +golang.org/x/net v0.0.0-20220909164309-bea034e7d591/go.mod h1:YDH+HFinaLZZlnHAfSS6ZXJJ9M9t4Dl22yv3iI2vPwk= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -431,8 +434,10 @@ golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j golang.org/x/oauth2 v0.0.0-20220309155454-6242fa91716a/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.0.0-20220411215720-9780585627b5/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.0.0-20220608161450-d0670ef3b1eb/go.mod h1:jaDAt6Dkxork7LmZnYtzbRWj0W47D86a3TGe0YHBvmE= -golang.org/x/oauth2 v0.0.0-20220622183110-fd043fe589d2 h1:+jnHzr9VPj32ykQVai5DNahi9+NSp7yYuCsl5eAQtL0= golang.org/x/oauth2 v0.0.0-20220622183110-fd043fe589d2/go.mod h1:jaDAt6Dkxork7LmZnYtzbRWj0W47D86a3TGe0YHBvmE= +golang.org/x/oauth2 v0.0.0-20220822191816-0ebed06d0094/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= +golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1 h1:lxqLZaMad/dJHMFZH0NiNpiEZI/nhgWhe4wgzpE+MuA= +golang.org/x/oauth2 v0.0.0-20220909003341-f21342109be1/go.mod h1:h4gKUeWbJ4rQPri7E0u6Gs4e9Ri2zaLxzw5DI5XGrYg= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -507,8 +512,9 @@ golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20220517195934-5e4e11fc645e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220610221304-9f5ed59c137d/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220624220833-87e55d714810 h1:rHZQSjJdAI4Xf5Qzeh2bBc5YJIkPFVM6oDtMFYmgws0= golang.org/x/sys v0.0.0-20220624220833-87e55d714810/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10 h1:WIoqL4EROvwiPdUtaip4VcDdpZ4kha7wBWZrbVKCIZg= +golang.org/x/sys v0.0.0-20220728004956-3c1f35247d10/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -623,8 +629,10 @@ google.golang.org/api v0.75.0/go.mod h1:pU9QmyHLnzlpar1Mjt4IbapUCy8J+6HD6GeELN69 google.golang.org/api v0.78.0/go.mod h1:1Sg78yoMLOhlQTeF+ARBoytAcH1NNyyl390YMy6rKmw= google.golang.org/api v0.80.0/go.mod h1:xY3nI94gbvBrE0J6NHXhxOmW97HG7Khjkku6AFB3Hyg= google.golang.org/api v0.84.0/go.mod h1:NTsGnUFJMYROtiquksZHBWtHfeMC7iYthki7Eq3pa8o= -google.golang.org/api v0.91.0 h1:731+JzuwaJoZXRQGmPoBiV+SrsAfUaIkdMCWTcQNPyA= -google.golang.org/api v0.91.0/go.mod h1:+Sem1dnrKlrXMR/X0bPnMWyluQe4RsNoYfmNLhOIkzw= +google.golang.org/api v0.93.0/go.mod h1:+Sem1dnrKlrXMR/X0bPnMWyluQe4RsNoYfmNLhOIkzw= +google.golang.org/api v0.94.0/go.mod h1:eADj+UBuxkh5zlrSntJghuNeg8HwQ1w5lTKkuqaETEI= +google.golang.org/api v0.96.0 h1:F60cuQPJq7K7FzsxMYHAUJSiXh2oKctHxBMbDygxhfM= +google.golang.org/api v0.96.0/go.mod h1:w7wJQLTM+wvQpNf5JyEcBoxK0RH7EDrh/L4qfsuJ13s= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= @@ -715,8 +723,10 @@ google.golang.org/genproto v0.0.0-20220608133413-ed9918b62aac/go.mod h1:KEWEmljW google.golang.org/genproto v0.0.0-20220616135557-88e70c0c3a90/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= google.golang.org/genproto v0.0.0-20220617124728-180714bec0ad/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= google.golang.org/genproto v0.0.0-20220624142145-8cd45d7dbd1f/go.mod h1:KEWEmljWE5zPzLBa/oHl6DaEt9LmfH6WtH1OHIvleBA= -google.golang.org/genproto v0.0.0-20220804142021-4e6b2dfa6612 h1:NX3L5YesD5qgxxrPHdKqHH38Ao0AG6poRXG+JljPsGU= -google.golang.org/genproto v0.0.0-20220804142021-4e6b2dfa6612/go.mod h1:iHe1svFLAZg9VWz891+QbRMwUv9O/1Ww+/mngYeThbc= +google.golang.org/genproto v0.0.0-20220810155839-1856144b1d9c/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= +google.golang.org/genproto v0.0.0-20220815135757-37a418bb8959/go.mod h1:dbqgFATTzChvnt+ujMdZwITVAJHFtfyN1qUhDqEiIlk= +google.golang.org/genproto v0.0.0-20220915135415-7fd63a7952de h1:5ANeKFmGdtiputJJYeUVg8nTGA/1bEirx4CgzcnPSx8= +google.golang.org/genproto v0.0.0-20220915135415-7fd63a7952de/go.mod h1:0Nb8Qy+Sk5eDzHnzlStwW3itdNaWoZA5XeSG+R3JHSo= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= diff --git a/modules/README.md b/modules/README.md index ffeec77612..ace4baa580 100644 --- a/modules/README.md +++ b/modules/README.md @@ -200,7 +200,6 @@ example, the following code is using the embedded pre-existing-vpc module: ```yaml - id: network1 source: modules/network/pre-existing-vpc - kind: terraform ``` #### Local Modules @@ -213,7 +212,6 @@ following module definition refers the local pre-existing-vpc modules. ```yaml - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform ``` > **_NOTE:_** This example would have to be run from the HPC Toolkit repository @@ -232,7 +230,6 @@ Get module from GitHub over SSH: ```yaml - id: network1 source: git@github.com:GoogleCloudPlatform/hpc-toolkit.git//modules/network/vpc - kind: terraform ``` Get module from GitHub over HTTPS: @@ -240,7 +237,6 @@ Get module from GitHub over HTTPS: ```yaml - id: network1 source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc - kind: terraform ``` Both examples above use the [double-slash notation][tfsubdir] (`//`) to indicate @@ -256,7 +252,6 @@ Toolkit vpc module, use: ```yaml - id: network1 source: github.com/GoogleCloudPlatform/hpc-toolkit//modules/network/vpc?ref=develop - kind: terraform ``` [tfrev]: https://www.terraform.io/language/modules/sources#selecting-a-revision @@ -264,6 +259,14 @@ Toolkit vpc module, use: [tfsubdir]: https://www.terraform.io/language/modules/sources#modules-in-package-sub-directories [daos-cluster.yaml]: ../community/examples/intel/daos-cluster.yaml +#### Generic Git Modules +To use a Terraform module available in a non-GitHub git repository such as +gitlab, set the source to a path starting `git::`. Two Standard git protocols +are supported, `git::https://` for HTTPS or `git::git@github.com` for SSH. + +Additional formatting and features after `git::` are identical to that of the +[GitHub Modules](#github-modules) described above. + ### Kind (May be Required) `kind` refers to the way in which a module is deployed. Currently, `kind` can be @@ -294,11 +297,9 @@ the used module's output. For example, see the following blueprint snippet: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: workstation source: modules/compute/vm-instance - kind: terraform use: [network1] settings: ... diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index a3dc343698..eba02d5da0 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -8,7 +8,6 @@ This module creates one or more ```yaml - id: compute source: modules/compute/vm-instance - kind: terraform use: [network1] settings: instance_count: 8 @@ -134,13 +133,14 @@ No modules. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [auto\_delete\_boot\_disk](#input\_auto\_delete\_boot\_disk) | Controls if boot disk should be auto-deleted when instance is deleted. | `bool` | `true` | no | | [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image that supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"not_enabled"` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment, used to name the cluster | `string` | n/a | yes | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to true, instances will not have public IPs | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for instances. | `number` | `200` | no | | [disk\_type](#input\_disk\_type) | Disk type for instances. | `string` | `"pd-standard"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `null` | no | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | | [instance\_image](#input\_instance\_image) | Instance Image |
object({
family = string,
project = string
})
|
{
"family": "hpc-centos-7",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. List key, value pairs. | `any` | n/a | yes | @@ -154,7 +154,7 @@ No modules. | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Describes maintenance behavior for the instance. If left blank this will default to `MIGRATE` except for when `placement_policy`, spot provisioning, or GPUs require it to be `TERMINATE` | `string` | `null` | no | | [placement\_policy](#input\_placement\_policy) | Control where your VM instances are physically located relative to each other within a zone. |
object({
vm_count = number,
availability_domain_count = number,
collocation = string,
})
| `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | -| [service\_account](#input\_service\_account) | Service account to attach to the instance. See https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#service_account. |
object({
email = string,
scopes = set(string)
})
|
{
"email": null,
"scopes": [
"https://www.googleapis.com/auth/devstorage.read_only",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/servicecontrol",
"https://www.googleapis.com/auth/service.management.readonly",
"https://www.googleapis.com/auth/trace.append"
]
}
| no | +| [service\_account](#input\_service\_account) | Service account to attach to the instance. See https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#service_account. |
object({
email = string,
scopes = set(string)
})
|
{
"email": null,
"scopes": [
"https://www.googleapis.com/auth/devstorage.read_write",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/servicecontrol",
"https://www.googleapis.com/auth/service.management.readonly",
"https://www.googleapis.com/auth/trace.append"
]
}
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [startup\_script](#input\_startup\_script) | Startup script used on the instance | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to attach the VM. | `string` | `null` | no | diff --git a/modules/compute/vm-instance/main.tf b/modules/compute/vm-instance/main.tf index 8dc362db09..f00d8f8f89 100644 --- a/modules/compute/vm-instance/main.tf +++ b/modules/compute/vm-instance/main.tf @@ -31,7 +31,7 @@ locals { # compact_placement : true when placement policy is provided and collocation set; false if unset compact_placement = try(var.placement_policy.collocation, null) != null - gpu_attached = contains(["a2"], local.machine_family) || length(var.guest_accelerator) > 0 + gpu_attached = contains(["a2"], local.machine_family) || var.guest_accelerator != null # both of these must be false if either compact placement or preemptible/spot instances are used # automatic restart is tolerant of GPUs while on host maintenance is not @@ -112,7 +112,7 @@ resource "google_compute_instance" "compute_vm" { boot_disk { source = google_compute_disk.boot_disk[count.index].self_link device_name = google_compute_disk.boot_disk[count.index].name - auto_delete = true + auto_delete = var.auto_delete_boot_disk } dynamic "scratch_disk" { diff --git a/modules/compute/vm-instance/variables.tf b/modules/compute/vm-instance/variables.tf index 2cbdceaf10..307baf708f 100644 --- a/modules/compute/vm-instance/variables.tf +++ b/modules/compute/vm-instance/variables.tf @@ -49,6 +49,12 @@ variable "disk_type" { default = "pd-standard" } +variable "auto_delete_boot_disk" { + description = "Controls if boot disk should be auto-deleted when instance is deleted." + type = bool + default = true +} + variable "local_ssd_count" { description = "The number of local SSDs to attach to each VM. See https://cloud.google.com/compute/docs/disks/local-ssd." type = number @@ -109,7 +115,7 @@ variable "service_account" { }) default = { email = null - scopes = ["https://www.googleapis.com/auth/devstorage.read_only", + scopes = ["https://www.googleapis.com/auth/devstorage.read_write", "https://www.googleapis.com/auth/logging.write", "https://www.googleapis.com/auth/monitoring.write", "https://www.googleapis.com/auth/servicecontrol", @@ -153,7 +159,7 @@ variable "guest_accelerator" { type = string, count = number })) - default = [] + default = null } variable "on_host_maintenance" { diff --git a/modules/compute/vm-instance/versions.tf b/modules/compute/vm-instance/versions.tf index 6a3d002173..1cf5dad238 100644 --- a/modules/compute/vm-instance/versions.tf +++ b/modules/compute/vm-instance/versions.tf @@ -27,10 +27,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.6.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:vm-instance/v1.6.0" } required_version = ">= 0.14.0" diff --git a/modules/file-system/filestore/README.md b/modules/file-system/filestore/README.md index 5eddd6d6d2..e3359e2d0f 100644 --- a/modules/file-system/filestore/README.md +++ b/modules/file-system/filestore/README.md @@ -48,7 +48,6 @@ The Filestore instance defined below will have the following attributes: ```yaml - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -67,7 +66,6 @@ The Filestore instance defined below will have the following attributes: ```yaml - id: highscale source: modules/file-system/filestore - kind: terraform use: [network1] settings: filestore_tier: HIGH_SCALE_SSD diff --git a/modules/file-system/filestore/scripts/mount.yaml b/modules/file-system/filestore/scripts/mount.yaml index b39a2f4adb..f7fbe58d5e 100644 --- a/modules/file-system/filestore/scripts/mount.yaml +++ b/modules/file-system/filestore/scripts/mount.yaml @@ -22,14 +22,14 @@ url: "http://metadata.google.internal/computeMetadata/v1/instance/attributes" tasks: - name: Read metadata network_storage information - uri: + ansible.builtin.uri: url: "{{ url }}/{{ meta_key }}" method: GET headers: Metadata-Flavor: "Google" register: storage - name: Mount file systems - mount: + ansible.posix.mount: src: "{{ item.server_ip }}:/{{ item.remote_mount }}" path: "{{ item.local_mount }}" opts: "{{ item.mount_options }}" diff --git a/modules/file-system/filestore/versions.tf b/modules/file-system/filestore/versions.tf index 00469f5359..a4fe24b1fc 100644 --- a/modules/file-system/filestore/versions.tf +++ b/modules/file-system/filestore/versions.tf @@ -26,10 +26,10 @@ terraform { } } provider_meta "google" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.6.0" } provider_meta "google-beta" { - module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.5.0" + module_name = "blueprints/terraform/hpc-toolkit:filestore/v1.6.0" } required_version = ">= 0.14.0" diff --git a/modules/file-system/pre-existing-network-storage/README.md b/modules/file-system/pre-existing-network-storage/README.md index 338af4caad..c68ba5590d 100644 --- a/modules/file-system/pre-existing-network-storage/README.md +++ b/modules/file-system/pre-existing-network-storage/README.md @@ -13,7 +13,6 @@ Toolkit supported file-system such as [filestore](../filestore/README.md). ```yaml - id: homefs source: modules/file-system/pre-existing-network-storage - kind: terraform settings: server_ip: ## Set server IP here ## remote_mount: nfsshare @@ -60,5 +59,7 @@ No resources. | Name | Description | |------|-------------| +| [client\_install\_runner](#output\_client\_install\_runner) | Runner that performs client installation needed to use file system. | +| [mount\_runner](#output\_mount\_runner) | Runner that mounts the file system. | | [network\_storage](#output\_network\_storage) | Describes a remote network storage to be mounted by fs-tab. | diff --git a/modules/file-system/pre-existing-network-storage/outputs.tf b/modules/file-system/pre-existing-network-storage/outputs.tf index 32b00ba317..c38ce6b892 100644 --- a/modules/file-system/pre-existing-network-storage/outputs.tf +++ b/modules/file-system/pre-existing-network-storage/outputs.tf @@ -24,3 +24,55 @@ output "network_storage" { mount_options = var.mount_options } } + +locals { + # Client Install + ddn_lustre_client_install_script = templatefile( + "${path.module}/templates/ddn_exascaler_luster_client_install.tftpl", + { + server_ip = split("@", var.server_ip)[0] + remote_mount = var.remote_mount + local_mount = var.local_mount + } + ) + + install_scripts = { + "lustre" = local.ddn_lustre_client_install_script + } + + # Mounting + ddn_lustre_mount_cmd = "mount -t ${var.fs_type} ${var.server_ip}:/${var.remote_mount} ${var.local_mount}" + mount_commands = { + "lustre" = local.ddn_lustre_mount_cmd + } + + mount_script = <<-EOT + #!/bin/bash + findmnt --source ${var.server_ip}:/${var.remote_mount} --target ${var.local_mount} &> /dev/null + if [[ $? != 0 ]]; then + echo "Mounting --source ${var.server_ip}:/${var.remote_mount} --target ${var.local_mount}" + mkdir -p ${var.local_mount} + ${lookup(local.mount_commands, var.fs_type, "exit 1")} + else + echo "Skipping mounting source: ${var.server_ip}:/${var.remote_mount}, already mounted to target:${var.local_mount}" + fi + EOT +} + +output "client_install_runner" { + description = "Runner that performs client installation needed to use file system." + value = { + "type" = "shell" + "content" = lookup(local.install_scripts, var.fs_type, "echo 'skipping: client_install_runner not yet supported for ${var.fs_type}'") + "destination" = "install_filesystem_client${replace(var.local_mount, "/", "_")}.sh" + } +} + +output "mount_runner" { + description = "Runner that mounts the file system." + value = { + "type" = "shell" + "content" = (lookup(local.mount_commands, var.fs_type, null) == null ? "echo 'skipping: mount_runner not yet supported for ${var.fs_type}'" : local.mount_script) + "destination" = "mount_filesystem${replace(var.local_mount, "/", "_")}.sh" + } +} diff --git a/modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl b/modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl new file mode 100644 index 0000000000..649abc4c4a --- /dev/null +++ b/modules/file-system/pre-existing-network-storage/templates/ddn_exascaler_luster_client_install.tftpl @@ -0,0 +1,48 @@ +#!/bin/sh + +# Copyright 2022 DataDirect Networks +# Modifications Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Prior Art: https://github.com/DDNStorage/exascaler-cloud-terraform/blob/78deadbb2c1fa7e4603cf9605b0f7d1782117954/gcp/templates/client-script.tftpl + +# install new EXAScaler Cloud clients: +# all instances must be in the same zone +# and connected to the same network and subnet +# to set up EXAScaler Cloud filesystem on a new client instance, +# run the folowing commands on the client with root privileges: + +if [[ ! -z $(cat /proc/filesystems | grep lustre) ]]; then + echo "Skipping lustre client install as it is already supported" + exit 0 +fi + +cat >/etc/esc-client.conf<.c..internal - on_prem_network_storage_ip: storage-ip-placeholder ## internal ip address for nfs to be mounted - -deployment_groups: -- group: primary - modules: - - source: modules/network/pre-existing-vpc - kind: terraform - id: network1 - settings: - network_name: cloud-vpc-network - subnetwork_name: primary-subnet - - - source: modules/file-system/pre-existing-network-storage - kind: terraform - id: pre-existing-storage - outputs: - - network_storage - settings: - server_ip: $(vars.on_prem_network_storage_ip) - remote_mount: /exports/home - local_mount: /home - fs_type: nfs - - - source: ./community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform - id: compute-partition - use: [network1] - settings: - partition_name: cloud - node_count_dynamic_max: 10 - exclusive: false - machine_type: n2-standard-2 - partition_conf: - Default: NO - - - source: ./community/modules/scheduler/schedmd-slurm-gcp-v5-hybrid - kind: terraform - id: slurm-controller - use: [compute-partition, pre-existing-storage] - settings: - output_dir: ./hybrid - slurm_bin_dir: /usr/local/bin - slurm_control_host: $(vars.on_prem_controller_host_name) - install_dir: /etc/slurm/hybrid diff --git a/tools/validate_configs/test_configs/hpc-cluster-project.yaml b/tools/validate_configs/test_configs/hpc-cluster-project.yaml index bf420d690e..2feb7ddff5 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-project.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-project.yaml @@ -34,7 +34,6 @@ deployment_groups: modules: - id: project source: ./community/modules/project/new-project - kind: terraform settings: project_id: $(vars.project_id) folder_id: 334688113020 # random number @@ -43,7 +42,6 @@ deployment_groups: - id: enable-apis source: ./community/modules/project/service-enablement - kind: terraform use: [project] settings: gcp_service_list: @@ -57,18 +55,15 @@ deployment_groups: # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: compute_partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -80,7 +75,6 @@ deployment_groups: - id: slurm_controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -90,7 +84,6 @@ deployment_groups: - id: slurm_login source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml b/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml index 4e1068c02b..da8ab09d1b 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-service-acct.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/pre-existing-network-storage - kind: terraform settings: server_ip: '$controller' remote_mount: /home @@ -40,7 +38,6 @@ deployment_groups: - id: service_acct source: ./community/modules/project/service-account - kind: terraform settings: project_id: $(vars.project_id) names: @@ -51,7 +48,6 @@ deployment_groups: - id: compute-partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: [network1] settings: partition_name: compute @@ -60,7 +56,6 @@ deployment_groups: - id: slurm source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: [network1] settings: network_storage: diff --git a/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml b/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml index 1e9470e3ec..9c3015c9a8 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-simple-nfs-sql.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: ./community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: labels: @@ -39,7 +37,6 @@ deployment_groups: - id: slurm-sql source: ./community/modules/database/slurm-cloudsql-federation - kind: terraform use: [network1] settings: sql_instance_name: slurm-sql8 @@ -47,7 +44,6 @@ deployment_groups: - id: compute-partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - network1 @@ -58,7 +54,6 @@ deployment_groups: - id: slurm-controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - homefs - compute-partition @@ -71,7 +66,6 @@ deployment_groups: - id: slurm-login source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - slurm-controller - network1 diff --git a/tools/validate_configs/test_configs/hpc-cluster-simple.yaml b/tools/validate_configs/test_configs/hpc-cluster-simple.yaml index bcf2b053b7..e55ac954de 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-simple.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-simple.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -40,7 +38,6 @@ deployment_groups: - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -55,7 +52,6 @@ deployment_groups: - id: workstation source: modules/compute/vm-instance - kind: terraform use: - network1 - homefs diff --git a/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml b/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml index ae022dbb7a..c77bdd1672 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-slurm-with-startup.yaml @@ -30,18 +30,15 @@ deployment_groups: # Example - ./modules/network/vpc - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -50,7 +47,6 @@ deployment_groups: - id: compute_partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - network1 - homefs @@ -63,7 +59,6 @@ deployment_groups: - id: slurm_controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - network1 - homefs @@ -75,7 +70,6 @@ deployment_groups: - id: slurm_login source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - network1 - homefs diff --git a/tools/validate_configs/test_configs/hpc-cluster-small-slurm-v5.yaml b/tools/validate_configs/test_configs/hpc-cluster-small-slurm-v5.yaml index aba6e5b910..e1170101eb 100644 --- a/tools/validate_configs/test_configs/hpc-cluster-small-slurm-v5.yaml +++ b/tools/validate_configs/test_configs/hpc-cluster-small-slurm-v5.yaml @@ -31,18 +31,15 @@ deployment_groups: # Example - ./resources/network/vpc - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -55,7 +52,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/schedmd-slurm-gcp-v5-partition - kind: terraform use: - network1 - homefs @@ -65,7 +61,6 @@ deployment_groups: - id: slurm_controller source: community/modules/scheduler/schedmd-slurm-gcp-v5-controller - kind: terraform use: - network1 - debug_partition @@ -74,7 +69,6 @@ deployment_groups: - id: slurm_login source: community/modules/scheduler/schedmd-slurm-gcp-v5-login - kind: terraform use: - network1 - slurm_controller diff --git a/tools/validate_configs/test_configs/htcondor-pool.yaml b/tools/validate_configs/test_configs/htcondor-pool.yaml index f61e2c56de..24a7e5715b 100644 --- a/tools/validate_configs/test_configs/htcondor-pool.yaml +++ b/tools/validate_configs/test_configs/htcondor-pool.yaml @@ -29,23 +29,19 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform outputs: - network_name - id: htcondor_install source: community/modules/scripts/htcondor-install - kind: terraform - id: htcondor_services source: community/modules/project/service-enablement - kind: terraform use: - htcondor_install - id: htcondor_install_scripts source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -67,29 +63,24 @@ deployment_groups: modules: - id: cluster_network source: modules/network/pre-existing-vpc - kind: terraform - id: htcondor_configure source: community/modules/scheduler/htcondor-configure - kind: terraform - id: htcondor_configure_central_manager source: modules/scripts/startup-script - kind: terraform settings: runners: - $(htcondor_configure.central_manager_runner) - id: htcondor_configure_access_point source: modules/scripts/startup-script - kind: terraform settings: runners: - $(htcondor_configure.access_point_runner) - id: htcondor_cm source: modules/compute/vm-instance - kind: terraform use: - cluster_network - htcondor_configure_central_manager @@ -109,7 +100,6 @@ deployment_groups: - id: htcondor_access source: modules/compute/vm-instance - kind: terraform use: - cluster_network - htcondor_configure_access_point diff --git a/tools/validate_configs/test_configs/instance-with-startup.yaml b/tools/validate_configs/test_configs/instance-with-startup.yaml index 3b13ca6e4a..b2c8d7732a 100644 --- a/tools/validate_configs/test_configs/instance-with-startup.yaml +++ b/tools/validate_configs/test_configs/instance-with-startup.yaml @@ -27,18 +27,15 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -53,7 +50,6 @@ deployment_groups: - id: workstation source: modules/compute/vm-instance - kind: terraform use: - network1 - homefs @@ -64,6 +60,5 @@ deployment_groups: - id: wait source: ./community/modules/scripts/wait-for-startup - kind: terraform settings: instance_name: ((module.workstation.name[0])) diff --git a/tools/validate_configs/test_configs/label_test.yaml b/tools/validate_configs/test_configs/label_test.yaml index b9777b2bbc..f64f9739c6 100644 --- a/tools/validate_configs/test_configs/label_test.yaml +++ b/tools/validate_configs/test_configs/label_test.yaml @@ -30,11 +30,9 @@ deployment_groups: modules: - id: network source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs @@ -46,7 +44,6 @@ deployment_groups: - id: homefs1 source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs diff --git a/tools/validate_configs/test_configs/new_project.yaml b/tools/validate_configs/test_configs/new_project.yaml index 06563e5b8a..c62b9d3984 100644 --- a/tools/validate_configs/test_configs/new_project.yaml +++ b/tools/validate_configs/test_configs/new_project.yaml @@ -24,7 +24,6 @@ deployment_groups: modules: - id: project source: ./community/modules/project/new-project - kind: terraform settings: project_id: test_project folder_id: 334688113020 # random number diff --git a/tools/validate_configs/test_configs/overwrite_labels.yaml b/tools/validate_configs/test_configs/overwrite_labels.yaml index 3d4b724bc4..f885a60bf4 100644 --- a/tools/validate_configs/test_configs/overwrite_labels.yaml +++ b/tools/validate_configs/test_configs/overwrite_labels.yaml @@ -31,11 +31,9 @@ deployment_groups: modules: - id: network source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs @@ -46,7 +44,6 @@ deployment_groups: - id: homefs1 source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs @@ -57,7 +54,6 @@ deployment_groups: - id: homefs2 source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs @@ -68,7 +64,6 @@ deployment_groups: - id: homefs3 source: modules/file-system/filestore - kind: terraform use: [network] settings: name: homefs diff --git a/tools/validate_configs/test_configs/pre-existing-fs.yaml b/tools/validate_configs/test_configs/pre-existing-fs.yaml index 6a89e74840..bc3290fcc1 100644 --- a/tools/validate_configs/test_configs/pre-existing-fs.yaml +++ b/tools/validate_configs/test_configs/pre-existing-fs.yaml @@ -31,17 +31,14 @@ deployment_groups: # network-name from deployment vars - id: homefs-filestore source: modules/file-system/filestore - kind: terraform - group: compute modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/pre-existing-network-storage - kind: terraform settings: server_ip: "" # for now, must be completed manually in compute/main.tf remote_mount: nfsshare @@ -50,7 +47,6 @@ deployment_groups: - id: compute-partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - network1 @@ -59,7 +55,6 @@ deployment_groups: - id: slurm source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - homefs - compute-partition diff --git a/tools/validate_configs/test_configs/rocky-ss.yaml b/tools/validate_configs/test_configs/rocky-ss.yaml index b06679749a..5b644dc3e6 100644 --- a/tools/validate_configs/test_configs/rocky-ss.yaml +++ b/tools/validate_configs/test_configs/rocky-ss.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: appsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: name: appsfs @@ -39,7 +37,6 @@ deployment_groups: - id: nfs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: image: rocky-linux-cloud/rocky-linux-8 @@ -47,7 +44,6 @@ deployment_groups: - id: spack source: ./community//modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -60,7 +56,6 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -94,7 +89,6 @@ deployment_groups: - id: instance source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/simple-startup.yaml b/tools/validate_configs/test_configs/simple-startup.yaml index 3940714717..97b48176bb 100644 --- a/tools/validate_configs/test_configs/simple-startup.yaml +++ b/tools/validate_configs/test_configs/simple-startup.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -50,13 +48,11 @@ deployment_groups: - id: instance source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup] settings: machine_type: e2-standard-4 - id: waiter source: ./community/modules/scripts/wait-for-startup - kind: terraform settings: instance_name: ((module.instance.name[0])) diff --git a/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml b/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml index 4f36bb0dde..2aa484bef8 100644 --- a/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml +++ b/tools/validate_configs/test_configs/slurm-two-partitions-workstation.yaml @@ -27,18 +27,15 @@ deployment_groups: modules: - id: network1 source: modules/network/vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home - id: startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -53,7 +50,6 @@ deployment_groups: - id: workstation source: modules/compute/vm-instance - kind: terraform use: - network1 - homefs @@ -65,7 +61,6 @@ deployment_groups: - id: compute-partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - network1 @@ -74,7 +69,6 @@ deployment_groups: - id: debug-partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - network1 @@ -83,7 +77,6 @@ deployment_groups: - id: slurm source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - homefs - compute-partition diff --git a/tools/validate_configs/test_configs/spack-buildcache.yaml b/tools/validate_configs/test_configs/spack-buildcache.yaml index b8322d2c94..572194a0d9 100644 --- a/tools/validate_configs/test_configs/spack-buildcache.yaml +++ b/tools/validate_configs/test_configs/spack-buildcache.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: spack source: ./community/modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -55,7 +53,6 @@ deployment_groups: - id: spack-startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: data @@ -82,7 +79,6 @@ deployment_groups: - id: spack-build source: modules/compute/vm-instance - kind: terraform use: - network1 - spack-startup diff --git a/tools/validate_configs/test_configs/spack-environments.yaml b/tools/validate_configs/test_configs/spack-environments.yaml index ffe5ece48b..2fd2f4ec41 100644 --- a/tools/validate_configs/test_configs/spack-environments.yaml +++ b/tools/validate_configs/test_configs/spack-environments.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: spack source: ./community/modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -83,7 +81,6 @@ deployment_groups: - id: spack-startup source: modules/scripts/startup-script - kind: terraform settings: runners: - type: data @@ -110,7 +107,6 @@ deployment_groups: - id: spack-build source: modules/compute/vm-instance - kind: terraform use: - network1 - spack-startup diff --git a/tools/validate_configs/test_configs/startup-options.yaml b/tools/validate_configs/test_configs/startup-options.yaml index cbfe2764e3..94a22d5ae1 100644 --- a/tools/validate_configs/test_configs/startup-options.yaml +++ b/tools/validate_configs/test_configs/startup-options.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -50,7 +48,6 @@ deployment_groups: - id: instance-explicit-startup source: ./modules/compute/vm-instance - kind: terraform use: [network1] settings: name_prefix: explicit @@ -59,7 +56,6 @@ deployment_groups: - id: instance-no-startup source: ./modules/compute/vm-instance - kind: terraform use: [network1] settings: name_prefix: no-startup @@ -67,7 +63,6 @@ deployment_groups: - id: instance-use-startup source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup] settings: name_prefix: use-startup @@ -76,7 +71,6 @@ deployment_groups: - id: instance-metadata-startup source: ./modules/compute/vm-instance - kind: terraform use: [network1] settings: name_prefix: metadata-startup diff --git a/tools/validate_configs/test_configs/test_outputs.yaml b/tools/validate_configs/test_configs/test_outputs.yaml index 103f038974..cf77ddc4b0 100644 --- a/tools/validate_configs/test_configs/test_outputs.yaml +++ b/tools/validate_configs/test_configs/test_outputs.yaml @@ -27,13 +27,11 @@ deployment_groups: modules: - id: instance source: modules/compute/vm-instance - kind: terraform outputs: - name - id: sql source: community/modules/database/slurm-cloudsql-federation - kind: terraform outputs: - cloudsql settings: @@ -44,7 +42,6 @@ deployment_groups: - id: filestore source: modules/file-system/filestore - kind: terraform use: [vpc] outputs: - network_storage @@ -52,14 +49,12 @@ deployment_groups: - id: nfs source: ./community/modules/file-system/nfs-server - kind: terraform outputs: - network_storage - install_nfs_client - id: pre-existing-storage source: modules/file-system/pre-existing-network-storage - kind: terraform outputs: - network_storage settings: @@ -70,7 +65,6 @@ deployment_groups: - id: pre-existing-vpc source: modules/network/pre-existing-vpc - kind: terraform outputs: - network_name - network_self_link @@ -81,7 +75,6 @@ deployment_groups: - id: vpc source: modules/network/vpc - kind: terraform outputs: - network_name - network_self_link @@ -93,7 +86,6 @@ deployment_groups: - id: new-project source: community/modules/project/new-project - kind: terraform outputs: - project_name - project_id @@ -119,7 +111,6 @@ deployment_groups: - id: sa source: community/modules/project/service-account - kind: terraform outputs: - email - emails @@ -140,20 +131,17 @@ deployment_groups: - id: spack source: community/modules/scripts/spack-install - kind: terraform outputs: - startup_script - controller_startup_script - id: startup source: modules/scripts/startup-script - kind: terraform outputs: - startup_script - id: partition source: ./community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: [vpc] outputs: - partition @@ -162,7 +150,6 @@ deployment_groups: - id: lustre source: ./community/modules/file-system/DDN-EXAScaler - kind: terraform outputs: - private_addresses - ssh_console @@ -172,7 +159,6 @@ deployment_groups: - id: controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - partition - vpc diff --git a/tools/validate_configs/test_configs/ubuntu-ss.yaml b/tools/validate_configs/test_configs/ubuntu-ss.yaml index b2cf676059..335a9b9b45 100644 --- a/tools/validate_configs/test_configs/ubuntu-ss.yaml +++ b/tools/validate_configs/test_configs/ubuntu-ss.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: appsfs source: modules/file-system/filestore - kind: terraform use: [network1] settings: name: appsfs @@ -39,14 +37,12 @@ deployment_groups: - id: nfs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] settings: auto_delete_disk: true - id: spack source: ./community//modules/scripts/spack-install - kind: terraform settings: install_dir: /apps/spack spack_url: https://github.com/spack/spack @@ -64,7 +60,6 @@ deployment_groups: - id: startup source: ./modules/scripts/startup-script - kind: terraform settings: runners: - type: shell @@ -98,7 +93,6 @@ deployment_groups: - id: instance source: ./modules/compute/vm-instance - kind: terraform use: [network1, startup, nfs, appsfs] settings: machine_type: e2-standard-4 diff --git a/tools/validate_configs/test_configs/use-resources.yaml b/tools/validate_configs/test_configs/use-resources.yaml index 5ef30961e9..bf67645d9e 100644 --- a/tools/validate_configs/test_configs/use-resources.yaml +++ b/tools/validate_configs/test_configs/use-resources.yaml @@ -30,11 +30,9 @@ deployment_groups: # Example - ./modules/network/pre-existing-vpc - id: network1 source: modules/network/pre-existing-vpc - kind: terraform - id: homefs source: modules/file-system/filestore - kind: terraform use: [network1] settings: local_mount: /home @@ -43,12 +41,10 @@ deployment_groups: - id: projectsfs source: community/modules/file-system/nfs-server - kind: terraform use: [network1] - id: scratchfs source: community/modules/file-system/DDN-EXAScaler - kind: terraform settings: local_mount: /scratch network_self_link: $(network1.network_self_link) @@ -57,7 +53,6 @@ deployment_groups: - id: compute_partition source: community/modules/compute/SchedMD-slurm-on-gcp-partition - kind: terraform use: - homefs - scratchfs @@ -68,7 +63,6 @@ deployment_groups: - id: slurm_controller source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-controller - kind: terraform use: - projectsfs - compute_partition @@ -76,7 +70,6 @@ deployment_groups: - id: slurm_login source: ./community/modules/scheduler/SchedMD-slurm-on-gcp-login-node - kind: terraform use: - homefs - scratchfs diff --git a/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml b/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml index 3985da6323..7adcc33496 100644 --- a/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml +++ b/tools/validate_configs/test_configs/vm-instance-local-ssd.yaml @@ -27,11 +27,9 @@ deployment_groups: modules: - id: network1 source: ./modules/network/pre-existing-vpc - kind: terraform - id: multi-instance-multi-ssd source: ./modules/compute/vm-instance - kind: terraform use: [network1] settings: machine_type: n2-standard-16 @@ -40,7 +38,6 @@ deployment_groups: - id: instance-ssd-interface-defined source: ./modules/compute/vm-instance - kind: terraform use: [network1] settings: machine_type: n2-standard-16