diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml index 24e8a75a..3638fc20 100644 --- a/.github/workflows/terraform.yml +++ b/.github/workflows/terraform.yml @@ -16,6 +16,7 @@ concurrency: env: TF_VAR_parent_id: project-e00pjzzrtk1fs3yavy + TF_VAR_tenant_id: tenant-e00f3wdfzwfjgbcyfv jobs: terraform: @@ -29,8 +30,9 @@ jobs: solution: - name: k8s-inference - name: k8s-training - - name: slurm - name: wireguard + - name: dsvm + - name: bastion defaults: run: @@ -72,7 +74,7 @@ jobs: - name: Install Nebius CLI run: | - curl -sSL https://storage.ai.nebius.cloud/nebius/install.sh | bash + curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash echo "${HOME}/.nebius/bin" >> $GITHUB_PATH - name: Nebius CLI init @@ -170,7 +172,7 @@ jobs: - name: Install Nebius CLI run: | - curl -sSL https://storage.ai.nebius.cloud/nebius/install.sh | bash + curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash echo "${HOME}/.nebius/bin" >> $GITHUB_PATH - name: Nebius CLI init diff --git a/CODEOWNERS b/CODEOWNERS index 2aea26bf..5eacdbe7 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -2,4 +2,4 @@ * @malibora @jadnov @elijah-k-nebius @rdjjke @asteny .github/workflows @malibora @d3vil-st @elijah-k-nebius -soperator @dstaroff @asteny @rdjjke @Uburro +soperator/ @dstaroff @asteny @rdjjke @Uburro diff --git a/README.md b/README.md index ce7d8481..dea28f77 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ This repository is a curated collection of Terraform and Helm solutions designed For those who prefer containerized environments, our Kubernetes solution includes GPU-Operator and Network-Operator. This setup ensures that your training workloads use dedicated GPU resources and optimized network configurations, both of which are critical components for AI models that require a lot of computational power. . GPU-Operator simplifies the management of NVIDIA GPUs, automating the deployment of necessary drivers and plugins. Similarly, the Network-Operator improves network performance, ensuring seamless communication throughout your cluster. The cluster uses InfiniBand technology, which provides the fastest host connections for data-intensive tasks. -[SLURM](./slurm/README.md) +[SLURM](./soperator/README.md) Our SLURM solutions offer a streamlined approach for users who prefer traditional HPC environments. These solutions include ready-to-use images pre-configured with NVIDIA drivers and are ideal for those looking to take advantage of SLURM’s robust job scheduling capabilities. Similar to our Kubernetes offerings, the SLURM solutions are optimized for InfiniBand connectivity, ensuring peak performance and efficiency in data transfer and communication between nodes. diff --git a/bastion/README.md b/bastion/README.md new file mode 100644 index 00000000..bc1617d0 --- /dev/null +++ b/bastion/README.md @@ -0,0 +1,140 @@ +# Bastion instance + +This Terraform solution deploys a Bastion instance that serves as a secure jump host for your infrastructure. +It improves the security by minimizing the use of Public IPs and limiting access to the rest of the environment. + +Also create a Service Account with generated Auhorization key pair to authentificate Nebius CLI on the host. + +Also installed on the host: +- Wireguard VPN solution with UI +- Nebius CLI and configured with profile authentificated by Service account +- kubectl and configured to connect to first mk8s cluster available in project by --internal flag + (scanned by: `nebius mk8s v1 cluster list`) + +## How to connect over bastion + +### Edit you local ssh config + +`~/.ssh/config` + +``` +Host bastion + HostName + User bastion + IdentityFile ~/.ssh/private.key + +Host target + HostName + User ubuntu + IdentityFile ~/.ssh/private.key + ProxyJump bastion +``` + +### Login to remote VM behind bastion +``` +ssh target +``` + +## Prerequisites + +1. Install [Nebius CLI](https://docs.nebius.dev/en/cli/#installation): + ```bash + curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash + ``` + +2. Reload your shell session: + + ```bash + exec -l $SHELL + ``` + + or + + ```bash + source ~/.bashrc + ``` + +3. [Configure](https://docs.nebius.ai/cli/configure/) Nebius CLI (we recommend using [service account](https://docs.nebius.ai/iam/service-accounts/manage/)): + ```bash + nebius init + ``` + +4. Install JQuery (for Debian-based distributions): + ```bash + sudo apt install jq -y + ``` + +## Installation + +To deploy the solution, follow these steps: + +1. Load environment variables: + ```bash + source ./environment.sh + ``` +2. Initialize Terraform: + ```bash + terraform init + ``` +3. Replace the placeholder content in `terraform.tfvars` with the configuration values that you need. See the details [below](#configuration-variables). +4. Preview the deployment plan: + ```bash + terraform plan + ``` +5. Apply the configuration: + ```bash + terraform apply + ``` + Wait for the operation to complete. + +## Configuration variables + +Update the following variables in the `terraform.tfvars` file with your own values: + +- `tenant-id` +- `parent_id` +- `subnet_id` +- `ssh_user_name` +- `ssh_public_key` + +## Creating and using a public IP allocation + +This step allows you to retain the IP address even if the VM is deleted. If you don’t need to keep the IP adress, skip section. + +1. Create a public IP allocation: + ```bash + nebius vpc v1 allocation create --ipv-4-public \ + --parent-id --name wireguard_allocation_pub \ + --format json | jq -r '.metadata.id' + ``` +2. Assign the value from the previous step to the `public_ip_allocation_id` variable in [variables.tf](./variables.tf): + +```bash +public_ip_allocation_id = +``` + +### Logging into Wireguard UI + +1. SSH into the Wireguard instance: + ```bash + ssh -i @ + ``` + +2. Retrieve the Wireguard UI password: + ```bash + sudo cat /var/lib/wireguard-ui/initial_password + ``` + +3. Open the Wireguard UI in your browser: + ``` + http://:5000 + ``` + +4. Log in with the following credentials: + - **Username:** `admin` + - **Password:** [password retrieved in step 2] + +### Notes + +- **Apply Config:** After creating, deleting or changing Wireguard users, select "Apply Config". +- **Allowed IPs:** When adding new users, specify the CIDRs of your existing infrastructure in the "Allowed IPs" field. diff --git a/bastion/disks.tf b/bastion/disks.tf new file mode 100644 index 00000000..6b7457f2 --- /dev/null +++ b/bastion/disks.tf @@ -0,0 +1,8 @@ +resource "nebius_compute_v1_disk" "bastion-boot-disk" { + parent_id = var.parent_id + name = "bastion-boot-disk" + block_size_bytes = 4096 + size_bytes = 64424509440 + type = "NETWORK_SSD" + source_image_family = { image_family = "ubuntu22.04-driverless" } +} diff --git a/slurm/environment.sh b/bastion/environment.sh similarity index 67% rename from slurm/environment.sh rename to bastion/environment.sh index fbdc6a48..a5178dbd 100644 --- a/slurm/environment.sh +++ b/bastion/environment.sh @@ -1,3 +1,4 @@ #/bin/sh unset NEBIUS_IAM_TOKEN export NEBIUS_IAM_TOKEN=$(nebius iam get-access-token) +export TF_VAR_iam_token=$NEBIUS_IAM_TOKEN diff --git a/bastion/locals.tf b/bastion/locals.tf new file mode 100644 index 00000000..ed79b470 --- /dev/null +++ b/bastion/locals.tf @@ -0,0 +1,4 @@ +locals { + ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( + fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) +} diff --git a/bastion/main.tf b/bastion/main.tf new file mode 100644 index 00000000..a9a4d3be --- /dev/null +++ b/bastion/main.tf @@ -0,0 +1,32 @@ +resource "nebius_compute_v1_instance" "bastion_instance" { + parent_id = var.parent_id + name = "bastion-instance" + + boot_disk = { + attach_mode = "READ_WRITE" + existing_disk = nebius_compute_v1_disk.bastion-boot-disk + } + + network_interfaces = [ + { + name = "eth0" + subnet_id = var.subnet_id + ip_address = {} + public_ip_address = {} + } + ] + + resources = { + platform = "cpu-e2" + preset = "4vcpu-16gb" + } + + cloud_init_user_data = templatefile("../modules/cloud-init/bastion-cloud-init.tftpl", { + ssh_user_name = var.ssh_user_name + ssh_public_key = local.ssh_public_key + sa_private_key = local.sa_private_key + parent_id = var.parent_id + sa_public_key_id = local.sa_public_key_id + service_account_id = local.service_account_id + }) +} \ No newline at end of file diff --git a/bastion/output.tf b/bastion/output.tf new file mode 100644 index 00000000..f031b4e9 --- /dev/null +++ b/bastion/output.tf @@ -0,0 +1,6 @@ +output "bastion_host_public_ip" { + value = trimsuffix(nebius_compute_v1_instance.bastion_instance.status.network_interfaces[0].public_ip_address.address, "/32") +} +output "bastion_service_account" { + value = nebius_iam_v1_service_account.bastion-sa.id +} \ No newline at end of file diff --git a/bastion/provider.tf b/bastion/provider.tf new file mode 100644 index 00000000..724fd8b7 --- /dev/null +++ b/bastion/provider.tf @@ -0,0 +1,11 @@ +terraform { + required_providers { + nebius = { + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" + } + } +} + +provider "nebius" { + domain = "api.eu.nebius.cloud:443" +} diff --git a/bastion/sa.tf b/bastion/sa.tf new file mode 100644 index 00000000..4599ac5c --- /dev/null +++ b/bastion/sa.tf @@ -0,0 +1,37 @@ +resource "tls_private_key" "bastion_sa_key" { + algorithm = "RSA" + rsa_bits = 4096 +} + +resource "nebius_iam_v1_service_account" "bastion-sa" { + parent_id = var.parent_id + name = "bastion-sa" +} + +data "nebius_iam_v1_group" "admins-group" { + name = "editors" + parent_id = var.tenant_id +} + +resource "nebius_iam_v1_group_membership" "bastion-sa-admin" { + parent_id = data.nebius_iam_v1_group.admins-group.id + member_id = nebius_iam_v1_service_account.bastion-sa.id +} + +resource "nebius_iam_v1_auth_public_key" "bastion-sa-public-key" { + parent_id = var.parent_id + expires_at = timeadd(timestamp(), "8760h") # 1 Year expiration time + account = { + service_account = { + id = nebius_iam_v1_service_account.bastion-sa.id + } + } + data = tls_private_key.bastion_sa_key.public_key_pem +} + +locals { + sa_public_key = tls_private_key.bastion_sa_key.public_key_pem + sa_private_key = tls_private_key.bastion_sa_key.private_key_pem + sa_public_key_id = nebius_iam_v1_auth_public_key.bastion-sa-public-key.id + service_account_id = nebius_iam_v1_service_account.bastion-sa.id +} \ No newline at end of file diff --git a/bastion/terraform.tfvars b/bastion/terraform.tfvars new file mode 100644 index 00000000..3582e7a0 --- /dev/null +++ b/bastion/terraform.tfvars @@ -0,0 +1,8 @@ +# tenant-id = "" +# parent_id = "" +# subnet_id = "" +# ssh_user_name = "bastion" +# ssh_public_key = { +# key = "put your public ssh key here" +# path = "put path to ssh key here" +# } \ No newline at end of file diff --git a/bastion/test-resource.tf b/bastion/test-resource.tf new file mode 100644 index 00000000..85a7d9c2 --- /dev/null +++ b/bastion/test-resource.tf @@ -0,0 +1,22 @@ +locals { + test_bst_host = trimsuffix(nebius_compute_v1_instance.bastion_instance.status.network_interfaces[0].public_ip_address.address, "/32") +} + +resource "null_resource" "check_bastion_instance" { + count = var.test_mode ? 1 : 0 + + connection { + user = var.ssh_user_name + host = local.test_bst_host + } + + provisioner "remote-exec" { + inline = [ + "set -eu", + "cloud-init status --wait", + "ip link show wg0", + "systemctl -q status wg-quick@wg0.service > /dev/null", + ".nebius/bin/nebius iam whoami > /dev/null" + ] + } +} \ No newline at end of file diff --git a/bastion/tests/main.tftest.hcl b/bastion/tests/main.tftest.hcl new file mode 100644 index 00000000..f95bae42 --- /dev/null +++ b/bastion/tests/main.tftest.hcl @@ -0,0 +1,7 @@ +run "test_mode_bastion_apply" { + command = apply + + variables { + test_mode = true + } +} diff --git a/bastion/variables.tf b/bastion/variables.tf new file mode 100644 index 00000000..73752b81 --- /dev/null +++ b/bastion/variables.tf @@ -0,0 +1,47 @@ +variable "tenant_id" { + description = "Tenant ID." + type = string +} + +variable "parent_id" { + description = "Project ID." + type = string +} + +variable "subnet_id" { + description = "Subnet ID." + type = string +} + +# SSH access +variable "ssh_user_name" { + description = "SSH username." + type = string + default = "bastion" +} + +variable "ssh_public_key" { + description = "SSH Public Key to access the cluster nodes." + type = object({ + key = optional(string), + path = optional(string, "~/.ssh/id_rsa.pub") + }) + default = {} + validation { + condition = var.ssh_public_key.key != null || fileexists(var.ssh_public_key.path) + error_message = "SSH Public Key must be set by `key` or file `path` ${var.ssh_public_key.path}" + } +} + +# Access By IP +variable "public_ip_allocation_id" { + description = "Id of a manually created public_ip_allocation." + type = string + default = null +} + +variable "test_mode" { + description = "Switch between real usage and testing." + type = bool + default = false +} diff --git a/bastion/wireguard-ui b/bastion/wireguard-ui new file mode 100755 index 00000000..0e092568 Binary files /dev/null and b/bastion/wireguard-ui differ diff --git a/compute-testing/provider.tf b/compute-testing/provider.tf index 63ebbe74..724fd8b7 100644 --- a/compute-testing/provider.tf +++ b/compute-testing/provider.tf @@ -1,7 +1,7 @@ terraform { required_providers { nebius = { - source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius" + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" } } } diff --git a/dsvm/README.md b/dsvm/README.md new file mode 100644 index 00000000..2b017606 --- /dev/null +++ b/dsvm/README.md @@ -0,0 +1,122 @@ +# DSVM + +This Terraform solution deploys a Data Science Virtual Machine (DSVM) is a virtual machine with pre-installed popular libraries for data analytics and machine learning. A DSVM can be used as an environment for training models and experimenting with data. + +## The image is based on Ubuntu and includes pre-installed software +* Conda, a package manager with Python 2.7 (environment py27) and Python 3.10 (py310). +* Jupyter Notebook and JupyterLab, tools for interactive and reproducible computations. +* Data analysis, scientific computing and data visualisation libraries: Pandas, NumPy, SciPy, Matplotlib. +* Machine Learning libraries: PyTorch, CatBoost, TensorFlow, scikit-learn, Keras. +* PySpark, a library for interacting with Apache Spark™ and building distributed data processing pipelines. +* NLTK, a suite of natural language processing libraries and data. +* Docker®, a container management system. +* Git, a version control system. +* NVIDIA® Data Center Driver, CUDA® Toolkit 12, and Container Toolkit for accelerating machine learning and other compute-intensive applications on NVIDIA GPUs available in Nebius.ai. +* Optimised libraries and instruments for working with images: scikit-image, opencv-python, Pillow. + +## Use cases +* Analysis and prediction of user behavior. +* Analysis of system operation and prediction of failures. +* Customer segmentation. +* Classification of images, documents, and any types of data. +* Recommendation systems. +* Speech synthesis and recognition services. +* Dialog engines. + +## Deployment instructions + +### Prerequisites + +1. Install [Nebius CLI](https://docs.nebius.dev/en/cli/#installation): + ```bash + curl -sSL https://storage.ai.nebius.cloud/nebius/install.sh | bash + ``` + +2. Reload your shell session: + + ```bash + exec -l $SHELL + ``` + + or + + ```bash + source ~/.bashrc + ``` + +3. [Configure](https://docs.nebius.ai/cli/configure/) Nebius CLI (we recommend using [service account](https://docs.nebius.ai/iam/service-accounts/manage/)): + ```bash + nebius init + ``` + +### Installation + +To deploy the solution, follow these steps: + +1. Load environment variables: + ```bash + source ./environment.sh + ``` +2. Initialize Terraform: + ```bash + terraform init + ``` +3. Replace the placeholder content in `terraform.tfvars` with the configuration values that you need. See the details [below](#configuration-variables). +4. Preview the deployment plan: + ```bash + terraform plan + ``` +5. Apply the configuration: + ```bash + terraform apply + ``` + Wait for the operation to complete. + +## Configuration variables + +Update the following variables in the `terraform.tfvars` file with your own values: + +- `parent_id` +- `subnet_id` +- `ssh_user_name` +- `ssh_public_key` + +## Usage +* In your web browser, go to ```http://:8888``` to access the UI. The password is your VM’s ID +* In the UI, open the terminal. +* Activate conda environment: ```conda activate py310``` +* Set a new password and restart the Jupyter Lab daemon: +``` +jupyter-lab password +sudo systemctl restart jupyter-lab +``` + +## Product composition +| Software | Version | +| ------------------------- | ---------- | +| Ubuntu | 22.04 LTS | +| CatBoost | 1.2 | +| Conda | 23.5.0 | +| Docker | 24.0.2 | +| Git | 2.25.1 | +| JypiterLab | 3.6.3 | +| Keras | 2.11.0 | +| Matplotlib | 3.7.1 | +| NLTK | 3.7 | +| NVIDIA CUDA Toolkit | 12.0.1 | +| NVIDIA Container Toolkit | 1.13.2 | +| NVIDIA Data Center Driver | 535.54.03 | +| NumPy | 1.22.3 | +| Pandas | 1.4.2 | +| Pillow | 9.4.0 | +| PySpark | 3.2.1 | +| PyTorch | 2.0.1 | +| SciPy | 1.8.1 | +| TensorFlow | 2.11.0 | +| opencv-python | 4.6.0 | +| scikit-image | 0.19.2 | +| scikit-learn | 1.1.1 | +| accelerate | 0.17.1 | +| datasets | 2.9.0 | +| transformers | 4.27.1 | +| torchvision | 0.15.2 | diff --git a/dsvm/disks.tf b/dsvm/disks.tf new file mode 100644 index 00000000..9bb8847a --- /dev/null +++ b/dsvm/disks.tf @@ -0,0 +1,8 @@ +resource "nebius_compute_v1_disk" "dsvm-boot-disk" { + parent_id = var.parent_id + name = "dsvm-boot-disk" + block_size_bytes = 4096 + size_bytes = 100 * 1024 * 1024 * 1024 # 100GiB + type = "NETWORK_SSD" + source_image_family = { image_family = "ubuntu22.04-cuda12" } +} diff --git a/dsvm/environment.sh b/dsvm/environment.sh new file mode 100644 index 00000000..a5178dbd --- /dev/null +++ b/dsvm/environment.sh @@ -0,0 +1,4 @@ +#/bin/sh +unset NEBIUS_IAM_TOKEN +export NEBIUS_IAM_TOKEN=$(nebius iam get-access-token) +export TF_VAR_iam_token=$NEBIUS_IAM_TOKEN diff --git a/dsvm/files/dsvm-cloud-init.tftpl b/dsvm/files/dsvm-cloud-init.tftpl new file mode 100644 index 00000000..9cfa5523 --- /dev/null +++ b/dsvm/files/dsvm-cloud-init.tftpl @@ -0,0 +1,152 @@ +users: + - name: ${ssh_user_name} + sudo: ALL=(ALL) NOPASSWD:ALL + shell: /bin/bash + ssh_authorized_keys: + - ${ssh_public_key} + +package_update: true + +write_files: + - content: | + #!/bin/bash + set -e + INSTANCE_ID=$(cloud-init query -f '{{instance_id}}') + PASSWORDHASH=$(/usr/local/bin/miniconda/envs/py310/bin/python -c "from jupyter_server.auth import passwd; print(passwd('$INSTANCE_ID', algorithm='sha1'))") + + mkdir -p /home/${ssh_user_name}/.jupyter/ + cat > /home/${ssh_user_name}/.jupyter/jupyter_server_config.json <requirements.txt + pytorch==2.0.1 + pytorch-cuda=12.1 + torchvision==0.15.2 + pandas==1.4.2 + numpy==1.22.3 + scipy==1.8.1 + matplotlib==3.7.1 + nltk==3.7 + pillow==9.4.0 + scikit-image==0.19.2 + scikit-learn==1.1.1 + opencv==4.6.0 + pyspark==3.2.1 + tensorflow==2.11.0 + traitlets==5.9.0 + catboost==1.2 + keras==2.11.0 + jupyterlab==3.6.3 + accelerate==0.33.0 + datasets==2.9.0 + transformers==4.27.1 + EOF + + sudo sh -c "$MINICONDA_BIN update -y -n base conda" + sudo sh -c "$MINICONDA_BIN install -y -n base conda-libmamba-solver" + sudo sh -c "$MINICONDA_BIN config --set solver libmamba" + sudo sh -c "$MINICONDA_BIN config --add channels defaults" + sudo sh -c "$MINICONDA_BIN install -y -n py310 --file requirements.txt -c fastai -c nvidia -c pytorch -c huggingface -c conda-forge" + + sudo sh -c "$MINICONDA_BIN run -n py310 pip uninstall -y torch torchvision torchaudio" + sudo sh -c "$MINICONDA_BIN run -n py310 pip install torch==2.1.0+cu121 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121" + echo "Python 3.10 libraries installed" + + sudo chmod -R 777 /usr/local/bin/miniconda + sudo ln -s /usr/local/bin/miniconda/bin/conda /usr/local/bin/conda + echo "Miniconda directory access enabled" + path: /etc/jupyter-lab/scripts/install-conda-packages.sh + permissions: "0755" + owner: root:root + - content: | + [Unit] + Description=Jupyter-lab initialization service + Wants=cloud-init.target + + [Service] + Type=oneshot + ExecStart=/etc/jupyter-lab/setup-jupyter-lab.sh + ExecStartPost=systemctl disable jupyter-lab-init.service + + [Install] + WantedBy=multi-user.target + path: /etc/systemd/system/jupyter-lab-init.service + permissions: "0644" + owner: root:root + - content: | + [Unit] + Description=Jupyter-lab server + After=network.target + StartLimitIntervalSec=0 + + [Service] + Type=simple + Restart=always + RestartSec=1 + User=${ssh_user_name} + ExecStart=/usr/local/bin/miniconda/envs/py310/bin/jupyter-lab --ServerApp.port=8888 --ServerApp.ip=0.0.0.0 + WorkingDirectory=/home/${ssh_user_name} + + [Install] + WantedBy=multi-user.target + path: /etc/systemd/system/jupyter-lab.service + permissions: "0644" + owner: root:root + +runcmd: + - sudo bash /etc/jupyter-lab/scripts/install-conda.sh + - sudo bash /etc/jupyter-lab/scripts/install-conda-packages.sh + - sudo -u ${ssh_user_name} /usr/local/bin/miniconda/bin/conda init bash + - sudo bash /etc/jupyter-lab/setup-jupyter-lab.sh + - sudo systemctl daemon-reload + - sudo systemctl enable jupyter-lab-init.service + - sudo systemctl start jupyter-lab-init.service + - sudo systemctl enable jupyter-lab.service + - sudo systemctl start jupyter-lab.service + - sudo systemctl restart jupyter-lab.service diff --git a/dsvm/locals.tf b/dsvm/locals.tf new file mode 100644 index 00000000..ed79b470 --- /dev/null +++ b/dsvm/locals.tf @@ -0,0 +1,4 @@ +locals { + ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( + fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) +} diff --git a/dsvm/main.tf b/dsvm/main.tf new file mode 100644 index 00000000..fc0c9afe --- /dev/null +++ b/dsvm/main.tf @@ -0,0 +1,28 @@ +resource "nebius_compute_v1_instance" "dsvm_instance" { + parent_id = var.parent_id + name = "dsvm-instance" + + boot_disk = { + attach_mode = "READ_WRITE" + existing_disk = nebius_compute_v1_disk.dsvm-boot-disk + } + + network_interfaces = [ + { + name = "eth0" + subnet_id = var.subnet_id + ip_address = {} + public_ip_address = {} + } + ] + + resources = { + platform = var.platform + preset = var.preset + } + + cloud_init_user_data = templatefile("./files/dsvm-cloud-init.tftpl", { + ssh_user_name = var.ssh_user_name, + ssh_public_key = local.ssh_public_key, + }) +} diff --git a/dsvm/output.tf b/dsvm/output.tf new file mode 100644 index 00000000..d67b6ecd --- /dev/null +++ b/dsvm/output.tf @@ -0,0 +1,6 @@ +output "DSVM_Login_URL" { + value = "http://${trimsuffix(nebius_compute_v1_instance.dsvm_instance.status.network_interfaces[0].public_ip_address.address, "/32")}:8888" +} +output "DSVM_Password" { + value = nebius_compute_v1_instance.dsvm_instance.id +} \ No newline at end of file diff --git a/slurm/versions.tf b/dsvm/provider.tf similarity index 100% rename from slurm/versions.tf rename to dsvm/provider.tf diff --git a/dsvm/terraform.tfvars b/dsvm/terraform.tfvars new file mode 100644 index 00000000..3d67de01 --- /dev/null +++ b/dsvm/terraform.tfvars @@ -0,0 +1,8 @@ +# parent_id = "" # The project-id in this context +# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id +# region = "eu-north1" # Project region +# ssh_user_name = "ubuntu" # Username you want to use to connect to the nodes +# ssh_public_key = { +# key = "put your public ssh key here" +# path = "put path to ssh key here" +# } diff --git a/dsvm/test-resource.tf b/dsvm/test-resource.tf new file mode 100644 index 00000000..7086b6b7 --- /dev/null +++ b/dsvm/test-resource.tf @@ -0,0 +1,19 @@ +locals { + test_dsvm_host = trimsuffix(nebius_compute_v1_instance.dsvm_instance.status.network_interfaces[0].public_ip_address.address, "/32") +} + +resource "null_resource" "check_dsvm_instance" { + count = var.test_mode ? 1 : 0 + + connection { + user = var.ssh_user_name + host = local.test_dsvm_host + } + + provisioner "remote-exec" { + inline = [ + "set -eu", + "cloud-init status --wait" + ] + } +} diff --git a/dsvm/tests/main.tftest.hcl b/dsvm/tests/main.tftest.hcl new file mode 100644 index 00000000..d926654d --- /dev/null +++ b/dsvm/tests/main.tftest.hcl @@ -0,0 +1,3 @@ +run "dsvm_apply" { + command = apply +} \ No newline at end of file diff --git a/dsvm/variables.tf b/dsvm/variables.tf new file mode 100644 index 00000000..371dbdff --- /dev/null +++ b/dsvm/variables.tf @@ -0,0 +1,55 @@ +# Global parameters +variable "parent_id" { + description = "Project ID." + type = string +} + +variable "subnet_id" { + description = "Subnet ID." + type = string +} + +variable "region" { + description = "Project region." + type = string + default = "eu-north1" # https://docs.nebius.com/overview/regions +} + +# Platform +variable "platform" { + description = "Platform for DSVM host." + type = string + default = "gpu-h100-sxm" # https://docs.nebius.com/compute/virtual-machines/types#gpu-configurations +} + +variable "preset" { + description = "Preset for DSVM host." + type = string + default = "1gpu-16vcpu-200gb" # https://docs.nebius.com/compute/virtual-machines/types#gpu-configurations +} + +# SSH access +variable "ssh_user_name" { + description = "SSH username." + type = string + default = "ubuntu" +} + +variable "ssh_public_key" { + description = "SSH Public Key to access the cluster nodes." + type = object({ + key = optional(string), + path = optional(string, "~/.ssh/id_rsa.pub") + }) + default = {} + validation { + condition = var.ssh_public_key.key != null || fileexists(var.ssh_public_key.path) + error_message = "SSH Public Key must be set by `key` or file `path` ${var.ssh_public_key.path}" + } +} + +variable "test_mode" { + description = "Switch between real usage and testing." + type = bool + default = false +} diff --git a/k8s-inference/README.md b/k8s-inference/README.md index 79317061..a024c0ba 100644 --- a/k8s-inference/README.md +++ b/k8s-inference/README.md @@ -13,7 +13,7 @@ 1. Install [Nebius CLI](https://docs.nebius.ai/cli/install/): ```bash - curl -sSL https://storage.ai.nebius.cloud/nebius/install.sh | bash + curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash ``` 2. Reload your shell session: @@ -107,7 +107,7 @@ loki_secret_key = "" # See the instruction in README.md on how to create this Check the details below for more information on [Grafana](#grafana), [Prometheus](#prometheus), [Loki](#temporary-block-to-make-loki-work-now) and [NVIDIA DCGM](#nvidia-dcgm-exporter-dashboard-and-alerting). -> Deploying Loki will require you to create a service account! Please check the instructions [here](#temporary-block-to-make-loki-work-now)! +> Deploying Loki will require you to create a service account! Please check the instructions [here](https://docs.nebius.com/iam/service-accounts/manage) to create a serice account to access to the storage and [here](https://docs.nebius.com/iam/service-accounts/access-keys) to create the access key. You can refer to the access key creation command [here](https://docs.nebius.com/cli/reference/iam/access-key/create). ### Storage configuration ```hcl diff --git a/k8s-inference/provider.tf b/k8s-inference/provider.tf index 8c0aac3b..facdaf13 100644 --- a/k8s-inference/provider.tf +++ b/k8s-inference/provider.tf @@ -1,7 +1,7 @@ terraform { required_providers { nebius = { - source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius" + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" } } } diff --git a/k8s-training/README.md b/k8s-training/README.md index 63f74a7d..56213b72 100644 --- a/k8s-training/README.md +++ b/k8s-training/README.md @@ -16,7 +16,7 @@ 1. Install [Nebius CLI](https://docs.nebius.ai/cli/install/): ```bash - curl -sSL https://storage.ai.nebius.cloud/nebius/install.sh | bash + curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash ``` 2. Reload your shell session: @@ -120,7 +120,7 @@ loki_secret_key = "" # See the instruction in README.md on how to create this See the details below for more information on [Grafana](#grafana), [Prometheus](#prometheus), [Loki](#temporary-block-to-make-loki-work-now) and [NVIDIA DCGM](#nvidia-dcgm-exporter-dashboard-and-alerting). -> To deploy Loki, you will need to create a service account. See the instructions [here](#temporary-block-to-make-loki-work-now). +> Deploying Loki will require you to create a service account! Please check the instructions [here](https://docs.nebius.com/iam/service-accounts/manage) to create a serice account to access to the storage and [here](https://docs.nebius.com/iam/service-accounts/access-keys) to create the access key. You can refer to the access key creation command [here](https://docs.nebius.com/cli/reference/iam/access-key/create). ### Storage configuration diff --git a/k8s-training/provider.tf b/k8s-training/provider.tf index 8c0aac3b..facdaf13 100644 --- a/k8s-training/provider.tf +++ b/k8s-training/provider.tf @@ -1,7 +1,7 @@ terraform { required_providers { nebius = { - source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius" + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" } } } diff --git a/modules/cloud-init/bastion-cloud-init.tftpl b/modules/cloud-init/bastion-cloud-init.tftpl new file mode 100644 index 00000000..9aae9c7c --- /dev/null +++ b/modules/cloud-init/bastion-cloud-init.tftpl @@ -0,0 +1,172 @@ +users: + - name: ${ssh_user_name} + sudo: ALL=(ALL) NOPASSWD:ALL + shell: /bin/bash + ssh_authorized_keys: + - ${ssh_public_key} + +package_update: true +packages: + - wireguard + - curl + - jq + +write_files: + - content: | + [Unit] + Description=Restart WireGuard + After=network.target + [Service] + Type=oneshot + ExecStart=/usr/bin/systemctl restart wg-quick@wg0.service + [Install] + RequiredBy=wgui.path + path: /etc/systemd/system/wgui.service + permissions: "0400" + owner: root:root + - content: | + [Unit] + Description=Watch /etc/wireguard/wg0.conf for changes + [Path] + PathModified=/etc/wireguard/wg0.conf + [Install] + WantedBy=multi-user.target + path: /etc/systemd/system/wgui.path + permissions: "0400" + owner: root:root + - content: | + [Unit] + Description=wgui server. + After=network.target + Wants=network-online.target systemd-networkd-wait-online.service + + [Service] + ExecStart=/opt/wireguard-ui + Restart=on-abnormal + User=root + Group=root + WorkingDirectory=/var/lib/wireguard-ui + Environment="WGUI_PASSWORD_FILE=/var/lib/wireguard-ui/initial_password" + Environment="WGUI_LOG_LEVEL=DEBUG" + + [Install] + WantedBy=multi-user.target + path: /etc/systemd/system/wgui_server.service + permissions: "0400" + owner: root:root + - content: ${jsonencode(sa_private_key)} + path: /root/.ssh/sa_private_key.pem + permissions: "0400" + owner: root:root + - content: | + AllowUsers ${ssh_user_name} + AllowUsers bastion + AllowTcpForwarding yes + PermitTunnel yes + GatewayPorts yes + PermitRootLogin no + Banner none + path: /etc/ssh/sshd_config.d/65-bastion-settings.conf + permissions: "644" + owner: root:root + - content: | + #!/bin/bash + cat <<'EOF' + + ██╗ ██╗███████╗██████╗ ██╗██╗ ██╗███████╗ + ████╗ ██║██╔════╝██╔══██╗██║██║ ██║██╔════╝ + ██╔██╗ ██║█████╗ ██████╔╝██║██║ ██║███████╗ + ██║╚██╗██║██╔══╝ ██╔══██╗██║██║ ██║╚════██║ + ██║ ╚████║███████╗██████╔╝██║╚╗████╔╝ ███████║ + ╚═╝ ╚═══╝╚══════╝╚═════╝ ╚═╝ ╚════╝ ╚══════╝ + EOF + path: /etc/update-motd.d/01-nebius + permissions: "0755" + owner: root:root + +runcmd: + - echo "### Install Nebius CLI" >> /tmp/csa-install.log 2>&1 + - su - ${ssh_user_name} -c 'export HOME=/home/${ssh_user_name}' >> /tmp/csa-install.log 2>&1 + - su - ${ssh_user_name} -c 'curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh | bash' >> /tmp/csa-install.log 2>&1 + - su - ${ssh_user_name} -c 'source "/home/${ssh_user_name}/.bashrc"' >> /tmp/csa-install.log 2>&1 + + - echo "### Copy SA key to bastion user" >> /tmp/csa-install.log 2>&1 + - sudo cp /root/.ssh/sa_private_key.pem /home/${ssh_user_name}/.ssh/sa_private_key.pem >> /tmp/csa-install.log 2>&1 + - sudo chown ${ssh_user_name}:${ssh_user_name} /home/${ssh_user_name}/.ssh/sa_private_key.pem >> /tmp/csa-install.log 2>&1 + + - echo "### Nebius CLI profile create" >> /tmp/csa-install.log 2>&1 + - su - ${ssh_user_name} -c '/home/${ssh_user_name}/.nebius/bin/nebius profile create --endpoint api.eu.nebius.cloud --service-account-id ${service_account_id} --public-key-id ${sa_public_key_id} --private-key-file /home/${ssh_user_name}/.ssh/sa_private_key.pem --profile default --parent-id ${parent_id}' >> /tmp/csa-install.log 2>&1 + + - echo "### Install kubectl" >> /tmp/csa-install.log 2>&1 + - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" >> /tmp/csa-install.log 2>&1 + - install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl >> /tmp/csa-install.log 2>&1 + + - echo "### kubectl mk8s config" >> /tmp/csa-install.log 2>&1 + - su - ${ssh_user_name} -c '/home/${ssh_user_name}/.nebius/bin/nebius mk8s v1 cluster list --parent-id ${parent_id} --format json | jq -r .items[0].metadata.id > /tmp/cluster.id' + - | + if ! grep -q "null" /tmp/cluster.id; then + su - ${ssh_user_name} -c '/home/${ssh_user_name}/.nebius/bin/nebius mk8s cluster get-credentials --id $(cat /tmp/cluster.id) --internal' >> /tmp/csa-install.log 2>&1 + else + echo "Cluster ID is null. Skipping credential retrieval." >> /tmp/csa-install.log + fi + + # Creating keys + - wg genkey | sudo tee /etc/wireguard/private.key + - sudo chmod go= /etc/wireguard/private.key + - sudo cat /etc/wireguard/private.key | wg pubkey | sudo tee /etc/wireguard/public.key + + # Creating wg0.conf file + - export PRIVATE_KEY=$(sudo cat /etc/wireguard/private.key) + - export INTERFACE=$(ip route list default | awk '{for(i=1;i<=NF;i++) if($i=="dev") print $(i+1)}') + - | + sudo tee /etc/wireguard/wg0.conf <> /home/tux/log.txt +- parted -s /dev/disk/by-id/virtio-${extra_disk_id} mklabel gpt >> /home/tux/log.txt +- parted -s /dev/disk/by-id/virtio-${extra_disk_id} mkpart primary ext4 0% 100% >> /home/tux/log.txt +- sync +- echo "sync" >> /home/tux/log.txt +- mkfs.ext4 /dev/disk/by-id/virtio-${extra_disk_id}-part1 >> /home/tux/log.txt +- sync +- mount -o rw /dev/disk/by-id/virtio-${extra_disk_id}-part1 ${extra_path} >> /home/tux/log.txt +- echo "/dev/disk/by-id/virtio-${extra_disk_id}-part1 ${extra_path} ext4 defaults 0 2" >> /etc/fstab +- chown nobody:nogroup ${extra_path} +- chmod 777 ${extra_path} +%{ endif } + +# mount shared filesystem if provided +%{if shared_filesystem_id != "" } +- mkdir -p ${shared_filesystem_mount} +- mount -t virtiofs filesystem-0 ${shared_filesystem_mount} +- chmod a+w ${shared_filesystem_mount} +- echo "filesystem-0 ${shared_filesystem_mount} virtiofs rw 0 0" >> /etc/fstab +%{endif} + +# install s3 access +%{if aws_access_key_id != ""} +- snap install aws-cli --classic + +%{ for user in users} +- mkdir -p /home/${user.user_name}/.aws +- echo "[default]" > /home/${user.user_name}/.aws/credentials +- echo "aws_access_key_id=${aws_access_key_id}" >> /home/${user.user_name}/.aws/credentials +- echo "aws_secret_access_key=${aws_secret_access_key}" >> /home/${user.user_name}/.aws/credentials +- echo "[default]" > /home/${user.user_name}/.aws/config +- echo "endpoint_url = https://storage.eu-north1.nebius.cloud:443" >> /home/${user.user_name}/.aws/config +- echo "region = eu-north1" >> /home/${user.user_name}/.aws/config +- chown -R ${user.user_name}:${user.user_name} /home/${user.user_name}/.aws +- chmod 600 /home/${user.user_name}/.aws/credentials +%{ endfor } + +- mkdir -p /root/.aws +- echo "[default]" > /root/.aws/credentials +- echo "aws_access_key_id=${aws_access_key_id}" >> /root/.aws/credentials +- echo "aws_secret_access_key=${aws_secret_access_key}" >> /root/.aws/credentials +- echo "[default]" > /root/.aws/config +- echo "endpoint_url = https://storage.eu-north1.nebius.cloud:443" >> /root/.aws/config +- echo "region = eu-north1" >> /root/.aws/config + + +# install s3 mount + +%{if mount_bucket != "" } +- wget https://s3.amazonaws.com/mountpoint-s3-release/1.14.0/x86_64/mount-s3-1.14.0-x86_64.deb +- dpkg -i mount-s3-1.14.0-x86_64.deb +- mkdir -p ${s3_mount_path} +- mount-s3 --upload-checksums=off --maximum-throughput-gbps=200 --allow-delete --allow-overwrite --allow-other --endpoint-url=https://storage.eu-north1.nebius.cloud:443 ${mount_bucket} ${s3_mount_path} +%{endif} +%{endif} diff --git a/modules/cloud-init/wireguard-cloud-init.tftpl b/modules/cloud-init/wireguard-cloud-init.tftpl index cb19dc35..f4f66c86 100644 --- a/modules/cloud-init/wireguard-cloud-init.tftpl +++ b/modules/cloud-init/wireguard-cloud-init.tftpl @@ -101,7 +101,7 @@ runcmd: - sudo mkdir -p /var/lib/wireguard-ui - tr -dc A-Za-z0-9 > /etc/exports -- netplan set ethernets.eth0.mtu=${mtu_size} -- netplan apply -- systemctl restart nfs-kernel-server +disk_setup: + /dev/disk/by-id/virtio-nfs-disk: # hardcoded device-label: nfs-disk + table_type: gpt + layout: true + +fs_setup: + - device: /dev/disk/by-id/virtio-nfs-disk-part1 # hardcoded device-label: nfs-disk + filesystem: ext4 + +mounts: + - [ /dev/disk/by-id/virtio-nfs-disk, ${nfs_path}, ext4, "defaults,relatime,rw", "0", "0" ] # hardcoded device-label: nfs-disk + +write_files: + - path: /etc/exports + content: | + ${nfs_path} ${nfs_ip_range}(rw,async,no_subtree_check,no_root_squash) + append: true + +runcmd: + # Prepare pnfs mountpoint + - mkdir -p ${nfs_path} + + # Configure permissions + - chown nobody:nogroup ${nfs_path} + - chmod 777 ${nfs_path} + + # Configure NFS export + - echo "${nfs_path} ${nfs_ip_range}(rw,async,no_subtree_check,no_root_squash)" >> /etc/exports + + # Netplan config for MTU + - echo "Writing netplan configuration..." + - | + cat < /etc/netplan/50-cloud-init.yaml + network: + version: 2 + ethernets: + enp: + match: + name: "enp*" + dhcp4: true + mtu: ${mtu_size} + EOF + - chmod 600 /etc/netplan/50-cloud-init.yaml + - echo "Netplan configuration written. Applying it..." + - netplan generate + - netplan apply + - echo "Netplan applied successfully." + + # Restart NFS service + - systemctl restart nfs-kernel-server \ No newline at end of file diff --git a/modules/nfs-server/main.tf b/modules/nfs-server/main.tf index baa910a1..791dd735 100644 --- a/modules/nfs-server/main.tf +++ b/modules/nfs-server/main.tf @@ -23,6 +23,7 @@ resource "nebius_compute_v1_instance" "nfs_server" { secondary_disks = [ { + device_id = "nfs-disk" # hardcoded device-label attach_mode = "READ_WRITE" existing_disk = nebius_compute_v1_disk.nfs-storage-disk } diff --git a/modules/nfs-server/provider.tf b/modules/nfs-server/provider.tf index 92a04b1f..4505d171 100644 --- a/modules/nfs-server/provider.tf +++ b/modules/nfs-server/provider.tf @@ -1,7 +1,7 @@ terraform { required_providers { nebius = { - source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius" + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" } } } diff --git a/modules/nfs-server/variables.tf b/modules/nfs-server/variables.tf index a429d6df..513576b4 100644 --- a/modules/nfs-server/variables.tf +++ b/modules/nfs-server/variables.tf @@ -66,10 +66,22 @@ variable "nfs_ip_range" { variable "mtu_size" { type = string description = "MTU size to make network fater" - default = "8910" + default = "8800" } variable "nfs_size" { type = string description = "Size of the NFS in GB, should be divisbile by 93" -} \ No newline at end of file +} + +variable "nfs_device_label" { + type = string + description = "device label to use later as device ID" + default = "nfs-disk" +} + +variable "nfs_disk_name_suffix" { + type = string + description = "Name suffix to be able to create several NFS disks in the same parent" + default = "" +} diff --git a/modules/o11y/versions.tf b/modules/o11y/versions.tf index 92a04b1f..4505d171 100644 --- a/modules/o11y/versions.tf +++ b/modules/o11y/versions.tf @@ -1,7 +1,7 @@ terraform { required_providers { nebius = { - source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius" + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" } } } diff --git a/nfs-server/provider.tf b/nfs-server/provider.tf index 92a04b1f..4505d171 100644 --- a/nfs-server/provider.tf +++ b/nfs-server/provider.tf @@ -1,7 +1,7 @@ terraform { required_providers { nebius = { - source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius" + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" } } } diff --git a/nfs-server/variables.tf b/nfs-server/variables.tf index 6f5ac76c..5c2cb9fc 100644 --- a/nfs-server/variables.tf +++ b/nfs-server/variables.tf @@ -46,4 +46,5 @@ variable "ssh_user_name" { variable "nfs_ip_range" { type = string description = "Ip range from where NFS will be available" + default = "192.168.0.0/16" } \ No newline at end of file diff --git a/slurm/.gitignore b/slurm/.gitignore deleted file mode 100644 index af8bf86a..00000000 --- a/slurm/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -.terraform* -*.tfstate* -.DS_Store -**/*.zip diff --git a/slurm/README.md b/slurm/README.md deleted file mode 100644 index 357f6249..00000000 --- a/slurm/README.md +++ /dev/null @@ -1,80 +0,0 @@ -# Nebius SLURM cluster installation module - -This Terraform module provisions a slurm cluster on Nebius Cloud. It creates virtual machines for worker nodes, which are then configured and prepared for operation by the master. To run container workloads, it also installs the enroot and pyxis plugins. - -## Module Structure - -The module includes the following files and directories: - -- `main.tf` - The main Terraform configuration file for the module. -- `variables.tf` - Definitions of variables used in the module. -- `outputs.tf` - Outputs generated after the module has been applied, which are then used to create the inventory.yaml file. -- `versions.tf` - The provider configuration file (to be filled in with your provider's details). -- `terraform.tfvars` - Variable values. - -## Configuring Terraform for Nebius Cloud - -- Install [Nebius CLI](https://docs.nebius.ai/cli/quickstart) -- Add environment variables for Terraform authentication in Nebuis Cloud - -## Preparing the environment - -```bash -source ./environment.sh -``` - -## Usage - -To use this module in your Terraform environment, define a Terraform configuration in file `terraform.tfvars`. - -Run: - -```bash -terraform init -terraform apply -``` - -After the apply process has been completed, connect to the master node via ssh: - -```bash -ssh slurm@ -``` - -Now you can monitor the progress of the cloud-init scripts: - -```bash -sudo tail -f /var/log/cloud-init-output.log -``` - -## Shared storage installation - -There are three types of shared storage that you can install: - -- Nebius AI shared filesystem -- NFS VM with exported nfs storage mounted on all slurm worker nodes to /mnt/slurm -- GlusterFS cluster with shared Glusterfs volume mounted on all worker nodes in /mnt/slurm - -To create shared storage, edit the `terraform.tfvars` file before running the Terraform script: - -To enable the creation of shared storage, set the following variables: - -- variable "shared_fs_type" set to: - - "filestore" to use a shared filesystem mounted on /mnt/slurm on all worker nodes - - `null` or remove it to use without shared storage -- variable "fs_size" - The size of the shared filesystem or NFS (value should be a multiple of 930) - -## Post-installation steps - -Check the slurm cluster status: - -``` -sinfo -Nl -``` - -correct status should be like (STATE: idle): - -``` -NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON -slurm-worker-1 1 debug* idle 160 160:1:1 129008 0 1 (null) none -slurm-worker-2 1 debug* idle 160 160:1:1 129008 0 1 (null) none -``` diff --git a/slurm/ansible_role.tf b/slurm/ansible_role.tf deleted file mode 100644 index f6ef9615..00000000 --- a/slurm/ansible_role.tf +++ /dev/null @@ -1,5 +0,0 @@ -data "archive_file" "ansible_role" { - type = "zip" - output_path = "files/ansible_role.zip" - source_dir = "files/ansible" -} diff --git a/slurm/files/ansible/roles/slurm-common/files/nccl-topo-h100-v1.xml b/slurm/files/ansible/roles/slurm-common/files/nccl-topo-h100-v1.xml deleted file mode 100644 index b61233fd..00000000 --- a/slurm/files/ansible/roles/slurm-common/files/nccl-topo-h100-v1.xml +++ /dev/null @@ -1,70 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/slurm/files/ansible/roles/slurm-common/tasks/enroot-packages.yml b/slurm/files/ansible/roles/slurm-common/tasks/enroot-packages.yml deleted file mode 100644 index b0d1a21f..00000000 --- a/slurm/files/ansible/roles/slurm-common/tasks/enroot-packages.yml +++ /dev/null @@ -1,6 +0,0 @@ -- name: Install enroot packages - apt: - deb: "{{ item }}" - with_items: - - https://github.com/NVIDIA/enroot/releases/download/v{{ENROOT_VERSION}}/enroot_{{ENROOT_VERSION}}-1_amd64.deb - - https://github.com/NVIDIA/enroot/releases/download/v{{ENROOT_VERSION}}/enroot+caps_{{ENROOT_VERSION}}-1_amd64.deb diff --git a/slurm/files/ansible/roles/slurm-common/tasks/files.yml b/slurm/files/ansible/roles/slurm-common/tasks/files.yml deleted file mode 100644 index dd49c7e6..00000000 --- a/slurm/files/ansible/roles/slurm-common/tasks/files.yml +++ /dev/null @@ -1,86 +0,0 @@ -- name: /etc/nccl-topo-h100-v1.xml - copy: - dest: /etc/nccl-topo-h100-v1.xml - src: nccl-topo-h100-v1.xml - -- name: Set NCCL_TOPO_FILE - ansible.builtin.lineinfile: - path: /home/slurm/.bashrc - line: export NCCL_TOPO_FILE=/etc/nccl-topo-h100-v1.xml - -- name: /etc/enroot/enroot.conf.d/enroot.conf - copy: - dest: /etc/enroot/enroot.conf.d/enroot.conf - content: | - ENROOT_RUNTIME_PATH /var/spool/enroot/user-$(id -u) - ENROOT_CONFIG_PATH ${HOME}/enroot - ENROOT_CACHE_PATH /var/spool/enroot - ENROOT_DATA_PATH /var/spool/enroot/data/user-$(id -u) - ENROOT_SQUASH_OPTIONS -noI -noD -noF -noX -no-duplicates - ENROOT_ROOTFS_WRITABLE yes - ENROOT_MOUNT_HOME no - ENROOT_RESTRICT_DEV no - -- name: /var/log/slurm - file: - path: /var/log/slurm - owner: slurm - group: slurm - state: directory - recurse: yes - -- name: /var/lib/slurm/enroot - file: - path: /var/lib/slurm/enroot - state: directory - -- name: /var/spool/slurmd - file: - path: /var/spool/slurmd - state: directory - owner: slurm - group: slurm - recurse: yes - -- name: /var/spool/enroot - file: - path: /var/spool/enroot - state: directory - owner: slurm - group: slurm - recurse: yes - -- name: /var/spool/enroot/data - file: - path: /var/spool/enroot/data - state: directory - owner: slurm - group: slurm - recurse: yes - -- name: /var/lib/slurm - file: - path: /var/lib/slurm - owner: slurm - group: slurm - recurse: yes - -- name: /etc/slurm/slurmdbd.conf - file: - path: /etc/slurm/slurmdbd.conf - owner: slurm - group: slurm - -- name: /etc/slurm - file: - path: /etc/slurm - owner: slurm - group: slurm - state: directory - recurse: yes - -- name: /etc/slurm/plugstack.conf - copy: - dest: /etc/slurm/plugstack.conf - content: | - required /usr/local/lib/slurm/spank_pyxis.so runtime_path=/var/lib/slurm/pyxis diff --git a/slurm/files/ansible/roles/slurm-common/tasks/main.yml b/slurm/files/ansible/roles/slurm-common/tasks/main.yml deleted file mode 100644 index 719f0e74..00000000 --- a/slurm/files/ansible/roles/slurm-common/tasks/main.yml +++ /dev/null @@ -1,7 +0,0 @@ -- include_tasks: packages.yml -- include_tasks: slurm-packages.yml -- include_tasks: enroot-packages.yml -- include_tasks: pyxis.yml -- include_tasks: pmix.yml -- include_tasks: files.yml -- include_tasks: services.yml diff --git a/slurm/files/ansible/roles/slurm-common/tasks/packages.yml b/slurm/files/ansible/roles/slurm-common/tasks/packages.yml deleted file mode 100644 index 4d23d1b8..00000000 --- a/slurm/files/ansible/roles/slurm-common/tasks/packages.yml +++ /dev/null @@ -1,18 +0,0 @@ -- name: Install packages - apt: - install_recommends: true - update_cache: true - lock_timeout: 600 - name: - - ca-certificates - - curl - - make - - gcc - - fuse-overlayfs - - glusterfs-client - - munge - - nfs-common - - numactl - - parallel - - libevent-dev - - libhwloc-dev diff --git a/slurm/files/ansible/roles/slurm-common/tasks/pmix.yml b/slurm/files/ansible/roles/slurm-common/tasks/pmix.yml deleted file mode 100644 index 9c8c5890..00000000 --- a/slurm/files/ansible/roles/slurm-common/tasks/pmix.yml +++ /dev/null @@ -1,25 +0,0 @@ -- name: Clone the repository PMIX - get_url: - url: https://github.com/openpmix/openpmix/releases/download/v{{ PMIX_VERSION }}/pmix-{{ PMIX_VERSION }}.tar.gz - dest: /usr/src/pmix-{{ PMIX_VERSION }}.tar.gz - register: pmix_repo - -- name: Uncompress PMIX - unarchive: - src: /usr/src/pmix-{{ PMIX_VERSION }}.tar.gz - dest: /usr/src/ - when: pmix_repo.changed - register: pmix_file - -- name: Configure PMIX - shell: ./configure - args: - chdir: "/usr/src/pmix-{{ PMIX_VERSION }}" - when: pmix_file.changed - -- name: Make and install PMIX - make: - target: install - jobs: "{{ ansible_processor_cores }}" - chdir: "/usr/src/pmix-{{ PMIX_VERSION }}" - when: pmix_file.changed diff --git a/slurm/files/ansible/roles/slurm-common/tasks/pyxis.yml b/slurm/files/ansible/roles/slurm-common/tasks/pyxis.yml deleted file mode 100644 index 194ca89a..00000000 --- a/slurm/files/ansible/roles/slurm-common/tasks/pyxis.yml +++ /dev/null @@ -1,30 +0,0 @@ -- name: Create directory for pyxis sources - file: - path: /opt/pyxis - state: directory - owner: root - group: root - mode: '0755' - -- name: Clone the pyxis repository - git: - repo: https://github.com/NVIDIA/pyxis.git - dest: /opt/pyxis - register: pyxis_repo - -- name: Make and install pyxis - make: - chdir: /opt/pyxis - target: install - jobs: "{{ ansible_processor_cores }}" - environment: - CPPFLAGS: '-I /usr/include/slurm' - when: pyxis_repo.changed - -- name: create dir - file: - path: "/var/lib/slurm/pyxis" - state: directory - owner: slurm - group: slurm - mode: '0755' diff --git a/slurm/files/ansible/roles/slurm-common/tasks/services.yml b/slurm/files/ansible/roles/slurm-common/tasks/services.yml deleted file mode 100644 index c16f8aac..00000000 --- a/slurm/files/ansible/roles/slurm-common/tasks/services.yml +++ /dev/null @@ -1,9 +0,0 @@ -- name: Disable and stop services - ansible.builtin.service: - name: "{{ item }}" - enabled: no - state: stopped - with_items: - - apt-daily.timer - - apt-daily-upgrade.timer - - unattended-upgrades diff --git a/slurm/files/ansible/roles/slurm-common/tasks/slurm-packages.yml b/slurm/files/ansible/roles/slurm-common/tasks/slurm-packages.yml deleted file mode 100644 index 9fed62ad..00000000 --- a/slurm/files/ansible/roles/slurm-common/tasks/slurm-packages.yml +++ /dev/null @@ -1,9 +0,0 @@ -- name: Install slurm common packages - apt: - deb: "{{ item }}" - with_items: - - https://github.com/nebius/slurm-deb-packages/releases/download/{{ SLURM_BINARIES }}{{ SLURM_VERSION }}/slurm-smd_{{ SLURM_VERSION }}-1_amd64.deb - - https://github.com/nebius/slurm-deb-packages/releases/download/{{ SLURM_BINARIES }}{{ SLURM_VERSION }}/slurm-smd-slurmdbd_{{ SLURM_VERSION }}-1_amd64.deb - - https://github.com/nebius/slurm-deb-packages/releases/download/{{ SLURM_BINARIES }}{{ SLURM_VERSION }}/slurm-smd-client_{{ SLURM_VERSION }}-1_amd64.deb - - https://github.com/nebius/slurm-deb-packages/releases/download/{{ SLURM_BINARIES }}{{ SLURM_VERSION }}/slurm-smd-dev_{{ SLURM_VERSION }}-1_amd64.deb - register: slurm_common_packages diff --git a/slurm/files/ansible/roles/slurm-master/tasks/files.yml b/slurm/files/ansible/roles/slurm-master/tasks/files.yml deleted file mode 100644 index 7af3abde..00000000 --- a/slurm/files/ansible/roles/slurm-master/tasks/files.yml +++ /dev/null @@ -1,96 +0,0 @@ -- name: /etc/systemd/system/slurmd.service - copy: - dest: /etc/systemd/system/slurmd.service - content: | - [Service] - User=slurm - Group=slurm - -- name: /etc/clustershell/groups.conf - copy: - dest: /etc/clustershell/groups.conf - content: | - [Main] - default: cluster - confdir: /etc/clustershell/groups.conf.d $CFGDIR/groups.conf.d - autodir: /etc/clustershell/groups.d $CFGDIR/groups.d - -- name: /var/lib/slurm/slurmctld - file: - path: /var/lib/slurm/slurmctld - owner: slurm - group: slurm - -- name: /etc/slurm/gres.conf - copy: - dest: /etc/slurm/gres.conf - content: | - Name=gpu Type=gpu File=/dev/nvidia[0-3] Cores=0-79 - Name=gpu Type=gpu File=/dev/nvidia[4-7] Cores=80-159 - -- name: /home/slurm/nccl.sh - copy: - dest: /home/slurm/nccl.sh - owner: slurm - group: slurm - content: | - #!/bin/bash - ### - - # change for more nodes as required, by default run on 2 nodes - SLURM_NODES=2 - - #export NCCL_DEBUG=INFO - export NCCL_IB_HCA=mlx5 - export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1 - export SHARP_COLL_ENABLE_PCI_RELAXED_ORDERING=1 - export NCCL_COLLNET_ENABLE=0 - export NCCL_TOPO_FILE=/etc/nccl-topo-h100-v1.xml - srun -N $SLURM_NODES --ntasks-per-node=8 --gpus-per-node=8 \ - --container-image="cr.eu-north1.nebius.cloud#nebius-benchmarks/nccl-tests:2.19.4-ubu22.04-cu12.2" \ - --container-remap-root --no-container-mount-home --container-mounts=$NCCL_TOPO_FILE:$NCCL_TOPO_FILE \ - /opt/nccl_tests/build/all_reduce_perf -b 512M -e 8G -f 2 -g 1 $@ - -- name: /home/slurm/nccl.sbatch - copy: - dest: /home/slurm/nccl.sbatch - owner: slurm - group: slurm - content: | - #!/bin/bash - ### - # to run sbatch: sbatch -N2 nccl.sbatch - # check job status: scontrol show job - # check log file: /mnt/slurm/nccl-.log - ### - #SBATCH --job-name=nccl_test - #SBATCH --ntasks-per-node=8 - #SBATCH --gpus-per-node=8 - #SBATCH --time=10:00 - #SBATCH --deadline=now+20minutes - #SBATCH --output="/mnt/slurm/nccl-%j.log" - #SBATCH --exclusive - - # NCCL environment variables are documented at: - # https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html - - #export NCCL_DEBUG=INFO - #export NCCL_SOCKET_IFNAME=eth0 - export NCCL_IB_HCA=mlx5 - export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1 - export SHARP_COLL_ENABLE_PCI_RELAXED_ORDERING=1 - export NCCL_COLLNET_ENABLE=0 - export NCCL_TOPO_FILE=/etc/nccl-topo-h100-v1.xml - - # Relaxed ordering is fixed in NCCL 2.18.3+, but - # in NCCL 2.18.1 and earlier it should be disabled - # for H100s due to a bug. See: - # https://docs.nvidia.com/deeplearning/nccl/archives/nccl_2181/release-notes/rel_2-18-1.html - # export NCCL_IB_PCI_RELAXED_ORDERING=0 - - # Log the assigned nodes - echo "Using nodes: $SLURM_JOB_NODELIST" - - srun --container-image="cr.eu-north1.nebius.cloud#nebius-benchmarks/nccl-tests:2.19.4-ubu22.04-cu12.2" \ - --container-remap-root --no-container-mount-home --container-mounts=$NCCL_TOPO_FILE:$NCCL_TOPO_FILE \ - /opt/nccl_tests/build/all_reduce_perf -b 512M -e 8G -f 2 -g 1 $@ diff --git a/slurm/files/ansible/roles/slurm-master/tasks/hosts.yml b/slurm/files/ansible/roles/slurm-master/tasks/hosts.yml deleted file mode 100644 index f81b56d4..00000000 --- a/slurm/files/ansible/roles/slurm-master/tasks/hosts.yml +++ /dev/null @@ -1,6 +0,0 @@ -- name: Make sure slurm-worker is in /etc/hosts - ansible.builtin.lineinfile: - path: /etc/hosts - regexp: '\s{{ item }}$' - line: "{{ hostvars[item]['ansible_host'] }} {{ item }}" - loop: "{{ groups['slurm_worker'] }}" diff --git a/slurm/files/ansible/roles/slurm-master/tasks/main.yml b/slurm/files/ansible/roles/slurm-master/tasks/main.yml deleted file mode 100644 index 34993103..00000000 --- a/slurm/files/ansible/roles/slurm-master/tasks/main.yml +++ /dev/null @@ -1,5 +0,0 @@ -- include_tasks: hosts.yml -- include_tasks: packages.yml -- include_tasks: slurm-packages.yml -- include_tasks: files.yml -- include_tasks: services.yml diff --git a/slurm/files/ansible/roles/slurm-master/tasks/packages.yml b/slurm/files/ansible/roles/slurm-master/tasks/packages.yml deleted file mode 100644 index e26a9b9c..00000000 --- a/slurm/files/ansible/roles/slurm-master/tasks/packages.yml +++ /dev/null @@ -1,5 +0,0 @@ -- name: Install packages - apt: - lock_timeout: 600 - name: - - clustershell diff --git a/slurm/files/ansible/roles/slurm-master/tasks/services.yml b/slurm/files/ansible/roles/slurm-master/tasks/services.yml deleted file mode 100644 index a6640201..00000000 --- a/slurm/files/ansible/roles/slurm-master/tasks/services.yml +++ /dev/null @@ -1,10 +0,0 @@ -- name: Enable Slurm services - ansible.builtin.service: - name: "{{ item }}" - enabled: yes - state: restarted - with_items: - - slurmdbd - - slurmctld - when: (slurm_packages.changed) or - (slurm_common_packages.changed) diff --git a/slurm/files/ansible/roles/slurm-master/tasks/slurm-packages.yml b/slurm/files/ansible/roles/slurm-master/tasks/slurm-packages.yml deleted file mode 100644 index 08b2b564..00000000 --- a/slurm/files/ansible/roles/slurm-master/tasks/slurm-packages.yml +++ /dev/null @@ -1,5 +0,0 @@ -- name: Install slurm master packages - apt: - lock_timeout: 600 - deb: https://github.com/nebius/slurm-deb-packages/releases/download/{{ SLURM_BINARIES }}{{ SLURM_VERSION }}/slurm-smd-slurmctld_{{ SLURM_VERSION }}-1_amd64.deb - register: slurm_packages diff --git a/slurm/files/ansible/roles/slurm-worker/tasks/files.yml b/slurm/files/ansible/roles/slurm-worker/tasks/files.yml deleted file mode 100644 index 68a3c857..00000000 --- a/slurm/files/ansible/roles/slurm-worker/tasks/files.yml +++ /dev/null @@ -1,18 +0,0 @@ -- name: /etc/default/slurmd - copy: - dest: /etc/default/slurmd - content: | - SLURMD_OPTIONS="--conf-server slurm-master" - -- name: /run/slurm - file: - path: /run/slurm - state: directory - owner: slurm - group: slurm - -- name: /var/spool/enroot - file: - path: /var/spool/enroot - state: directory - mode: a+rwx diff --git a/slurm/files/ansible/roles/slurm-worker/tasks/hosts.yml b/slurm/files/ansible/roles/slurm-worker/tasks/hosts.yml deleted file mode 100644 index 049073d6..00000000 --- a/slurm/files/ansible/roles/slurm-worker/tasks/hosts.yml +++ /dev/null @@ -1,13 +0,0 @@ -- name: Make sure slurm-master is in /etc/hosts - ansible.builtin.lineinfile: - path: /etc/hosts - regexp: '\sslurm-master$' - line: "{{ hostvars['slurm-master']['ansible_default_ipv4']['address'] }} slurm-master" - -- name: Make sure slurm-worker is in /etc/hosts - ansible.builtin.lineinfile: - path: /etc/hosts - regexp: '\s{{ item }}$' - line: "{{ hostvars[item]['ansible_host'] }} {{ item }}" - loop: "{{ groups['slurm_worker'] }}" - when: item != inventory_hostname diff --git a/slurm/files/ansible/roles/slurm-worker/tasks/main.yml b/slurm/files/ansible/roles/slurm-worker/tasks/main.yml deleted file mode 100644 index 1a804715..00000000 --- a/slurm/files/ansible/roles/slurm-worker/tasks/main.yml +++ /dev/null @@ -1,5 +0,0 @@ -- include_tasks: hosts.yml -- include_tasks: slurm-packages.yml -- include_tasks: files.yml -- include_tasks: munge-auth.yml -- include_tasks: services.yml diff --git a/slurm/files/ansible/roles/slurm-worker/tasks/munge-auth.yml b/slurm/files/ansible/roles/slurm-worker/tasks/munge-auth.yml deleted file mode 100644 index 95194739..00000000 --- a/slurm/files/ansible/roles/slurm-worker/tasks/munge-auth.yml +++ /dev/null @@ -1,5 +0,0 @@ -- name: Copy munge key - copy: - dest: /etc/munge/munge.key - src: /etc/munge/munge.key - register: munge_key diff --git a/slurm/files/ansible/roles/slurm-worker/tasks/services.yml b/slurm/files/ansible/roles/slurm-worker/tasks/services.yml deleted file mode 100644 index b60b30fa..00000000 --- a/slurm/files/ansible/roles/slurm-worker/tasks/services.yml +++ /dev/null @@ -1,11 +0,0 @@ -- name: Slurm services - service: - name: "{{ item }}" - enabled: yes - state: restarted - with_items: - - munge - - slurmd - when: (munge_key.changed) or - (slurm_packages.changed) or - (slurm_common_packages.changed) diff --git a/slurm/files/ansible/roles/slurm-worker/tasks/slurm-packages.yml b/slurm/files/ansible/roles/slurm-worker/tasks/slurm-packages.yml deleted file mode 100644 index 00e1e4d0..00000000 --- a/slurm/files/ansible/roles/slurm-worker/tasks/slurm-packages.yml +++ /dev/null @@ -1,5 +0,0 @@ -- name: Install slurm worker packages - apt: - lock_timeout: 600 - deb: https://github.com/nebius/slurm-deb-packages/releases/download/{{ SLURM_BINARIES }}{{ SLURM_VERSION }}/slurm-smd-slurmd_{{ SLURM_VERSION }}-1_amd64.deb - register: slurm_packages diff --git a/slurm/files/ansible/slurm.yml b/slurm/files/ansible/slurm.yml deleted file mode 100644 index 1602755c..00000000 --- a/slurm/files/ansible/slurm.yml +++ /dev/null @@ -1,23 +0,0 @@ -- hosts: slurm_master - connection: local - roles: - - slurm-common - - slurm-master - -- hosts: slurm_worker - strategy: free - remote_user: slurm - become: true - gather_facts: no - vars: - ansible_ssh_private_key_file: "/etc/ssh/id_rsa" - pre_tasks: - - name: Wait for SSH connection to become available - wait_for_connection: - timeout: 1800 - delay: 10 - - name: Gather facts - setup: - roles: - - slurm-common - - slurm-worker diff --git a/slurm/files/ansible_role.zip b/slurm/files/ansible_role.zip deleted file mode 100644 index 34b15c61..00000000 Binary files a/slurm/files/ansible_role.zip and /dev/null differ diff --git a/slurm/files/cloud-config-master.yaml.tftpl b/slurm/files/cloud-config-master.yaml.tftpl deleted file mode 100644 index 4d548aa7..00000000 --- a/slurm/files/cloud-config-master.yaml.tftpl +++ /dev/null @@ -1,140 +0,0 @@ -#cloud-config -hostname: ${hostname} -prefer_fqdn_over_hostname: false -users: - - name: slurm - groups: sudo - shell: /bin/bash - sudo: 'ALL=(ALL) NOPASSWD:ALL' - ssh-authorized-keys: - - ${chomp(ssh_public_key)} - - ${chomp(master_public_key)} - -packages: - - unzip - - python3-pip - %{~ if shared_fs_type == "nfs" ~} - - nfs-common - %{~ endif ~} - -runcmd: - - mkdir -p /home/slurm - - chmod 700 /home/slurm - - cp /home/ubuntu/.bashrc /home/slurm/ - - cp /etc/ssh/id_rsa /home/slurm/.ssh/ - - chown -R slurm:slurm /home/slurm - %{~ if shared_fs_type == "filesystem" ~} - - mkdir /mnt/slurm - - echo "slurm-fs /mnt/slurm virtiofs rw 0 0" >> /etc/fstab - - mount -a - %{~ endif ~} - %{~ if shared_fs_type == "nfs" ~} - - mkdir /mnt/nfs - - echo "${nfs_ip}:${nfs_export_path} /mnt/nfs nfs defaults 0 0" >> /etc/fstab - - mount -a - %{~ endif ~} - - base64 -d /tmp/ansible/ansible_role.zip.b64 > /tmp/ansible/ansible_role.zip - - unzip /tmp/ansible/ansible_role.zip -d /tmp/ansible/ - - pip3 install ansible - - | - ANSIBLE_HOST_KEY_CHECKING=False \ - ANSIBLE_LOG_PATH=/tmp/ansible/ansible.log \ - ansible-playbook -i /tmp/ansible/inventory.yml /tmp/ansible/slurm.yml & echo $! > /tmp/ansible/ansible.pid -# export ANSIBLE_HOST_KEY_CHECKING=False -# ansible-playbook -i /tmp/ansible/inventory.yml -e SLURM_BINARIES="12.2.2-jammy-slurm" -e SLURM_VERSION="24.05.3" -e ENROOT_VERSION="3.4.1" /tmp/ansible/slurm.yml - -write_files: - - content: ${ansible_role} - encoding: gzip+base64 - path: /tmp/ansible/ansible_role.zip.b64 - - content: | - all: - vars: - SLURM_VERSION: ${SLURM_VERSION} - SLURM_BINARIES: ${SLURM_BINARIES} - ENROOT_VERSION: ${ENROOT_VERSION} - PMIX_VERSION: ${PMIX_VERSION} - slurm_master: - hosts: - slurm-master: - slurm_worker: - hosts: - %{~ for name, ip in slurm_workers_ip ~} - ${name}: - ansible_host: ${ip} - %{~ endfor ~} - path: /tmp/ansible/inventory.yml - - content: | - # slurm.conf file - ClusterName=slurm-cluster - SlurmctldHost=slurm-master - SlurmUser=slurm - SlurmdUser=root - SlurmctldPort=6817 - SlurmdPort=6818 - AuthType=auth/munge - StateSaveLocation=/var/lib/slurm/slurmctld - SwitchType=switch/none - MpiDefault=pmi2 - SlurmctldPidFile=/run/slurmctld.pid - SlurmdPidFile=/run/slurmd.pid - ProctrackType=proctrack/pgid - ReturnToService=0 - PlugStackConfig=/etc/slurm/plugstack.conf - SlurmctldParameters=enable_configless - # TIMERS - SlurmctldTimeout=300 - SlurmdTimeout=300 - InactiveLimit=0 - MinJobAge=300 - KillWait=30 - Waittime=0 - # DEBUG - DebugFlags=NO_CONF_HASH - # LOGGING/ACCOUNTNG - SlurmctldDebug=info - SlurmctldLogFile=/var/log/slurm/slurmctld.log - SlurmdDebug=info - SlurmdLogFile=/var/log/slurm/slurmd.log - JobAcctGatherType=jobacct_gather/none - #DB - %{~ if is_mysql ~} - AccountingStorageType=accounting_storage/slurmdbd - AccountingStorageHost=slurm-master - JobCompType=jobcomp/mysql - JobCompUser=slurm - JobCompPass=${password} - JobCompHost=${hostname} - JobCompLoc=slurm-db - %{~ endif ~} - GresTypes=gpu - SelectType=select/cons_tres - # COMPUTE NODES - # NodeName=${worker_prefix}-[1-${cluster_workers_count}] CPUs=16 RealMemory=32090 State=idle State=UNKNOWN - NodeName=${worker_prefix}-[1-${cluster_workers_count}] Gres=gpu:8 CPUs=128 RealMemory=1290080 State=idle State=UNKNOWN - PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP - path: /etc/slurm/slurm.conf - - content: | - AuthType=auth/munge - DbdHost=slurm-master - DebugLevel=info - LogFile=/var/log/slurm/slurmdbd.log - PidFile=/run/slurmdbd.pid - SlurmUser=slurm - StoragePass=${password} - StorageUser=slurm - StorageHost=${hostname} - StorageLoc=slurm-db - StorageType=accounting_storage/mysql - path: /etc/slurm/slurmdbd.conf - permissions: '0600' - - content: | - cluster: - all: '@slurm' - clients: 'client[01-${cluster_workers_count}]' - path: /etc/clustershell/groups.d/cluster.yaml - - content: | - ${chomp(indent(6,master_private_key))} - path: /etc/ssh/id_rsa - permissions: "0400" - owner: "root:root" diff --git a/slurm/files/cloud-config-worker.yaml.tftpl b/slurm/files/cloud-config-worker.yaml.tftpl deleted file mode 100644 index c63f2ead..00000000 --- a/slurm/files/cloud-config-worker.yaml.tftpl +++ /dev/null @@ -1,50 +0,0 @@ -#cloud-config -hostname: ${hostname} -prefer_fqdn_over_hostname: false -users: - - name: slurm - groups: sudo - shell: /bin/bash - sudo: 'ALL=(ALL) NOPASSWD:ALL' - ssh-authorized-keys: - - ${chomp(ssh_public_key)} - - ${chomp(master_public_key)} - -packages: - %{~ if shared_fs_type == "nfs" ~} - - nfs-common - %{~ endif ~} - -runcmd: - - mkdir -p /home/slurm - - chown -R slurm:slurm /home/slurm - - chmod 700 /home/slurm - %{~ if shared_fs_type == "filesystem" ~} - - mkdir /mnt/slurm - - echo "slurm-fs /mnt/slurm virtiofs rw 0 0" >> /etc/fstab - - mount -a - %{~ endif ~} - %{~ if shared_fs_type == "nfs" ~} - - mkdir /mnt/nfs - - echo "${nfs_ip}:${nfs_export_path} /mnt/nfs nfs defaults 0 0" >> /etc/fstab - - mount -a - %{~ endif ~} - -write_files: - - path: /etc/slurm/slurmdbd.conf - permissions: '0600' - content: | - AuthType=auth/munge - DbdHost=slurm-master - DebugLevel=info - LogFile=/var/log/slurm/slurmdbd.log - PidFile=/run/slurmdbd.pid - SlurmUser=slurm - StoragePass=${password} - StorageUser=slurm - StorageHost=${hostname} - StorageLoc=slurm-db - StorageType=accounting_storage/mysql - - path: /etc/tmpfiles.d/slurm.conf - content: | - d /run/slurm 0770 root slurm - diff --git a/slurm/filesystem.tf b/slurm/filesystem.tf deleted file mode 100644 index cf7a9b7f..00000000 --- a/slurm/filesystem.tf +++ /dev/null @@ -1,8 +0,0 @@ -resource "nebius_compute_v1_filesystem" "slurm-fs" { - count = var.shared_fs_type == "filesystem" ? 1 : 0 - parent_id = var.parent_id - name = "slurm-fs" - type = "NETWORK_SSD" - block_size_bytes = 4096 - size_bytes = var.fs_size -} diff --git a/slurm/gpu_cluster.tf b/slurm/gpu_cluster.tf deleted file mode 100644 index 71639a63..00000000 --- a/slurm/gpu_cluster.tf +++ /dev/null @@ -1,6 +0,0 @@ - -resource "nebius_compute_v1_gpu_cluster" "gpu-cluster-slurm" { - parent_id = var.parent_id - name = "gpu-cluster-slurm" - infiniband_fabric = "fabric-3" -} diff --git a/slurm/locals.tf b/slurm/locals.tf deleted file mode 100644 index 90fe2b56..00000000 --- a/slurm/locals.tf +++ /dev/null @@ -1,27 +0,0 @@ -locals { - ssh_public_key = var.ssh_public_key.key != null ? var.ssh_public_key.key : ( - fileexists(var.ssh_public_key.path) ? file(var.ssh_public_key.path) : null) - - - regions_default = { - eu-west1 = { - master_platform = "cpu-d3" - master_preset = "16vcpu-64gb" - worker_platform = "gpu-h200-sxm" - worker_preset = "8gpu-128vcpu-1600gb" - } - eu-north1 = { - master_platform = "cpu-e2" - master_preset = "16vcpu-64gb" - worker_platform = "gpu-h100-sxm" - worker_preset = "8gpu-128vcpu-1600gb" - } - } - - current_region_defaults = local.regions_default[var.region] - - master_platform = coalesce(var.master_platform, local.current_region_defaults.master_platform) - master_preset = coalesce(var.master_preset, local.current_region_defaults.master_preset) - worker_platform = coalesce(var.worker_platform, local.current_region_defaults.worker_platform) - worker_preset = coalesce(var.worker_preset, local.current_region_defaults.worker_preset) -} diff --git a/slurm/nfs.tf b/slurm/nfs.tf deleted file mode 100644 index d0fe16c8..00000000 --- a/slurm/nfs.tf +++ /dev/null @@ -1,15 +0,0 @@ -module "nfs-module" { - providers = { - nebius = nebius - } - count = var.shared_fs_type == "nfs" ? 1 : 0 - source = "../modules/nfs-server" - parent_id = var.parent_id - subnet_id = var.subnet_id - ssh_user_name = "storage" - ssh_public_key = local.ssh_public_key - nfs_ip_range = "192.168.0.0/16" - nfs_size = var.fs_size - platform = local.master_platform - preset = local.master_preset -} diff --git a/slurm/output.tf b/slurm/output.tf deleted file mode 100644 index 892f15f2..00000000 --- a/slurm/output.tf +++ /dev/null @@ -1,3 +0,0 @@ -output "slurm_master_pib" { - value = trimsuffix(nebius_compute_v1_instance.master.status.network_interfaces[0].public_ip_address.address, "/32") -} diff --git a/slurm/slurm-master.tf b/slurm/slurm-master.tf deleted file mode 100644 index dda58492..00000000 --- a/slurm/slurm-master.tf +++ /dev/null @@ -1,71 +0,0 @@ -resource "nebius_vpc_v1alpha1_allocation" "master" { - parent_id = var.parent_id - name = "slurm-master" - ipv4_private = { - subnet_id = var.subnet_id - } -} - -resource "nebius_compute_v1_disk" "master" { - parent_id = var.parent_id - name = "slurm-boot-disk-master" - block_size_bytes = 4096 - size_bytes = 107374182400 - type = "NETWORK_SSD" - source_image_family = { image_family = "ubuntu22.04-driverless" } -} - -resource "nebius_compute_v1_instance" "master" { - name = "slurm-master" - parent_id = var.parent_id - resources = { - platform = local.master_platform - preset = local.master_preset - } - boot_disk = { - attach_mode = "READ_WRITE" - existing_disk = nebius_compute_v1_disk.master - } - - filesystems = var.shared_fs_type == "filesystem" ? [{ - attach_mode = "READ_WRITE" - device_name = "slurm-fs" - mount_tag = "slurm-fs" - existing_filesystem = { - id = nebius_compute_v1_filesystem.slurm-fs[0].id - } }] : null - - cloud_init_user_data = templatefile( - "${path.module}/files/cloud-config-master.yaml.tftpl", { - ENROOT_VERSION = var.enroot_version - PMIX_VERSION = var.pmix_version - SLURM_VERSION = var.slurm_version - SLURM_BINARIES = var.slurm_binaries - shared_fs_type = var.shared_fs_type - nfs_export_path = var.shared_fs_type == "nfs" ? module.nfs-module[0].nfs_export_path : 0 - nfs_ip = var.shared_fs_type == "nfs" ? module.nfs-module[0].nfs_server_internal_ip : 0 - is_mysql = var.mysql_jobs_backend - ssh_public_key = local.ssh_public_key - cluster_workers_count = var.cluster_workers_count - hostname = "slurm-master" - password = "" #random_password.mysql.result - master_public_key = tls_private_key.master_key.public_key_openssh - master_private_key = tls_private_key.master_key.private_key_openssh - slurm_workers_ip = { - for worker_name, worker in nebius_vpc_v1alpha1_allocation.worker : - worker_name => trimsuffix(worker.status.details.allocated_cidr, "/32") - } - worker_prefix = var.worker_name_prefix - ansible_role = base64gzip(filebase64(data.archive_file.ansible_role.output_path)) - }) - network_interfaces = [ - { - name = "eth0" - subnet_id = var.subnet_id - ip_address : { - allocation_id = nebius_vpc_v1alpha1_allocation.master.id - } - public_ip_address : {} - } - ] -} diff --git a/slurm/slurm-worker.tf b/slurm/slurm-worker.tf deleted file mode 100644 index 2bbfc0a6..00000000 --- a/slurm/slurm-worker.tf +++ /dev/null @@ -1,78 +0,0 @@ -locals { - cluster_workers = toset([ - for worker_num in range(1, var.cluster_workers_count + 1) : "${var.worker_name_prefix}-${worker_num}" - ]) -} - -resource "nebius_vpc_v1alpha1_allocation" "worker" { - for_each = local.cluster_workers - parent_id = var.parent_id - name = each.key - ipv4_private = { - subnet_id = var.subnet_id - } -} - -resource "nebius_compute_v1_disk" "worker" { - for_each = local.cluster_workers - parent_id = var.parent_id - name = "slurm-boot-disk-worker-${each.key}" - - block_size_bytes = 4096 - size_bytes = 549755813888 - type = "NETWORK_SSD" - source_image_family = { image_family = "ubuntu22.04-cuda12" } -} - -resource "nebius_compute_v1_instance" "worker" { - for_each = local.cluster_workers - name = each.key - parent_id = var.parent_id - resources = { - platform = local.worker_platform - preset = local.worker_preset - } - gpu_cluster = nebius_compute_v1_gpu_cluster.gpu-cluster-slurm - - boot_disk = { - attach_mode = "READ_WRITE" - existing_disk = nebius_compute_v1_disk.worker[each.key] - } - - filesystems = var.shared_fs_type == "filesystem" ? [ - { - attach_mode = "READ_WRITE" - device_name = "slurm-fs" - mount_tag = "slurm-fs" - existing_filesystem = { - id = nebius_compute_v1_filesystem.slurm-fs[0].id - } - } - ] : null - - cloud_init_user_data = templatefile( - "${path.module}/files/cloud-config-worker.yaml.tftpl", { - ENROOT_VERSION = "3.4.1" - SLURM_VERSION = var.slurm_version - is_mysql = var.mysql_jobs_backend - ssh_public_key = local.ssh_public_key - shared_fs_type = var.shared_fs_type - nfs_export_path = var.shared_fs_type == "nfs" ? module.nfs-module[0].nfs_export_path : 0 - nfs_ip = var.shared_fs_type == "nfs" ? module.nfs-module[0].nfs_server_internal_ip : 0 - worker_prefix = var.worker_name_prefix - cluster_workers_count = var.cluster_workers_count - hostname = each.key - password = "" #random_password.mysql.result - master_public_key = tls_private_key.master_key.public_key_openssh - }) - - network_interfaces = [ - { - name = "eth0" - subnet_id = var.subnet_id - ip_address : { - allocation_id = nebius_vpc_v1alpha1_allocation.worker[each.key].id - } - } - ] -} diff --git a/slurm/terraform.tfvars b/slurm/terraform.tfvars deleted file mode 100644 index 3c1ff8a4..00000000 --- a/slurm/terraform.tfvars +++ /dev/null @@ -1,16 +0,0 @@ -# parent_id = "" # The project-id in this context -# subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id -# region = "" # Project region -# ssh_user_name = "" # Username you want to use to connect to the nodes -# ssh_public_key = { -# key = "put your public ssh key here" OR -# path = "put path to ssh key here" -# } -cluster_workers_count = 2 # amount of workers -mysql_jobs_backend = false # Do you want to use mysql -shared_fs_type = "filesystem" # "nfs" or "filesystem" - -# master_platform = -# master_preset = -# worker_platform = -# worker_preset = \ No newline at end of file diff --git a/slurm/test-resources.tf b/slurm/test-resources.tf deleted file mode 100644 index d5c75f39..00000000 --- a/slurm/test-resources.tf +++ /dev/null @@ -1,56 +0,0 @@ -resource "null_resource" "master-check-ansible" { - count = var.test_mode ? 1 : 0 - connection { - user = "slurm" - host = trimsuffix(nebius_compute_v1_instance.master.status.network_interfaces[0].public_ip_address.address, "/32") - } - - provisioner "remote-exec" { - inline = [ - "set -eu", - "until [ -s '/tmp/ansible/ansible.pid' ]; do echo 'Waiting for ansbile process start'; sleep 10; done", - "until ! ps -p $(cat /tmp/ansible/ansible.pid) > /dev/null; do echo 'Waiting for ansbile process finish'; sleep 10; done", - "grep -q 'failed=0' /tmp/ansible/ansible.log", - "grep -q 'unreachable=0' /tmp/ansible/ansible.log", - "grep -q 'rescued=0' /tmp/ansible/ansible.log", - "grep -q 'ignored=0' /tmp/ansible/ansible.log", - ] - } -} - -resource "null_resource" "master-check-slurm" { - depends_on = [null_resource.master-check-ansible] - count = var.test_mode ? 1 : 0 - connection { - user = "slurm" - host = trimsuffix(nebius_compute_v1_instance.master.status.network_interfaces[0].public_ip_address.address, "/32") - } - - provisioner "remote-exec" { - inline = [ - "set -eu", - "sinfo -N", - "scontrol show nodes --json | jq '.nodes[].state'", - "scontrol show nodes --json | jq -e '.nodes | length == ${var.cluster_workers_count}'", - "scontrol show nodes --json | jq -e '[(select(.nodes[].state == [\"IDLE\"]))] | length == ${var.cluster_workers_count}'", - ] - } -} - -resource "null_resource" "master-run-nccl-tests" { - depends_on = [null_resource.master-check-slurm] - count = var.test_mode ? 1 : 0 - connection { - user = "slurm" - host = trimsuffix(nebius_compute_v1_instance.master.status.network_interfaces[0].public_ip_address.address, "/32") - } - - provisioner "remote-exec" { - inline = [ - "set -eu", - "sbatch -W -N ${var.cluster_workers_count} /home/slurm/nccl.sbatch", - "scontrol show job --all", - "scontrol show job --all --json | jq -e '[select(.jobs[].job_state == [\"COMPLETED\"])] | length == 1'", - ] - } -} diff --git a/slurm/tests/main.tftest.hcl b/slurm/tests/main.tftest.hcl deleted file mode 100644 index 6847e79c..00000000 --- a/slurm/tests/main.tftest.hcl +++ /dev/null @@ -1,30 +0,0 @@ -run "slurm_master_apply" { - command = apply - - variables { - cluster_workers_count = 2 - } - - plan_options { - target = [ - nebius_compute_v1_instance.master - ] - } -} - -run "slurm_full_apply" { - command = apply - - variables { - cluster_workers_count = 2 - } -} - -run "test_mode_slurm_apply" { - command = apply - - variables { - cluster_workers_count = 2 - test_mode = true - } -} diff --git a/slurm/tls_master_key.tf b/slurm/tls_master_key.tf deleted file mode 100644 index 4de3e29e..00000000 --- a/slurm/tls_master_key.tf +++ /dev/null @@ -1,3 +0,0 @@ -resource "tls_private_key" "master_key" { - algorithm = "RSA" -} diff --git a/slurm/variables.tf b/slurm/variables.tf deleted file mode 100644 index f128c0d8..00000000 --- a/slurm/variables.tf +++ /dev/null @@ -1,117 +0,0 @@ -variable "parent_id" { - type = string -} -variable "subnet_id" { - type = string -} - -variable "region" { - description = "Project region." - type = string -} - -variable "ib_image_id" { - type = string - description = "ID of Infiniband image" - default = "arljjqhufbo9rrjsonm2" -} - -variable "cluster_workers_count" { - type = number - description = "Amount of slurm workers" -} - -variable "ssh_public_key" { - description = "SSH Public Key to access the cluster nodes" - type = object({ - key = optional(string), - path = optional(string, "~/.ssh/id_rsa.pub") - }) - default = {} - validation { - condition = var.ssh_public_key.key != null || fileexists(var.ssh_public_key.path) - error_message = "SSH Public Key must be set by `key` or file `path` ${var.ssh_public_key.path}" - } -} - -variable "master_platform" { - description = "Platform for Slurm Master." - type = string - default = null -} - -variable "master_preset" { - description = "Preset for Slurm Master." - type = string - default = null -} - -variable "worker_platform" { - description = "Platform for Slurm Worker." - type = string - default = null -} - -variable "worker_preset" { - description = "Preset for Slurm Worker." - type = string - default = null -} - -variable "mysql_jobs_backend" { - type = bool - description = "Use MySQL for jobs logging in slurm: 1 or 0" - default = false -} - -variable "slurm_version" { - type = string - description = "Slurm version" - default = "24.05.3" -} - -variable "slurm_binaries" { - type = string - description = "Slurm binaries URL" - default = "12.2.2-jammy-slurm" -} - -variable "pmix_version" { - type = string - description = "PMIX version" - default = "5.0.3" -} - -variable "enroot_version" { - type = string - description = "ENROOT version" - default = "3.4.1" -} - -variable "shared_fs_type" { - type = string - default = null - description = "Use shared managed FileStorage mounted on /mnt/slurm on every worker node" - validation { - condition = var.shared_fs_type == null ? true : contains(["filesystem", "nfs"], var.shared_fs_type) - error_message = "shared_fs_type must be one of: filesystem / nfs" - } -} - -variable "fs_size" { - type = number - description = "Shared FileStorage or NFS size x93" - default = 93 * 1024 * 1024 * 1024 -} - -variable "worker_name_prefix" { - type = string - description = "Slurm worker name prefix" - default = "slurm-worker" -} - -variable "test_mode" { - description = "Switch between real usage and testing" - type = bool - default = false -} diff --git a/vm-instance/.envrc.sh b/vm-instance/.envrc.sh new file mode 100644 index 00000000..9026e35f --- /dev/null +++ b/vm-instance/.envrc.sh @@ -0,0 +1,322 @@ +#!/bin/bash +unset NEBIUS_IAM_TOKEN +export NEBIUS_IAM_TOKEN=$(nebius iam get-access-token) +export TF_VAR_iam_token=$NEBIUS_IAM_TOKEN + +# File to store the last selected project +LAST_SELECTED_TENANT_FILE=".last_selected_tenant" +LAST_SELECTED_PROJECT_FILE=".last_selected_project" + +# Check if necessary tools are installed +REQUIRED_TOOLS=("fzf" "jq") +INSTALL_COMMAND="" + +# Determine the package manager +if command -v apt &>/dev/null; then + INSTALL_COMMAND="sudo apt install -y" +elif command -v yum &>/dev/null; then + INSTALL_COMMAND="sudo yum install -y" +elif command -v dnf &>/dev/null; then + INSTALL_COMMAND="sudo dnf install -y" +elif command -v brew &>/dev/null; then + INSTALL_COMMAND="brew install" +else + echo "Unsupported package manager. Please install required tools manually: ${REQUIRED_TOOLS[*]}" + exit 1 +fi + +# Check and install missing tools +for tool in "${REQUIRED_TOOLS[@]}"; do + if ! command -v "$tool" &>/dev/null; then + echo "$tool is not installed. Installing..." + $INSTALL_COMMAND "$tool" + if [[ $? -ne 0 ]]; then + echo "Failed to install $tool. Please install it manually." + exit 1 + fi + fi +done + +# Fetch the data from the command +OUTPUT=$(nebius iam tenant list --page-size 100 --format json) + +# Parse the names and IDs from the output +declare -A TENANTS +while IFS= read -r line; do + # Extract tenant names and IDs + name=$(echo "$line" | jq -r '.metadata.name') + id=$(echo "$line" | jq -r '.metadata.id') + [[ -n "$name" && -n "$id" ]] && TENANTS["$name"]=$id +done < <(echo "$OUTPUT" | jq -c '.items[]') + +# Check if tenant list is empty +if [[ ${#TENANTS[@]} -eq 0 ]]; then + echo "No tenants found. Exiting." + exit 0 +fi + +# Create a list with both names and IDs +tenant_list=$(for name in "${!TENANTS[@]}"; do + echo "$name (${TENANTS[$name]})" +done) + +# Prepend the last selected tenant to the list, if it exists +if [[ -f "$LAST_SELECTED_TENANT_FILE" ]]; then + last_selected=$(<"$LAST_SELECTED_TENANT_FILE") + tenant_list=$(echo "$last_selected"; echo "$tenant_list" | grep -v -F "$last_selected") +fi + +# Use fzf for selection +selected=$(echo "$tenant_list" | fzf --prompt="Select a tenant: " --height=20 --reverse --exact --header="Arrow keys to navigate, Enter to select") + +# Check if the selection is empty +if [[ -z "$selected" ]]; then + echo "No tenant selected." + exit 0 +fi + +# Extract the selected name and ID safely +tenant_name=$(echo "$selected" | sed -E 's/^(.*)[[:space:]]\(.*/\1/') +tenant_id=$(echo "$selected" | sed -E 's/^.*\((.*)\)$/\1/') + +# Save the selection for the next run +echo "$selected" > "$LAST_SELECTED_TENANT_FILE" +# Fetch the data from the command + +# Now, execute the command +OUTPUT=$(nebius iam project list --page-size 100 --parent-id "$tenant_id" --format json) + +declare -A PROJECTS +while IFS= read -r line; do + # Extract tenant names and IDs + name=$(echo "$line" | jq -r '.metadata.name') + id=$(echo "$line" | jq -r '.metadata.id') + [[ -n "$name" && -n "$id" ]] && PROJECTS["$name"]=$id +done < <(echo "$OUTPUT" | jq -c '.items[]') + +# Check if project list is empty +if [[ ${#PROJECTS[@]} -eq 0 ]]; then + echo "No projects found. Exiting." + exit 0 +fi + + +# Create a list with both names and IDs +project_list=$(for name in "${!PROJECTS[@]}"; do + echo "$name (${PROJECTS[$name]})" +done) + +# Prepend the last selected project to the list, if it exists +if [[ -f "$LAST_SELECTED_PROJECT_FILE" ]]; then + last_selected=$(<"$LAST_SELECTED_PROJECT_FILE") + echo "LAST SELECTION: $last_selected" + # Check if the last selected item exists in the current tenant list + if echo "$project_list" | grep -q -F "$last_selected"; then + project_list=$(echo "$last_selected"; echo "$project_list" | grep -v -F "$last_selected") + fi +fi + +# Use fzf for selection +selected=$(echo "$project_list" | fzf --prompt="Select a project: " --height=20 --reverse --exact --header="Arrow keys to navigate, Enter to select") + +# Check if the selection is empty +if [[ -z "$selected" ]]; then + echo "No project selected." + exit 0 +fi + +# Extract the selected name and ID safely +project_name=$(echo "$selected" | sed -E 's/^(.*)[[:space:]]\(.*/\1/') +project_id=$(echo "$selected" | sed -E 's/^.*\((.*)\)$/\1/') +unset TENANTS +unset PROJECTS + +# Save the selection for the next run +echo "$selected" > "$LAST_SELECTED_PROJECT_FILE" + +export NEBIUS_TENANT_ID=$tenant_id +export NEBIUS_PROJECT_ID=$project_id +# Output the result +echo "Selected tenant: $tenant_name ($tenant_id)" +echo "Selected project: $project_name ($project_id)" + +# region VPC subnet + +NEBIUS_VPC_SUBNET_ID=$(nebius vpc subnet list \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --format json \ + | jq -r '.items[0].metadata.id') +export NEBIUS_VPC_SUBNET_ID + +# endregion VPC subnet + +# region TF variables + +export TF_VAR_iam_token="${NEBIUS_IAM_TOKEN}" +export TF_VAR_iam_tenant_id="${NEBIUS_TENANT_ID}" +export TF_VAR_iam_project_id="${NEBIUS_PROJECT_ID}" +export TF_VAR_vpc_subnet_id="${NEBIUS_VPC_SUBNET_ID}" +export TF_VAR_iam_project_id="${NEBIUS_PROJECT_ID}" +export TF_VAR_parent_id="${NEBIUS_PROJECT_ID}" +export TF_VAR_subnet_id="${NEBIUS_VPC_SUBNET_ID}" + +export TFE_PARALLELISM=20 + +echo "Exported variables:" +echo "NEBIUS_TENANT_ID: ${NEBIUS_TENANT_ID}" +echo "NEBIUS_PROJECT_ID: ${NEBIUS_PROJECT_ID}" +echo "NEBIUS_VPC_SUBNET_ID: ${NEBIUS_VPC_SUBNET_ID}" +echo "TFE_PARALLELISM: ${TFE_PARALLELISM}" + +# endregion TF variables + +# region Remote state + +# region Service account + +NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account list \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --format json \ + | jq -r '.items[] | select(.metadata.name == "slurm-terraform-sa").metadata.id') + +if [ -z "$NEBIUS_SA_TERRAFORM_ID" ]; then + NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account create \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --name 'slurm-terraform-sa' \ + --format json \ + | jq -r '.metadata.id') + echo "Created new service account with ID: $NEBIUS_SA_TERRAFORM_ID" +else + echo "Found existing service account with ID: $NEBIUS_SA_TERRAFORM_ID" +fi + +# endregion Service account + +# region `editors` group + +NEBIUS_GROUP_EDITORS_ID=$(nebius iam group get-by-name \ + --parent-id "${NEBIUS_TENANT_ID}" \ + --name 'editors' \ + --format json \ + | jq -r '.metadata.id') + +IS_MEMBER=$(nebius iam group-membership list-members \ + --parent-id "${NEBIUS_GROUP_EDITORS_ID}" \ + --page-size 1000 \ + --format json \ + | jq -r --arg SAID "${NEBIUS_SA_TERRAFORM_ID}" '.memberships[] | select(.spec.member_id == $SAID) | .spec.member_id') + + +# Add service account to group editors only if not already a member +if [ -z "${IS_MEMBER}" ]; then + nebius iam group-membership create \ + --parent-id "${NEBIUS_GROUP_EDITORS_ID}" \ + --member-id "${NEBIUS_SA_TERRAFORM_ID}" + echo "Added service account to editors group" +else + echo "Service account is already a member of editors group" +fi + +# endregion `editors` group + +# region Access key + +DATE_FORMAT='+%Y-%m-%dT%H:%M:%SZ' + +if [[ "$(uname)" == "Darwin" ]]; then + # macOS + EXPIRATION_DATE=$(date -v +14d "${DATE_FORMAT}") +else + # Linux (assumes GNU date) + EXPIRATION_DATE=$(date -d '+14 day' "${DATE_FORMAT}") +fi + +echo 'Creating new access key for Object Storage' +NEBIUS_SA_ACCESS_KEY_ID=$(nebius iam access-key create \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --name "slurm-tf-ak-$(date +%s)" \ + --account-service-account-id "${NEBIUS_SA_TERRAFORM_ID}" \ + --description 'Temporary S3 Access' \ + --expires-at "${EXPIRATION_DATE}" \ + --format json \ + | jq -r '.resource_id') +echo "Created new access key: ${NEBIUS_SA_ACCESS_KEY_ID}" + +# endregion Access key + +# region AWS access key + +AWS_ACCESS_KEY_ID=$(nebius iam access-key get-by-id \ + --id "${NEBIUS_SA_ACCESS_KEY_ID}" \ + --format json | jq -r '.status.aws_access_key_id') +export AWS_ACCESS_KEY_ID + +echo "Generating new AWS_SECRET_ACCESS_KEY" +AWS_SECRET_ACCESS_KEY="$(nebius iam access-key get-secret-once \ + --id "${NEBIUS_SA_ACCESS_KEY_ID}" \ + --format json \ + | jq -r '.secret')" +export AWS_SECRET_ACCESS_KEY + +# endregion AWS access key + +# region Bucket + +NEBIUS_BUCKET_NAME="tfstate-slurm-k8s-$(echo -n "${NEBIUS_TENANT_ID}-${NEBIUS_PROJECT_ID}" | md5sum | awk '$0=$1')" +export NEBIUS_BUCKET_NAME +# Check if bucket exists +EXISTING_BUCKET=$(nebius storage bucket list \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --format json \ + | jq -r --arg BUCKET "${NEBIUS_BUCKET_NAME}" '.items[] | select(.metadata.name == $BUCKET) | .metadata.name') + +if [ -z "${EXISTING_BUCKET}" ]; then + nebius storage bucket create \ + --name "${NEBIUS_BUCKET_NAME}" \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --versioning-policy 'enabled' + echo "Created bucket: ${NEBIUS_BUCKET_NAME}" +else + echo "Using existing bucket: ${NEBIUS_BUCKET_NAME}" +fi + +aws configure set aws_access_key_id $AWS_ACCESS_KEY_ID +aws configure set aws_secret_access_key $AWS_SECRET_ACCESS_KEY +aws configure set region eu-north1 +aws configure set endpoint_url https://storage.eu-north1.nebius.cloud:443 +mkdir -p ./.aws +echo "[default]" > ./.aws/credentials +echo "aws_access_key_id = $AWS_ACCESS_KEY_ID" >> ./.aws/credentials +echo "aws_secret_access_key = $AWS_SECRET_ACCESS_KEY" >> ./.aws/credentials +echo "[default]" > ./.aws/config +echo "region = eu-north1" >> ./.aws/config +echo "endpoint_url = https://storage.eu-north1.nebius.cloud:443" >> ./.aws/config +export TF_VAR_aws_access_key_id=$AWS_ACCESS_KEY_ID +export TF_VAR_aws_secret_access_key=$AWS_SECRET_ACCESS_KEY + +# endregion Bucket + +# region Backend override + +cat > terraform_backend_override.tf << EOF +terraform { + backend "s3" { + bucket = "${NEBIUS_BUCKET_NAME}" + key = "slurm-k8s.tfstate" + + endpoints = { + s3 = "https://storage.eu-north1.nebius.cloud:443" + } + region = "eu-north1" + + skip_region_validation = true + skip_credentials_validation = true + skip_requesting_account_id = true + skip_s3_checksum = true + } +} +EOF + +# endregion Backend override + +# endregion Remote state diff --git a/vm-instance/README.md b/vm-instance/README.md new file mode 100644 index 00000000..d68c7ac5 --- /dev/null +++ b/vm-instance/README.md @@ -0,0 +1,24 @@ +# Nebius Simple solutions + +This Terraform Module facilitates creating less complex yet useful solutions, as single vms or services +## Configuring Terraform for Nebius Cloud + +- Install [Nebius CLI](https://docs.nebius.com/cli/install/). +- Add environment variables for Terraform authentication in Nebuis Cloud. + +``` +source ./env.sh +``` + +## Usage + +run terraform:To use this module in your Terraform environment, you must first create a Terraform configuration and change the placeholder values in the `terraform.tfvars`. + + +``` +terraform init +terraform plan +terraform apply +``` + +If you want to mount an existing shared filesystem, simply put the id in `shared_filesystem_id`. diff --git a/vm-instance/locals.tf b/vm-instance/locals.tf new file mode 100644 index 00000000..e69de29b diff --git a/vm-instance/main.tf b/vm-instance/main.tf new file mode 100644 index 00000000..a132383e --- /dev/null +++ b/vm-instance/main.tf @@ -0,0 +1,22 @@ +module "instance-module" { + source = "../modules/instance" + parent_id = var.parent_id + subnet_id = var.subnet_id + count = var.instance_count + instance_name = "instance-${count.index}" + users = var.users + preset = var.preset + platform = var.platform + boot_disk_size_gb = 500 + shared_filesystem_id = var.shared_filesystem_id + shared_filesystem_mount = var.shared_filesystem_mount + extra_path = var.extra_path + add_extra_storage = var.add_extra_storage + extra_storage_size_gb = var.extra_storage_size_gb + extra_storage_class = var.extra_storage_class + public_ip = var.public_ip + mount_bucket = var.mount_bucket + s3_mount_path = var.s3_mount_path + aws_access_key_id = var.aws_access_key_id + aws_secret_access_key = var.aws_secret_access_key +} diff --git a/vm-instance/outputs.tf b/vm-instance/outputs.tf new file mode 100644 index 00000000..c6e5d169 --- /dev/null +++ b/vm-instance/outputs.tf @@ -0,0 +1,9 @@ +output "internal_ips" { + description = "The internal IP addresses of all instances" + value = [module.instance-module[*].internal_ip] +} + +output "public_ips" { + description = "The public IP addresses of all instances" + value = [module.instance-module[*].public_ip] +} diff --git a/vm-instance/provider.tf b/vm-instance/provider.tf new file mode 100644 index 00000000..72e57f31 --- /dev/null +++ b/vm-instance/provider.tf @@ -0,0 +1,8 @@ +terraform { + required_providers { + nebius = { + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" + version= ">= 0.4.24" + } + } +} diff --git a/vm-instance/s3_keys.sh b/vm-instance/s3_keys.sh new file mode 100644 index 00000000..c4d92a4f --- /dev/null +++ b/vm-instance/s3_keys.sh @@ -0,0 +1,129 @@ +NEBIUS_TENANT_ID='' +NEBIUS_PROJECT_ID='' + +if [ -z "${NEBIUS_TENANT_ID}" ]; then + echo "Error: NEBIUS_TENANT_ID is not set" + return 1 +fi + +if [ -z "${NEBIUS_PROJECT_ID}" ]; then + echo "Error: NEBIUS_PROJECT_ID is not set" + return 1 +fi + +# region IAM token + +unset NEBIUS_IAM_TOKEN +nebius iam whoami > /dev/null +nebius iam get-access-token > /dev/null +NEBIUS_IAM_TOKEN=$(nebius iam get-access-token) +export NEBIUS_IAM_TOKEN + +# endregion IAM token + +# region VPC subnet + +NEBIUS_VPC_SUBNET_ID=$(nebius vpc subnet list \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --format json \ + | jq -r '.items[0].metadata.id') +export NEBIUS_VPC_SUBNET_ID + +# endregion VPC subnet + +# region Service account + +NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account list \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --format json \ + | jq -r '.items[] | select(.metadata.name == "slurm-terraform-sa").metadata.id') + +if [ -z "$NEBIUS_SA_TERRAFORM_ID" ]; then + NEBIUS_SA_TERRAFORM_ID=$(nebius iam service-account create \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --name 'slurm-terraform-sa' \ + --format json \ + | jq -r '.metadata.id') + echo "Created new service account with ID: $NEBIUS_SA_TERRAFORM_ID" +else + echo "Found existing service account with ID: $NEBIUS_SA_TERRAFORM_ID" +fi + +# endregion Service account + +# region `editors` group + +NEBIUS_GROUP_EDITORS_ID=$(nebius iam group get-by-name \ + --parent-id "${NEBIUS_TENANT_ID}" \ + --name 'editors' \ + --format json \ + | jq -r '.metadata.id') + +IS_MEMBER=$(nebius iam group-membership list-members \ + --parent-id "${NEBIUS_GROUP_EDITORS_ID}" \ + --page-size 1000 \ + --format json \ + | jq -r --arg SAID "${NEBIUS_SA_TERRAFORM_ID}" '.memberships[] | select(.spec.member_id == $SAID) | .spec.member_id') + + +# Add service account to group editors only if not already a member +if [ -z "${IS_MEMBER}" ]; then + nebius iam group-membership create \ + --parent-id "${NEBIUS_GROUP_EDITORS_ID}" \ + --member-id "${NEBIUS_SA_TERRAFORM_ID}" + echo "Added service account to editors group" +else + echo "Service account is already a member of editors group" +fi + +# endregion `editors` group + +# region Access key + +DATE_FORMAT='+%Y-%m-%dT%H:%M:%SZ' + +if [[ "$(uname)" == "Darwin" ]]; then + # macOS + EXPIRATION_DATE=$(date -v +30d "${DATE_FORMAT}") +else + # Linux (assumes GNU date) + EXPIRATION_DATE=$(date -d '+30 day' "${DATE_FORMAT}") +fi + +echo "Creating new access key for Object Storage expiring at ${EXPIRATION_DATE}" +NEBIUS_SA_ACCESS_KEY_ID=$(nebius iam access-key create \ + --parent-id "${NEBIUS_PROJECT_ID}" \ + --name "s3-access-$(date +%s)" \ + --account-service-account-id "${NEBIUS_SA_TERRAFORM_ID}" \ + --description 'Temporary S3 Access' \ + --expires-at "${EXPIRATION_DATE}" \ + --format json \ + | jq -r '.resource_id') +echo "Created new access key: ${NEBIUS_SA_ACCESS_KEY_ID}" + +# endregion Access key + +# region AWS access key + +AWS_ACCESS_KEY_ID=$(nebius iam access-key get-by-id \ + --id "${NEBIUS_SA_ACCESS_KEY_ID}" \ + --format json | jq -r '.status.aws_access_key_id') +export AWS_ACCESS_KEY_ID + +echo "Generating new AWS_SECRET_ACCESS_KEY" +AWS_SECRET_ACCESS_KEY="$(nebius iam access-key get-secret-once \ + --id "${NEBIUS_SA_ACCESS_KEY_ID}" \ + --format json \ + | jq -r '.secret')" +export AWS_SECRET_ACCESS_KEY + +# endregion AWS access key +export AWS_ENDPOINT_URL="https://storage.eu-north1.nebius.cloud:443" +aws configure set aws_access_key_id ${AWS_ACCESS_KEY_ID} +aws configure set aws_secret_access_key ${AWS_SECRET_ACCESS_KEY} +aws configure set region eu-north1 +aws configure set endpoint_url https://storage.eu-north1.nebius.cloud:443 + + +echo "AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}" +echo "AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}" diff --git a/vm-instance/terraform.tfvars b/vm-instance/terraform.tfvars new file mode 100644 index 00000000..490a5ee8 --- /dev/null +++ b/vm-instance/terraform.tfvars @@ -0,0 +1,29 @@ +#parent_id = "" # The project-id in this context +#subnet_id = "" # Use the command "nebius vpc v1alpha1 network list" to see the subnet id + + +#preset = "16vcpu-64gb" +#platform = "cpu-e2" +#preset = "8gpu-128vcpu-1600gb" +preset = "1gpu-16vcpu-200gb" +platform = "gpu-h100-sxm" + +users = [ + { + user_name = "tux", + ssh_key_path = "~/.ssh/id_rsa.pub" + }, + { + user_name = "tux2", + ssh_public_key = "" + } +] + +add_extra_storage = false + + +public_ip = true +instance_count = 1 + +shared_filesystem_id = "" +mount_bucket = "" diff --git a/vm-instance/variables.tf b/vm-instance/variables.tf new file mode 100644 index 00000000..45e64170 --- /dev/null +++ b/vm-instance/variables.tf @@ -0,0 +1,121 @@ +variable "parent_id" { + type = string + description = "Id of the folder where the resources going to be created." +} + +variable "subnet_id" { + type = string + description = "ID of the subnet." +} + +variable "instance_count" { + type = number + description = "Number of instances" + default = 1 +} + + +variable "instance_name" { + type = string + description = "name of the instance" + default = "instance" +} + +variable "platform" { + description = "VM platform." + type = string + default = "cpu-e2" +} + +variable "preset" { + description = "VM resources preset." + type = string + default = "16vcpu-64gb" +} + +variable "users" { + description = "List of users with their SSH keys" + type = list(object({ + user_name = string + ssh_public_key = optional(string) # Inline SSH key + ssh_key_path = optional(string, "~/.ssh/id_rsa.pub") # Path to SSH key file + })) + default = [] + validation { + condition = alltrue([ + for user in var.users : user.ssh_public_key != null || fileexists(user.ssh_key_path) + ]) + error_message = "Each user must have at least one SSH key defined as 'ssh_public_key' or 'ssh_key_path'." + } +} + +variable "shared_filesystem_id" { + description = "Id of an existing shared file system" + type = string + default = "" +} + +variable "shared_filesystem_mount" { + description = "mounting point of the shared file system" + type = string + default = "/mnt/share" +} + +variable "region" { + type = string + description = "region" + default = "eu-north1" +} + +variable "add_extra_storage" { + type = bool + default = false + description = "if true, a new disk will be created and mounted " + } + +variable "extra_path" { + type = string + default = "/mnt/storage" + description = "Folder where the network storage will be mounted on" +} +variable "extra_storage_class" { + type = string + default = "NETWORK_SSD" + description = "Network type of additional disk being added" +} + +variable "extra_storage_size_gb" { + type = number + default = 50 + description = "size of the newly created extra storage" +} + +variable "public_ip" { + type = bool + default = true + description = "attach a public ip to the vm if true" +} + +variable "mount_bucket" { + type = string + description = "name of a bucket that should be mounted into fs" + default = "" +} + +variable "s3_mount_path" { + type = string + description = "mountpoint for s3 mount" + default = "/mnt/s3" +} + +variable "aws_access_key_id" { + type = string + description = "S3 access key" + default = "" +} + +variable "aws_secret_access_key" { + type = string + description = "S3 access key" + default = "" +} diff --git a/wireguard/README.md b/wireguard/README.md index c6cd77d3..2467f0f8 100644 --- a/wireguard/README.md +++ b/wireguard/README.md @@ -1,108 +1,10 @@ -# Wireguard VPN instance +# Creating a jump server with WireGuard installed on it -This Terraform solution deploys a Wireguard VPN instance that serves as a secure jump host for your infrastructure. It improves the security by minimizing the use of Public IPs and limiting access to the rest of the environment. +In Nebius AI Cloud, you can create a virtual machine (VM) with a [WireGuard](https://www.wireguard.com) image. This solution allows you to create a jump server between two zones: -## Prerequisites +* Secure zone that contains your VMs in Nebius AI Cloud +* Demilitarized zone (DMZ) that contains machines outside Nebius AI Cloud -1. Install [Nebius CLI](https://docs.nebius.dev/en/cli/#installation): - ```bash - curl -sSL https://storage.ai.nebius.cloud/nebius/install.sh | bash - ``` +By using a jump server as a virtual machine with WireGuard, DMZ machines can connect to VMs in the secure zone and share data in an encrypted form. Only one public IP address is required for the connection, which enables you to keep the number of available public IP addresses within a [quota](https://docs.nebius.com/compute/resources/quotas-limits#network). -2. Reload your shell session: - - ```bash - exec -l $SHELL - ``` - - or - - ```bash - source ~/.bashrc - ``` - -3. [Configure](https://docs.nebius.ai/cli/configure/) Nebius CLI (we recommend using [service account](https://docs.nebius.ai/iam/service-accounts/manage/)): - ```bash - nebius init - ``` - -4. Install JQuery (for Debian-based distributions): - ```bash - sudo apt install jq -y - ``` - -## Installation - -To deploy the solution, follow these steps: - -1. Load environment variables: - ```bash - source ./environment.sh - ``` -2. Initialize Terraform: - ```bash - terraform init - ``` -3. Replace the placeholder content in `terraform.tfvars` with the configuration values that you need. See the details [below](#configuration-variables). -4. Preview the deployment plan: - ```bash - terraform plan - ``` -5. Apply the configuration: - ```bash - terraform apply - ``` - Wait for the operation to complete. - -## Configuration variables - -Update the following variables in the `terraform.tfvars` file with your own values: - -- `parent_id` -- `subnet_id` -- `ssh_user_name` -- `ssh_public_key` - -## Creating and using a public IP allocation - -This step allows you to retain the IP address even if the VM is deleted. If you don’t need to keep the IP adress, skip section. - -1. Create a public IP allocation: - ```bash - nebius vpc v1 allocation create --ipv-4-public \ - --parent-id --name wireguard_allocation_pub \ - --format json | jq -r '.metadata.id' - ``` -2. Assign the value from the previous step to the `public_ip_allocation_id` variable in [variables.tf](./variables.tf): - -```bash -public_ip_allocation_id = -``` - -## Usage - -### Logging into Wireguard UI - -1. SSH into the Wireguard instance: - ```bash - ssh -i @ - ``` - -2. Retrieve the Wireguard UI password: - ```bash - sudo cat /var/lib/wireguard-ui/initial_password - ``` - -3. Open the Wireguard UI in your browser: - ``` - http://:5000 - ``` - -4. Log in with the following credentials: - - **Username:** `admin` - - **Password:** [password retrieved in step 2] - -### Notes - -- **Apply Config:** After creating, deleting or changing Wireguard users, select "Apply Config". -- **Allowed IPs:** When adding new users, specify the CIDRs of your existing infrastructure in the "Allowed IPs" field. +To create the jump server, use Terraform manifests located in the current directory. For the instructions on how to deploy this solution, see the [Compute documentation](https://docs.nebius.com/compute/virtual-machines/wireguard). diff --git a/wireguard/provider.tf b/wireguard/provider.tf index 63ebbe74..724fd8b7 100644 --- a/wireguard/provider.tf +++ b/wireguard/provider.tf @@ -1,7 +1,7 @@ terraform { required_providers { nebius = { - source = "terraform-provider-nebius.storage.ai.nebius.cloud/nebius/nebius" + source = "terraform-provider.storage.eu-north1.nebius.cloud/nebius/nebius" } } } diff --git a/wireguard/test-resource.tf b/wireguard/test-resource.tf index 7743604a..44bdc140 100644 --- a/wireguard/test-resource.tf +++ b/wireguard/test-resource.tf @@ -20,7 +20,6 @@ resource "null_resource" "check_wireguard_instance" { } } - resource "null_resource" "check_wireguard_web_ui" { depends_on = [null_resource.check_wireguard_instance] count = var.test_mode ? 1 : 0 diff --git a/wireguard/tests/main.tftest.hcl b/wireguard/tests/main.tftest.hcl index f8ebc7af..2345689f 100644 --- a/wireguard/tests/main.tftest.hcl +++ b/wireguard/tests/main.tftest.hcl @@ -1,11 +1,7 @@ -run "wireguard_apply" { - command = apply -} - run "test_mode_wireguard_apply" { command = apply variables { test_mode = true } -} +} \ No newline at end of file