From 50698b846f735c17eacc0f595c8ac9ae741d9b8c Mon Sep 17 00:00:00 2001 From: Egor Sklyarov Date: Fri, 10 Nov 2023 14:57:49 +0400 Subject: [PATCH] Add Nebius backend (#770) * Nebius configs & API client implementation * Implement Nebius compute * Nebius tweaks * Fix list backends test * Pre-install docker on CPU Nebius instance * Run GPU instances in Nebius * Move packer directory * Build Nebius images using Packer with HCL2 * Use pre-built VM images for Nebius * Build Nebius images in CI * Remove outer JSON quotes * Create JSON with service account * Fix create-json version * Remove extra dash * Fix Nebius images build * Build Nebius images in a single job --- .github/workflows/docker.yml | 55 +++- {runner/ami/packer => packer}/README.MD | 23 ++ .../ami/packer => packer}/aws-image-cuda.json | 0 {runner/ami/packer => packer}/aws-image.json | 0 .../ami/packer => packer}/aws-vars-prod.json | 0 .../packer => packer}/azure-image-cuda.json | 0 .../ami/packer => packer}/azure-image.json | 0 packer/build-cuda-image.pkr.hcl | 41 +++ packer/build-image.pkr.hcl | 35 ++ packer/config.pkr.hcl | 8 + .../ami/packer => packer}/gcp-image-cuda.json | 0 {runner/ami/packer => packer}/gcp-image.json | 0 packer/locals.pkr.hcl | 6 + packer/nebius.pkr.hcl | 12 + .../packer => packer}/provisioners/cuda.sh | 0 .../provisioners/docker-image-with-cuda.sh | 0 .../provisioners/docker-image-without-cuda.sh | 0 .../provisioners/install-docker.sh | 0 .../provisioners/kernel/apt-daily.sh | 0 .../provisioners/kernel/apt-packages.sh | 0 .../provisioners/kernel/apt-upgrade.sh | 0 .../provisioners/kernel/kernel-tuning.sh | 0 .../packer => packer}/provisioners/run-docker | 0 packer/variables.pkr.hcl | 27 ++ {runner/ami/packer => packer}/versions.json | 0 runner/cmd/shim/main.go | 8 - .../_internal/core/backends/base/compute.py | 7 +- .../core/backends/nebius/__init__.py | 15 + .../core/backends/nebius/api_client.py | 307 ++++++++++++++++++ .../_internal/core/backends/nebius/compute.py | 195 +++++++++++ .../_internal/core/backends/nebius/config.py | 6 + .../_internal/core/backends/nebius/types.py | 37 +++ .../core/models/backends/__init__.py | 10 + .../_internal/core/models/backends/base.py | 1 + .../_internal/core/models/backends/nebius.py | 55 ++++ .../server/services/backends/__init__.py | 7 + .../services/backends/configurators/nebius.py | 88 +++++ .../_internal/server/services/config.py | 47 ++- .../_internal/server/routers/test_backends.py | 10 +- 39 files changed, 974 insertions(+), 26 deletions(-) rename {runner/ami/packer => packer}/README.MD (74%) rename {runner/ami/packer => packer}/aws-image-cuda.json (100%) rename {runner/ami/packer => packer}/aws-image.json (100%) rename {runner/ami/packer => packer}/aws-vars-prod.json (100%) rename {runner/ami/packer => packer}/azure-image-cuda.json (100%) rename {runner/ami/packer => packer}/azure-image.json (100%) create mode 100644 packer/build-cuda-image.pkr.hcl create mode 100644 packer/build-image.pkr.hcl create mode 100644 packer/config.pkr.hcl rename {runner/ami/packer => packer}/gcp-image-cuda.json (100%) rename {runner/ami/packer => packer}/gcp-image.json (100%) create mode 100644 packer/locals.pkr.hcl create mode 100644 packer/nebius.pkr.hcl rename {runner/ami/packer => packer}/provisioners/cuda.sh (100%) rename {runner/ami/packer => packer}/provisioners/docker-image-with-cuda.sh (100%) rename {runner/ami/packer => packer}/provisioners/docker-image-without-cuda.sh (100%) rename {runner/ami/packer => packer}/provisioners/install-docker.sh (100%) rename {runner/ami/packer => packer}/provisioners/kernel/apt-daily.sh (100%) rename {runner/ami/packer => packer}/provisioners/kernel/apt-packages.sh (100%) rename {runner/ami/packer => packer}/provisioners/kernel/apt-upgrade.sh (100%) rename {runner/ami/packer => packer}/provisioners/kernel/kernel-tuning.sh (100%) rename {runner/ami/packer => packer}/provisioners/run-docker (100%) create mode 100644 packer/variables.pkr.hcl rename {runner/ami/packer => packer}/versions.json (100%) create mode 100644 src/dstack/_internal/core/backends/nebius/__init__.py create mode 100644 src/dstack/_internal/core/backends/nebius/api_client.py create mode 100644 src/dstack/_internal/core/backends/nebius/compute.py create mode 100644 src/dstack/_internal/core/backends/nebius/config.py create mode 100644 src/dstack/_internal/core/backends/nebius/types.py create mode 100644 src/dstack/_internal/core/models/backends/nebius.py create mode 100644 src/dstack/_internal/server/services/backends/configurators/nebius.py diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 9f994b838..490d226cf 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -26,6 +26,10 @@ on: description: "Build GCP images" type: boolean default: true + build_nebius: + description: "Build Nebius images" + type: boolean + default: true env: PACKER_VERSION: "1.9.2" @@ -66,7 +70,7 @@ jobs: if: always() && inputs.build_aws && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped') defaults: run: - working-directory: runner + working-directory: packer runs-on: ubuntu-latest env: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} @@ -81,7 +85,6 @@ jobs: wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip chmod +x packer - cp -R ami/packer/* . - name: Run packer run: | ./packer build -var-file=versions.json $PROD_VARS -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX aws-image${{ matrix.variant }}.json @@ -93,7 +96,7 @@ jobs: if: always() && inputs.build_azure && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped') defaults: run: - working-directory: runner + working-directory: packer runs-on: ubuntu-latest env: AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} @@ -115,7 +118,6 @@ jobs: wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip chmod +x packer - cp -R ami/packer/* . - name: Run packer run: | ./packer build -var-file=versions.json -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX azure-image${{ matrix.variant }}.json @@ -131,7 +133,7 @@ jobs: if: always() && inputs.build_gcp && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped') defaults: run: - working-directory: runner + working-directory: packer runs-on: ubuntu-latest strategy: matrix: @@ -154,7 +156,6 @@ jobs: wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip chmod +x packer - cp -R ami/packer/* . - name: Run packer run: | ./packer build -var-file=versions.json -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX gcp-image${{ matrix.variant }}.json @@ -164,3 +165,45 @@ jobs: gcloud compute images add-iam-policy-binding ${BUILD_PREFIX}dstack${{ matrix.variant }}-$IMAGE_VERSION --member='allAuthenticatedUsers' --role='roles/compute.imageUser' env: IMAGE_VERSION: ${{ inputs.image_version }} + + build-nebius-images: + needs: build-docker + if: always() && inputs.build_nebius && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped') + defaults: + run: + working-directory: packer + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Get Nebius CLI + run: | + echo "CLI_VERSION=$CLI_VERSION" + curl -sSL https://storage.ai.nebius.cloud/ncp/install.sh | bash + echo "$HOME/nebius-cloud/bin" >> $GITHUB_PATH + env: + CLI_VERSION: 0.113.0+Nebius-AI + - name: Write Nebius credentials + uses: jsdaniell/create-json@v1.2.2 + with: + name: "service_account.json" + json: ${{ secrets.NEBIUS_SERVICE_ACCOUNT }} + dir: "packer/" + - name: Setup Nebius profile + run: | + ncp config profile create packer + ncp config set endpoint api.ai.nebius.cloud:443 + ncp config set service-account-key service_account.json + rm service_account.json + - name: Download packer + run: | + wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip + chmod +x packer + ./packer init . + - name: Run packer (HCL2) + run: | + export PKR_VAR_nebius_token=$(ncp iam create-token) + ./packer build -only yandex.nebius,yandex.nebius-cuda -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX . + env: + PKR_VAR_nebius_folder_id: ${{ secrets.NEBIUS_FOLDER_ID }} + PKR_VAR_nebius_subnet_id: ${{ secrets.NEBIUS_SUBNET_ID }} diff --git a/runner/ami/packer/README.MD b/packer/README.MD similarity index 74% rename from runner/ami/packer/README.MD rename to packer/README.MD index 095a470b6..1cd91feaa 100644 --- a/runner/ami/packer/README.MD +++ b/packer/README.MD @@ -54,3 +54,26 @@ Set environment variables. | AZURE_TENANT_ID | tenant_id | | AZURE_SUBSCRIPTION_ID | subscription_id | +# Nebius + +## Setup Nebius credentials + +> `compute.admin` is not sufficient for packer. Use `admin` role instead. + +```shell +ncp config profile create packer +ncp config set service-account-key path/to/service_account.json +ncp config set endpoint api.ai.nebius.cloud:443 +export PKR_VAR_nebius_token=$(ncp iam create-token) +``` + +## Build images + +```shell +export PKR_VAR_nebius_folder_id=... +export PKR_VAR_nebius_subnet_id=... +# no CUDA +packer build -only yandex.nebius -var image_version=0.4rc3 . +# with CUDA +packer build -only yandex.nebius-cuda -var image_version=0.4rc3 . +``` diff --git a/runner/ami/packer/aws-image-cuda.json b/packer/aws-image-cuda.json similarity index 100% rename from runner/ami/packer/aws-image-cuda.json rename to packer/aws-image-cuda.json diff --git a/runner/ami/packer/aws-image.json b/packer/aws-image.json similarity index 100% rename from runner/ami/packer/aws-image.json rename to packer/aws-image.json diff --git a/runner/ami/packer/aws-vars-prod.json b/packer/aws-vars-prod.json similarity index 100% rename from runner/ami/packer/aws-vars-prod.json rename to packer/aws-vars-prod.json diff --git a/runner/ami/packer/azure-image-cuda.json b/packer/azure-image-cuda.json similarity index 100% rename from runner/ami/packer/azure-image-cuda.json rename to packer/azure-image-cuda.json diff --git a/runner/ami/packer/azure-image.json b/packer/azure-image.json similarity index 100% rename from runner/ami/packer/azure-image.json rename to packer/azure-image.json diff --git a/packer/build-cuda-image.pkr.hcl b/packer/build-cuda-image.pkr.hcl new file mode 100644 index 000000000..44d0506ba --- /dev/null +++ b/packer/build-cuda-image.pkr.hcl @@ -0,0 +1,41 @@ +build { + source "source.yandex.nebius" { + name = "nebius-cuda" + image_description = "Ubuntu 22.04 with CUDA, Docker and dstackai/base:cuda images" + image_family = "dstack-cuda" + image_name = "${local.image_name}-cuda" + } + # TODO(egor-s) add other sources + + provisioner "shell" { + inline = ["cloud-init status --long --wait"] + } + + provisioner "shell" { + scripts = ["provisioners/kernel/apt-upgrade.sh", "provisioners/kernel/apt-daily.sh", "provisioners/kernel/apt-packages.sh", "provisioners/kernel/kernel-tuning.sh"] + } + + provisioner "file" { + destination = "/tmp/install-docker.sh" + source = "provisioners/install-docker.sh" + } + + provisioner "file" { + destination = "/tmp/run-docker" + source = "provisioners/run-docker" + } + + provisioner "shell" { + inline = ["cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version ${local.docker_version}"] + } + + provisioner "shell" { + environment_vars = ["CUDA_DRIVERS_VERSION=${local.cuda_drivers_version}"] + script = "provisioners/cuda.sh" + } + + provisioner "shell" { + environment_vars = ["IMAGE_VERSION=${var.image_version}"] + script = "provisioners/docker-image-with-cuda.sh" + } +} diff --git a/packer/build-image.pkr.hcl b/packer/build-image.pkr.hcl new file mode 100644 index 000000000..13f8863fc --- /dev/null +++ b/packer/build-image.pkr.hcl @@ -0,0 +1,35 @@ +build { + source "source.yandex.nebius" { + image_description = "Ubuntu 22.04 with Docker and dstackai/base images" + image_family = "dstack" + image_name = local.image_name + } + # TODO(egor-s) add other sources + + provisioner "shell" { + inline = ["cloud-init status --long --wait"] + } + + provisioner "shell" { + scripts = ["provisioners/kernel/apt-upgrade.sh", "provisioners/kernel/apt-daily.sh", "provisioners/kernel/apt-packages.sh", "provisioners/kernel/kernel-tuning.sh"] + } + + provisioner "file" { + destination = "/tmp/install-docker.sh" + source = "provisioners/install-docker.sh" + } + + provisioner "file" { + destination = "/tmp/run-docker" + source = "provisioners/run-docker" + } + + provisioner "shell" { + inline = ["cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version ${local.docker_version}"] + } + + provisioner "shell" { + environment_vars = ["IMAGE_VERSION=${var.image_version}"] + script = "provisioners/docker-image-without-cuda.sh" + } +} diff --git a/packer/config.pkr.hcl b/packer/config.pkr.hcl new file mode 100644 index 000000000..6e5af302a --- /dev/null +++ b/packer/config.pkr.hcl @@ -0,0 +1,8 @@ +packer { + required_plugins { + yandex = { + version = ">= 1.1.2" + source = "github.com/hashicorp/yandex" + } + } +} \ No newline at end of file diff --git a/runner/ami/packer/gcp-image-cuda.json b/packer/gcp-image-cuda.json similarity index 100% rename from runner/ami/packer/gcp-image-cuda.json rename to packer/gcp-image-cuda.json diff --git a/runner/ami/packer/gcp-image.json b/packer/gcp-image.json similarity index 100% rename from runner/ami/packer/gcp-image.json rename to packer/gcp-image.json diff --git a/packer/locals.pkr.hcl b/packer/locals.pkr.hcl new file mode 100644 index 000000000..66a1e9072 --- /dev/null +++ b/packer/locals.pkr.hcl @@ -0,0 +1,6 @@ +locals { + clean_image_version = regex_replace(var.image_version, "[^a-z0-9-]", "-") + image_name = "${var.build_prefix}dstack-${local.clean_image_version}" + docker_version = "20.10.17" + cuda_drivers_version = "535.54.03-1" +} diff --git a/packer/nebius.pkr.hcl b/packer/nebius.pkr.hcl new file mode 100644 index 000000000..e19b6b0ba --- /dev/null +++ b/packer/nebius.pkr.hcl @@ -0,0 +1,12 @@ +source "yandex" "nebius" { + disk_size_gb = 30 + disk_type = "network-ssd" + endpoint = "api.ai.nebius.cloud:443" + folder_id = var.nebius_folder_id + source_image_family = "ubuntu-2204-lts" + ssh_username = "ubuntu" + subnet_id = var.nebius_subnet_id + token = var.nebius_token + use_ipv4_nat = true + zone = "eu-north1-c" +} diff --git a/runner/ami/packer/provisioners/cuda.sh b/packer/provisioners/cuda.sh similarity index 100% rename from runner/ami/packer/provisioners/cuda.sh rename to packer/provisioners/cuda.sh diff --git a/runner/ami/packer/provisioners/docker-image-with-cuda.sh b/packer/provisioners/docker-image-with-cuda.sh similarity index 100% rename from runner/ami/packer/provisioners/docker-image-with-cuda.sh rename to packer/provisioners/docker-image-with-cuda.sh diff --git a/runner/ami/packer/provisioners/docker-image-without-cuda.sh b/packer/provisioners/docker-image-without-cuda.sh similarity index 100% rename from runner/ami/packer/provisioners/docker-image-without-cuda.sh rename to packer/provisioners/docker-image-without-cuda.sh diff --git a/runner/ami/packer/provisioners/install-docker.sh b/packer/provisioners/install-docker.sh similarity index 100% rename from runner/ami/packer/provisioners/install-docker.sh rename to packer/provisioners/install-docker.sh diff --git a/runner/ami/packer/provisioners/kernel/apt-daily.sh b/packer/provisioners/kernel/apt-daily.sh similarity index 100% rename from runner/ami/packer/provisioners/kernel/apt-daily.sh rename to packer/provisioners/kernel/apt-daily.sh diff --git a/runner/ami/packer/provisioners/kernel/apt-packages.sh b/packer/provisioners/kernel/apt-packages.sh similarity index 100% rename from runner/ami/packer/provisioners/kernel/apt-packages.sh rename to packer/provisioners/kernel/apt-packages.sh diff --git a/runner/ami/packer/provisioners/kernel/apt-upgrade.sh b/packer/provisioners/kernel/apt-upgrade.sh similarity index 100% rename from runner/ami/packer/provisioners/kernel/apt-upgrade.sh rename to packer/provisioners/kernel/apt-upgrade.sh diff --git a/runner/ami/packer/provisioners/kernel/kernel-tuning.sh b/packer/provisioners/kernel/kernel-tuning.sh similarity index 100% rename from runner/ami/packer/provisioners/kernel/kernel-tuning.sh rename to packer/provisioners/kernel/kernel-tuning.sh diff --git a/runner/ami/packer/provisioners/run-docker b/packer/provisioners/run-docker similarity index 100% rename from runner/ami/packer/provisioners/run-docker rename to packer/provisioners/run-docker diff --git a/packer/variables.pkr.hcl b/packer/variables.pkr.hcl new file mode 100644 index 000000000..f2f3aae4f --- /dev/null +++ b/packer/variables.pkr.hcl @@ -0,0 +1,27 @@ +variable "build_prefix" { + type = string + default = "" +} + +variable "image_version" { + type = string +} + +# Nebius +variable "nebius_folder_id" { + type = string + default = null + sensitive = true +} + +variable "nebius_subnet_id" { + type = string + default = null + sensitive = true +} + +variable "nebius_token" { + type = string + default = null + sensitive = true +} diff --git a/runner/ami/packer/versions.json b/packer/versions.json similarity index 100% rename from runner/ami/packer/versions.json rename to packer/versions.json diff --git a/runner/cmd/shim/main.go b/runner/cmd/shim/main.go index c6b9d0f7a..1b6f7ff7b 100644 --- a/runner/cmd/shim/main.go +++ b/runner/cmd/shim/main.go @@ -30,14 +30,6 @@ func main() { Required: true, Destination: &backendName, EnvVars: []string{"DSTACK_BACKEND"}, - Action: func(c *cli.Context, s string) error { - for _, backend := range []string{"aws", "azure", "gcp", "lambda", "tensordock", "local"} { - if s == backend { - return nil - } - } - return gerrors.Newf("unknown backend %s", s) - }, }, /* Shim Parameters */ &cli.PathFlag{ diff --git a/src/dstack/_internal/core/backends/base/compute.py b/src/dstack/_internal/core/backends/base/compute.py index 29bc81ced..788f1961d 100644 --- a/src/dstack/_internal/core/backends/base/compute.py +++ b/src/dstack/_internal/core/backends/base/compute.py @@ -51,7 +51,11 @@ def create_gateway( def get_user_data( - backend: BackendType, image_name: str, authorized_keys: List[str], registry_auth_required: bool + backend: BackendType, + image_name: str, + authorized_keys: List[str], + registry_auth_required: bool, + cloud_config_kwargs: Optional[dict] = None, ) -> str: commands = get_shim_commands( backend=backend, @@ -62,6 +66,7 @@ def get_user_data( return get_cloud_config( runcmd=[["sh", "-c", " && ".join(commands)]], ssh_authorized_keys=authorized_keys, + **(cloud_config_kwargs or {}), ) diff --git a/src/dstack/_internal/core/backends/nebius/__init__.py b/src/dstack/_internal/core/backends/nebius/__init__.py new file mode 100644 index 000000000..38c3d7efe --- /dev/null +++ b/src/dstack/_internal/core/backends/nebius/__init__.py @@ -0,0 +1,15 @@ +from dstack._internal.core.backends.base import Backend +from dstack._internal.core.backends.nebius.compute import NebiusCompute +from dstack._internal.core.backends.nebius.config import NebiusConfig +from dstack._internal.core.models.backends.base import BackendType + + +class NebiusBackend(Backend): + TYPE: BackendType = BackendType.NEBIUS + + def __init__(self, config: NebiusConfig): + self.config = config + self._compute = NebiusCompute(self.config) + + def compute(self) -> NebiusCompute: + return self._compute diff --git a/src/dstack/_internal/core/backends/nebius/api_client.py b/src/dstack/_internal/core/backends/nebius/api_client.py new file mode 100644 index 000000000..722bc569e --- /dev/null +++ b/src/dstack/_internal/core/backends/nebius/api_client.py @@ -0,0 +1,307 @@ +import time +from typing import Dict, List, Optional + +import jwt +import requests + +from dstack._internal.core.backends.nebius.types import ( + ClientError, + ConflictError, + ForbiddenError, + NebiusError, + NotFoundError, + ResourcesSpec, + ServiceAccount, +) +from dstack._internal.utils.logging import get_logger + +logger = get_logger("nebius") +API_URL = "api.ai.nebius.cloud" + + +class NebiusAPIClient: + # Reference: https://nebius.ai/docs/api-design-guide/ + def __init__(self, service_account: ServiceAccount): + self.service_account = service_account + self.s = requests.Session() + self.expires_at = 0 + + def get_token(self): + now = int(time.time()) + if now + 60 < self.expires_at: + return + logger.debug("Refreshing IAM token") + expires_at = now + 3600 + payload = { + "aud": self.url("iam", "/tokens"), + "iss": self.service_account["service_account_id"], + "iat": now, + "exp": expires_at, + } + jwt_token = jwt.encode( + payload, + self.service_account["private_key"], + algorithm="PS256", + headers={"kid": self.service_account["id"]}, + ) + + resp = requests.post(payload["aud"], json={"jwt": jwt_token}) + resp.raise_for_status() + iam_token = resp.json()["iamToken"] + self.s.headers["Authorization"] = f"Bearer {iam_token}" + self.expires_at = expires_at + + def compute_zones_list(self) -> List[dict]: + logger.debug("Fetching compute zones") + self.get_token() + resp = self.s.get(self.url("compute", "/zones")) + self.raise_for_status(resp) + return resp.json()["zones"] + + def resource_manager_folders_create(self, cloud_id: str, name: str, **kwargs) -> dict: + logger.debug("Creating folder %s", name) + self.get_token() + resp = self.s.post( + self.url("resource-manager", "/folders"), + json=omit_none( + cloudId=cloud_id, + name=name, + **kwargs, + ), + ) + self.raise_for_status(resp) + return resp.json() + + def vpc_networks_create(self, folder_id: str, name: str, **kwargs) -> dict: + logger.debug("Creating network %s in %s", name, folder_id) + self.get_token() + resp = self.s.post( + self.url("vpc", "/networks"), + json=omit_none( + folderId=folder_id, + name=name, + **kwargs, + ), + ) + self.raise_for_status(resp) + return resp.json() + + def vpc_networks_list(self, folder_id: str, filter: Optional[str] = None) -> List[dict]: + logger.debug("Fetching networks in %s", folder_id) + return self.list( + "vpc", + "networks", + params=dict( + folderId=folder_id, + filter=filter, + ), + ) + + def vpc_subnets_create( + self, + folder_id: str, + name: str, + network_id: str, + zone: str, + cird_blocks: List[str], + **kwargs, + ) -> dict: + logger.debug("Creating subnet %s in %s", name, network_id) + self.get_token() + resp = self.s.post( + self.url("vpc", "/subnets"), + json=omit_none( + folderId=folder_id, + name=name, + networkId=network_id, + zoneId=zone, + v4CidrBlocks=cird_blocks, + **kwargs, + ), + ) + self.raise_for_status(resp) + return resp.json() + + def vpc_subnets_list(self, folder_id: str, filter: Optional[str] = None) -> List[dict]: + logger.debug("Fetching subnets in %s", folder_id) + return self.list( + "vpc", + "subnets", + params=dict( + folderId=folder_id, + filter=filter, + ), + ) + + def vpc_security_groups_create( + self, folder_id: str, name: str, network_id: str, rule_specs: List[dict], **kwargs + ) -> dict: + logger.debug("Creating security group %s in %s", name, folder_id) + self.get_token() + resp = self.s.post( + self.url("vpc", "/securityGroups"), + json=omit_none( + folderId=folder_id, + name=name, + networkId=network_id, + ruleSpecs=rule_specs, + **kwargs, + ), + ) + self.raise_for_status(resp) + return resp.json() + + def vpc_security_groups_list(self, folder_id: str, filter: Optional[str] = None) -> List[dict]: + logger.debug("Fetching security groups in %s", folder_id) + return self.list( + "vpc", + "securityGroups", + params=dict( + folderId=folder_id, + filter=filter, + ), + ) + + def vpc_security_groups_delete(self, security_group_id: str): + logger.debug("Deleting security group %s", security_group_id) + self.get_token() + resp = self.s.delete(self.url("vpc", f"/securityGroups/{security_group_id}")) + self.raise_for_status(resp) + + def compute_instances_create( + self, + folder_id: str, + name: str, + zone_id: str, + platform_id: str, + resources_spec: ResourcesSpec, + metadata: Optional[Dict[str, str]], + disk_size_gb: int, + image_id: str, + subnet_id: str, + security_group_ids: List[str], + **kwargs, + ) -> dict: + # Reference: https://nebius.ai/docs/api-design-guide/compute/v1/api-ref/Instance/create + logger.debug("Creating instance %s (%s) in %s", name, platform_id, folder_id) + self.get_token() + resp = self.s.post( + self.url("compute", "/instances"), + json=omit_none( + folderId=folder_id, + name=name, + zoneId=zone_id, + platformId=platform_id, + resourcesSpec=resources_spec, + metadata=metadata, + boot_disk_spec=dict( + autoDelete=True, + diskSpec=dict( + typeId="network-ssd", + size=disk_size_gb * 1024 * 1024 * 1024, + imageId=image_id, + ), + ), + networkInterfaceSpecs=[ + dict( + subnetId=subnet_id, + primaryV4AddressSpec=dict( + oneToOneNatSpec=dict( + ipVersion="IPV4", + ), + ), + securityGroupIds=security_group_ids, + ) + ], + **kwargs, + ), + ) + self.raise_for_status(resp) + return resp.json() + + def compute_instances_list( + self, folder_id: str, filter: Optional[str] = None, order_by: Optional[str] = None + ) -> List[dict]: + logger.debug("Fetching instances in %s", folder_id) + return self.list( + "compute", + "instances", + params=dict( + folderId=folder_id, + filter=filter, + orderBy=order_by, + ), + ) + + def compute_instances_delete(self, instance_id: str): + logger.debug("Deleting instance %s", instance_id) + self.get_token() + resp = self.s.delete(self.url("compute", f"/instances/{instance_id}")) + self.raise_for_status(resp) + + def compute_instances_get(self, instance_id: str, full: bool = False) -> dict: + logger.debug("Fetching instance %s", instance_id) + self.get_token() + resp = self.s.get( + self.url("compute", f"/instances/{instance_id}"), + params=dict( + view="FULL" if full else "BASIC", + ), + ) + self.raise_for_status(resp) + return resp.json() + + def compute_images_list( + self, folder_id: str, filter: Optional[str] = None, order_by: Optional[str] = None + ): + logger.debug("Fetching images in %s", folder_id) + return self.list( + "compute", + "images", + params=dict( + folderId=folder_id, + filter=filter, + orderBy=order_by, + ), + ) + + def list(self, service: str, resource: str, params: dict, page_size: int = 1000) -> List[dict]: + page_token = None + output = [] + while True: + self.get_token() + resp = self.s.get( + self.url(service, f"/{resource}"), + params=omit_none( + pageSize=page_size, + pageToken=page_token, + **params, + ), + ) + self.raise_for_status(resp) + data = resp.json() + output += data.get(resource, []) + page_token = data.get("nextPageToken") + if not page_token: + break + return output + + def url(self, service: str, path: str, version="v1") -> str: + return f"https://{service}.{API_URL.rstrip('/')}/{service}/{version}/{path.lstrip('/')}" + + def raise_for_status(self, resp: requests.Response): + if resp.status_code == 400: + raise NebiusError(resp.text) + if resp.status_code == 401: + raise ClientError(resp.text) + if resp.status_code == 403: + raise ForbiddenError(resp.text) + if resp.status_code == 404: + raise NotFoundError(resp.text) + if resp.status_code == 409: + raise ConflictError(resp.text) + resp.raise_for_status() + + +def omit_none(**kwargs) -> dict: + return {k: v for k, v in kwargs.items() if v is not None} diff --git a/src/dstack/_internal/core/backends/nebius/compute.py b/src/dstack/_internal/core/backends/nebius/compute.py new file mode 100644 index 000000000..5f1c1d5a8 --- /dev/null +++ b/src/dstack/_internal/core/backends/nebius/compute.py @@ -0,0 +1,195 @@ +import json +import re +import time +from typing import List, Optional + +import dstack.version as version +from dstack._internal.core.backends.base import Compute +from dstack._internal.core.backends.base.compute import get_user_data +from dstack._internal.core.backends.base.offers import get_catalog_offers +from dstack._internal.core.backends.nebius.api_client import NebiusAPIClient +from dstack._internal.core.backends.nebius.config import NebiusConfig +from dstack._internal.core.backends.nebius.types import ( + ForbiddenError, + NotFoundError, + ResourcesSpec, +) +from dstack._internal.core.errors import NoCapacityError +from dstack._internal.core.models.backends.base import BackendType +from dstack._internal.core.models.instances import ( + InstanceAvailability, + InstanceOfferWithAvailability, + LaunchedInstanceInfo, +) +from dstack._internal.core.models.runs import Job, Requirements, Run + +MEGABYTE = 1024**2 +INSTANCE_PULL_INTERVAL = 10 + + +class NebiusCompute(Compute): + def __init__(self, config: NebiusConfig): + self.config = config + self.api_client = NebiusAPIClient(json.loads(self.config.creds.data)) + + def get_offers( + self, requirements: Optional[Requirements] = None + ) -> List[InstanceOfferWithAvailability]: + offers = get_catalog_offers( + backend=BackendType.NEBIUS, + locations=self.config.regions, + requirements=requirements, + ) + # TODO(egor-s) quotas + return [ + InstanceOfferWithAvailability( + **offer.dict(), availability=InstanceAvailability.UNKNOWN + ) + for offer in offers + ] + + def run_job( + self, + run: Run, + job: Job, + instance_offer: InstanceOfferWithAvailability, + project_ssh_public_key: str, + project_ssh_private_key: str, + ) -> LaunchedInstanceInfo: + cuda = len(instance_offer.instance.resources.gpus) > 0 + security_group_id = self._get_security_group_id(project_name=run.project_name) + subnet_id = self._get_subnet_id(zone=instance_offer.region) + image_id = self._get_image_id(cuda=cuda) + + try: + resp = self.api_client.compute_instances_create( + folder_id=self.config.folder_id, + name=job.job_spec.job_name, # TODO(egor-s) make globally unique + zone_id=instance_offer.region, + platform_id=instance_offer.instance.name, + resources_spec=ResourcesSpec( + memory=int(instance_offer.instance.resources.memory_mib * MEGABYTE), + cores=instance_offer.instance.resources.cpus, + coreFraction=100, + gpus=len(instance_offer.instance.resources.gpus), + ), + metadata={ + "user-data": get_user_data( + backend=BackendType.NEBIUS, + image_name=job.job_spec.image_name, + authorized_keys=[ + run.run_spec.ssh_key_pub.strip(), + project_ssh_public_key.strip(), + ], + registry_auth_required=job.job_spec.registry_auth is not None, + ), + }, + disk_size_gb=100, # TODO(egor-s) make configurable + image_id=image_id, + subnet_id=subnet_id, + security_group_ids=[security_group_id], + labels=self._get_labels(project=run.project_name), + ) + except ForbiddenError as e: + if instance_offer.instance.name in e.args[0]: + raise NoCapacityError(json.loads(e.args[0])["message"]) + raise + instance_id = resp["metadata"]["instanceId"] + try: + while True: + instance = self.api_client.compute_instances_get(instance_id) + if "primaryV4Address" in instance["networkInterfaces"][0]: + break + time.sleep(INSTANCE_PULL_INTERVAL) + except Exception: + self.terminate_instance(instance_id, instance_offer.region) + raise + return LaunchedInstanceInfo( + instance_id=instance_id, + ip_address=instance["networkInterfaces"][0]["primaryV4Address"]["oneToOneNat"][ + "address" + ], + region=instance_offer.region, + username="ubuntu", + ssh_port=22, + dockerized=True, + ) + + def terminate_instance(self, instance_id: str, region: str): + try: + self.api_client.compute_instances_delete(instance_id) + except NotFoundError: + pass + + def _get_security_group_id(self, project_name: str) -> str: + name = project_name + security_groups = self.api_client.vpc_security_groups_list( + folder_id=self.config.folder_id, + filter=f'name="{name}"', + ) + if security_groups: + return security_groups[0]["id"] + resp = self.api_client.vpc_security_groups_create( + folder_id=self.config.folder_id, + name=name, + network_id=self.config.network_id, + rule_specs=[ + { + "description": "SSH access", + "direction": "INGRESS", + "ports": {"fromPort": 22, "toPort": 22}, + "protocolName": "ANY", + "cidrBlocks": {"v4CidrBlocks": ["0.0.0.0/0"]}, + }, + { + "description": "Project intranet", + "direction": "INGRESS", + "protocolName": "ANY", + "predefinedTarget": "self_security_group", + }, + { + "description": "Internet access", + "direction": "EGRESS", + "protocolName": "ANY", + "cidrBlocks": {"v4CidrBlocks": ["0.0.0.0/0"]}, + }, + ], + description="For job instance, by dstack", + labels=self._get_labels(project=project_name), + ) + return resp["response"]["id"] + + def _get_subnet_id(self, zone: str, name: Optional[str] = None) -> str: + name = name or f"default-{zone}" + subnets = self.api_client.vpc_subnets_list(folder_id=self.config.folder_id) + for subnet in subnets: + if subnet["name"] == name: + return subnet["id"] + n = len(subnets) + resp = self.api_client.vpc_subnets_create( + folder_id=self.config.folder_id, + name=name, + network_id=self.config.network_id, + zone=zone, + cird_blocks=[f"10.{n}.0.0/16"], + labels=self._get_labels(), + ) + return resp["response"]["id"] + + def _get_image_id(self, cuda: bool) -> str: + image_name = re.sub(r"[^a-z0-9-]", "-", f"dstack-{version.base_image}") + if cuda: + image_name += "-cuda" + images = self.api_client.compute_images_list( + folder_id="bjel82ie37qos4pc6guk", filter=f'name="{image_name}"' + ) + return images[0]["id"] + + def _get_labels(self, **kwargs) -> dict: + labels = { + "owner": "dstack", + **kwargs, + } + if version.__version__: + labels["dstack-version"] = version.__version__.replace(".", "-") + return labels diff --git a/src/dstack/_internal/core/backends/nebius/config.py b/src/dstack/_internal/core/backends/nebius/config.py new file mode 100644 index 000000000..4bface329 --- /dev/null +++ b/src/dstack/_internal/core/backends/nebius/config.py @@ -0,0 +1,6 @@ +from dstack._internal.core.backends.base.config import BackendConfig +from dstack._internal.core.models.backends.nebius import AnyNebiusCreds, NebiusStoredConfig + + +class NebiusConfig(NebiusStoredConfig, BackendConfig): + creds: AnyNebiusCreds diff --git a/src/dstack/_internal/core/backends/nebius/types.py b/src/dstack/_internal/core/backends/nebius/types.py new file mode 100644 index 000000000..b59aff736 --- /dev/null +++ b/src/dstack/_internal/core/backends/nebius/types.py @@ -0,0 +1,37 @@ +from typing import List, Literal, Optional, TypedDict + + +class ServiceAccount(TypedDict): + id: str + service_account_id: str + created_at: str + key_algorithm: str + public_key: str + private_key: str + + +class ResourcesSpec(TypedDict): + memory: int + cores: int + coreFraction: int + gpus: int + + +class NebiusError(Exception): + pass + + +class ClientError(NebiusError): + pass + + +class ForbiddenError(NebiusError): + pass + + +class NotFoundError(NebiusError): + pass + + +class ConflictError(NebiusError): + pass diff --git a/src/dstack/_internal/core/models/backends/__init__.py b/src/dstack/_internal/core/models/backends/__init__.py index bfa47bb8f..782fed616 100644 --- a/src/dstack/_internal/core/models/backends/__init__.py +++ b/src/dstack/_internal/core/models/backends/__init__.py @@ -27,6 +27,12 @@ LambdaConfigInfoWithCredsPartial, LambdaConfigValues, ) +from dstack._internal.core.models.backends.nebius import ( + NebiusConfigInfo, + NebiusConfigInfoWithCreds, + NebiusConfigInfoWithCredsPartial, + NebiusConfigValues, +) from dstack._internal.core.models.backends.tensordock import ( TensorDockConfigInfo, TensorDockConfigInfoWithCreds, @@ -45,6 +51,7 @@ AzureConfigInfo, GCPConfigInfo, LambdaConfigInfo, + NebiusConfigInfo, TensorDockConfigInfo, VastAIConfigInfo, DstackConfigInfo, @@ -54,6 +61,7 @@ AzureConfigInfoWithCreds, GCPConfigInfoWithCreds, LambdaConfigInfoWithCreds, + NebiusConfigInfoWithCreds, TensorDockConfigInfoWithCreds, VastAIConfigInfoWithCreds, DstackConfigInfo, @@ -63,6 +71,7 @@ AzureConfigInfoWithCredsPartial, GCPConfigInfoWithCredsPartial, LambdaConfigInfoWithCredsPartial, + NebiusConfigInfoWithCredsPartial, TensorDockConfigInfoWithCredsPartial, VastAIConfigInfoWithCredsPartial, DstackConfigInfo, @@ -75,6 +84,7 @@ AzureConfigValues, GCPConfigValues, LambdaConfigValues, + NebiusConfigValues, TensorDockConfigValues, VastAIConfigValues, DstackConfigValues, diff --git a/src/dstack/_internal/core/models/backends/base.py b/src/dstack/_internal/core/models/backends/base.py index dc4eb6a75..b43920c3d 100644 --- a/src/dstack/_internal/core/models/backends/base.py +++ b/src/dstack/_internal/core/models/backends/base.py @@ -11,6 +11,7 @@ class BackendType(str, enum.Enum): GCP = "gcp" LAMBDA = "lambda" LOCAL = "local" + NEBIUS = "nebius" TENSORDOCK = "tensordock" VASTAI = "vastai" diff --git a/src/dstack/_internal/core/models/backends/nebius.py b/src/dstack/_internal/core/models/backends/nebius.py new file mode 100644 index 000000000..0af948db4 --- /dev/null +++ b/src/dstack/_internal/core/models/backends/nebius.py @@ -0,0 +1,55 @@ +from typing import List, Optional, Union + +from pydantic import BaseModel +from typing_extensions import Literal + +from dstack._internal.core.models.backends.base import ConfigElement, ConfigMultiElement +from dstack._internal.core.models.common import ForbidExtra + + +class NebiusConfigInfo(BaseModel): + type: Literal["nebius"] = "nebius" + cloud_id: str + folder_id: str + network_id: str + regions: Optional[List[str]] = None + + +class NebiusServiceAccountCreds(ForbidExtra): + type: Literal["service_account"] = "service_account" + filename: str + data: str + + +AnyNebiusCreds = NebiusServiceAccountCreds + + +NebiusCreds = AnyNebiusCreds + + +class NebiusConfigInfoWithCreds(NebiusConfigInfo): + creds: AnyNebiusCreds + + +AnyNebiusConfigInfo = Union[NebiusConfigInfo, NebiusConfigInfoWithCreds] + + +class NebiusConfigInfoWithCredsPartial(BaseModel): + type: Literal["nebius"] = "nebius" + creds: Optional[AnyNebiusCreds] + cloud_id: Optional[str] + folder_id: Optional[str] + network_id: Optional[str] + regions: Optional[List[str]] + + +class NebiusConfigValues(BaseModel): + type: Literal["nebius"] = "nebius" + cloud_id: Optional[ConfigElement] + folder_id: Optional[ConfigElement] + network_id: Optional[ConfigElement] + regions: Optional[ConfigMultiElement] + + +class NebiusStoredConfig(NebiusConfigInfo): + pass diff --git a/src/dstack/_internal/server/services/backends/__init__.py b/src/dstack/_internal/server/services/backends/__init__.py index 36153d906..a70d98396 100644 --- a/src/dstack/_internal/server/services/backends/__init__.py +++ b/src/dstack/_internal/server/services/backends/__init__.py @@ -63,6 +63,13 @@ except ImportError: pass +try: + from dstack._internal.server.services.backends.configurators.nebius import NebiusConfigurator + + _CONFIGURATOR_CLASSES.append(NebiusConfigurator) +except ImportError: + pass + try: from dstack._internal.server.services.backends.configurators.tensordock import ( TensorDockConfigurator, diff --git a/src/dstack/_internal/server/services/backends/configurators/nebius.py b/src/dstack/_internal/server/services/backends/configurators/nebius.py new file mode 100644 index 000000000..e4c935861 --- /dev/null +++ b/src/dstack/_internal/server/services/backends/configurators/nebius.py @@ -0,0 +1,88 @@ +import json +from typing import List + +import requests + +import dstack._internal.core.backends.nebius.api_client as api_client +from dstack._internal.core.backends.base import Backend +from dstack._internal.core.backends.nebius import NebiusBackend +from dstack._internal.core.backends.nebius.config import NebiusConfig +from dstack._internal.core.models.backends.base import ( + BackendType, + ConfigElementValue, + ConfigMultiElement, +) +from dstack._internal.core.models.backends.nebius import ( + NebiusConfigInfo, + NebiusConfigInfoWithCreds, + NebiusConfigInfoWithCredsPartial, + NebiusConfigValues, + NebiusCreds, + NebiusStoredConfig, +) +from dstack._internal.server.models import BackendModel, ProjectModel +from dstack._internal.server.services.backends import Configurator +from dstack._internal.server.services.backends.configurators.base import ( + raise_invalid_credentials_error, +) + +REGIONS = ["eu-north1-c"] + + +class NebiusConfigurator(Configurator): + TYPE: BackendType = BackendType.NEBIUS + + def get_default_configs(self) -> List[NebiusConfigInfoWithCreds]: + return [] + + def get_config_values(self, config: NebiusConfigInfoWithCredsPartial) -> NebiusConfigValues: + config_values = NebiusConfigValues() + if config.creds is None: + return config_values + self._validate_nebius_creds(config.creds) + # TODO(egor-s) cloud_id + # TODO(egor-s) folder_id + # TODO(egor-s) network_id + config_values.regions = self._get_regions_element(selected=config.regions or []) + return config_values + + def create_backend( + self, project: ProjectModel, config: NebiusConfigInfoWithCreds + ) -> BackendModel: + if config.regions is None: + config.regions = REGIONS + self._validate_nebius_creds(config.creds) + return BackendModel( + project_id=project.id, + type=self.TYPE.value, + config=NebiusStoredConfig.parse_obj(config).json(), + auth=NebiusCreds.parse_obj(config.creds).json(), + ) + + def get_config_info(self, model: BackendModel, include_creds: bool) -> NebiusConfigInfo: + config = self._get_backend_config(model) + if include_creds: + return NebiusConfigInfoWithCreds.parse_obj(config) + return NebiusConfigInfo.parse_obj(config) + + def get_backend(self, model: BackendModel) -> Backend: + config = self._get_backend_config(model) + return NebiusBackend(config=config) + + def _get_backend_config(self, model: BackendModel) -> NebiusConfig: + return NebiusConfig( + **json.loads(model.config), + creds=NebiusCreds.parse_raw(model.auth), + ) + + def _validate_nebius_creds(self, creds: NebiusCreds): + try: + api_client.NebiusAPIClient(json.loads(creds.data)).get_token() + except requests.HTTPError: + raise_invalid_credentials_error(fields=[["creds", "data"]]) + + def _get_regions_element(self, selected: List[str]) -> ConfigMultiElement: + element = ConfigMultiElement(selected=selected) + for r in REGIONS: + element.values.append(ConfigElementValue(value=r, label=r)) + return element diff --git a/src/dstack/_internal/server/services/config.py b/src/dstack/_internal/server/services/config.py index d7fdab4bc..f9b797bee 100644 --- a/src/dstack/_internal/server/services/config.py +++ b/src/dstack/_internal/server/services/config.py @@ -45,16 +45,7 @@ class GCPServiceAccountCreds(ForbidExtra): @root_validator def fill_data(cls, values): - if values.get("data") is not None: - return values - if "filename" not in values: - raise ValueError() - try: - with open(Path(values["filename"]).expanduser()) as f: - values["data"] = f.read() - except OSError: - raise ValueError(f"No such file {values['filename']}") - return values + return _fill_data(values) class GCPDefaultCreds(ForbidExtra): @@ -77,6 +68,28 @@ class LambdaConfig(ForbidExtra): creds: AnyLambdaCreds +class NebiusServiceAccountCreds(ForbidExtra): + type: Literal["service_account"] = "service_account" + filename: str + data: Optional[str] = None + + @root_validator + def fill_data(cls, values): + return _fill_data(values) + + +AnyNebiusCreds = Union[NebiusServiceAccountCreds] + + +class NebiusConfig(ForbidExtra): + type: Literal["nebius"] = "nebius" + cloud_id: str + folder_id: str + network_id: str + regions: Optional[List[str]] = None + creds: AnyNebiusCreds + + class TensorDockConfig(ForbidExtra): type: Literal["tensordock"] = "tensordock" regions: Optional[List[str]] = None @@ -98,6 +111,7 @@ class DstackConfig(ForbidExtra): AzureConfig, GCPConfig, LambdaConfig, + NebiusConfig, TensorDockConfig, VastAIConfig, DstackConfig, @@ -251,3 +265,16 @@ def _config_to_internal_config(backend_config: BackendConfig) -> AnyConfigInfoWi if backend_config.type == "azure": config_info.__root__.locations = backend_config.regions return config_info.__root__ + + +def _fill_data(values: dict): + if values.get("data") is not None: + return values + if "filename" not in values: + raise ValueError() + try: + with open(Path(values["filename"]).expanduser()) as f: + values["data"] = f.read() + except OSError: + raise ValueError(f"No such file {values['filename']}") + return values diff --git a/src/tests/_internal/server/routers/test_backends.py b/src/tests/_internal/server/routers/test_backends.py index d971a3be8..450e00558 100644 --- a/src/tests/_internal/server/routers/test_backends.py +++ b/src/tests/_internal/server/routers/test_backends.py @@ -25,7 +25,15 @@ class TestListBackendTypes: def test_returns_backend_types(self): response = client.post("/api/backends/list_types") assert response.status_code == 200, response.json() - assert response.json() == ["aws", "azure", "gcp", "lambda", "tensordock", "vastai"] + assert response.json() == [ + "aws", + "azure", + "gcp", + "lambda", + "nebius", + "tensordock", + "vastai", + ] class TestGetBackendConfigValuesAWS: