Skip to content

Commit

Permalink
Add Nebius backend (#770)
Browse files Browse the repository at this point in the history
* Nebius configs & API client implementation

* Implement Nebius compute

* Nebius tweaks

* Fix list backends test

* Pre-install docker on CPU Nebius instance

* Run GPU instances in Nebius

* Move packer directory

* Build Nebius images using Packer with HCL2

* Use pre-built VM images for Nebius

* Build Nebius images in CI

* Remove outer JSON quotes

* Create JSON with service account

* Fix create-json version

* Remove extra dash

* Fix Nebius images build

* Build Nebius images in a single job
  • Loading branch information
Egor-S authored Nov 10, 2023
1 parent 005a83e commit 50698b8
Show file tree
Hide file tree
Showing 39 changed files with 974 additions and 26 deletions.
55 changes: 49 additions & 6 deletions .github/workflows/docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ on:
description: "Build GCP images"
type: boolean
default: true
build_nebius:
description: "Build Nebius images"
type: boolean
default: true

env:
PACKER_VERSION: "1.9.2"
Expand Down Expand Up @@ -66,7 +70,7 @@ jobs:
if: always() && inputs.build_aws && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped')
defaults:
run:
working-directory: runner
working-directory: packer
runs-on: ubuntu-latest
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
Expand All @@ -81,7 +85,6 @@ jobs:
wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip
unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip
chmod +x packer
cp -R ami/packer/* .
- name: Run packer
run: |
./packer build -var-file=versions.json $PROD_VARS -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX aws-image${{ matrix.variant }}.json
Expand All @@ -93,7 +96,7 @@ jobs:
if: always() && inputs.build_azure && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped')
defaults:
run:
working-directory: runner
working-directory: packer
runs-on: ubuntu-latest
env:
AZURE_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
Expand All @@ -115,7 +118,6 @@ jobs:
wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip
unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip
chmod +x packer
cp -R ami/packer/* .
- name: Run packer
run: |
./packer build -var-file=versions.json -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX azure-image${{ matrix.variant }}.json
Expand All @@ -131,7 +133,7 @@ jobs:
if: always() && inputs.build_gcp && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped')
defaults:
run:
working-directory: runner
working-directory: packer
runs-on: ubuntu-latest
strategy:
matrix:
Expand All @@ -154,7 +156,6 @@ jobs:
wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip
unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip
chmod +x packer
cp -R ami/packer/* .
- name: Run packer
run: |
./packer build -var-file=versions.json -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX gcp-image${{ matrix.variant }}.json
Expand All @@ -164,3 +165,45 @@ jobs:
gcloud compute images add-iam-policy-binding ${BUILD_PREFIX}dstack${{ matrix.variant }}-$IMAGE_VERSION --member='allAuthenticatedUsers' --role='roles/compute.imageUser'
env:
IMAGE_VERSION: ${{ inputs.image_version }}

build-nebius-images:
needs: build-docker
if: always() && inputs.build_nebius && (needs.build-docker.result == 'success' || needs.build-docker.result == 'skipped')
defaults:
run:
working-directory: packer
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Get Nebius CLI
run: |
echo "CLI_VERSION=$CLI_VERSION"
curl -sSL https://storage.ai.nebius.cloud/ncp/install.sh | bash
echo "$HOME/nebius-cloud/bin" >> $GITHUB_PATH
env:
CLI_VERSION: 0.113.0+Nebius-AI
- name: Write Nebius credentials
uses: jsdaniell/[email protected]
with:
name: "service_account.json"
json: ${{ secrets.NEBIUS_SERVICE_ACCOUNT }}
dir: "packer/"
- name: Setup Nebius profile
run: |
ncp config profile create packer
ncp config set endpoint api.ai.nebius.cloud:443
ncp config set service-account-key service_account.json
rm service_account.json
- name: Download packer
run: |
wget https://releases.hashicorp.com/packer/${{ env.PACKER_VERSION }}/packer_${{ env.PACKER_VERSION }}_linux_amd64.zip
unzip packer_${{ env.PACKER_VERSION }}_linux_amd64.zip
chmod +x packer
./packer init .
- name: Run packer (HCL2)
run: |
export PKR_VAR_nebius_token=$(ncp iam create-token)
./packer build -only yandex.nebius,yandex.nebius-cuda -var image_version=${{ inputs.image_version }} -var build_prefix=$BUILD_PREFIX .
env:
PKR_VAR_nebius_folder_id: ${{ secrets.NEBIUS_FOLDER_ID }}
PKR_VAR_nebius_subnet_id: ${{ secrets.NEBIUS_SUBNET_ID }}
23 changes: 23 additions & 0 deletions runner/ami/packer/README.MD → packer/README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,26 @@ Set environment variables.
| AZURE_TENANT_ID | tenant_id |
| AZURE_SUBSCRIPTION_ID | subscription_id |

# Nebius

## Setup Nebius credentials

> `compute.admin` is not sufficient for packer. Use `admin` role instead.
```shell
ncp config profile create packer
ncp config set service-account-key path/to/service_account.json
ncp config set endpoint api.ai.nebius.cloud:443
export PKR_VAR_nebius_token=$(ncp iam create-token)
```

## Build images

```shell
export PKR_VAR_nebius_folder_id=...
export PKR_VAR_nebius_subnet_id=...
# no CUDA
packer build -only yandex.nebius -var image_version=0.4rc3 .
# with CUDA
packer build -only yandex.nebius-cuda -var image_version=0.4rc3 .
```
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
41 changes: 41 additions & 0 deletions packer/build-cuda-image.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
build {
source "source.yandex.nebius" {
name = "nebius-cuda"
image_description = "Ubuntu 22.04 with CUDA, Docker and dstackai/base:cuda images"
image_family = "dstack-cuda"
image_name = "${local.image_name}-cuda"
}
# TODO(egor-s) add other sources

provisioner "shell" {
inline = ["cloud-init status --long --wait"]
}

provisioner "shell" {
scripts = ["provisioners/kernel/apt-upgrade.sh", "provisioners/kernel/apt-daily.sh", "provisioners/kernel/apt-packages.sh", "provisioners/kernel/kernel-tuning.sh"]
}

provisioner "file" {
destination = "/tmp/install-docker.sh"
source = "provisioners/install-docker.sh"
}

provisioner "file" {
destination = "/tmp/run-docker"
source = "provisioners/run-docker"
}

provisioner "shell" {
inline = ["cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version ${local.docker_version}"]
}

provisioner "shell" {
environment_vars = ["CUDA_DRIVERS_VERSION=${local.cuda_drivers_version}"]
script = "provisioners/cuda.sh"
}

provisioner "shell" {
environment_vars = ["IMAGE_VERSION=${var.image_version}"]
script = "provisioners/docker-image-with-cuda.sh"
}
}
35 changes: 35 additions & 0 deletions packer/build-image.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
build {
source "source.yandex.nebius" {
image_description = "Ubuntu 22.04 with Docker and dstackai/base images"
image_family = "dstack"
image_name = local.image_name
}
# TODO(egor-s) add other sources

provisioner "shell" {
inline = ["cloud-init status --long --wait"]
}

provisioner "shell" {
scripts = ["provisioners/kernel/apt-upgrade.sh", "provisioners/kernel/apt-daily.sh", "provisioners/kernel/apt-packages.sh", "provisioners/kernel/kernel-tuning.sh"]
}

provisioner "file" {
destination = "/tmp/install-docker.sh"
source = "provisioners/install-docker.sh"
}

provisioner "file" {
destination = "/tmp/run-docker"
source = "provisioners/run-docker"
}

provisioner "shell" {
inline = ["cd /tmp", "chmod +x install-docker.sh", "./install-docker.sh --version ${local.docker_version}"]
}

provisioner "shell" {
environment_vars = ["IMAGE_VERSION=${var.image_version}"]
script = "provisioners/docker-image-without-cuda.sh"
}
}
8 changes: 8 additions & 0 deletions packer/config.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
packer {
required_plugins {
yandex = {
version = ">= 1.1.2"
source = "github.com/hashicorp/yandex"
}
}
}
File renamed without changes.
File renamed without changes.
6 changes: 6 additions & 0 deletions packer/locals.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
locals {
clean_image_version = regex_replace(var.image_version, "[^a-z0-9-]", "-")
image_name = "${var.build_prefix}dstack-${local.clean_image_version}"
docker_version = "20.10.17"
cuda_drivers_version = "535.54.03-1"
}
12 changes: 12 additions & 0 deletions packer/nebius.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
source "yandex" "nebius" {
disk_size_gb = 30
disk_type = "network-ssd"
endpoint = "api.ai.nebius.cloud:443"
folder_id = var.nebius_folder_id
source_image_family = "ubuntu-2204-lts"
ssh_username = "ubuntu"
subnet_id = var.nebius_subnet_id
token = var.nebius_token
use_ipv4_nat = true
zone = "eu-north1-c"
}
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
27 changes: 27 additions & 0 deletions packer/variables.pkr.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
variable "build_prefix" {
type = string
default = ""
}

variable "image_version" {
type = string
}

# Nebius
variable "nebius_folder_id" {
type = string
default = null
sensitive = true
}

variable "nebius_subnet_id" {
type = string
default = null
sensitive = true
}

variable "nebius_token" {
type = string
default = null
sensitive = true
}
File renamed without changes.
8 changes: 0 additions & 8 deletions runner/cmd/shim/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,6 @@ func main() {
Required: true,
Destination: &backendName,
EnvVars: []string{"DSTACK_BACKEND"},
Action: func(c *cli.Context, s string) error {
for _, backend := range []string{"aws", "azure", "gcp", "lambda", "tensordock", "local"} {
if s == backend {
return nil
}
}
return gerrors.Newf("unknown backend %s", s)
},
},
/* Shim Parameters */
&cli.PathFlag{
Expand Down
7 changes: 6 additions & 1 deletion src/dstack/_internal/core/backends/base/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ def create_gateway(


def get_user_data(
backend: BackendType, image_name: str, authorized_keys: List[str], registry_auth_required: bool
backend: BackendType,
image_name: str,
authorized_keys: List[str],
registry_auth_required: bool,
cloud_config_kwargs: Optional[dict] = None,
) -> str:
commands = get_shim_commands(
backend=backend,
Expand All @@ -62,6 +66,7 @@ def get_user_data(
return get_cloud_config(
runcmd=[["sh", "-c", " && ".join(commands)]],
ssh_authorized_keys=authorized_keys,
**(cloud_config_kwargs or {}),
)


Expand Down
15 changes: 15 additions & 0 deletions src/dstack/_internal/core/backends/nebius/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from dstack._internal.core.backends.base import Backend
from dstack._internal.core.backends.nebius.compute import NebiusCompute
from dstack._internal.core.backends.nebius.config import NebiusConfig
from dstack._internal.core.models.backends.base import BackendType


class NebiusBackend(Backend):
TYPE: BackendType = BackendType.NEBIUS

def __init__(self, config: NebiusConfig):
self.config = config
self._compute = NebiusCompute(self.config)

def compute(self) -> NebiusCompute:
return self._compute
Loading

0 comments on commit 50698b8

Please sign in to comment.