Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Optionally add container cost to container request #1013

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions pkg/clients/container_cost.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package clients

import (
"bytes"
"encoding/json"
"fmt"
"net/http"

"github.com/beam-cloud/beta9/pkg/types"
)

type ContainerCostResponse struct {
CostPerMs float64 `json:"cost_per_ms"`
}

type ContainerCostClient struct {
client *http.Client
endpoint string
token string
}

func NewContainerCostClient(config types.ContainerCostHookConfig) *ContainerCostClient {
return &ContainerCostClient{
client: &http.Client{},
endpoint: config.Endpoint,
token: config.Token,
}
}

func (c *ContainerCostClient) GetContainerCostPerMs(request *types.ContainerRequest) (float64, error) {
var requestBody bytes.Buffer
if err := json.NewEncoder(&requestBody).Encode(request); err != nil {
return 0, err
}

req, err := http.NewRequest("POST", c.endpoint, &requestBody)
if err != nil {
return 0, err
}

req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", c.token))

resp, err := c.client.Do(req)
if err != nil {
return 0, err
}

var response ContainerCostResponse
if err := json.NewDecoder(resp.Body).Decode(&response); err != nil {
return 0, err
}

return response.CostPerMs, nil
}
2 changes: 1 addition & 1 deletion pkg/common/config.default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -251,4 +251,4 @@ abstractions:
bot:
systemPrompt: ""
stepIntervalS: 1
sessionInactivityTimeoutS: 10
sessionInactivityTimeoutS: 10
66 changes: 46 additions & 20 deletions pkg/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"sort"
"time"

"github.com/beam-cloud/beta9/pkg/clients"
"github.com/beam-cloud/beta9/pkg/common"
"github.com/beam-cloud/beta9/pkg/network"
repo "github.com/beam-cloud/beta9/pkg/repository"
Expand All @@ -21,16 +22,17 @@ const (
)

type Scheduler struct {
ctx context.Context
backendRepo repo.BackendRepository
workerRepo repo.WorkerRepository
workerPoolManager *WorkerPoolManager
requestBacklog *RequestBacklog
containerRepo repo.ContainerRepository
workspaceRepo repo.WorkspaceRepository
eventRepo repo.EventRepository
schedulerMetrics SchedulerMetrics
eventBus *common.EventBus
ctx context.Context
backendRepo repo.BackendRepository
workerRepo repo.WorkerRepository
workerPoolManager *WorkerPoolManager
requestBacklog *RequestBacklog
containerRepo repo.ContainerRepository
workspaceRepo repo.WorkspaceRepository
eventRepo repo.EventRepository
schedulerMetrics SchedulerMetrics
eventBus *common.EventBus
containerCostClient *clients.ContainerCostClient
}

func NewScheduler(ctx context.Context, config types.AppConfig, redisClient *common.RedisClient, metricsRepo repo.MetricsRepository, backendRepo repo.BackendRepository, workspaceRepo repo.WorkspaceRepository, tailscale *network.Tailscale) (*Scheduler, error) {
Expand All @@ -44,6 +46,11 @@ func NewScheduler(ctx context.Context, config types.AppConfig, redisClient *comm
schedulerMetrics := NewSchedulerMetrics(metricsRepo)
eventRepo := repo.NewTCPEventClientRepo(config.Monitoring.FluentBit.Events)

var containerCostClient *clients.ContainerCostClient = nil
if config.GatewayService.ContainerCostHook.Endpoint != "" {
containerCostClient = clients.NewContainerCostClient(config.GatewayService.ContainerCostHook)
}

// Load worker pools
workerPoolManager := NewWorkerPoolManager(config.Worker.Failover.Enabled)
for name, pool := range config.Worker.Pools {
Expand Down Expand Up @@ -92,16 +99,17 @@ func NewScheduler(ctx context.Context, config types.AppConfig, redisClient *comm
}

return &Scheduler{
ctx: ctx,
eventBus: eventBus,
backendRepo: backendRepo,
workerRepo: workerRepo,
workerPoolManager: workerPoolManager,
requestBacklog: requestBacklog,
containerRepo: containerRepo,
schedulerMetrics: schedulerMetrics,
eventRepo: eventRepo,
workspaceRepo: workspaceRepo,
ctx: ctx,
eventBus: eventBus,
backendRepo: backendRepo,
workerRepo: workerRepo,
workerPoolManager: workerPoolManager,
requestBacklog: requestBacklog,
containerRepo: containerRepo,
schedulerMetrics: schedulerMetrics,
eventRepo: eventRepo,
workspaceRepo: workspaceRepo,
containerCostClient: containerCostClient,
}, nil
}

Expand Down Expand Up @@ -251,6 +259,9 @@ func (s *Scheduler) StartProcessingRequests() {
continue
}

// Add the container cost per ms to the request if a cost endpoint is provided
s.addContainerCostPerMs(request)

// Find a worker to schedule ContainerRequests on
worker, err := s.selectWorker(request)
if err != nil || worker == nil {
Expand Down Expand Up @@ -495,3 +506,18 @@ func calculateBackoffDelay(retryCount int) time.Duration {
}
return delay
}

// addContainerCostPerMs adds the container cost per ms to the request if the config provided
// a container cost hook endpoint.
func (s *Scheduler) addContainerCostPerMs(request *types.ContainerRequest) {
if s.containerCostClient == nil {
return
}

costPerMs, err := s.containerCostClient.GetContainerCostPerMs(request)
if err != nil {
log.Error().Str("container_id", request.ContainerId).Err(err).Msg("unable to get container cost per ms")
}

request.CostPerMs = costPerMs
}
18 changes: 12 additions & 6 deletions pkg/types/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,19 @@ type StubLimits struct {
MaxGpuCount uint32 `key:"maxGpuCount" json:"max_gpu_count"`
}

type ContainerCostHookConfig struct {
Endpoint string `key:"endpoint" json:"endpoint"`
Token string `key:"token" json:"token"`
}

type GatewayServiceConfig struct {
Host string `key:"host" json:"host"`
InvokeURLType string `key:"invokeURLType" json:"invoke_url_type"`
GRPC GRPCConfig `key:"grpc" json:"grpc"`
HTTP HTTPConfig `key:"http" json:"http"`
ShutdownTimeout time.Duration `key:"shutdownTimeout" json:"shutdown_timeout"`
StubLimits StubLimits `key:"stubLimits" json:"stub_limits"`
Host string `key:"host" json:"host"`
InvokeURLType string `key:"invokeURLType" json:"invoke_url_type"`
GRPC GRPCConfig `key:"grpc" json:"grpc"`
HTTP HTTPConfig `key:"http" json:"http"`
ShutdownTimeout time.Duration `key:"shutdownTimeout" json:"shutdown_timeout"`
StubLimits StubLimits `key:"stubLimits" json:"stub_limits"`
ContainerCostHook ContainerCostHookConfig `key:"containerCostHook" json:"container_cost_hook"`
}

type FileServiceConfig struct {
Expand Down
1 change: 1 addition & 0 deletions pkg/types/metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ var (

// Worker keys
MetricsWorkerContainerDuration = "container_duration_milliseconds"
MetricsWorkerContainerCost = "container_cost_cents"
)

type TaskMetrics struct {
Expand Down
1 change: 1 addition & 0 deletions pkg/types/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ type ContainerRequest struct {
CheckpointEnabled bool `json:"checkpoint_enabled"`
BuildOptions BuildOptions `json:"build_options"`
Ports []uint32 `json:"ports"`
CostPerMs float64 `json:"cost_per_ms"`
}

func (c *ContainerRequest) RequiresGPU() bool {
Expand Down
2 changes: 2 additions & 0 deletions pkg/worker/lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,8 @@ func (s *Worker) spawn(request *types.ContainerRequest, spec *specs.Spec, output
}

// Log metrics

log.Info().Str("container_id", request.ContainerId).Str("cost_per_ms", fmt.Sprintf("%f", request.CostPerMs)).Msg("container cost per ms")
go s.workerMetrics.EmitContainerUsage(ctx, request)
go s.eventRepo.PushContainerStartedEvent(containerId, s.workerId, request)
defer func() { go s.eventRepo.PushContainerStoppedEvent(containerId, s.workerId, request) }()
Expand Down
17 changes: 17 additions & 0 deletions pkg/worker/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,21 @@ func (wm *WorkerMetrics) metricsContainerDuration(request *types.ContainerReques
}, float64(duration.Milliseconds()))
}

func (wm *WorkerMetrics) metricsContainerCost(request *types.ContainerRequest, duration time.Duration) {
wm.metricsRepo.IncrementCounter(types.MetricsWorkerContainerCost, map[string]interface{}{
"container_id": request.ContainerId,
"worker_id": wm.workerId,
"stub_id": request.StubId,
"workspace_id": request.WorkspaceId,
"cpu_millicores": request.Cpu,
"mem_mb": request.Memory,
"gpu": request.Gpu,
"gpu_count": request.GpuCount,
"cost_per_ms": request.CostPerMs,
"duration_ms": duration.Milliseconds(),
}, request.CostPerMs*float64(duration.Milliseconds()))
}

// Periodically send metrics to track container duration
func (wm *WorkerMetrics) EmitContainerUsage(ctx context.Context, request *types.ContainerRequest) {
cursorTime := time.Now()
Expand All @@ -57,10 +72,12 @@ func (wm *WorkerMetrics) EmitContainerUsage(ctx context.Context, request *types.
select {
case <-ticker.C:
go wm.metricsContainerDuration(request, time.Since(cursorTime))
go wm.metricsContainerCost(request, time.Since(cursorTime))
cursorTime = time.Now()
case <-ctx.Done():
// Consolidate any remaining time
go wm.metricsContainerDuration(request, time.Since(cursorTime))
go wm.metricsContainerCost(request, time.Since(cursorTime))
return
}
}
Expand Down
Loading