From d69b0470e392c53b9845645f5f1a615dd751179f Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Thu, 10 Aug 2023 11:33:29 +0500 Subject: [PATCH] Fix runner not terminating on max duration exceeded The behaviour was that the runner always stopped instead of respecting the termination policy. Since stopping is not supported for all backends, the instance may continue to run. --- runner/consts/consts.go | 5 +++++ runner/internal/executor/executor.go | 22 +++++++++++----------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/runner/consts/consts.go b/runner/consts/consts.go index e66ec05a1..0fabfed53 100644 --- a/runner/consts/consts.go +++ b/runner/consts/consts.go @@ -39,3 +39,8 @@ const DELAY_READ_STATUS = 5 * time.Second const REPO_HTTPS_URL = "https://%s/%s/%s.git" const REPO_GIT_URL = "git@%s:%s/%s.git" + +const ( + TERMINATE_POLICY = "terminate" + STOP_POLICY = "stop" +) diff --git a/runner/internal/executor/executor.go b/runner/internal/executor/executor.go index 8acc8329a..5b194ad21 100644 --- a/runner/internal/executor/executor.go +++ b/runner/internal/executor/executor.go @@ -6,8 +6,6 @@ import ( "encoding/json" "errors" "fmt" - "github.com/docker/go-connections/nat" - "github.com/dstackai/dstack/runner/internal/gateway" "io" "os" "path" @@ -15,6 +13,9 @@ import ( "strconv" "time" + "github.com/docker/go-connections/nat" + "github.com/dstackai/dstack/runner/internal/gateway" + "github.com/dstackai/dstack/runner/internal/backend/base" "github.com/dstackai/dstack/runner/internal/models" @@ -143,6 +144,14 @@ func (ex *Executor) Run(ctx context.Context) error { if err != nil { return gerrors.Wrap(err) } + if job.MaxDurationExceeded() { + log.Info(runCtx, "Job max duration exceeded") + if job.TerminationPolicy == consts.STOP_POLICY { + job.Status = states.Stopping + } else { + job.Status = states.Terminating + } + } if job.Status == states.Stopping { log.Info(runCtx, "Stopped") ex.Stop(false) @@ -160,15 +169,6 @@ func (ex *Executor) Run(ctx context.Context) error { _ = ex.backend.UpdateState(runCtx) return errRun } - if job.MaxDurationExceeded() { - log.Info(runCtx, "Job max duration exceeded. Stopping...") - ex.Stop(false) - log.Info(runCtx, "Waiting job end") - errRun := <-erCh - job.Status = states.Stopped - _ = ex.backend.UpdateState(runCtx) - return errRun - } case <-ctx.Done(): log.Info(runCtx, "Stopped") ex.Stop(true)