From b3f3621cd11d0e77dd7ab368d348ef3288e28068 Mon Sep 17 00:00:00 2001 From: sundargates Date: Mon, 11 Dec 2023 12:59:20 -0800 Subject: [PATCH] Handling failure to initialize job clusters better (#598) * Handling failure to initialize job clusters better * Running testcontainers in a separate build * Increase the number of retries * When end instant is not set in mantis metadata use the default value * Disbaling findbugs * One more bug * Fixing the log line --------- Co-authored-by: Sundaram Ananthanarayanan --- .github/workflows/nebula-ci.yml | 45 ++++++++++++++++++- .../master/JobClustersManagerActor.java | 3 +- .../master/jobcluster/CompletedJobStore.java | 11 ++--- .../master/jobcluster/JobClusterActor.java | 38 +++++++++------- .../jobcluster/proto/JobClusterProto.java | 10 ++--- .../test/java/TestContainerHelloWorld.java | 2 +- 6 files changed, 79 insertions(+), 30 deletions(-) diff --git a/.github/workflows/nebula-ci.yml b/.github/workflows/nebula-ci.yml index 4c430cb7c..4af389c3e 100644 --- a/.github/workflows/nebula-ci.yml +++ b/.github/workflows/nebula-ci.yml @@ -36,7 +36,7 @@ jobs: restore-keys: | - ${{ runner.os }}-gradlewrapper- - name: Build with Gradle - run: ./gradlew --info --stacktrace build --warning-mode=all + run: ./gradlew --info --stacktrace build -x mantis-testcontainers:test --warning-mode=all env: CI_NAME: github_actions CI_BUILD_NUMBER: ${{ github.sha }} @@ -59,3 +59,46 @@ jobs: with: name: Event File path: ${{ github.event_path }} + + integration-test: + runs-on: ubuntu-latest + strategy: + matrix: + # test against JDK 8 + java: [ 8 ] + name: Integration Tests with Java ${{ matrix.java }} + steps: + - uses: actions/checkout@v1 + - name: Setup jdk + uses: actions/setup-java@v1 + with: + java-version: ${{ matrix.java }} + - uses: actions/cache@v1 + id: gradle-cache + with: + path: ~/.gradle/caches + key: ${{ runner.os }}-gradle-${{ hashFiles('**/gradle/dependency-locks/*.lockfile') }} + restore-keys: | + - ${{ runner.os }}-gradle- + - uses: actions/cache@v1 + id: gradle-wrapper-cache + with: + path: ~/.gradle/wrapper + key: ${{ runner.os }}-gradlewrapper-${{ hashFiles('gradle/wrapper/*') }} + restore-keys: | + - ${{ runner.os }}-gradlewrapper- + - name: Build with Gradle + run: ./gradlew --info --stacktrace mantis-testcontainers:test --warning-mode=all + env: + CI_NAME: github_actions + CI_BUILD_NUMBER: ${{ github.sha }} + CI_BUILD_URL: 'https://github.com/${{ github.repository }}' + CI_BRANCH: ${{ github.ref }} + COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Upload Test Results + uses: actions/upload-artifact@v3 + if: always() + with: + name: Unit Test Results + path: "**/build/test-results/**/*.xml" + diff --git a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/JobClustersManagerActor.java b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/JobClustersManagerActor.java index a32fa012a..608f00e6e 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/JobClustersManagerActor.java +++ b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/JobClustersManagerActor.java @@ -369,7 +369,8 @@ private void initialize(JobClustersManagerInitialize initMsg) { List completedJobsList = Lists.newArrayList(); JobClusterProto.InitializeJobClusterRequest req = new JobClusterProto.InitializeJobClusterRequest((JobClusterDefinitionImpl) jobClusterMeta.getJobClusterDefinition(), - jobClusterMeta.isDisabled(), jobClusterMeta.getLastJobCount(), jobList, completedJobsList, "system", getSelf(), false); + jobClusterMeta.isDisabled(), jobClusterMeta.getLastJobCount(), jobList, + "system", getSelf(), false); return jobClusterInfoManager.initializeCluster(jobClusterInfo, req, t); diff --git a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/CompletedJobStore.java b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/CompletedJobStore.java index 43451d0b2..351927708 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/CompletedJobStore.java +++ b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/CompletedJobStore.java @@ -30,6 +30,7 @@ import io.mantisrx.server.master.persistence.MantisJobStore; import io.mantisrx.shaded.com.google.common.annotations.VisibleForTesting; import java.io.IOException; +import java.time.Instant; import java.util.Comparator; import java.util.HashMap; import java.util.List; @@ -204,8 +205,8 @@ private CompletedJobEntry fetchJobId(JobId jId) { /** * If job data exists in cache return it else call getArchiveJob * - * @param jId - * @return + * @param jId job id + * @return job metadata if found else empty */ @Override public Optional getJobMetadata(JobId jId) throws IOException { @@ -253,7 +254,7 @@ private CompletedJob fromMantisJobMetadata(IMantisJobMetadata jobMetadata) { jobMetadata.getJobDefinition().getVersion(), jobMetadata.getState(), jobMetadata.getSubmittedAtInstant().toEpochMilli(), - jobMetadata.getEndedAtInstant().get().toEpochMilli(), + jobMetadata.getEndedAtInstant().orElse(Instant.ofEpochMilli(0l)).toEpochMilli(), jobMetadata.getUser(), jobMetadata.getLabels()); } @@ -324,9 +325,9 @@ private void addCompletedJobToCache(CompletedJob completedJob, /** * Bulk add completed jobs to cache * - * @param completedJobsList + * @param completedJobsList list of completed jobs */ - private void addCompletedJobsToCache(List completedJobsList) throws IOException { + private void addCompletedJobsToCache(List completedJobsList) { if (!completedJobsList.isEmpty()) { Map cache = completedJobsList.stream() .flatMap(compJob -> { diff --git a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/JobClusterActor.java b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/JobClusterActor.java index 9e507a1db..1878ab5fa 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/JobClusterActor.java +++ b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/JobClusterActor.java @@ -703,7 +703,15 @@ public void onJobClusterInitialize(JobClusterProto.InitializeJobClusterRequest i if(jobClusterMetadata.isDisabled()) { logger.info("Cluster {} initialized but is Disabled", jobClusterMetadata .getJobClusterDefinition().getName()); - jobManager.initialize(); + try { + jobManager.initialize(); + } catch (Exception e) { + sender.tell(new JobClusterProto.InitializeJobClusterResponse(initReq.requestId, CLIENT_ERROR, + String.format("JobCluster %s initialization failed", + initReq.jobClusterDefinition.getName()), + initReq.jobClusterDefinition.getName(), initReq.requestor), getSelf()); + } + int count = 50; if(!initReq.jobList.isEmpty()) { logger.info("Cluster {} is disabled however it has {} active/accepted jobs", @@ -783,7 +791,14 @@ public void onJobClusterInitialize(JobClusterProto.InitializeJobClusterRequest i initRunningJobs(initReq, sender); logger.info("Job expiry check frequency set to {}", expireFrequency); - jobManager.initialize(); + try { + jobManager.initialize(); + } catch (Exception e) { + sender.tell(new JobClusterProto.InitializeJobClusterResponse(initReq.requestId, CLIENT_ERROR, + String.format("JobCluster %s initialization failed", + initReq.jobClusterDefinition.getName()), + initReq.jobClusterDefinition.getName(), initReq.requestor), getSelf()); + } } } @@ -798,14 +813,11 @@ public void onJobClusterInitialize(JobClusterProto.InitializeJobClusterRequest i */ private void initRunningJobs(JobClusterProto.InitializeJobClusterRequest initReq, ActorRef sender) { - List completedJobsList = initReq.completedJobsList; List jobList = initReq.jobList; - logger.info("In _initJobs for cluster {}: {} activeJobs and {} completedJobs", name, jobList.size(), - completedJobsList.size()); + logger.info("In _initJobs for cluster {}: {} activeJobs", name, jobList.size()); if (logger.isDebugEnabled()) { - logger.debug("In _initJobs for cluster {} activeJobs -> {} and completedJobs -> {}", name, jobList, - completedJobsList); + logger.debug("In _initJobs for cluster {} activeJobs -> {}", name, jobList); } Observable.from(jobList) @@ -2609,14 +2621,10 @@ final static class JobManager { this.costsCalculator = costsCalculator; } - void initialize() { - try { - logger.debug("Loading completed jobs for cluster {}", name); - completedJobStore.initialize(); - logger.debug("Initialized completed job store for cluster {}", name); - } catch (IOException e) { - logger.error("Could not initialize completed job store for cluster {}", name, e); - } + void initialize() throws IOException { + logger.debug("Loading completed jobs for cluster {}", name); + completedJobStore.initialize(); + logger.debug("Initialized completed job store for cluster {}", name); } public void onJobClusterDeletion() throws IOException { diff --git a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/proto/JobClusterProto.java b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/proto/JobClusterProto.java index bf104833e..594b1e528 100644 --- a/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/proto/JobClusterProto.java +++ b/mantis-control-plane/mantis-control-plane-server/src/main/java/io/mantisrx/master/jobcluster/proto/JobClusterProto.java @@ -23,7 +23,6 @@ import io.mantisrx.master.jobcluster.job.JobState; import io.mantisrx.server.core.JobCompletedReason; import io.mantisrx.server.master.domain.JobClusterDefinitionImpl; -import io.mantisrx.server.master.domain.JobClusterDefinitionImpl.CompletedJob; import io.mantisrx.server.master.domain.JobDefinition; import io.mantisrx.server.master.domain.JobId; import io.mantisrx.shaded.com.google.common.collect.Lists; @@ -48,20 +47,19 @@ public static final class InitializeJobClusterRequest extends BaseRequest { public final long lastJobNumber; public final boolean createInStore; public final List jobList; - public final List completedJobsList; + /** * Invoked directly during bootstrap * @param jobClusterDefinition * @param isDisabled * @param lastJobNumber * @param jobList - * @param completedJobsList * @param user * @param requestor * @param createInStore */ public InitializeJobClusterRequest(final JobClusterDefinitionImpl jobClusterDefinition, boolean isDisabled, long lastJobNumber, - List jobList, List completedJobsList, String user, ActorRef requestor, boolean createInStore) { + List jobList, String user, ActorRef requestor, boolean createInStore) { super(); Preconditions.checkNotNull(jobClusterDefinition, "JobClusterDefn cannot be null"); this.jobClusterDefinition = jobClusterDefinition; @@ -71,7 +69,6 @@ public InitializeJobClusterRequest(final JobClusterDefinitionImpl jobClusterDefi this.isDisabled = isDisabled; this.lastJobNumber = lastJobNumber; this.jobList = jobList; - this.completedJobsList = completedJobsList; } /** @@ -81,7 +78,7 @@ public InitializeJobClusterRequest(final JobClusterDefinitionImpl jobClusterDefi * @param requestor */ public InitializeJobClusterRequest(final JobClusterDefinitionImpl jobClusterDefinition, String user, ActorRef requestor) { - this(jobClusterDefinition, false, 0, Lists.newArrayList(), Lists.newArrayList(), user, requestor, true); + this(jobClusterDefinition, false, 0, Lists.newArrayList(), user, requestor, true); } @@ -95,7 +92,6 @@ public String toString() { ", lastJobNumber=" + lastJobNumber + ", createInStore=" + createInStore + ", jobList=" + jobList + - ", completedJobsList=" + completedJobsList + '}'; } } diff --git a/mantis-testcontainers/src/test/java/TestContainerHelloWorld.java b/mantis-testcontainers/src/test/java/TestContainerHelloWorld.java index 462d355ad..28628b8b3 100644 --- a/mantis-testcontainers/src/test/java/TestContainerHelloWorld.java +++ b/mantis-testcontainers/src/test/java/TestContainerHelloWorld.java @@ -200,7 +200,7 @@ public void testQuickSubmitJob() throws IOException, InterruptedException { if (!ensureJobWorkerStarted( controlPlaneHost, controlPlanePort, - 5, + 10, Duration.ofSeconds(2).toMillis())) { fail("Failed to start job worker."); }