Skip to content

Commit

Permalink
Merge branch 'master' into support-proactive-relaunch-from-worker
Browse files Browse the repository at this point in the history
  • Loading branch information
BalaBalaYi authored Jan 3, 2025
2 parents 013de3d + 22ead8c commit 1d73743
Show file tree
Hide file tree
Showing 186 changed files with 21,928 additions and 593 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
name: main test for the project
name: code-check

on:
pull_request:
Expand Down Expand Up @@ -35,7 +35,7 @@ jobs:
- name: Upload coverage reports to Codecov
uses: codecov/[email protected]
with:
token: 043586e5-68b8-4588-9bf4-c333f2692345
token: 3b0503fb-7c5e-4486-9ddf-2903deb77067
slug: intelligent-machine-learning/dlrover
operator-test:
runs-on: ubuntu-latest
Expand Down
13 changes: 1 addition & 12 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
---
exclude: '^atorch/|^tfplus/'
exclude: '^atorch/|^tfplus/^|xpu_timer/'
repos:
- repo: https://github.com/pre-commit/mirrors-isort
rev: v5.10.1
Expand Down Expand Up @@ -67,14 +67,3 @@ repos:
- id: yamllint
exclude:
(dlrover/python/tests/data|operator/config)
# - repo: https://github.com/igorshubovych/markdownlint-cli
# rev: v0.35.0
# hooks:
# - id: markdownlint
# name: markdownlint
# description: "Checks the style of Markdown files."
# entry: markdownlint --fix --config .markdownlint.yaml
# language: node
# types: [markdown]
# exclude: (dlrover/go/brain/vendor)
# args: [--fix]
3 changes: 3 additions & 0 deletions dlrover/go/master/go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/intelligent-machine-learning/dlrover/go/master

go 1.23.4
9 changes: 7 additions & 2 deletions dlrover/go/operator/pkg/controllers/master/master.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ const (
defaultImagePullPolicy = "Always"
envMasterAddrKey = "DLROVER_MASTER_ADDR"
envBrainServiceAddrKey = "DLROVER_BRAIN_SERVICE_ADDR"
defaultBrainServiceAddr = "dlrover-brain.dlrover.svc.cluster.local:50001"
envPodIP = "POD_IP"

// ReplicaTypeJobMaster is the type for DLRover ElasticJob Master replica.
Expand All @@ -60,8 +61,12 @@ func (m *Manager) newJobMaster(
)
pod.Labels[common.LabelReplicaTypeKey] = string(ReplicaTypeJobMaster)
pod.Labels[common.LabelReplicaIndexKey] = fmt.Sprintf("%d", replicaIndex)
if job.Spec.BrainService != "" {
setBrainServiceIntoContainer(&pod.Spec.Containers[0], job.Spec.BrainService)
if job.Spec.OptimizeMode == "cluster" {
brainServiceAddr := defaultBrainServiceAddr
if job.Spec.BrainService != "" {
brainServiceAddr = job.Spec.BrainService
}
setBrainServiceIntoContainer(&pod.Spec.Containers[0], brainServiceAddr)
}
return pod
}
Expand Down
26 changes: 26 additions & 0 deletions dlrover/go/operator/pkg/controllers/master/master_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,29 @@ func TestCreateMasterPodWithImage(t *testing.T) {
assert.Equal(t, pod.Spec.Containers[0].Image, "dlrover-master:test-v0")
assert.Equal(t, string(pod.Spec.Containers[0].ImagePullPolicy), "Always")
}

func TestCreateMasterPodWithOptimizeMode(t *testing.T) {
job := &elasticv1alpha1.ElasticJob{
ObjectMeta: metav1.ObjectMeta{
Name: "test-ps",
Namespace: "dlrover",
Annotations: map[string]string{},
Labels: map[string]string{},
},
}
job.Spec.OptimizeMode = "cluster"
job.Spec.ReplicaSpecs = make(map[commonv1.ReplicaType]*elasticv1alpha1.ReplicaSpec)
NewMasterTemplateToJob(job, "dlrover-master:test")
manager := &Manager{}
pod := manager.newJobMaster(job, initMasterIndex)
assert.Equal(t, pod.Name, "elasticjob-test-ps-dlrover-master")
assert.Equal(t, job.Spec.BrainService, "")
actualValue := ""
for _, env := range pod.Spec.Containers[0].Env {
if env.Name == envBrainServiceAddrKey {
actualValue = env.Value
break
}
}
assert.Equal(t, actualValue, defaultBrainServiceAddr)
}
54 changes: 0 additions & 54 deletions dlrover/python/brain/hpsearch/base.py

This file was deleted.

148 changes: 0 additions & 148 deletions dlrover/python/brain/hpsearch/bo.py

This file was deleted.

8 changes: 8 additions & 0 deletions dlrover/python/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,14 @@ class JobConstant(object):
INSUFFICIENT_NODE_TIMEOUT_DEFAULT_MIN = 600
INSUFFICIENT_NODE_TIMEOUT_DEFAULT_MAX = 3600
PENDING_NODE_TIMEOUT_DEFAULT_MIN = 600
# grpc timeout 60s
MASTER_CLIENT_GRPC_DEFAULT_TIMEOUT = 60
# sleep 3s on NetworkFailureReason.WAITING_NODE
MASTER_CLIENT_CHECK_FAULT_TIMEOUT = 3
# sleep 3s on NetworkFailureReason.WAITING_NODE
MASTER_CLIENT_CHECK_STRAGGLER_TIMEOUT = 3
# sleep 5s before next node check round
NODE_CHECK_NEXT_ROUND_TIMEOUT = 5


class Accelerators(object):
Expand Down
Loading

0 comments on commit 1d73743

Please sign in to comment.