Skip to content

Commit

Permalink
Merge branch 'master' into add-no-init-into-waiting
Browse files Browse the repository at this point in the history
  • Loading branch information
Ma Jie Yue committed Jan 15, 2025
2 parents 069ccbf + 7ba180c commit b7a8db6
Show file tree
Hide file tree
Showing 88 changed files with 7,507 additions and 8,213 deletions.
13 changes: 13 additions & 0 deletions .github/actions/elasticjob-controller-test/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
---
name: elasticjob-controller-test
description: run gotest to execute go test cases of ElasticJob operator
runs:
using: 'docker'
image: "easydl/dlrover:ci"
args:
- "/bin/bash"
- "-c"
- "rm -rf /usr/local/go && \
wget -q https://go.dev/dl/go1.23.4.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go1.23.4.linux-amd64.tar.gz && \
cd go/elasticjob && go test ./..."
9 changes: 3 additions & 6 deletions .github/actions/go-master-test/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,9 @@ name: go-master-test
description: run gotest to execute go test cases of ElasticJob operator
runs:
using: 'docker'
image: "easydl/dlrover:ci"
image: "golang:1.23.4"
args:
- "/bin/bash"
- "-c"
- "rm -rf /usr/local/go && \
wget -q https://go.dev/dl/go1.23.4.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go1.23.4.linux-amd64.tar.gz && \
go install github.com/onsi/ginkgo/v2/[email protected] &&
cd dlrover/go/master && ginkgo -v ./..."
- "go install github.com/onsi/ginkgo/v2/[email protected] && \
cd go/master && ginkgo -v ./..."
10 changes: 0 additions & 10 deletions .github/actions/operator-test/action.yml

This file was deleted.

4 changes: 2 additions & 2 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,14 @@ jobs:
# This step references the directory that contains the action.
- name: RUN gotest
uses: ./.github/actions/go-master-test
operator-test:
elasticjob-controller-test:
runs-on: ubuntu-latest
steps:
# This step checks out a copy of your repository.
- uses: actions/checkout@v3
# This step references the directory that contains the action.
- name: RUN gotest
uses: ./.github/actions/operator-test
uses: ./.github/actions/elasticjob-controller-test
brain-test:
runs-on: ubuntu-latest
steps:
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ share/python-wheels/
*.egg
MANIFEST
*.ckpt
dlrover/go/operator/bin/
go/elasticjob/bin/

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
7 changes: 1 addition & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ repos:
entry: bash ./scripts/codestyle/clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
exclude: dlrover/go/brain/vendor
- repo: local
hooks:
- id: cpplint-cpp-source
Expand All @@ -38,20 +37,16 @@ repos:
entry: bash ./scripts/codestyle/cpplint_precommit.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
exclude: dlrover/go/brain/vendor
- repo: https://github.com/dnephin/pre-commit-golang.git
rev: v0.3.3
hooks:
- id: go-fmt
exclude: dlrover/go/brain/vendor
- id: go-lint
exclude: dlrover/go/brain/vendor
- id: no-go-testing
- repo: https://github.com/gruntwork-io/pre-commit
rev: v0.1.8
hooks:
- id: shellcheck
exclude: dlrover/go/brain/vendor
files: \.(sh)$
- repo: local
hooks:
Expand All @@ -66,4 +61,4 @@ repos:
hooks:
- id: yamllint
exclude:
(dlrover/python/tests/data|operator/config)
(dlrover/python/tests/data|go/elasticjob/config)
8,086 changes: 0 additions & 8,086 deletions dlrover/go/operator/config/crd/bases/elastic.iml.github.io_elasticjobs.yaml

This file was deleted.

4 changes: 2 additions & 2 deletions dlrover/python/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,10 +368,10 @@ class JobConstant(object):
# master_client.check_fault_node timeout
MASTER_CLIENT_CHECK_STRAGGLER_NODE_TIMEOUT = 300

# sleep 3s on NetworkFailureReason.WAITING_NODE
# sleep 1s on NetworkFailureReason.WAITING_NODE
MASTER_CLIENT_CHECK_FAULT_SLEEP_TIMEOUT = 1

# sleep 3s on NetworkFailureReason.WAITING_NODE
# sleep 1s on NetworkFailureReason.WAITING_NODE
MASTER_CLIENT_CHECK_STRAGGLER_SLEEP_TIMEOUT = 1

# sleep 5s before next node check round
Expand Down
4 changes: 3 additions & 1 deletion dlrover/trainer/tests/torch/node_check_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import json
import os
import random
import unittest
from datetime import timedelta

Expand All @@ -35,11 +36,12 @@ def tearDown(self):
os.environ.clear()

def test_gpu_node_check(self):
port = random.randint(10000, 40000)
os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["LOCAL_RANK"] = "0"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12345"
os.environ["MASTER_PORT"] = str(port)
t = gpu_main()
self.assertTrue(t > 0)
with open("/tmp/dlrover/network_check/0.txt", "r") as f:
Expand Down
2 changes: 1 addition & 1 deletion docs/blogs/stabilize_llm_training_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ GO 版本: GO 1.18.

```python
git clone git@github.com:intelligent-machine-learning/dlrover.git
cd dlrover/dlrover/go/operator/
cd dlrover/go/elasticjob
make deploy IMG=easydl/elasticjob-controller:master
```

Expand Down
4 changes: 2 additions & 2 deletions docs/deployment/controller.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ installed and run `minikube start`.

```bash
# deploy from local directory
$ kubectl -n dlrover apply -k dlrover/go/operator/config/manifests/bases
$ kubectl -n dlrover apply -k go/elasticjob/config/manifests/bases

# deploy from remote repo
$ deployment="[email protected]:intelligent-machine-learning/dlrover/dlrover/go/operator/config/manifests/bases/?ref=master"
$ deployment="[email protected]:intelligent-machine-learning/dlrover/go/elasticjob/config/manifests/bases/?ref=master"
$ kubectl -n dlrover apply -k $deployment
```

Expand Down
12 changes: 8 additions & 4 deletions docs/developer_guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ $ helm upgrade -i nvdp nvdp/nvidia-device-plugin \
--version=0.13.0 \
--namespace nvidia-device-plugin \
--create-namespace \
--set-file config.map.config=./dlrover/go/operator/config/gpu/nvidia-device-plugin-gpu-shared.yaml
--set-file config.map.config=./go/elasticjob/config/gpu/nvidia-device-plugin-gpu-shared.yaml
```

Then test your GPU resources by
Expand Down Expand Up @@ -177,21 +177,25 @@ a docker image.
- Run the controller in the terminal.

```bash
cd dlrover/go/operator
cd go/elasticjob
make install
make run
```

- Deploy the controller with GO 1.18.
- Deploy the controller with GO 1.23.4

```bash
make deploy IMG=easydl/elasticjob-controller:master
```

If you cannot use curl to download kustomize when running `make deploy`,
you can download the kustomize from [release page](https://github.com/kubernetes-sigs/kustomize/releases)
and move the kustomize bin to `go/elasticjob/bin/`

### 3. Grant Permission for the DLRover Master to Access CRDs

```bash
kubectl apply -f dlrover/go/operator/config/manifests/bases/default-role.yaml
kubectl apply -f go/elasticjob/config/manifests/bases/default-role.yaml
```

### 4. Build the Image
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorial/check_node_health.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ ElasticJob CRD on the cluster by the following steps.

```bash
git clone [email protected]:intelligent-machine-learning/dlrover.git
cd dlrover/dlrover/go/operator/
cd dlrover/go/elasticjob
make deploy IMG=easydl/elasticjob-controller:master # GO 1.18.
# Grant permission for the DLRover master to Access CRDs.
kubectl -n dlrover apply -f config/manifests/bases/default-role.yaml
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorial/tf_elasticjob_on_k8s.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ make deploy IMG=easydl/elasticjob-controller:v0.1.1
1. Grant permission for the DLRover master to Access CRDs.

```bash
kubectl -n dlrover apply -f dlrover/go/operator/config/rbac/default_role.yaml
kubectl -n dlrover apply -f dlrover/go/elasticjob/config/rbac/default_role.yaml
```

## Submit an Auto-Scaling Job
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorial/torch_elasticjob_on_k8s.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ git clone [email protected]:intelligent-machine-learning/dlrover.git
2. Deploy the controller on the cluster.

```bash
cd dlrover/dlrover/go/operator/
cd dlrover/go/elasticjob
make deploy IMG=easydl/elasticjob-controller:master # GO 1.18
```

Expand Down
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion dlrover/go/operator/Makefile → go/elasticjob/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ ENVTEST ?= $(LOCALBIN)/setup-envtest

## Tool Versions
KUSTOMIZE_VERSION ?= v3.8.7
CONTROLLER_TOOLS_VERSION ?= v0.9.2
CONTROLLER_TOOLS_VERSION ?= v0.14.0

KUSTOMIZE_INSTALL_SCRIPT ?= "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh"
.PHONY: kustomize
Expand Down
4 changes: 2 additions & 2 deletions dlrover/go/operator/PROJECT → go/elasticjob/PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ domain: iml.github.io
layout:
- go.kubebuilder.io/v3
projectName: operator
repo: github.com/intelligent-machine-learning/easydl/dlrover/go/operator
repo: github.com/intelligent-machine-learning/dlrover/go/elasticjob
resources:
- api:
crdVersion: v1
Expand All @@ -11,6 +11,6 @@ resources:
domain: iml.github.io
group: elastic
kind: ElasticJob
path: github.com/intelligent-machine-learning/easydl/dlrover/go/operator/api/v1alpha1
path: github.com/intelligent-machine-learning/dlrover/go/elasticjob/api/v1alpha1
version: v1alpha1
version: "3"
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ limitations under the License.
package v1alpha1

import (
commonv1 "github.com/intelligent-machine-learning/easydl/dlrover/go/operator/pkg/common/api/v1"
commonv1 "github.com/intelligent-machine-learning/dlrover/go/elasticjob/pkg/common/api/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ limitations under the License.
package v1alpha1

import (
commonv1 "github.com/intelligent-machine-learning/easydl/dlrover/go/operator/pkg/common/api/v1"
commonv1 "github.com/intelligent-machine-learning/dlrover/go/elasticjob/pkg/common/api/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
Expand All @@ -35,6 +35,7 @@ type ScaleSpec struct {
// {
// "PS": ReplicaResourceSpec,
// "worker": ReplicaResourceSpec,
// "dlrover-master": ReplicaResourceSpec,
// }
ReplicaResourceSpecs map[commonv1.ReplicaType]ReplicaResourceSpec `json:"replicaResourceSpecs,omitempty"`

Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit b7a8db6

Please sign in to comment.