Skip to content

Commit

Permalink
pip install -q CI python packages and speed up CI. (#1421)
Browse files Browse the repository at this point in the history
* Speedup CI install python  packages.

* Remove unused codes.

* Wait backgroup processes.

* Fix code format

* remove sudo

* Install ci package quietly.

* Add log to start installing packages.

* Install torch

* Speed up UT

* Fix UT.

* Fix UT.

* Set timeout for python UT

* Set the timeout to 6min.
  • Loading branch information
workingloong authored Jan 5, 2025
1 parent 2a83d5a commit 63a926b
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .github/actions/dlrover-python-test/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ runs:
args:
- "/bin/bash"
- "-c"
- "sh scripts/ci_install.sh && python -m grpc_tools.protoc -I. \
- "bash scripts/ci_install.sh && python -m grpc_tools.protoc -I. \
dlrover/proto/*.proto --python_out=. --grpc_python_out=. \
&& ROLE_NAME=dlrover-trainer \
python -m pytest --durations=10 dlrover/python/tests dlrover/trainer/tests \
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ jobs:
- uses: ./.github/actions/pre-commit
dlrover-python-test:
runs-on: ubuntu-latest
timeout-minutes: 6
steps:
# This step checks out a copy of your repository.
- name: checkout branch
Expand Down
9 changes: 6 additions & 3 deletions dlrover/python/tests/test_job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def test_get_dead_node_event(self):
params = MockK8sPSJobArgs()
params.initilize()
manager = create_job_manager(params, SpeedMonitor())
manager.start()
manager._init_nodes()
ts = int(time.time())
manager.collect_node_heart_beat(NodeType.WORKER, 0, ts)

Expand All @@ -390,7 +390,7 @@ def test_get_dead_node_event(self):
node.heartbeat_time = (now - timedelta(seconds=1000)).timestamp()
if index == 0:
node.create_time = now - timedelta(seconds=800)
node.start_time = now - timedelta(seconds=600)
node.start_time = now - timedelta(seconds=500)
else:
node.create_time = now - timedelta(seconds=1400)
node.start_time = now - timedelta(seconds=1200)
Expand Down Expand Up @@ -881,9 +881,12 @@ def test_start_and_stop(self):

def test_concurrency_heart_beat_collecting(self):
params = MockK8sAllreduceJobArgs()
worker_size = 10000
worker_size = 1000
params.initilize(worker_size)
manager = create_job_manager(params, SpeedMonitor())
manager._scaler._check_master_service_avaliable = mock.MagicMock(
return_value=True
)
manager.start()

job_nodes = self.job_context.job_nodes()
Expand Down
10 changes: 10 additions & 0 deletions dlrover/python/tests/test_pod_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import time
import unittest
from collections import deque
from unittest import mock

from dlrover.python.common.constants import (
DistributionStrategy,
Expand Down Expand Up @@ -46,6 +47,9 @@ def tearDown(self) -> None:
def test_init_pod_template(self):
error_monitor = SimpleErrorMonitor()
scaler = PodScaler("elasticjob-sample", "default", error_monitor)
scaler._check_master_service_avaliable = mock.MagicMock(
return_value=True
)
scaler.start()
self.assertEqual(
scaler._distribution_strategy,
Expand Down Expand Up @@ -118,6 +122,9 @@ def test_periodic_create_pod(self):
def test_create_pod(self):
error_monitor = SimpleErrorMonitor()
scaler = PodScaler("elasticjob-sample", "default", error_monitor)
scaler._check_master_service_avaliable = mock.MagicMock(
return_value=True
)
_dlrover_ctx.config_master_port()

scaler.start()
Expand Down Expand Up @@ -251,6 +258,9 @@ def test_scale(self):

def test_scale_thread(self):
scaler = PodScaler("elasticjob-sample", "default")
scaler._check_master_service_avaliable = mock.MagicMock(
return_value=True
)
scaler.start()
scaler._distribution_strategy = DistributionStrategy.PS
resource = NodeResource(4, 8192)
Expand Down
38 changes: 22 additions & 16 deletions scripts/ci_install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.

pip install kubernetes
pip install grpcio-tools
pip install psutil
pip install deprecated
pip install 'ray[default]'
pip install pyhocon
pip install pytest-cov
pip install pytest-ordering
pip install packaging
pip install tensorflow==2.13.0
pip install deepspeed==0.12.6
pip install accelerate==0.29.2
pip install transformers==4.37.2
pip install torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
pip install peft==0.10.0
pip install botorch==0.8.5
echo "Start installing CI python packages."
start_time=$(date +%s)

pip install -q kubernetes
pip install -q grpcio-tools
pip install -q psutil
pip install -q deprecated
pip install -q 'ray[default]'
pip install -q pyhocon
pip install -q pytest-cov
pip install -q pytest-ordering
pip install -q packaging
pip install -q tensorflow==2.13.0
pip install -q torch==2.0.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
pip install -q deepspeed==0.12.6
pip install -q accelerate==0.29.2
pip install -q transformers==4.37.2
pip install -q peft==0.10.0

end_time=$(date +%s)
cost_time=$((end_time-start_time))
echo "pip cost time: $((cost_time/60))min $((cost_time%60))s"

0 comments on commit 63a926b

Please sign in to comment.