From 48abb9ab6e4b350a9efa55bc848b82fe0eeaa0a0 Mon Sep 17 00:00:00 2001 From: zhangxingmeng <103552200+zhangxingmeng@users.noreply.github.com> Date: Wed, 3 Apr 2024 10:20:30 +0800 Subject: [PATCH] fix gpu docker install error (#1228) --- docker/release/secretflow-gpu.Dockerfile | 35 ++++++++++++++---------- docs/getting_started/installation.md | 4 +-- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/docker/release/secretflow-gpu.Dockerfile b/docker/release/secretflow-gpu.Dockerfile index 901d5f710..8732a7a3b 100644 --- a/docker/release/secretflow-gpu.Dockerfile +++ b/docker/release/secretflow-gpu.Dockerfile @@ -1,20 +1,30 @@ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 -RUN export DEBIAN_FRONTEND=noninteractive && \ - apt-get update \ - && apt-get install -y libcudnn8=8.6.0.163-1+cuda11.8 --allow-downgrades --allow-change-held-packages \ - && apt-get install -y python3.10 --allow-downgrades --allow-change-held-packages \ - && apt-get install -y python3-pip --allow-downgrades --allow-change-held-packages --no-install-recommends \ +ENV DEBIAN_FRONTEND=noninteractive -RUN if [ ! -e /usr/bin/python ]; then ln -sf /usr/bin/python3.10 /usr/bin/python; fi +RUN apt-get update \ + && apt-get install -y libcudnn8=8.6.0.163-1+cuda11.8 --allow-downgrades --allow-change-held-packages \ + && apt-get install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y python3.10 python3.10-distutils python3-distutils -RUN if [ ! -e /usr/bin/python3 ]; then ln -sf /usr/bin/python3.10 /usr/bin/python3; fi +# alter python version to 3.10 +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 + +RUN ln -s /usr/bin/python3 /usr/bin/python + +# # install pip +RUN apt-get update \ + && apt-get install -y curl \ + && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \ + && python get-pip.py #install the dependencies of cuda11 #you are supposed to add the mirror source of pypi to accelerate installation of nvidia packages of cuda11, #if not, the building of images are prone to fail very much RUN pip install nvidia-cublas-cu11 nvidia-cuda-cupti-cu11 nvidia-cuda-nvcc-cu11 \ - nvidia-cuda-nvrtc-cu11 nvidia-cuda-runtime-cu11 install nvidia-cudnn-cu11 \ + nvidia-cuda-nvrtc-cu11 nvidia-cuda-runtime-cu11 nvidia-cudnn-cu11 \ nvidia-cufft-cu11 nvidia-curand-cu11 nvidia-cusolver-cu11 \ nvidia-cusparse-cu11 nvidia-nccl-cu11 nvidia-nvtx-cu11 \ && rm -rf ~/.cache/pip \ @@ -23,8 +33,8 @@ RUN pip install nvidia-cublas-cu11 nvidia-cuda-cupti-cu11 nvidia-cuda-nvcc-cu11 # install the gpu version of jax and jaxlib based cuda11 # the site of https://storage.googleapis.com/jax-releases/jax_cuda_releases.html is very necessary # ref to https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier -RUN pip install --upgrade "jax[cuda11_pip]"==0.4.1 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html \ - && pip install --upgrade "jaxlib[cuda11_pip]"==0.4.1 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html \ +RUN pip install --upgrade "jax[cuda11_pip]"==0.4.25 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html \ + && pip install --upgrade "jaxlib[cuda11_pip]"==0.4.25 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html \ && rm -rf ~/.cache/pip \ && rm -rf /tmp/* @@ -32,7 +42,7 @@ RUN pip install --upgrade "jax[cuda11_pip]"==0.4.1 -f https://storage.googleapis #you are supposed to add the mirror source of pypi to accelerate installation of SecretFlow and accelerate the building of images #if not, the building of images are prone to fail very much # Now, based on the CUDA11, the best match of TensorFlow, PyTorch and Jax are -# tensorflow==2.12.0, due to the version of TensorFlow which secretflow requires is 2.11.0, so we install tensorflow==2.12.0 manually. +# tensorflow==2.12.0, due to the version of TensorFlow which secretflow requires is 2.11.0, so we install tensorflow==2.12.0 manually. # torch==2.0.0 # jax==0.4.1 RUN pip install -U secretflow \ @@ -43,9 +53,6 @@ RUN pip install -U secretflow \ && rm -rf /tmp/* COPY secretflow_entrypoint.sh /opt/secretflow/ - COPY secretflow_entrypoint.py /opt/secretflow/ - - ENTRYPOINT ["sh","/opt/secretflow/secretflow_entrypoint.sh"] diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md index 15ee89cf9..ea864cb16 100644 --- a/docs/getting_started/installation.md +++ b/docs/getting_started/installation.md @@ -244,11 +244,11 @@ docker build -f secretflow-gpu.Dockerfile -t secretflow-gpu . 1. Run a container ```bash -docker container run --runtime=nvidia -it --gpus all secretflow-gpu bash +docker run -it --gpus all secretflow-gpu bash ``` > **NOTE**: The following two parameters are necessary: -> - `--runtime=nvidia` +> - You should install `nvidia-container-toolkit` and restart docker daemon to enable GPU support. > - `--gpus all` 2. After the container is running, you can use the jupyter notebook [GPU Check](../tutorial/GPU_check.ipynb) to check the access of Tensorflow and PyTorch for NVIDIA GPUs inside the container. \ No newline at end of file