From 48abb9ab6e4b350a9efa55bc848b82fe0eeaa0a0 Mon Sep 17 00:00:00 2001
From: zhangxingmeng <103552200+zhangxingmeng@users.noreply.github.com>
Date: Wed, 3 Apr 2024 10:20:30 +0800
Subject: [PATCH] fix gpu docker install error (#1228)

---
 docker/release/secretflow-gpu.Dockerfile | 35 ++++++++++++++----------
 docs/getting_started/installation.md     |  4 +--
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/docker/release/secretflow-gpu.Dockerfile b/docker/release/secretflow-gpu.Dockerfile
index 901d5f710..8732a7a3b 100644
--- a/docker/release/secretflow-gpu.Dockerfile
+++ b/docker/release/secretflow-gpu.Dockerfile
@@ -1,20 +1,30 @@
 FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
 
-RUN  export DEBIAN_FRONTEND=noninteractive && \
-     apt-get update \
-     && apt-get install -y libcudnn8=8.6.0.163-1+cuda11.8 --allow-downgrades --allow-change-held-packages  \
-     && apt-get install -y python3.10 --allow-downgrades --allow-change-held-packages   \
-     && apt-get install -y python3-pip --allow-downgrades --allow-change-held-packages --no-install-recommends \
+ENV DEBIAN_FRONTEND=noninteractive
 
-RUN if [ ! -e /usr/bin/python ]; then ln -sf /usr/bin/python3.10 /usr/bin/python; fi
+RUN  apt-get update \
+    && apt-get install -y libcudnn8=8.6.0.163-1+cuda11.8 --allow-downgrades --allow-change-held-packages \
+    && apt-get install -y software-properties-common  \
+    && add-apt-repository ppa:deadsnakes/ppa  \
+    && apt-get update \
+    && apt-get install -y python3.10 python3.10-distutils python3-distutils
 
-RUN if [ ! -e /usr/bin/python3 ]; then ln -sf /usr/bin/python3.10 /usr/bin/python3; fi
+# alter python version to 3.10
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
+# # install pip
+RUN apt-get update \
+    && apt-get install -y curl \
+    && curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py \
+    && python get-pip.py
 
 #install the dependencies of cuda11
 #you are supposed to  add the mirror source of pypi to accelerate installation of nvidia packages of cuda11,
 #if not, the building of images are prone to fail very much
 RUN pip install nvidia-cublas-cu11 nvidia-cuda-cupti-cu11 nvidia-cuda-nvcc-cu11 \
-    nvidia-cuda-nvrtc-cu11 nvidia-cuda-runtime-cu11 install nvidia-cudnn-cu11 \
+    nvidia-cuda-nvrtc-cu11 nvidia-cuda-runtime-cu11 nvidia-cudnn-cu11 \
     nvidia-cufft-cu11  nvidia-curand-cu11  nvidia-cusolver-cu11 \
     nvidia-cusparse-cu11 nvidia-nccl-cu11  nvidia-nvtx-cu11 \
     && rm -rf  ~/.cache/pip \
@@ -23,8 +33,8 @@ RUN pip install nvidia-cublas-cu11 nvidia-cuda-cupti-cu11 nvidia-cuda-nvcc-cu11
 # install the gpu version of jax and jaxlib based cuda11
 # the site of https://storage.googleapis.com/jax-releases/jax_cuda_releases.html is very necessary
 # ref to https://github.com/google/jax#pip-installation-gpu-cuda-installed-via-pip-easier
-RUN pip install --upgrade "jax[cuda11_pip]"==0.4.1 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html \
-    && pip install --upgrade "jaxlib[cuda11_pip]"==0.4.1 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html \
+RUN pip install --upgrade "jax[cuda11_pip]"==0.4.25 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html \
+    && pip install --upgrade "jaxlib[cuda11_pip]"==0.4.25 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html \
     && rm -rf  ~/.cache/pip \
     && rm -rf /tmp/*
 
@@ -32,7 +42,7 @@ RUN pip install --upgrade "jax[cuda11_pip]"==0.4.1 -f https://storage.googleapis
 #you are supposed to add the mirror source of pypi to accelerate installation of SecretFlow and accelerate the building of images
 #if not, the building of images are prone to fail very much
 # Now, based on the CUDA11, the best match of TensorFlow, PyTorch and Jax are
-# tensorflow==2.12.0, due to the version of TensorFlow which secretflow  requires is 2.11.0, so we install tensorflow==2.12.0 manually.
+# tensorflow==2.12.0, due to the version of TensorFlow which secretflow requires is 2.11.0, so we install tensorflow==2.12.0 manually.
 # torch==2.0.0
 # jax==0.4.1
 RUN pip install -U secretflow \
@@ -43,9 +53,6 @@ RUN pip install -U secretflow \
     && rm -rf /tmp/*
 
 COPY secretflow_entrypoint.sh /opt/secretflow/
-
 COPY secretflow_entrypoint.py /opt/secretflow/
 
-
-
 ENTRYPOINT ["sh","/opt/secretflow/secretflow_entrypoint.sh"]
diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md
index 15ee89cf9..ea864cb16 100644
--- a/docs/getting_started/installation.md
+++ b/docs/getting_started/installation.md
@@ -244,11 +244,11 @@ docker build -f  secretflow-gpu.Dockerfile -t secretflow-gpu .
 1. Run a container
 
 ```bash
-docker container run --runtime=nvidia  -it --gpus all secretflow-gpu bash
+docker run -it --gpus all secretflow-gpu bash
 ```
 
 > **NOTE**: The following two parameters are necessary:
-> - `--runtime=nvidia`
+> - You should install `nvidia-container-toolkit` and restart docker daemon to enable GPU support.
 > - `--gpus all`
 
 2. After the container is running, you can use the jupyter notebook [GPU Check](../tutorial/GPU_check.ipynb) to check the access of Tensorflow and PyTorch for NVIDIA GPUs inside the container.
\ No newline at end of file