diff --git a/Docker/Dockerfile b/Docker/Dockerfile index 1dcdb1a6..8e4f854e 100644 --- a/Docker/Dockerfile +++ b/Docker/Dockerfile @@ -189,6 +189,9 @@ SHELL ["/bin/bash", "--login", "-c"] COPY --from=selected_freesurfer_build_image /opt/freesurfer /opt/freesurfer COPY --from=selected_conda_build_image /venv /venv +# Fix for cuda11.8+cudnn8.7 bug+warning: https://github.com/pytorch/pytorch/issues/97041 +RUN if [[ "$DEVICE" == "cu118" ]] ; then cd /venv/python3.10/site-packages/torch/lib && ln -s libnvrtc-*.so.11.2 libnvrtc.so ; fi + # Copy fastsurfer over from the build context and add PYTHONPATH COPY . /fastsurfer/ ENV PYTHONPATH=/fastsurfer:/opt/freesurfer/python/packages \ diff --git a/Docker/README.md b/Docker/README.md index c2f0d319..d5d12af8 100644 --- a/Docker/README.md +++ b/Docker/README.md @@ -139,7 +139,7 @@ As you can see, only the tag of the image is changed from gpu to cpu and the sta Here we build an experimental image to test performance when running on AMD GPUs. Note that you need a supported OS and Kernel version and supported GPU for the RocM to work correctly. You need to install the Kernel drivers into your host machine kernel (amdgpu-install --usecase=dkms) for the amd docker to work. For this follow: -https://docs.amd.com/en/latest/deploy/linux/quick_start.html +https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html#rocm-install-quick, https://rocm.docs.amd.com/projects/install-on-linux/en/latest/install/amdgpu-install.html#amdgpu-install-dkms and https://rocm.docs.amd.com/projects/install-on-linux/en/latest/how-to/docker.html ```bash PYTHONPATH= @@ -149,9 +149,8 @@ python build.py --device rocm --tag my_fastsurfer:rocm and run segmentation only: ```bash -docker run --rm --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ - --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host \ - --shm-size 8G \ +docker run --rm --security-opt seccomp=unconfined \ + --device=/dev/kfd --device=/dev/dri --group-add video \ -v /home/user/my_mri_data:/data \ -v /home/user/my_fastsurfer_analysis:/output \ my_fastsurfer:rocm \ @@ -159,12 +158,13 @@ docker run --rm --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ --sid subjectX --sd /output ``` -Note, we tested on an AMD Radeon Pro W6600, which is [not officially supported](https://docs.amd.com/en/latest/release/gpu_os_support.html), but setting `HSA_OVERRIDE_GFX_VERSION=10.3.0` [inside docker did the trick](https://en.opensuse.org/AMD_OpenCL#ROCm_-_Running_on_unsupported_hardware): +In conflict with the official ROCm documentation (above), we also needed to add the group render `--group-add render` (in addition to `--group-add video`). + +Note, we tested on an AMD Radeon Pro W6600, which is [not officially supported](https://docs.amd.com/en/latest/release/gpu_os_support.html), but setting `HSA_OVERRIDE_GFX_VERSION=10.3.0` [inside docker did the trick](https://en.opensuse.org/SDB:AMD_GPGPU#Using_CUDA_code_with_ZLUDA_and_ROCm): ```bash -docker run --rm --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ - --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host \ - --shm-size 8G \ +docker run --rm --security-opt seccomp=unconfined \ + --device=/dev/kfd --device=/dev/dri --group-add video --group-add render \ -v /home/user/my_mri_data:/data \ -v /home/user/my_fastsurfer_analysis:/output \ -e HSA_OVERRIDE_GFX_VERSION=10.3.0 \ diff --git a/Docker/build.py b/Docker/build.py index 045a9a0a..6467c7bf 100755 --- a/Docker/build.py +++ b/Docker/build.py @@ -30,7 +30,7 @@ Target = Literal['runtime', 'build_common', 'build_conda', 'build_freesurfer', 'build_base', 'runtime_cuda'] CacheType = Literal["inline", "registry", "local", "gha", "s3", "azblob"] -AllDeviceType = Literal["cpu", "cuda", "cu118", "cu121", "cu124", "rocm6.1"] +AllDeviceType = Literal["cpu", "cuda", "cu118", "cu121", "cu124", "rocm", "rocm6.1"] DeviceType = Literal["cpu", "cu118", "cu121", "cu124", "rocm6.1"] CREATE_BUILDER = "Create builder with 'docker buildx create --name fastsurfer'." @@ -58,6 +58,7 @@ class DEFAULTS: # and rocm versions, if pytorch comes with new versions. # torch 1.12.0 comes compiled with cu113, cu116, rocm5.0 and rocm5.1.1 # torch 2.0.1 comes compiled with cu117, cu118, and rocm5.4.2 + # torch 2.4 comes compiled with cu118, cu121, cu124 and rocm6.1 MapDeviceType: Dict[AllDeviceType, DeviceType] = dict( ((d, d) for d in get_args(DeviceType)), rocm="rocm6.1", @@ -230,6 +231,7 @@ def make_parser() -> argparse.ArgumentParser: --cache type=registry,ref=server/fastbuild,mode=max. Will default to the environment variable FASTSURFER_BUILD_CACHE: {cache_kwargs.get('default', 'N/A')}""", + metavar="type={inline,local,...}[,=[,...]]", **cache_kwargs, ) parser.add_argument( diff --git a/Docker/install_env.py b/Docker/install_env.py index 841431ff..3d03d38d 100644 --- a/Docker/install_env.py +++ b/Docker/install_env.py @@ -19,7 +19,7 @@ def mode(arg: str) -> str: if arg in ["base", "cpu"] or \ re.match("^cu\\d+$", arg) or \ - re.match("^rocm\\d+\\.\\d+(\\.\\d+)?$"): + re.match("^rocm\\d+\\.\\d+(\\.\\d+)?$", arg): return arg else: raise argparse.ArgumentTypeError(f"The mode was '{arg}', but should be " diff --git a/FastSurferCNN/data_loader/data_utils.py b/FastSurferCNN/data_loader/data_utils.py index 9f98aa53..dd11302b 100644 --- a/FastSurferCNN/data_loader/data_utils.py +++ b/FastSurferCNN/data_loader/data_utils.py @@ -623,7 +623,7 @@ def read_classes_from_lut(lut_file: str | Path): if lut_file.suffix == ".csv": kwargs["sep"] = "," elif lut_file.suffix == ".txt": - kwargs["delim_whitespace"] = True + kwargs["sep"] = "\\s+" else: raise RuntimeError( f"Unknown LUT file extension {lut_file}, must be csv, txt or tsv." diff --git a/FastSurferCNN/inference.py b/FastSurferCNN/inference.py index cdf5b169..f99fea0a 100644 --- a/FastSurferCNN/inference.py +++ b/FastSurferCNN/inference.py @@ -213,7 +213,9 @@ def load_checkpoint(self, ckpt: Union[str, os.PathLike]): # make sure the model is, where it is supposed to be self.model.to(self.device) - model_state = torch.load(ckpt, map_location=device) + # WARNING: weights_only=False can cause unsafe code execution, but here the + # checkpoint can be considered to be from a safe source + model_state = torch.load(ckpt, map_location=device, weights_only=False) self.model.load_state_dict(model_state["model_state"]) # workaround for mps (move the model back to mps) diff --git a/FastSurferCNN/utils/checkpoint.py b/FastSurferCNN/utils/checkpoint.py index 31052bb4..6663a9e0 100644 --- a/FastSurferCNN/utils/checkpoint.py +++ b/FastSurferCNN/utils/checkpoint.py @@ -228,7 +228,9 @@ def load_from_checkpoint( loaded_epoch : int Epoch number. """ - checkpoint = torch.load(checkpoint_path, map_location="cpu") + # WARNING: weights_only=False can cause unsafe code execution, but here the + # checkpoint can be considered to be from a safe source + checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=False) if drop_classifier: classifier_conv = ["classifier.conv.weight", "classifier.conv.bias"] diff --git a/HypVINN/inference.py b/HypVINN/inference.py index 10134526..07953dae 100644 --- a/HypVINN/inference.py +++ b/HypVINN/inference.py @@ -181,7 +181,9 @@ def load_checkpoint(self, ckpt: str): of a model. """ logger.info("Loading checkpoint {}".format(ckpt)) - model_state = torch.load(ckpt, map_location=self.device) + # WARNING: weights_only=False can cause unsafe code execution, but here the + # checkpoint can be considered to be from a safe source + model_state = torch.load(ckpt, map_location=self.device, weights_only=False) self.model.load_state_dict(model_state["model_state"]) def get_modelname(self): diff --git a/env/export_pip-r.sh b/env/export_pip-r.sh index 49433793..042563e1 100644 --- a/env/export_pip-r.sh +++ b/env/export_pip-r.sh @@ -73,5 +73,3 @@ pyversion=$(echo "$out" | head -n 1 | cut -d" " -f2) echo "" echo "# $out" } >> $1 - -} \ No newline at end of file