EESSI · ocaisa · Dec 19, 2023 · Dec 1, 2023 · Dec 1, 2023 · Dec 1, 2023
diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh
@@ -187,6 +187,22 @@ fi
 # assume there's only one diff file that corresponds to the PR patch file
 pr_diff=$(ls [0-9]*.diff | head -1)
 
+# install any additional required scripts
+# order is important: these are needed to install a full CUDA SDK in host_injections
+install_scripts_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^install_scripts.sh$' > /dev/null; echo $?)
+if [ ${install_scripts_changed} == '0' ]; then
+    # for now, this just reinstalls all scripts. Note the most elegant, but works
+    ${TOPDIR}/install_scripts.sh --prefix ${EESSI_CVMFS_REPO}
+fi
+
+# Install full CUDA SDK in host_injections
+# Hardcode this for now, see if it works
+# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
+${EESSI_CVMFS_REPO}/gpu_support/nvidia/install_cuda_host_injections.sh 12.1.1
+
+# Install drivers in host_injections
+${EESSI_CVMFS_REPO}/gpu_support/nvidia/link_nvidia_host_libraries.sh
+
 # use PR patch file to determine in which easystack files stuff was added
 for easystack_file in $(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing'); do
 

diff --git a/eessi_container.sh b/eessi_container.sh
@@ -30,8 +30,8 @@
 # -. initial settings & exit codes
 TOPDIR=$(dirname $(realpath $0))
 
-source ${TOPDIR}/scripts/utils.sh
-source ${TOPDIR}/scripts/cfg_files.sh
+source "${TOPDIR}"/scripts/utils.sh
+source "${TOPDIR}"/scripts/cfg_files.sh
 
 # exit codes: bitwise shift codes to allow for combination of exit codes
 # ANY_ERROR_EXITCODE is sourced from ${TOPDIR}/scripts/utils.sh
@@ -46,6 +46,7 @@ SAVE_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 8))
 HTTP_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 9))
 HTTPS_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 10))
 RUN_SCRIPT_MISSING_EXITCODE=$((${ANY_ERROR_EXITCODE} << 11))
+NVIDIA_MODE_UNKNOWN_EXITCODE=$((${ANY_ERROR_EXITCODE} << 12))
 
 # CernVM-FS settings
 CVMFS_VAR_LIB="var-lib-cvmfs"
@@ -72,12 +73,17 @@ display_help() {
   echo "  -a | --access {ro,rw}  - ro (read-only), rw (read & write) [default: ro]"
   echo "  -c | --container IMG   - image file or URL defining the container to use"
   echo "                           [default: docker://ghcr.io/eessi/build-node:debian11]"
-  echo "  -h | --help            - display this usage information [default: false]"
   echo "  -g | --storage DIR     - directory space on host machine (used for"
   echo "                           temporary data) [default: 1. TMPDIR, 2. /tmp]"
+  echo "  -h | --help            - display this usage information [default: false]"
+  echo "  -i | --host-injections - directory to link to for host_injections "
+  echo "                           [default: /..storage../opt-eessi]"
   echo "  -l | --list-repos      - list available repository identifiers [default: false]"
   echo "  -m | --mode MODE       - with MODE==shell (launch interactive shell) or"
   echo "                           MODE==run (run a script or command) [default: shell]"
+  echo "  -n | --nvidia MODE     - configure the container to work with NVIDIA GPUs,"
+  echo "                           MODE==install for a CUDA installation, MODE==run to"
+  echo "                           attach a GPU, MODE==all for both [default: false]"
   echo "  -r | --repository CFG  - configuration file or identifier defining the"
   echo "                           repository to use [default: EESSI via"
   echo "                           default container, see --container]"
@@ -111,6 +117,8 @@ VERBOSE=0
 STORAGE=
 LIST_REPOS=0
 MODE="shell"
+SETUP_NVIDIA=0
+ADDITIONAL_SINGULARITY_FLAGS=
 REPOSITORY="EESSI"
 RESUME=
 SAVE=
@@ -141,6 +149,10 @@ while [[ $# -gt 0 ]]; do
       display_help
       exit 0
       ;;
+    -i|--host-injections)
+      USER_HOST_INJECTIONS="$2"
+      shift 2
+      ;;
     -l|--list-repos)
       LIST_REPOS=1
       shift 1
@@ -149,6 +161,11 @@ while [[ $# -gt 0 ]]; do
       MODE="$2"
       shift 2
       ;;
+    -n|--nvidia)
+      SETUP_NVIDIA=1
+      NVIDIA_MODE="$2"
+      shift 2
+      ;;
     -r|--repository)
       REPOSITORY="$2"
       shift 2
@@ -224,6 +241,13 @@ if [[ "${MODE}" != "shell" && "${MODE}" != "run" ]]; then
     fatal_error "unknown execution mode '${MODE}'" "${MODE_UNKNOWN_EXITCODE}"
 fi
 
+# Also validate the NVIDIA GPU mode (if present)
+if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
+    if [[ "${NVIDIA_MODE}" != "run" && "${NVIDIA_MODE}" != "install" && "${NVIDIA_MODE}" != "all" ]]; then
+        fatal_error "unknown NVIDIA mode '${NVIDIA_MODE}'" "${NVIDIA_MODE_UNKNOWN_EXITCODE}"
+    fi
+fi
+
 # TODO (arg -r|--repository) check if repository is known
 # REPOSITORY_ERROR_EXITCODE
 if [[ ! -z "${REPOSITORY}" && "${REPOSITORY}" != "EESSI" && ! -r ${EESSI_REPOS_CFG_FILE} ]]; then
@@ -310,12 +334,25 @@ fi
 #      |-overlay-work
 #      |-home
 #      |-repos_cfg
+#      |-opt-eessi (unless otherwise specificed for host_injections)
 
 # tmp dir for EESSI
 EESSI_TMPDIR=${EESSI_HOST_STORAGE}
 mkdir -p ${EESSI_TMPDIR}
 [[ ${VERBOSE} -eq 1 ]] && echo "EESSI_TMPDIR=${EESSI_TMPDIR}"
 
+# Set host_injections directory and ensure it is a writable directory (if user provided)
+if [ -z ${USER_HOST_INJECTIONS+x} ]; then
+    # Not set, so use our default
+    HOST_INJECTIONS=${EESSI_TMPDIR}/opt-eessi
+    mkdir -p $HOST_INJECTIONS
+else
+    # Make sure the host_injections directory specified exists and is a folder
+    mkdir -p ${USER_HOST_INJECTIONS} || fatal_error "host_injections directory ${USER_HOST_INJECTIONS} is either not a directory or cannot be created"
+    HOST_INJECTIONS=${USER_HOST_INJECTIONS}
+fi
+[[ ${VERBOSE} -eq 1 ]] && echo "HOST_INJECTIONS=${HOST_INJECTIONS}"
+
 # configure Singularity: if SINGULARITY_CACHEDIR is already defined, use that
 #   a global SINGULARITY_CACHEDIR would ensure that we don't consume
 #   storage space again and again for the container & also speed-up
@@ -394,12 +431,34 @@ fi
 [[ ${VERBOSE} -eq 1 ]] && echo "SINGULARITY_HOME=${SINGULARITY_HOME}"
 
 # define paths to add to SINGULARITY_BIND (added later when all BIND mounts are defined)
-BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs"
+BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs,${HOST_INJECTIONS}:/opt/eessi"
 # provide a '/tmp' inside the container
 BIND_PATHS="${BIND_PATHS},${EESSI_TMPDIR}:${TMP_IN_CONTAINER}"
 
 [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"
 
+# Configure anything we need for NVIDIA GPUs and CUDA installation
+if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
+    if [[ "${NVIDIA_MODE}" == "run" || "${NVIDIA_MODE}" == "all" ]]; then
+        # Give singularity the appropriate flag
+        ADDITIONAL_SINGULARITY_FLAGS="--nv ${ADDITIONAL_SINGULARITY_FLAGS}"
+        [[ ${VERBOSE} -eq 1 ]] && echo "ADDITIONAL_SINGULARITY_FLAGS=${ADDITIONAL_SINGULARITY_FLAGS}"
+    fi
+    if [[ "${NVIDIA_MODE}" == "install" || "${NVIDIA_MODE}" == "all" ]]; then
+        # Add additional bind mounts to allow CUDA to install within a container
+        # (Experience tells us that these are necessary, but we don't know _why_
+        # as the CUDA installer is a black box. The suspicion is that the CUDA
+        # installer gets confused by the permissions on these directories when
+        # inside a container)
+        EESSI_VAR_LOG=${EESSI_TMPDIR}/var-log
+        EESSI_USR_LOCAL_CUDA=${EESSI_TMPDIR}/usr-local-cuda
+        mkdir -p ${EESSI_VAR_LOG}
+        mkdir -p ${EESSI_USR_LOCAL_CUDA}
+        BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda"
+        [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"
+    fi
+fi
+
 # set up repository config (always create directory repos_cfg and populate it with info when
 # arg -r|--repository is used)
 mkdir -p ${EESSI_TMPDIR}/repos_cfg
@@ -562,8 +621,8 @@ if [ ! -z ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} ]; then
 fi
 
 echo "Launching container with command (next line):"
-echo "singularity ${RUN_QUIET} ${MODE} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@"
-singularity ${RUN_QUIET} ${MODE} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@"
+echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@"
+singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@"
 exit_code=$?
 
 # 6. save tmp if requested (arg -s|--save)

diff --git a/gpu_support/nvidia/install_cuda_host_injections.sh b/gpu_support/nvidia/install_cuda_host_injections.sh
@@ -0,0 +1,209 @@
+#!/usr/bin/env bash
+
+# This script can be used to install CUDA under the `.../host_injections` directory.
+# This provides the parts of the CUDA installation that cannot be redistributed as
+# part of EESSI due to license limitations. While GPU-based software from EESSI will
+# _run_ without these, installation of additional CUDA software requires the CUDA
+# installation(s) under `host_injections` to be present.
+#
+# The `host_injections` directory is a variant symlink that by default points to
+# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see
+# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the
+# installation to be successful, this directory needs to be writeable by the user
+# executing this script.
+
+# Initialise our bash functions
+TOPDIR=$(dirname $(realpath $BASH_SOURCE))
+source "$TOPDIR"/../../scripts/utils.sh
+
+# Function to display help message
+show_help() {
+    echo "Usage: $0 [OPTIONS]"
+    echo "Options:"
+    echo "  --help                           Display this help message"
+    echo "  --accept-cuda-eula               You _must_ accept the CUDA EULA to install"
+    echo "                                   CUDA, see the EULA at"
+    echo "                                   https://docs.nvidia.com/cuda/eula/index.html"
+    echo "  -c, --cuda-version CUDA_VERSION  Specify a version o CUDA to install (must"
+    echo "                                   have a corresponding easyconfig in the"
+    echo "                                   EasyBuild release)"
+    echo "  -t, --temp-dir /path/to/tmpdir   Specify a location to use for temporary"
+    echo "                                   storage during the CUDA install"
+    echo "                                   (must have >10GB available)"
+}
+
+# Initialize variables
+install_cuda_version=""
+eula_accepted=0
+
+# Parse command-line options
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --help)
+            show_help
+            exit 0
+            ;;
+        -c|--cuda-version)
+            if [ -n "$2" ]; then
+                install_cuda_version="$2"
+                shift 2
+            else
+                echo "Error: Argument required for $1"
+                show_help
+                exit 1
+            fi
+            ;;
+        --accept-cuda-eula)
+            eula_accepted=1
+            shift 1
+            ;;
+        -t|--temp-dir)
+            if [ -n "$2" ]; then
+                CUDA_TEMP_DIR="$2"
+                shift 2
+            else
+                echo "Error: Argument required for $1"
+                show_help
+                exit 1
+            fi
+            ;;
+        *)
+            show_help
+            fatal_error "Error: Unknown option: $1"
+            ;;
+    esac
+done
+
+# Make sure EESSI is initialised
+check_eessi_initialised
+
+# Make sure the CUDA version supplied is a semantic version
+is_semantic_version() {
+    local version=$1
+    local regex='^[0-9]+\.[0-9]+\.[0-9]+$'
+
+    if [[ $version =~ $regex ]]; then
+        return 0  # Return success (0) if it's a semantic version
+    else
+        return 1  # Return failure (1) if it's not a semantic version
+    fi
+}
+if ! is_semantic_version "$install_cuda_version"; then
+  show_help
+  error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n"
+  error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n"
+  error="${error}version to provide is probably one of those available under\n"
+  error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n"
+  fatal_error "${error}"
+fi
+
+# Make sure they have accepted the CUDA EULA
+if [ "$eula_accepted" -ne 1 ]; then
+  show_help
+  error="\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n"
+  fatal_error "${error}"
+fi
+
+# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections`
+# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup)
+cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections}
+
+# Only install CUDA if specified version is not found.
+# (existence of easybuild subdir implies a successful install)
+if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then
+  echo_green "CUDA software found! No need to install CUDA again."
+else
+  # We need to be able write to the installation space so let's make sure we can
+  if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then
+    fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA"
+  fi
+
+  # we need a directory we can use for temporary storage
+  if [[ -z "${CUDA_TEMP_DIR}" ]]; then
+    tmpdir=$(mktemp -d)
+  else
+    tmpdir="${CUDA_TEMP_DIR}"/temp
+    if ! mkdir "$tmpdir" ; then
+      fatal_error "Could not create directory ${tmpdir}"
+    fi
+  fi
+
+  required_space_in_tmpdir=50000
+  # Let's see if we have sources and build locations defined if not, we use the temporary space
+  if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then
+    export EASYBUILD_BUILDPATH=${tmpdir}/build
+    required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
+  fi
+  if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then
+    export EASYBUILD_SOURCEPATH=${tmpdir}/sources
+    required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
+  fi
+
+  # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB),
+  # need to do a space check before we proceed
+  avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}')
+  if (( avail_space < 5000000 )); then
+    fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..."
+  fi
+  avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}')
+  if (( avail_space < required_space_in_tmpdir )); then
+    error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n"
+    error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check."
+    error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH "
+    error="${error}to reduce this requirement. Exiting now..."
+    fatal_error "${error}"
+  fi
+
+  if ! command -v "eb" &>/dev/null; then
+    echo_yellow "Attempting to load an EasyBuild module to do actual install"
+    module load EasyBuild
+    # There are some scenarios where this may fail
+    if [ $? -ne 0 ]; then
+      error="'eb' command not found in your environment and\n"
+      error="${error}  module load EasyBuild\n"
+      error="${error}failed for some reason.\n"
+      error="${error}Please re-run this script with the 'eb' command available."
+      fatal_error "${error}"
+    fi
+  fi
+
+  cuda_easyconfig="CUDA-${install_cuda_version}.eb"
+
+  # Check the easyconfig file is available in the release
+  # (eb search always returns 0, so we need a grep to ensure a usable exit code)
+  eb --search ^${cuda_easyconfig}|grep CUDA > /dev/null 2>&1
+  # Check the exit code
+  if [ $? -ne 0 ]; then
+    eb_version=$(eb --version)
+    available_cuda_easyconfigs=$(eb --search ^CUDA-*.eb|grep CUDA)
+
+    error="The easyconfig ${cuda_easyconfig} was not found in EasyBuild version:\n"
+    error="${error}  ${eb_version}\n"
+    error="${error}You either need to give a different version of CUDA to install _or_ \n"
+    error="${error}use a different version of EasyBuild for the installation.\n"
+    error="${error}\nThe versions of available with the current eb command are:\n"
+    error="${error}${available_cuda_easyconfigs}"
+    fatal_error "${error}"
+  fi
+
+  # We need the --rebuild option, as the CUDA module may or may not be on the
+  # `MODULEPATH` yet. Even if it is, we still want to redo this installation
+  # since it will provide the symlinked targets for the parts of the CUDA
+  # installation in the `.../versions/...` prefix
+  # We install the module in our `tmpdir` since we do not need the modulefile,
+  # we only care about providing the targets for the symlinks.
+  extra_args="--rebuild --installpath-modules=${tmpdir}"
+
+  # We don't want hooks used in this install, we need a vanilla CUDA installation
+  touch "$tmpdir"/none.py
+  # shellcheck disable=SC2086  # Intended splitting of extra_args
+  eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}"
+  ret=$?
+  if [ $ret -ne 0 ]; then
+    fatal_error  "CUDA installation failed, please check EasyBuild logs..."
+  else
+    echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!"
+  fi
+  # clean up tmpdir
+  rm -rf "${tmpdir}"
+fi