From 3db7a53ace90e3b5191486cc52e3b9c808eb7b02 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 18 Oct 2023 14:25:39 +0200 Subject: [PATCH 01/18] Build CUDA under --- .../nvidia/install_cuda_host_injections.sh | 92 +++++++++++++++++++ scripts/utils.sh | 80 +++++++++++++++- 2 files changed, 167 insertions(+), 5 deletions(-) create mode 100644 gpu_support/nvidia/install_cuda_host_injections.sh diff --git a/gpu_support/nvidia/install_cuda_host_injections.sh b/gpu_support/nvidia/install_cuda_host_injections.sh new file mode 100644 index 0000000000..b33efbfedd --- /dev/null +++ b/gpu_support/nvidia/install_cuda_host_injections.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../scripts/utils.sh + +# Make sure EESSI is initialised +check_eessi_initialised() + +if [[ $# -eq 0 ]] ; then + fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" +fi +install_cuda_version=$1 +if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" +else + # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` + # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) + cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} +fi + +# Only install CUDA if specified version is not found. +# This is only relevant for users, the shipped CUDA installation will +# always be in versions instead of host_injections and have symlinks pointing +# to host_injections for everything we're not allowed to ship +# (existence of easybuild subdir implies a successful install) +if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then + echo_green "CUDA software found! No need to install CUDA again, proceed with testing." +else + # We need to be able write to the installation space so let's make sure we can + if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then + fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA" + fi + + # we need a directory we can use for temporary storage + if [[ -z "${CUDA_TEMP_DIR}" ]]; then + tmpdir=$(mktemp -d) + else + tmpdir="${CUDA_TEMP_DIR}"/temp + if ! mkdir "$tmpdir" ; then + fatal_error "Could not create directory ${tmpdir}" + fi + fi + + required_space_in_tmpdir=50000 + # Let's see if we have sources and build locations defined if not, we use the temporary space + if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then + export EASYBUILD_BUILDPATH=${tmpdir}/build + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then + export EASYBUILD_SOURCEPATH=${tmpdir}/sources + required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000)) + fi + + # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), + # need to do a space check before we proceed + avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < 5000000 )); then + fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..." + fi + avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}') + if (( avail_space < required_space_in_tmpdir )); then + error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n" + error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check." + error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH " + error="${error}to reduce this requirement. Exiting now..." + fatal_error "${error}" + fi + + if [[ -z "${EBROOTEASYBUILD}" ]]; then + echo_yellow "Loading EasyBuild module to do actual install" + module load EasyBuild + fi + + # we need the --rebuild option and a (random) dir for the module since we are + # fixing the broken links of the EESSI-shipped installation + extra_args="--rebuild --installpath-modules=${tmpdir}" + + # We don't want hooks used in this install, we need a vanilla CUDA installation + touch "$tmpdir"/none.py + # shellcheck disable=SC2086 # Intended splitting of extra_args + eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb + ret=$? + if [ $ret -ne 0 ]; then + fatal_error "CUDA installation failed, please check EasyBuild logs..." + else + echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!" + fi + # clean up tmpdir + rm -rf "${tmpdir}" +fi diff --git a/scripts/utils.sh b/scripts/utils.sh index d0da95e87f..07760f0dd0 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -14,7 +14,7 @@ ANY_ERROR_EXITCODE=1 function fatal_error() { echo_red "ERROR: $1" >&2 if [[ $# -gt 1 ]]; then - exit $2 + exit "$2" else exit "${ANY_ERROR_EXITCODE}" fi @@ -32,11 +32,81 @@ function check_exit_code { fi } +function check_eessi_initialised() { + if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then + fatal_error "EESSI has not been initialised!" + else + return 0 + fi +} + +function float_greater_than() { + # Make sure we have two arguments + if [ $# -ne 2 ]; then + echo_red "greater_than_float requires two (float) numbers" >&2 + return $ANY_ERROR_EXITCODE + fi + # Make sure the arguments are numbers + if [[ ! $1 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then + echo_yellow "Input to float_greater_than is not a float, ignoring" + return $ANY_ERROR_EXITCODE + fi + if [[ ! $2 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then + echo_yellow "Input to float_greater_than is not a float, ignoring" + return $ANY_ERROR_EXITCODE + fi + # Now do the actual evaluation + return_code=$ANY_ERROR_EXITCODE + result=$(echo "$1" "$2" | awk '{if ($1 > $2) print "true"}') + if [ "$result" = true ] ; then + return_code=0 + fi + return $return_code +} + +function check_in_prefix_shell() { + # Make sure EPREFIX is defined + if [[ -z "${EPREFIX}" ]]; then + fatal_error "This script cannot be used without having first defined EPREFIX" + fi + if [[ ! ${SHELL} = ${EPREFIX}/bin/bash ]]; then + fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!" + fi +} + +function create_directory_structure() { + # Ensure we are given a single path argument + if [ $# -ne 1 ]; then + echo_red "Function requires a single (relative or absolute) path argument" >&2 + return $ANY_ERROR_EXITCODE + fi + dir_structure="$1" + + # Attempt to create the directory structure + error_message=$(mkdir -p "$dir_structure" 2>&1) + return_code=$? + # If it fails be explicit about the error + if [ ${return_code} -ne 0 ]; then + real_dir=$(realpath -m "$dir_structure") + echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2 + else + # If we're creating it, our use case is that we want to be able to write there + # (this is a check in case the directory already existed) + if [ ! -w "${dir_structure}" ]; then + real_dir=$(realpath -m "$dir_structure") + echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!" + return_code=$ANY_ERROR_EXITCODE + fi + fi + + return $return_code +} + function get_path_for_tool { tool_name=$1 tool_envvar_name=$2 - which_out=$(which ${tool_name} 2>&1) + which_out=$(which "${tool_name}" 2>&1) exit_code=$? if [[ ${exit_code} -eq 0 ]]; then echo "INFO: found tool ${tool_name} in PATH (${which_out})" >&2 @@ -68,7 +138,7 @@ function get_host_from_url { url=$1 re="(http|https)://([^/:]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -80,7 +150,7 @@ function get_port_from_url { url=$1 re="(http|https)://[^:]+:([0-9]+)" if [[ $url =~ $re ]]; then - echo ${BASH_REMATCH[2]} + echo "${BASH_REMATCH[2]}" return 0 else echo "" @@ -90,7 +160,7 @@ function get_port_from_url { function get_ipv4_address { hname=$1 - hipv4=$(grep ${hname} /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) + hipv4=$(grep "${hname}" /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1) # TODO try other methods if the one above does not work --> tool that verifies # what method can be used? echo "${hipv4}" From 82893a136bad5c0a3125c91aaa2bf354f70c7450 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 18 Oct 2023 14:32:22 +0200 Subject: [PATCH 02/18] Make script executable --- gpu_support/nvidia/install_cuda_host_injections.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 gpu_support/nvidia/install_cuda_host_injections.sh diff --git a/gpu_support/nvidia/install_cuda_host_injections.sh b/gpu_support/nvidia/install_cuda_host_injections.sh old mode 100644 new mode 100755 From 3797953057ff83e1e82c1be0232ee9aa61e18b1f Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Wed, 18 Oct 2023 14:42:23 +0200 Subject: [PATCH 03/18] Fix calling function --- gpu_support/nvidia/install_cuda_host_injections.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/nvidia/install_cuda_host_injections.sh b/gpu_support/nvidia/install_cuda_host_injections.sh index b33efbfedd..d6278ff3d7 100755 --- a/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/gpu_support/nvidia/install_cuda_host_injections.sh @@ -5,7 +5,7 @@ TOPDIR=$(dirname $(realpath $BASH_SOURCE)) source "$TOPDIR"/../../scripts/utils.sh # Make sure EESSI is initialised -check_eessi_initialised() +check_eessi_initialised if [[ $# -eq 0 ]] ; then fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" From 445048a248bf61ae82bb0456acf242fcb48428ed Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 19 Oct 2023 15:07:33 +0200 Subject: [PATCH 04/18] Make additional bind mounts for CUDA install to succeed in container --- build_container.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build_container.sh b/build_container.sh index 23a9e665c9..42f7afade9 100755 --- a/build_container.sh +++ b/build_container.sh @@ -24,12 +24,12 @@ echo "Using $EESSI_TMPDIR as parent for temporary directories..." # create temporary directories mkdir -p $EESSI_TMPDIR/{home,overlay-upper,overlay-work} -mkdir -p $EESSI_TMPDIR/{var-lib-cvmfs,var-run-cvmfs} +mkdir -p $EESSI_TMPDIR/{var-lib-cvmfs,var-run-cvmfs,var-log,opt-eessi,usr-local-cuda} # configure Singularity export SINGULARITY_CACHEDIR=$EESSI_TMPDIR/singularity_cache # take into account that $SINGULARITY_BIND may be defined already, to bind additional paths into the build container -BIND_PATHS="$EESSI_TMPDIR/var-run-cvmfs:/var/run/cvmfs,$EESSI_TMPDIR/var-lib-cvmfs:/var/lib/cvmfs,$EESSI_TMPDIR" +BIND_PATHS="$EESSI_TMPDIR/var-log:/var/log,$EESSI_TMPDIR/usr-local-cuda:/usr/local/cuda,$EESSI_TMPDIR/var-run-cvmfs:/var/run/cvmfs,$EESSI_TMPDIR/var-lib-cvmfs:/var/lib/cvmfs,$EESSI_TMPDIR/opt-eessi:/opt/eessi,$EESSI_TMPDIR" if [ -z $SINGULARITY_BIND ]; then export SINGULARITY_BIND="$BIND_PATHS" else From 617c8ada42cf8ca3421e9a8d7f41453d7697309e Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Tue, 31 Oct 2023 16:12:45 +0100 Subject: [PATCH 05/18] Switch GPU support to eessi_container.sh --- build_container.sh | 4 +-- eessi_container.sh | 64 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/build_container.sh b/build_container.sh index 42f7afade9..23a9e665c9 100755 --- a/build_container.sh +++ b/build_container.sh @@ -24,12 +24,12 @@ echo "Using $EESSI_TMPDIR as parent for temporary directories..." # create temporary directories mkdir -p $EESSI_TMPDIR/{home,overlay-upper,overlay-work} -mkdir -p $EESSI_TMPDIR/{var-lib-cvmfs,var-run-cvmfs,var-log,opt-eessi,usr-local-cuda} +mkdir -p $EESSI_TMPDIR/{var-lib-cvmfs,var-run-cvmfs} # configure Singularity export SINGULARITY_CACHEDIR=$EESSI_TMPDIR/singularity_cache # take into account that $SINGULARITY_BIND may be defined already, to bind additional paths into the build container -BIND_PATHS="$EESSI_TMPDIR/var-log:/var/log,$EESSI_TMPDIR/usr-local-cuda:/usr/local/cuda,$EESSI_TMPDIR/var-run-cvmfs:/var/run/cvmfs,$EESSI_TMPDIR/var-lib-cvmfs:/var/lib/cvmfs,$EESSI_TMPDIR/opt-eessi:/opt/eessi,$EESSI_TMPDIR" +BIND_PATHS="$EESSI_TMPDIR/var-run-cvmfs:/var/run/cvmfs,$EESSI_TMPDIR/var-lib-cvmfs:/var/lib/cvmfs,$EESSI_TMPDIR" if [ -z $SINGULARITY_BIND ]; then export SINGULARITY_BIND="$BIND_PATHS" else diff --git a/eessi_container.sh b/eessi_container.sh index 48c4653ba9..b58bddf309 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -46,6 +46,7 @@ SAVE_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 8)) HTTP_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 9)) HTTPS_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 10)) RUN_SCRIPT_MISSING_EXITCODE=$((${ANY_ERROR_EXITCODE} << 11)) +NVIDIA_MODE_UNKNOWN_EXITCODE=$((${ANY_ERROR_EXITCODE} << 12)) # CernVM-FS settings CVMFS_VAR_LIB="var-lib-cvmfs" @@ -72,12 +73,17 @@ display_help() { echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]" echo " -c | --container IMG - image file or URL defining the container to use" echo " [default: docker://ghcr.io/eessi/build-node:debian11]" - echo " -h | --help - display this usage information [default: false]" echo " -g | --storage DIR - directory space on host machine (used for" echo " temporary data) [default: 1. TMPDIR, 2. /tmp]" + echo " -h | --help - display this usage information [default: false]" + echo " -i | --host-injections - directory to link to for host_injections " + echo " [default: /..storage../opt-eessi]" echo " -l | --list-repos - list available repository identifiers [default: false]" echo " -m | --mode MODE - with MODE==shell (launch interactive shell) or" echo " MODE==run (run a script or command) [default: shell]" + echo " -n | --nvidia MODE - configure the container to work with NVIDIA GPUs," + echo " MODE==install for a CUDA installation, MODE==run to" + echo " attach a GPU, MODE==all for both [default: false]" echo " -r | --repository CFG - configuration file or identifier defining the" echo " repository to use [default: EESSI-pilot via" echo " default container, see --container]" @@ -111,6 +117,8 @@ VERBOSE=0 STORAGE= LIST_REPOS=0 MODE="shell" +SETUP_NVIDIA=0 +ADDITIONAL_SINGULARITY_FLAGS= REPOSITORY="EESSI-pilot" RESUME= SAVE= @@ -141,6 +149,10 @@ while [[ $# -gt 0 ]]; do display_help exit 0 ;; + -i|--host-injections) + USER_HOST_INJECTIONS="$2" + shift 2 + ;; -l|--list-repos) LIST_REPOS=1 shift 1 @@ -149,6 +161,11 @@ while [[ $# -gt 0 ]]; do MODE="$2" shift 2 ;; + -n|--nvidia) + SETUP_NVIDIA=1 + NVIDIA_MODE="$2" + shift 2 + ;; -r|--repository) REPOSITORY="$2" shift 2 @@ -224,6 +241,13 @@ if [[ "${MODE}" != "shell" && "${MODE}" != "run" ]]; then fatal_error "unknown execution mode '${MODE}'" "${MODE_UNKNOWN_EXITCODE}" fi +# Also validate the NVIDIA GPU mode (if present) +if [[ ${SETUP_NVIDIA} -eq 1 ]]; then + if [[ "${NVIDIA_MODE}" != "run" && "${NVIDIA_MODE}" != "install" && "${NVIDIA_MODE}" != "all" ]]; then + fatal_error "unknown NVIDIA mode '${NVIDIA_MODE}'" "${NVIDIA_MODE_UNKNOWN_EXITCODE}" + fi +fi + # TODO (arg -r|--repository) check if repository is known # REPOSITORY_ERROR_EXITCODE if [[ ! -z "${REPOSITORY}" && "${REPOSITORY}" != "EESSI-pilot" && ! -r ${EESSI_REPOS_CFG_FILE} ]]; then @@ -294,6 +318,7 @@ else echo "Using ${EESSI_HOST_STORAGE} as tmp directory (to resume session add '--resume ${EESSI_HOST_STORAGE}')." fi + # if ${RESUME} is a file (assume a tgz), unpack it into ${EESSI_HOST_STORAGE} if [[ ! -z ${RESUME} && -f ${RESUME} ]]; then tar xf ${RESUME} -C ${EESSI_HOST_STORAGE} @@ -310,12 +335,25 @@ fi # |-overlay-work # |-home # |-repos_cfg +# |-opt-eessi (unless otherwise specificed for host_injections) # tmp dir for EESSI EESSI_TMPDIR=${EESSI_HOST_STORAGE} mkdir -p ${EESSI_TMPDIR} [[ ${VERBOSE} -eq 1 ]] && echo "EESSI_TMPDIR=${EESSI_TMPDIR}" +# Set host_injections directory and ensure it is a writable directory (if user provided) +if [ -z ${USER_HOST_INJECTIONS+x} ]; then + # Not set, so use our default + HOST_INJECTIONS=${EESSI_TMPDIR}/opt-eessi + mkdir -p $HOST_INJECTIONS +else + # Make sure the host_injections directory specified exists and is a folder + mkdir -p ${USER_HOST_INJECTIONS} || fatal_error "host_injections directory ${USER_HOST_INJECTIONS} is either not a directory or cannot be created" + HOST_INJECTIONS=${USER_HOST_INJECTIONS} +fi +[[ ${VERBOSE} -eq 1 ]] && echo "HOST_INJECTIONS=${HOST_INJECTIONS}" + # configure Singularity: if SINGULARITY_CACHEDIR is already defined, use that # a global SINGULARITY_CACHEDIR would ensure that we don't consume # storage space again and again for the container & also speed-up @@ -394,12 +432,30 @@ fi [[ ${VERBOSE} -eq 1 ]] && echo "SINGULARITY_HOME=${SINGULARITY_HOME}" # define paths to add to SINGULARITY_BIND (added later when all BIND mounts are defined) -BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs" +BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs,${HOST_INJECTIONS}:/opt/eessi" # provide a '/tmp' inside the container BIND_PATHS="${BIND_PATHS},${EESSI_TMPDIR}:${TMP_IN_CONTAINER}" [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}" +# Configure anything we need for NVIDIA GPUs and CUDA installation +if [[ ${SETUP_NVIDIA} -eq 1 ]]; then + if [[ "${NVIDIA_MODE}" == "run" || "${NVIDIA_MODE}" == "all" ]]; then + # Give singularity the appropriate flag + ADDITIONAL_SINGULARITY_FLAGS="--nv ${ADDITIONAL_SINGULARITY_FLAGS}" + [[ ${VERBOSE} -eq 1 ]] && echo "ADDITIONAL_SINGULARITY_FLAGS=${ADDITIONAL_SINGULARITY_FLAGS}" + fi + if [[ "${NVIDIA_MODE}" == "install" || "${NVIDIA_MODE}" == "all" ]]; then + # Add additional bind mounts to allow CUDA to install within a container + EESSI_VAR_LOG=${EESSI_TMPDIR}/var-log + EESSI_USR_LOCAL_CUDA=${EESSI_TMPDIR}/usr-local-cuda + mkdir -p ${EESSI_VAR_LOG} + mkdir -p ${EESSI_USR_LOCAL_CUDA} + BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda" + [[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}" + fi +fi + # set up repository config (always create directory repos_cfg and populate it with info when # arg -r|--repository is used) mkdir -p ${EESSI_TMPDIR}/repos_cfg @@ -558,8 +614,8 @@ if [ ! -z ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} ]; then fi echo "Launching container with command (next line):" -echo "singularity ${RUN_QUIET} ${MODE} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@" -singularity ${RUN_QUIET} ${MODE} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@" +echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@" +singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@" exit_code=$? # 6. save tmp if requested (arg -s|--save) From 090351864cbc5b97f35498b278d16041a5b97b1e Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 11:16:55 +0100 Subject: [PATCH 06/18] Address review and improve UI and error reporting --- eessi_container.sh | 8 +- .../nvidia/install_cuda_host_injections.sh | 132 ++++++++++++++++-- scripts/utils.sh | 24 ---- 3 files changed, 125 insertions(+), 39 deletions(-) diff --git a/eessi_container.sh b/eessi_container.sh index b58bddf309..e31808d546 100755 --- a/eessi_container.sh +++ b/eessi_container.sh @@ -30,8 +30,8 @@ # -. initial settings & exit codes TOPDIR=$(dirname $(realpath $0)) -source ${TOPDIR}/scripts/utils.sh -source ${TOPDIR}/scripts/cfg_files.sh +source "${TOPDIR}"/scripts/utils.sh +source "${TOPDIR}"/scripts/cfg_files.sh # exit codes: bitwise shift codes to allow for combination of exit codes # ANY_ERROR_EXITCODE is sourced from ${TOPDIR}/scripts/utils.sh @@ -447,6 +447,10 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then fi if [[ "${NVIDIA_MODE}" == "install" || "${NVIDIA_MODE}" == "all" ]]; then # Add additional bind mounts to allow CUDA to install within a container + # (Experience tells us that these are necessary, but we don't know _why_ + # as the CUDA installer is a black box. The suspicion is that the CUDA + # installer gets confused by the permissions on these directories when + # inside a container) EESSI_VAR_LOG=${EESSI_TMPDIR}/var-log EESSI_USR_LOCAL_CUDA=${EESSI_TMPDIR}/usr-local-cuda mkdir -p ${EESSI_VAR_LOG} diff --git a/gpu_support/nvidia/install_cuda_host_injections.sh b/gpu_support/nvidia/install_cuda_host_injections.sh index d6278ff3d7..2c59a891f8 100755 --- a/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/gpu_support/nvidia/install_cuda_host_injections.sh @@ -1,16 +1,94 @@ #!/usr/bin/env bash +# This script can be used to install CUDA under the `.../host_injections` directory. +# This provides the parts of the CUDA installation that cannot be redistributed as +# part of EESSI due to license limitations. While GPU-based software from EESSI will +# _run_ without these, installation of additional CUDA software requires the CUDA +# installation(s) under `host_injections` to be present. +# +# The `host_injections` directory is a variant symlink that by default points to +# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see +# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the +# installation to be successful, this directory needs to be writeable by the user +# executing this script. + # Initialise our bash functions TOPDIR=$(dirname $(realpath $BASH_SOURCE)) source "$TOPDIR"/../../scripts/utils.sh +# Function to display help message +show_help() { + echo "Usage: $0 [OPTIONS]" + echo "Options:" + echo " --help Display this help message" + echo " -c, --cuda-version CUDA_VERSION Specify a version o CUDA to install (must" + echo " have a corresponding easyconfig in the" + echo " EasyBuild release)" + echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary" + echo " storage during the CUDA install" + echo " (must have >10GB available)" +} + +# Initialize variables +install_cuda_version="" + +# Parse command-line options +while [[ $# -gt 0 ]]; do + case "$1" in + --help) + show_help + exit 0 + ;; + -c|--cuda-version) + if [ -n "$2" ]; then + install_cuda_version="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + -t|--temp-dir) + if [ -n "$2" ]; then + CUDA_TEMP_DIR="$2" + shift 2 + else + echo "Error: Argument required for $1" + show_help + exit 1 + fi + ;; + *) + show_help + fatal_error "Error: Unknown option: $1" + ;; + esac +done + +# Make sure the CUDA version supplied is a semantic version +is_semantic_version() { + local version=$1 + local regex='^[0-9]+\.[0-9]+\.[0-9]+$' + + if [[ $version =~ $regex ]]; then + return 0 # Return success (0) if it's a semantic version + else + return 1 # Return failure (1) if it's not a semantic version + fi +} +if ! is_semantic_version "$install_cuda_version"; then + show_help + error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" + error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" + error="${error}version to provide is probably the one that is available under\n" + error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n" + fatal_error "${error}" +fi + # Make sure EESSI is initialised check_eessi_initialised -if [[ $# -eq 0 ]] ; then - fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1" -fi -install_cuda_version=$1 if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" else @@ -20,12 +98,9 @@ else fi # Only install CUDA if specified version is not found. -# This is only relevant for users, the shipped CUDA installation will -# always be in versions instead of host_injections and have symlinks pointing -# to host_injections for everything we're not allowed to ship # (existence of easybuild subdir implies a successful install) if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then - echo_green "CUDA software found! No need to install CUDA again, proceed with testing." + echo_green "CUDA software found! No need to install CUDA again." else # We need to be able write to the installation space so let's make sure we can if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then @@ -68,19 +143,50 @@ else fatal_error "${error}" fi - if [[ -z "${EBROOTEASYBUILD}" ]]; then - echo_yellow "Loading EasyBuild module to do actual install" + if ! command -v "eb" &>/dev/null; then + echo_yellow "Attempting to load an EasyBuild module to do actual install" module load EasyBuild + # There are some scenarios where this may fail + if [ $? -ne 0 ]; then + error="'eb' command not found in your environment and\n" + error="${error} module load EasyBuild\n" + error="${error}failed for some reason.\n" + error="${error}Please re-run this script with the 'eb' command available." + fatal_error "${error}" + fi + fi + + cuda_easyconfig="CUDA-${install_cuda_version}.eb" + + # Check the easyconfig file is available in the release + # (eb search always returns 0, so we need a grep to ensure a usable exit code) + eb --search ^${cuda_easyconfig}|grep CUDA > /dev/null 2>&1 + # Check the exit code + if [ $? -ne 0 ]; then + eb_version=$(eb --version) + available_cuda_easyconfigs=$(eb --search ^CUDA-*.eb|grep CUDA) + + error="The easyconfig ${cuda_easyconfig} was not found in EasyBuild version:\n" + error="${error} ${eb_version}\n" + error="${error}You either need to give a different version of CUDA to install _or_ \n" + error="${error}use a different version of EasyBuild for the installation.\n" + error="${error}\nThe versions of available with the current eb command are:\n" + error="${error}${available_cuda_easyconfigs}" + fatal_error "${error}" fi - # we need the --rebuild option and a (random) dir for the module since we are - # fixing the broken links of the EESSI-shipped installation + # We need the --rebuild option, as the CUDA module may or may not be on the + # `MODULEPATH` yet. Even if it is, we still want to redo this installation + # since it will provide the symlinked targets for the parts of the CUDA + # installation in the `.../versions/...` prefix + # We install the module in our `tmpdir` since we do not need the modulefile, + # we only care about providing the targets for the symlinks. extra_args="--rebuild --installpath-modules=${tmpdir}" # We don't want hooks used in this install, we need a vanilla CUDA installation touch "$tmpdir"/none.py # shellcheck disable=SC2086 # Intended splitting of extra_args - eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb + eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" ret=$? if [ $ret -ne 0 ]; then fatal_error "CUDA installation failed, please check EasyBuild logs..." diff --git a/scripts/utils.sh b/scripts/utils.sh index 07760f0dd0..b2be3f6221 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -40,30 +40,6 @@ function check_eessi_initialised() { fi } -function float_greater_than() { - # Make sure we have two arguments - if [ $# -ne 2 ]; then - echo_red "greater_than_float requires two (float) numbers" >&2 - return $ANY_ERROR_EXITCODE - fi - # Make sure the arguments are numbers - if [[ ! $1 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then - echo_yellow "Input to float_greater_than is not a float, ignoring" - return $ANY_ERROR_EXITCODE - fi - if [[ ! $2 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then - echo_yellow "Input to float_greater_than is not a float, ignoring" - return $ANY_ERROR_EXITCODE - fi - # Now do the actual evaluation - return_code=$ANY_ERROR_EXITCODE - result=$(echo "$1" "$2" | awk '{if ($1 > $2) print "true"}') - if [ "$result" = true ] ; then - return_code=0 - fi - return $return_code -} - function check_in_prefix_shell() { # Make sure EPREFIX is defined if [[ -z "${EPREFIX}" ]]; then From 12719eccaabf57112332cc4a4958d0d03e382ef4 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 12:56:11 +0100 Subject: [PATCH 07/18] Make sure users are forced to accept CUDA EULA --- .../nvidia/install_cuda_host_injections.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/gpu_support/nvidia/install_cuda_host_injections.sh b/gpu_support/nvidia/install_cuda_host_injections.sh index 2c59a891f8..79af986a0a 100755 --- a/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/gpu_support/nvidia/install_cuda_host_injections.sh @@ -21,6 +21,9 @@ show_help() { echo "Usage: $0 [OPTIONS]" echo "Options:" echo " --help Display this help message" + echo " --accept-cuda-eula You _must_ accept the CUDA EULA to install" + echo " CUDA, see the EULA at" + echo " https://docs.nvidia.com/cuda/eula/index.html" echo " -c, --cuda-version CUDA_VERSION Specify a version o CUDA to install (must" echo " have a corresponding easyconfig in the" echo " EasyBuild release)" @@ -31,6 +34,7 @@ show_help() { # Initialize variables install_cuda_version="" +eula_accepted=0 # Parse command-line options while [[ $# -gt 0 ]]; do @@ -49,6 +53,10 @@ while [[ $# -gt 0 ]]; do exit 1 fi ;; + --accept-cuda-eula) + eula_accepted=1 + shift 1 + ;; -t|--temp-dir) if [ -n "$2" ]; then CUDA_TEMP_DIR="$2" @@ -86,6 +94,13 @@ if ! is_semantic_version "$install_cuda_version"; then fatal_error "${error}" fi +# Make sure they have accepted the CUDA EULA +if [ "$eula_accepted" -ne 1 ]; then + show_help + error="\nYou _must_ accept the CUDA EULA via the appropriate command line option.\n" + fatal_error "${error}" +fi + # Make sure EESSI is initialised check_eessi_initialised @@ -186,7 +201,7 @@ else # We don't want hooks used in this install, we need a vanilla CUDA installation touch "$tmpdir"/none.py # shellcheck disable=SC2086 # Intended splitting of extra_args - eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" + eb --prefix="$tmpdir" ${extra_args} --accept-eula-for=CUDA --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ "${cuda_easyconfig}" ret=$? if [ $ret -ne 0 ]; then fatal_error "CUDA installation failed, please check EasyBuild logs..." From b005591069f055c4a5c206216279fd428878b887 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 14:50:16 +0100 Subject: [PATCH 08/18] Add script to link in host NVIDIA drivers --- .../nvidia/link_nvidia_host_libraries.sh | 128 ++++++++++++++++++ 1 file changed, 128 insertions(+) create mode 100755 gpu_support/nvidia/link_nvidia_host_libraries.sh diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh new file mode 100755 index 0000000000..eb713fce1b --- /dev/null +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -0,0 +1,128 @@ +#!/bin/bash + +# This script links host libraries related to GPU drivers to a location where +# they can be found by the EESSI linker + +# Initialise our bash functions +TOPDIR=$(dirname $(realpath $BASH_SOURCE)) +source "$TOPDIR"/../../scripts/utils.sh + +# We rely on ldconfig to give us the location of the libraries on the host +command_name="ldconfig" +# We cannot use a version of ldconfig that's being shipped under CVMFS +exclude_prefix="/cvmfs" + +found_paths=() +# Always attempt to use /sbin/ldconfig +if [ -x "/sbin/$command_name" ]; then + found_paths+=("$dir/$command_name") +fi +IFS=':' read -ra path_dirs <<< "$PATH" +for dir in "${path_dirs[@]}"; do + if [[ ! "$dir" =~ ^$exclude_prefix ]]; then + if [ -x "$dir/$command_name" ]; then + found_paths+=("$dir/$command_name") + fi + fi +done + +if [ ${#found_paths[@]} -gt 0 ]; then + echo "Found $command_name in the following locations:" + printf -- "- %s\n" "${found_paths[@]}" + echo "Using first version" + host_ldconfig=${found_paths[0]} +else + error="$command_name not found in PATH or only found in paths starting with $exclude_prefix." + fatal_error $error +fi + +# Make sure EESSI is initialised (doesn't matter what version) +check_eessi_initialised + +# Find the CUDA version of the host CUDA drivers +# (making sure that this can still work inside prefix environment inside a container) +nvidia_smi_command="LD_LIBRARY_PATH=/.singularity/libs:$LD_LIBRARY_PATH nvidia-smi --query-gpu=driver_version --format=csv,noheader" +if $nvidia_smi_command; then + host_cuda_version=$($nvidia_smi_command | tail -n1) +else + error="Failed to successfully execute\n $nvidia_smi_command\n" + fatal_error $error +fi + +# Let's make sure the driver libraries are not already in place +link_drivers=1 + +host_injections_nvidia_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_CPU_FAMILY}/nvidia" +host_injection_driver_dir="${host_injections_nvidia_dir}/host" +host_injection_driver_version_file="$host_injection_driver_dir/version.txt" +if [ -e "$host_injection_driver_version_file" ]; then + if grep -q "$host_cuda_version" "$host_injection_driver_version_file"; then + echo_green "The host CUDA driver libraries have already been linked!" + link_drivers=0 + else + # There's something there but it is out of date + echo_yellow "Cleaning out outdated symlinks" + rm $host_injection_driver_dir/* + if [ $? -ne 0 ]; then + error="Unable to remove files under '$host_injection_driver_dir'." + fatal_error $error + fi + fi +fi + +drivers_linked=0 +if [ "$link_drivers" -eq 1 ]; then + if ! create_directory_structure "${host_injection_driver_dir}" ; then + fatal_error "No write permissions to directory ${host_injection_driver_dir}" + fi + cd ${host_injection_driver_dir} + # Need a small temporary space to hold a couple of files + temp_dir=$(mktemp -d) + + # Gather libraries on the host (_must_ be host ldconfig) + $host_ldconfig -p | awk '{print $NF}' > "$temp_dir"/libs.txt + # Allow for the fact that we may be in a container so the CUDA libs might be in there + ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null + + # Leverage singularity to find the full list of libraries we should be linking to + curl -o "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf + + # Make symlinks to all the interesting libraries + grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} libs.txt | xargs -i ln -s {} + + # Inject CUDA version into dir + echo $host_cuda_version > version.txt + drivers_linked=1 + + # Remove the temporary directory when done + rm -r "$temp_dir" +fi + +# Make latest symlink for NVIDIA drivers +cd $host_injections_nvidia_dir +symlink="latest" +if [ -L "$symlink" ]; then + # Unless the drivers have been installed, leave the symlink alone + if [ "$drivers_linked" -eq 1 ]; then + ln -sf host latest + fi +else + # No link exists yet + ln -s host latest +fi + +# Make sure the libraries can be found by the EESSI linker +host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections} +if [ -L "$host_injection_linker_dir/lib" ]; then + target_path=$(readlink -f "$host_injection_linker_dir/lib") + if [ "$target_path" != "$$host_injections_nvidia_dir/latest" ]; then + cd $host_injection_linker_dir + ln -sf $host_injections_nvidia_dir/latest lib + fi +else + create_directory_structure $host_injection_linker_dir + cd $host_injection_linker_dir + ln -s $host_injections_nvidia_dir/latest lib +fi + +echo_green "Host NVIDIA gpu drivers linked successfully for EESSI" \ No newline at end of file From 9d6e91de4c0536b3552fdee9d1bd9d631a9a8be7 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 14:52:13 +0100 Subject: [PATCH 09/18] Typo --- gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index eb713fce1b..eb68e8dd69 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -15,7 +15,7 @@ exclude_prefix="/cvmfs" found_paths=() # Always attempt to use /sbin/ldconfig if [ -x "/sbin/$command_name" ]; then - found_paths+=("$dir/$command_name") + found_paths+=("/sbin/$command_name") fi IFS=':' read -ra path_dirs <<< "$PATH" for dir in "${path_dirs[@]}"; do From cb126492b53e7c77ce896883dd98c47cbf0cd61d Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 14:54:46 +0100 Subject: [PATCH 10/18] Wrap the command exection because of the envvars --- gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index eb68e8dd69..77796ac9c9 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -42,7 +42,7 @@ check_eessi_initialised # Find the CUDA version of the host CUDA drivers # (making sure that this can still work inside prefix environment inside a container) nvidia_smi_command="LD_LIBRARY_PATH=/.singularity/libs:$LD_LIBRARY_PATH nvidia-smi --query-gpu=driver_version --format=csv,noheader" -if $nvidia_smi_command; then +if $($nvidia_smi_command); then host_cuda_version=$($nvidia_smi_command | tail -n1) else error="Failed to successfully execute\n $nvidia_smi_command\n" From f9268e0fd80f726ba6ddf37727d821d1dc231d9c Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 14:57:07 +0100 Subject: [PATCH 11/18] Move envvar setting outside of command --- gpu_support/nvidia/link_nvidia_host_libraries.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index 77796ac9c9..56c0b7a2ae 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -41,8 +41,9 @@ check_eessi_initialised # Find the CUDA version of the host CUDA drivers # (making sure that this can still work inside prefix environment inside a container) -nvidia_smi_command="LD_LIBRARY_PATH=/.singularity/libs:$LD_LIBRARY_PATH nvidia-smi --query-gpu=driver_version --format=csv,noheader" -if $($nvidia_smi_command); then +LD_LIBRARY_PATH=/.singularity/libs:$LD_LIBRARY_PATH +nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" +if $nvidia_smi_command; then host_cuda_version=$($nvidia_smi_command | tail -n1) else error="Failed to successfully execute\n $nvidia_smi_command\n" From a06f541445b5f9d5a76d659fc07c03d17003e930 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 14:58:38 +0100 Subject: [PATCH 12/18] Forgot to add temp location --- gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index 56c0b7a2ae..a15c1e98d4 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -89,7 +89,7 @@ if [ "$link_drivers" -eq 1 ]; then curl -o "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf # Make symlinks to all the interesting libraries - grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} libs.txt | xargs -i ln -s {} + grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {} # Inject CUDA version into dir echo $host_cuda_version > version.txt From 600d4b46880d853e62428f94b7ce8912604b29ba Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 15:01:12 +0100 Subject: [PATCH 13/18] Wrong location under host_injections --- gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index a15c1e98d4..b4b8699628 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -53,7 +53,7 @@ fi # Let's make sure the driver libraries are not already in place link_drivers=1 -host_injections_nvidia_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_CPU_FAMILY}/nvidia" +host_injections_nvidia_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/${EESSI_CPU_FAMILY}" host_injection_driver_dir="${host_injections_nvidia_dir}/host" host_injection_driver_version_file="$host_injection_driver_dir/version.txt" if [ -e "$host_injection_driver_version_file" ]; then From b2664a3943115c75a85099936111e5ab231ebd2c Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 15:17:13 +0100 Subject: [PATCH 14/18] Export LD_LIBRARY_PATH in script --- gpu_support/nvidia/link_nvidia_host_libraries.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index b4b8699628..c971e4e65e 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -41,7 +41,7 @@ check_eessi_initialised # Find the CUDA version of the host CUDA drivers # (making sure that this can still work inside prefix environment inside a container) -LD_LIBRARY_PATH=/.singularity/libs:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" if $nvidia_smi_command; then host_cuda_version=$($nvidia_smi_command | tail -n1) From 9854e79edbc5b7a3450aa1ee1484f8e58ad4575d Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 15:29:19 +0100 Subject: [PATCH 15/18] Also export CUDA version to the links directory --- gpu_support/nvidia/link_nvidia_host_libraries.sh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index c971e4e65e..ac826f9572 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -43,8 +43,9 @@ check_eessi_initialised # (making sure that this can still work inside prefix environment inside a container) export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" -if $nvidia_smi_command; then - host_cuda_version=$($nvidia_smi_command | tail -n1) +if $nvidia_smi_command > /dev/null; then + host_driver_version=$($nvidia_smi_command | tail -n1) + host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') else error="Failed to successfully execute\n $nvidia_smi_command\n" fatal_error $error @@ -57,7 +58,7 @@ host_injections_nvidia_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/${ host_injection_driver_dir="${host_injections_nvidia_dir}/host" host_injection_driver_version_file="$host_injection_driver_dir/version.txt" if [ -e "$host_injection_driver_version_file" ]; then - if grep -q "$host_cuda_version" "$host_injection_driver_version_file"; then + if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then echo_green "The host CUDA driver libraries have already been linked!" link_drivers=0 else @@ -91,8 +92,9 @@ if [ "$link_drivers" -eq 1 ]; then # Make symlinks to all the interesting libraries grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {} - # Inject CUDA version into dir - echo $host_cuda_version > version.txt + # Inject driver and CUDA versions into dir + echo $host_driver_version > version.txt + echo $host_cuda_version > cuda_version.txt drivers_linked=1 # Remove the temporary directory when done From 59fee787c0b8a709c05fc8bba379f2cb5e2abcd7 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 15:34:49 +0100 Subject: [PATCH 16/18] Also export CUDA version to the links directory --- gpu_support/nvidia/link_nvidia_host_libraries.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index ac826f9572..e71697c8d7 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -45,6 +45,7 @@ export LD_LIBRARY_PATH=/.singularity.d/libs:$LD_LIBRARY_PATH nvidia_smi_command="nvidia-smi --query-gpu=driver_version --format=csv,noheader" if $nvidia_smi_command > /dev/null; then host_driver_version=$($nvidia_smi_command | tail -n1) + # If the first worked, this should work too host_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}') else error="Failed to successfully execute\n $nvidia_smi_command\n" @@ -56,7 +57,7 @@ link_drivers=1 host_injections_nvidia_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/${EESSI_CPU_FAMILY}" host_injection_driver_dir="${host_injections_nvidia_dir}/host" -host_injection_driver_version_file="$host_injection_driver_dir/version.txt" +host_injection_driver_version_file="$host_injection_driver_dir/driver_version.txt" if [ -e "$host_injection_driver_version_file" ]; then if grep -q "$host_driver_version" "$host_injection_driver_version_file"; then echo_green "The host CUDA driver libraries have already been linked!" @@ -93,7 +94,7 @@ if [ "$link_drivers" -eq 1 ]; then grep '.so$' "$temp_dir"/nvliblist.conf | xargs -i grep {} "$temp_dir"/libs.txt | xargs -i ln -s {} # Inject driver and CUDA versions into dir - echo $host_driver_version > version.txt + echo $host_driver_version > driver_version.txt echo $host_cuda_version > cuda_version.txt drivers_linked=1 From 3d0c3dd3eb6fd0e9a80652b477379325a040cdba Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Thu, 30 Nov 2023 15:46:46 +0100 Subject: [PATCH 17/18] Add a comment to explain the download --- gpu_support/nvidia/link_nvidia_host_libraries.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index e71697c8d7..d714c0ec8b 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -88,6 +88,7 @@ if [ "$link_drivers" -eq 1 ]; then ls /.singularity.d/libs/* >> "$temp_dir"/libs.txt 2>/dev/null # Leverage singularity to find the full list of libraries we should be linking to + echo_yellow "Downloading latest version of nvliblist.conf from Apptainer" curl -o "$temp_dir"/nvliblist.conf https://raw.githubusercontent.com/apptainer/apptainer/main/etc/nvliblist.conf # Make symlinks to all the interesting libraries @@ -129,4 +130,4 @@ else ln -s $host_injections_nvidia_dir/latest lib fi -echo_green "Host NVIDIA gpu drivers linked successfully for EESSI" \ No newline at end of file +echo_green "Host NVIDIA gpu drivers linked successfully for EESSI" From 480d35643c7740c48f5508303e1cccadf04aae76 Mon Sep 17 00:00:00 2001 From: Alan O'Cais Date: Fri, 1 Dec 2023 15:07:34 +0100 Subject: [PATCH 18/18] Address review comments --- .../nvidia/install_cuda_host_injections.sh | 18 +++++++----------- .../nvidia/link_nvidia_host_libraries.sh | 13 ++++++++----- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/gpu_support/nvidia/install_cuda_host_injections.sh b/gpu_support/nvidia/install_cuda_host_injections.sh index 79af986a0a..f02f0da02e 100755 --- a/gpu_support/nvidia/install_cuda_host_injections.sh +++ b/gpu_support/nvidia/install_cuda_host_injections.sh @@ -74,6 +74,9 @@ while [[ $# -gt 0 ]]; do esac done +# Make sure EESSI is initialised +check_eessi_initialised + # Make sure the CUDA version supplied is a semantic version is_semantic_version() { local version=$1 @@ -89,7 +92,7 @@ if ! is_semantic_version "$install_cuda_version"; then show_help error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n" error="${error}command line option. This script is intended for use with EESSI so the 'correct'\n" - error="${error}version to provide is probably the one that is available under\n" + error="${error}version to provide is probably one of those available under\n" error="${error}$EESSI_SOFTWARE_PATH/software/CUDA\n" fatal_error "${error}" fi @@ -101,16 +104,9 @@ if [ "$eula_accepted" -ne 1 ]; then fatal_error "${error}" fi -# Make sure EESSI is initialised -check_eessi_initialised - -if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then - fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH" -else - # As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` - # (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) - cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} -fi +# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections` +# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup) +cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections} # Only install CUDA if specified version is not found. # (existence of easybuild subdir implies a successful install) diff --git a/gpu_support/nvidia/link_nvidia_host_libraries.sh b/gpu_support/nvidia/link_nvidia_host_libraries.sh index d714c0ec8b..26760f0b82 100755 --- a/gpu_support/nvidia/link_nvidia_host_libraries.sh +++ b/gpu_support/nvidia/link_nvidia_host_libraries.sh @@ -19,11 +19,14 @@ if [ -x "/sbin/$command_name" ]; then fi IFS=':' read -ra path_dirs <<< "$PATH" for dir in "${path_dirs[@]}"; do - if [[ ! "$dir" =~ ^$exclude_prefix ]]; then - if [ -x "$dir/$command_name" ]; then - found_paths+=("$dir/$command_name") - fi - fi + if [ "$dir" = "/sbin" ]; then + continue # we've already checked for $command_name in /sbin, don't need to do it twice + fi + if [[ ! "$dir" =~ ^$exclude_prefix ]]; then + if [ -x "$dir/$command_name" ]; then + found_paths+=("$dir/$command_name") + fi + fi done if [ ${#found_paths[@]} -gt 0 ]; then