Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Build CUDA under host_injections and make EESSI aware of host CUDA drivers #368

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 60 additions & 4 deletions eessi_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ SAVE_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 8))
HTTP_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 9))
HTTPS_PROXY_ERROR_EXITCODE=$((${ANY_ERROR_EXITCODE} << 10))
RUN_SCRIPT_MISSING_EXITCODE=$((${ANY_ERROR_EXITCODE} << 11))
NVIDIA_MODE_UNKNOWN_EXITCODE=$((${ANY_ERROR_EXITCODE} << 12))

# CernVM-FS settings
CVMFS_VAR_LIB="var-lib-cvmfs"
Expand All @@ -72,12 +73,17 @@ display_help() {
echo " -a | --access {ro,rw} - ro (read-only), rw (read & write) [default: ro]"
echo " -c | --container IMG - image file or URL defining the container to use"
echo " [default: docker://ghcr.io/eessi/build-node:debian11]"
echo " -h | --help - display this usage information [default: false]"
echo " -g | --storage DIR - directory space on host machine (used for"
echo " temporary data) [default: 1. TMPDIR, 2. /tmp]"
echo " -h | --help - display this usage information [default: false]"
echo " -i | --host-injections - directory to link to for host_injections "
echo " [default: /..storage../opt-eessi]"
echo " -l | --list-repos - list available repository identifiers [default: false]"
echo " -m | --mode MODE - with MODE==shell (launch interactive shell) or"
echo " MODE==run (run a script or command) [default: shell]"
echo " -n | --nvidia MODE - configure the container to work with NVIDIA GPUs,"
echo " MODE==install for a CUDA installation, MODE==run to"
echo " attach a GPU, MODE==all for both [default: false]"
echo " -r | --repository CFG - configuration file or identifier defining the"
echo " repository to use [default: EESSI-pilot via"
echo " default container, see --container]"
Expand Down Expand Up @@ -111,6 +117,8 @@ VERBOSE=0
STORAGE=
LIST_REPOS=0
MODE="shell"
SETUP_NVIDIA=0
ADDITIONAL_SINGULARITY_FLAGS=
REPOSITORY="EESSI-pilot"
RESUME=
SAVE=
Expand Down Expand Up @@ -141,6 +149,10 @@ while [[ $# -gt 0 ]]; do
display_help
exit 0
;;
-i|--host-injections)
USER_HOST_INJECTIONS="$2"
shift 2
;;
-l|--list-repos)
LIST_REPOS=1
shift 1
Expand All @@ -149,6 +161,11 @@ while [[ $# -gt 0 ]]; do
MODE="$2"
shift 2
;;
-n|--nvidia)
SETUP_NVIDIA=1
NVIDIA_MODE="$2"
shift 2
;;
-r|--repository)
REPOSITORY="$2"
shift 2
Expand Down Expand Up @@ -224,6 +241,13 @@ if [[ "${MODE}" != "shell" && "${MODE}" != "run" ]]; then
fatal_error "unknown execution mode '${MODE}'" "${MODE_UNKNOWN_EXITCODE}"
fi

# Also validate the NVIDIA GPU mode (if present)
if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
if [[ "${NVIDIA_MODE}" != "run" && "${NVIDIA_MODE}" != "install" && "${NVIDIA_MODE}" != "all" ]]; then
fatal_error "unknown NVIDIA mode '${NVIDIA_MODE}'" "${NVIDIA_MODE_UNKNOWN_EXITCODE}"
fi
fi

# TODO (arg -r|--repository) check if repository is known
# REPOSITORY_ERROR_EXITCODE
if [[ ! -z "${REPOSITORY}" && "${REPOSITORY}" != "EESSI-pilot" && ! -r ${EESSI_REPOS_CFG_FILE} ]]; then
Expand Down Expand Up @@ -294,6 +318,7 @@ else
echo "Using ${EESSI_HOST_STORAGE} as tmp directory (to resume session add '--resume ${EESSI_HOST_STORAGE}')."
fi


# if ${RESUME} is a file (assume a tgz), unpack it into ${EESSI_HOST_STORAGE}
if [[ ! -z ${RESUME} && -f ${RESUME} ]]; then
tar xf ${RESUME} -C ${EESSI_HOST_STORAGE}
Expand All @@ -310,12 +335,25 @@ fi
# |-overlay-work
# |-home
# |-repos_cfg
# |-opt-eessi (unless otherwise specificed for host_injections)

# tmp dir for EESSI
EESSI_TMPDIR=${EESSI_HOST_STORAGE}
mkdir -p ${EESSI_TMPDIR}
[[ ${VERBOSE} -eq 1 ]] && echo "EESSI_TMPDIR=${EESSI_TMPDIR}"

# Set host_injections directory and ensure it is a writable directory (if user provided)
if [ -z ${USER_HOST_INJECTIONS+x} ]; then
# Not set, so use our default
HOST_INJECTIONS=${EESSI_TMPDIR}/opt-eessi
mkdir -p $HOST_INJECTIONS
else
# Make sure the host_injections directory specified exists and is a folder
mkdir -p ${USER_HOST_INJECTIONS} || fatal_error "host_injections directory ${USER_HOST_INJECTIONS} is either not a directory or cannot be created"
HOST_INJECTIONS=${USER_HOST_INJECTIONS}
fi
[[ ${VERBOSE} -eq 1 ]] && echo "HOST_INJECTIONS=${HOST_INJECTIONS}"

# configure Singularity: if SINGULARITY_CACHEDIR is already defined, use that
# a global SINGULARITY_CACHEDIR would ensure that we don't consume
# storage space again and again for the container & also speed-up
Expand Down Expand Up @@ -394,12 +432,30 @@ fi
[[ ${VERBOSE} -eq 1 ]] && echo "SINGULARITY_HOME=${SINGULARITY_HOME}"

# define paths to add to SINGULARITY_BIND (added later when all BIND mounts are defined)
BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs"
BIND_PATHS="${EESSI_CVMFS_VAR_LIB}:/var/lib/cvmfs,${EESSI_CVMFS_VAR_RUN}:/var/run/cvmfs,${HOST_INJECTIONS}:/opt/eessi"
# provide a '/tmp' inside the container
BIND_PATHS="${BIND_PATHS},${EESSI_TMPDIR}:${TMP_IN_CONTAINER}"

[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"

# Configure anything we need for NVIDIA GPUs and CUDA installation
if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
if [[ "${NVIDIA_MODE}" == "run" || "${NVIDIA_MODE}" == "all" ]]; then
# Give singularity the appropriate flag
ADDITIONAL_SINGULARITY_FLAGS="--nv ${ADDITIONAL_SINGULARITY_FLAGS}"
[[ ${VERBOSE} -eq 1 ]] && echo "ADDITIONAL_SINGULARITY_FLAGS=${ADDITIONAL_SINGULARITY_FLAGS}"
fi
if [[ "${NVIDIA_MODE}" == "install" || "${NVIDIA_MODE}" == "all" ]]; then
# Add additional bind mounts to allow CUDA to install within a container
EESSI_VAR_LOG=${EESSI_TMPDIR}/var-log
EESSI_USR_LOCAL_CUDA=${EESSI_TMPDIR}/usr-local-cuda
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
mkdir -p ${EESSI_VAR_LOG}
mkdir -p ${EESSI_USR_LOCAL_CUDA}
BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda"
[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"
fi
fi

# set up repository config (always create directory repos_cfg and populate it with info when
# arg -r|--repository is used)
mkdir -p ${EESSI_TMPDIR}/repos_cfg
Expand Down Expand Up @@ -558,8 +614,8 @@ if [ ! -z ${EESSI_SOFTWARE_SUBDIR_OVERRIDE} ]; then
fi

echo "Launching container with command (next line):"
echo "singularity ${RUN_QUIET} ${MODE} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@"
singularity ${RUN_QUIET} ${MODE} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@"
echo "singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} ${EESSI_FUSE_MOUNTS[@]} ${CONTAINER} $@"
singularity ${RUN_QUIET} ${MODE} ${ADDITIONAL_SINGULARITY_FLAGS} "${EESSI_FUSE_MOUNTS[@]}" ${CONTAINER} "$@"
exit_code=$?

# 6. save tmp if requested (arg -s|--save)
Expand Down
92 changes: 92 additions & 0 deletions gpu_support/nvidia/install_cuda_host_injections.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env bash
ocaisa marked this conversation as resolved.
Show resolved Hide resolved

# Initialise our bash functions
TOPDIR=$(dirname $(realpath $BASH_SOURCE))
source "$TOPDIR"/../../scripts/utils.sh

# Make sure EESSI is initialised
check_eessi_initialised

if [[ $# -eq 0 ]] ; then
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
fatal_error "You must provide the CUDA version as an argument, e.g.:\n $0 11.3.1"
casparvl marked this conversation as resolved.
Show resolved Hide resolved
fi
install_cuda_version=$1
if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
fatal_error "This script cannot be used without having first defined EESSI_SOFTWARE_PATH"
else
# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections`
# (CUDA is a binary installation so no need to worry too much about the EasyBuild setup)
cuda_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections}
fi

# Only install CUDA if specified version is not found.
# This is only relevant for users, the shipped CUDA installation will
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
# always be in versions instead of host_injections and have symlinks pointing
# to host_injections for everything we're not allowed to ship
# (existence of easybuild subdir implies a successful install)
if [ -d "${cuda_install_parent}"/software/CUDA/"${install_cuda_version}"/easybuild ]; then
echo_green "CUDA software found! No need to install CUDA again, proceed with testing."
else
# We need to be able write to the installation space so let's make sure we can
if ! create_directory_structure "${cuda_install_parent}"/software/CUDA ; then
fatal_error "No write permissions to directory ${cuda_install_parent}/software/CUDA"
fi

# we need a directory we can use for temporary storage
if [[ -z "${CUDA_TEMP_DIR}" ]]; then
tmpdir=$(mktemp -d)
else
tmpdir="${CUDA_TEMP_DIR}"/temp
if ! mkdir "$tmpdir" ; then
fatal_error "Could not create directory ${tmpdir}"
fi
fi

required_space_in_tmpdir=50000
# Let's see if we have sources and build locations defined if not, we use the temporary space
if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then
export EASYBUILD_BUILDPATH=${tmpdir}/build
required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
fi
if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then
export EASYBUILD_SOURCEPATH=${tmpdir}/sources
required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
fi

# The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB),
# need to do a space check before we proceed
avail_space=$(df --output=avail "${cuda_install_parent}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < 5000000 )); then
fatal_error "Need at least 5GB disk space to install CUDA under ${cuda_install_parent}, exiting now..."
fi
avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < required_space_in_tmpdir )); then
error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n"
error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check."
error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH "
error="${error}to reduce this requirement. Exiting now..."
fatal_error "${error}"
fi

if [[ -z "${EBROOTEASYBUILD}" ]]; then
echo_yellow "Loading EasyBuild module to do actual install"
module load EasyBuild
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
fi

# we need the --rebuild option and a (random) dir for the module since we are
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
# fixing the broken links of the EESSI-shipped installation
extra_args="--rebuild --installpath-modules=${tmpdir}"

# We don't want hooks used in this install, we need a vanilla CUDA installation
touch "$tmpdir"/none.py
# shellcheck disable=SC2086 # Intended splitting of extra_args
eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cuda_install_parent}"/ CUDA-"${install_cuda_version}".eb
casparvl marked this conversation as resolved.
Show resolved Hide resolved
ret=$?
if [ $ret -ne 0 ]; then
fatal_error "CUDA installation failed, please check EasyBuild logs..."
else
echo_green "CUDA installation at ${cuda_install_parent}/software/CUDA/${install_cuda_version} succeeded!"
fi
# clean up tmpdir
rm -rf "${tmpdir}"
fi
80 changes: 75 additions & 5 deletions scripts/utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ ANY_ERROR_EXITCODE=1
function fatal_error() {
echo_red "ERROR: $1" >&2
if [[ $# -gt 1 ]]; then
exit $2
exit "$2"
else
exit "${ANY_ERROR_EXITCODE}"
fi
Expand All @@ -32,11 +32,81 @@ function check_exit_code {
fi
}

function check_eessi_initialised() {
if [[ -z "${EESSI_SOFTWARE_PATH}" ]]; then
fatal_error "EESSI has not been initialised!"
else
return 0
fi
}

function float_greater_than() {
ocaisa marked this conversation as resolved.
Show resolved Hide resolved
# Make sure we have two arguments
if [ $# -ne 2 ]; then
echo_red "greater_than_float requires two (float) numbers" >&2
return $ANY_ERROR_EXITCODE
fi
# Make sure the arguments are numbers
if [[ ! $1 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then
echo_yellow "Input to float_greater_than is not a float, ignoring"
return $ANY_ERROR_EXITCODE
fi
if [[ ! $2 =~ ^[+-]?[0-9]+\.?[0-9]*$ ]]; then
echo_yellow "Input to float_greater_than is not a float, ignoring"
return $ANY_ERROR_EXITCODE
fi
# Now do the actual evaluation
return_code=$ANY_ERROR_EXITCODE
result=$(echo "$1" "$2" | awk '{if ($1 > $2) print "true"}')
if [ "$result" = true ] ; then
return_code=0
fi
return $return_code
}

function check_in_prefix_shell() {
casparvl marked this conversation as resolved.
Show resolved Hide resolved
# Make sure EPREFIX is defined
if [[ -z "${EPREFIX}" ]]; then
fatal_error "This script cannot be used without having first defined EPREFIX"
fi
if [[ ! ${SHELL} = ${EPREFIX}/bin/bash ]]; then
fatal_error "Not running in Gentoo Prefix environment, run '${EPREFIX}/startprefix' first!"
fi
}

function create_directory_structure() {
# Ensure we are given a single path argument
if [ $# -ne 1 ]; then
echo_red "Function requires a single (relative or absolute) path argument" >&2
return $ANY_ERROR_EXITCODE
fi
dir_structure="$1"

# Attempt to create the directory structure
error_message=$(mkdir -p "$dir_structure" 2>&1)
return_code=$?
# If it fails be explicit about the error
if [ ${return_code} -ne 0 ]; then
real_dir=$(realpath -m "$dir_structure")
echo_red "Creating ${dir_structure} (real path ${real_dir}) failed with:\n ${error_message}" >&2
else
# If we're creating it, our use case is that we want to be able to write there
# (this is a check in case the directory already existed)
if [ ! -w "${dir_structure}" ]; then
real_dir=$(realpath -m "$dir_structure")
echo_red "You do not have (required) write permissions to ${dir_structure} (real path ${real_dir})!"
return_code=$ANY_ERROR_EXITCODE
fi
fi

return $return_code
}

function get_path_for_tool {
tool_name=$1
tool_envvar_name=$2

which_out=$(which ${tool_name} 2>&1)
which_out=$(which "${tool_name}" 2>&1)
exit_code=$?
if [[ ${exit_code} -eq 0 ]]; then
echo "INFO: found tool ${tool_name} in PATH (${which_out})" >&2
Expand Down Expand Up @@ -68,7 +138,7 @@ function get_host_from_url {
url=$1
re="(http|https)://([^/:]+)"
if [[ $url =~ $re ]]; then
echo ${BASH_REMATCH[2]}
echo "${BASH_REMATCH[2]}"
return 0
else
echo ""
Expand All @@ -80,7 +150,7 @@ function get_port_from_url {
url=$1
re="(http|https)://[^:]+:([0-9]+)"
if [[ $url =~ $re ]]; then
echo ${BASH_REMATCH[2]}
echo "${BASH_REMATCH[2]}"
return 0
else
echo ""
Expand All @@ -90,7 +160,7 @@ function get_port_from_url {

function get_ipv4_address {
hname=$1
hipv4=$(grep ${hname} /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1)
hipv4=$(grep "${hname}" /etc/hosts | grep -v '^[[:space:]]*#' | cut -d ' ' -f 1)
# TODO try other methods if the one above does not work --> tool that verifies
# what method can be used?
echo "${hipv4}"
Expand Down