Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

{2023.06}[foss/2023a] cuDNN v8.9.2.26 #353

Merged
merged 10 commits into from
May 17, 2024
5 changes: 3 additions & 2 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -199,14 +199,15 @@ pr_diff=$(ls [0-9]*.diff | head -1)
# for now, this just reinstalls all scripts. Note the most elegant, but works
${TOPDIR}/install_scripts.sh --prefix ${EESSI_PREFIX}

# Install full CUDA SDK in host_injections
# Install full CUDA SDK and cu* libraries in host_injections
# Hardcode this for now, see if it works
# TODO: We should make a nice yaml and loop over all CUDA versions in that yaml to figure out what to install
# Allow skipping CUDA SDK install in e.g. CI environments
if [ -z "${skip_cuda_install}" ] || [ ! "${skip_cuda_install}" ]; then
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuda_host_injections.sh -c 12.1.1 --accept-cuda-eula
${EESSI_PREFIX}/scripts/gpu_support/nvidia/install_cuDNN_host_injections.sh -c 12.1.1 -d 8.9.2.26
else
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed"
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed"
fi

# Install drivers in host_injections
Expand Down
29 changes: 27 additions & 2 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,38 @@
end
end

local function eessi_cudnn_enabled_load_hook(t)
local frameStk = require("FrameStk"):singleton()
local mt = frameStk:mt()
local simpleName = string.match(t.modFullName, "(.-)/")
-- If we try to load cuDNN itself, check if the full cuDNN package was installed on the host in host_injections.
-- This is required for end users to build additional cuDNN dependent software. If the full SDK isn't present, refuse
-- to load the cuDNN module and print an informative message on how to set up GPU support for NESSI
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
if simpleName == 'cuDNN' then
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- build final path where the cuDNN software should be installed
local cudnnEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local cudnnDirExists = isDir(cudnnEasyBuildDir)
if not cudnnDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the cuDNN package where NESSI "
advice = advice .. "can find it.\\n"
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
end
end
end

-- Combine both functions into a single one, as we can only register one function as load hook in lmod
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
function eessi_load_hook(t)
-- Only apply CUDA hooks if the loaded module is in the NESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA module from a local software stack
-- Only apply CUDA and cuDNN hooks if the loaded module is in the NESSI prefix
-- This avoids getting an Lmod Error when trying to load a CUDA or cuDNN module from a local software stack
if from_eessi_prefix(t) then
eessi_cuda_enabled_load_hook(t)
eessi_cudnn_enabled_load_hook(t)
end
end

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,4 @@ easyconfigs:
# see https://github.com/easybuilders/easybuild-easyconfigs/pull/19451;
options:
from-pr: 19451
- cuDNN-8.9.2.26-CUDA-12.1.1.eb
76 changes: 76 additions & 0 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,6 +688,62 @@ def post_sanitycheck_cuda(self, *args, **kwargs):
raise EasyBuildError("CUDA-specific hook triggered for non-CUDA easyconfig?!")


def post_sanitycheck_cuDNN(self, *args, **kwargs):
"""
Remove files from cuDNN installation that we are not allowed to ship,
and replace them with a symlink to a corresponding installation under host_injections.
"""
if self.name == 'cuDNN':
print_msg("Replacing files in cuDNN installation that we can not ship with symlinks to host_injections...")

allowlist = ['LICENSE']

# read cuDNN LICENSE, construct allowlist based on section 2.6 that specifies list of files that can be shipped
license_path = os.path.join(self.installdir, 'LICENSE')
search_string = "2. Distribution. The following portions of the SDK are distributable under the Agreement:"
with open(license_path) as infile:
for line in infile:
if line.strip().startswith(search_string):
# remove search string, split into words, remove trailing
# dots '.' and only retain words starting with a dot '.'
distributable = line[len(search_string):]
for word in distributable.split():
if word[0] == '.':
allowlist.append(word.rstrip('.'))

allowlist = sorted(set(allowlist))
self.log.info("Allowlist for files in cuDNN installation that can be redistributed: " + ', '.join(allowlist))

# iterate over all files in the CUDA installation directory
for dir_path, _, files in os.walk(self.installdir):
for filename in files:
full_path = os.path.join(dir_path, filename)
# we only really care about real files, i.e. not symlinks
if not os.path.islink(full_path):
# check if the current file is part of the allowlist
basename = filename.split('.')[0]
if '.' in filename:
extension = '.' + filename.split('.')[1]
if basename in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", basename, full_path)
elif '.' in filename and extension in allowlist:
self.log.debug("%s is found in allowlist, so keeping it: %s", extension, full_path)
else:
self.log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
filename, full_path)
# if it is not in the allowlist, delete the file and create a symlink to host_injections
host_inj_path = full_path.replace('versions', 'host_injections')
# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
"are using this hook for a NESSI installation?",
full_path, host_inj_path)
remove_file(full_path)
symlink(host_inj_path, full_path)
else:
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")


def inject_gpu_property(ec):
"""
Add 'gpu' property, via modluafooter easyconfig parameter
Expand All @@ -712,6 +768,25 @@ def inject_gpu_property(ec):
ec[key] = '\n'.join([ec_dict[key], value])
else:
ec[key] = value
# Check if cuDNN is in the dependencies, if so add the 'gpu' Lmod property
if ('cuDNN' in [dep[0] for dep in iter(ec_dict['dependencies'])]):
ec.log.info("Injecting gpu as Lmod arch property and envvar with cuDNN version")
key = 'modluafooter'
value = 'add_property("arch","gpu")'
cudnn_version = 0
for dep in iter(ec_dict['dependencies']):
# Make cuDNN a build dependency only (rpathing saves us from link errors)
if 'cuDNN' in dep[0]:
cudnn_version = dep[1]
ec_dict['dependencies'].remove(dep)
if dep not in ec_dict['builddependencies']:
ec_dict['builddependencies'].append(dep)
value = '\n'.join([value, 'setenv("EESSICUDNNVERSION","%s")' % cudnn_version])
if key in ec_dict:
if not value in ec_dict[key]:
ec[key] = '\n'.join([ec_dict[key], value])
else:
ec[key] = value
return ec


Expand Down Expand Up @@ -768,4 +843,5 @@ def inject_gpu_property(ec):

POST_SANITYCHECK_HOOKS = {
'CUDA': post_sanitycheck_cuda,
'cuDNN': post_sanitycheck_cuDNN,
}
2 changes: 1 addition & 1 deletion install_scripts.sh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@

# Copy files for the scripts/gpu_support/nvidia directory
nvidia_files=(
install_cuda_host_injections.sh link_nvidia_host_libraries.sh
install_cuda_host_injections.sh install_cuDNN_host_injections.sh link_nvidia_host_libraries.sh
)
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"

Expand Down
210 changes: 210 additions & 0 deletions scripts/gpu_support/nvidia/install_cuDNN_host_injections.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#!/usr/bin/env bash

# This script can be used to install cuDNN under the `.../host_injections` directory.
# This provides the parts of the cuDNN installation that cannot be redistributed as
# part of NESSI due to license limitations. While GPU-based software from NESSI will
# _run_ without these, installation of additional software that requires the cuDNN
# installation(s) under `host_injections` to be present.
#
# The `host_injections` directory is a variant symlink that by default points to
# `/opt/eessi`, unless otherwise defined in the local CVMFS configuration (see
# https://cvmfs.readthedocs.io/en/stable/cpt-repo.html#variant-symlinks). For the
# installation to be successful, this directory needs to be writeable by the user
# executing this script.

# Initialise our bash functions
TOPDIR=$(dirname $(realpath $BASH_SOURCE))
source "$TOPDIR"/../../utils.sh

# Function to display help message
show_help() {
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --help Display this help message"
echo " -c, --cuda-version CUDA_VERSION Specify a version of CUDA to be used"
echo " when installing cuDNN (must"
echo " have a corresponding easyconfig in the"
echo " EasyBuild release)"
echo " -d, --cudnn-version CUDNN_VERSION Specify a version of cuDNN to install (must"
echo " have a corresponding easyconfig in the"
echo " EasyBuild release)"
echo " -t, --temp-dir /path/to/tmpdir Specify a location to use for temporary"
echo " storage during the cuDNN install"
echo " (must have >10GB available)"
}

# Initialize variables
cuda_version=""
cudnn_version=""

# Parse command-line options
while [[ $# -gt 0 ]]; do
case "$1" in
--help)
show_help
exit 0
;;
-c|--cuda-version)
if [ -n "$2" ]; then
cuda_version="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
-d|--cudnn-version)
if [ -n "$2" ]; then
cudnn_version="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
-t|--temp-dir)
if [ -n "$2" ]; then
CUDA_TEMP_DIR="$2"
shift 2
else
echo "Error: Argument required for $1"
show_help
exit 1
fi
;;
*)
show_help
fatal_error "Error: Unknown option: $1"
;;
esac
done

# Make sure NESSI is initialised
check_eessi_initialised

# Make sure the CUDA version supplied is a semantic version
is_semantic_version() {
local version=$1
local regex='^[0-9]+\.[0-9]+\.[0-9]+$'

if [[ $version =~ $regex ]]; then
return 0 # Return success (0) if it's a semantic version
else
return 1 # Return failure (1) if it's not a semantic version
fi
}
if ! is_semantic_version "$cuda_version"; then
show_help
error="\nYou must provide a semantic version for CUDA (e.g., 12.1.1) via the appropriate\n"
error="${error}command line option. This script is intended for use with NESSI so the 'correct'\n"
error="${error}version to provide is probably one of those available under\n"
error="${error}$EESSI_SOFTWARE_PATH/software/cuDNN\n"
fatal_error "${error}"
fi

# As an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections`
cudnn_install_parent=${EESSI_SOFTWARE_PATH/versions/host_injections}

# Only install cuDNN if specified version is not found.
# (existence of easybuild subdir implies a successful install)
if [ -d "${cudnn_install_parent}"/software/cuDNN/*-CUDA-"${cuda_version}"/easybuild ]; then
echo_green "cuDNN software found! No need to install cuDNN again."
else
# We need to be able write to the installation space so let's make sure we can
if ! create_directory_structure "${cudnn_install_parent}"/software/cuDNN ; then
fatal_error "No write permissions to directory ${cudnn_install_parent}/software/cuDNN"
fi

# we need a directory we can use for temporary storage
if [[ -z "${CUDA_TEMP_DIR}" ]]; then
tmpdir=$(mktemp -d)
else
tmpdir="${CUDA_TEMP_DIR}"/temp
if ! mkdir "$tmpdir" ; then
fatal_error "Could not create directory ${tmpdir}"
fi
fi

required_space_in_tmpdir=50000
# Let's see if we have sources and build locations defined if not, we use the temporary space
if [[ -z "${EASYBUILD_BUILDPATH}" ]]; then
export EASYBUILD_BUILDPATH=${tmpdir}/build
required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
fi
if [[ -z "${EASYBUILD_SOURCEPATH}" ]]; then
export EASYBUILD_SOURCEPATH=${tmpdir}/sources
required_space_in_tmpdir=$((required_space_in_tmpdir + 5000000))
fi

# The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB),
# need to do a space check before we proceed
avail_space=$(df --output=avail "${cudnn_install_parent}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < 5000000 )); then
fatal_error "Need at least 5GB disk space to install cuDNN under ${cudnn_install_parent}, exiting now..."
fi
avail_space=$(df --output=avail "${tmpdir}"/ | tail -n 1 | awk '{print $1}')
if (( avail_space < required_space_in_tmpdir )); then
error="Need at least ${required_space_in_tmpdir} disk space under ${tmpdir}.\n"
error="${error}Set the environment variable CUDA_TEMP_DIR to a location with adequate space to pass this check."
error="${error}You can alternatively set EASYBUILD_BUILDPATH and/or EASYBUILD_SOURCEPATH "
error="${error}to reduce this requirement. Exiting now..."
fatal_error "${error}"
fi

if ! command -v "eb" &>/dev/null; then
echo_yellow "Attempting to load an EasyBuild module to do actual install"
module load EasyBuild
# There are some scenarios where this may fail
if [ $? -ne 0 ]; then
error="'eb' command not found in your environment and\n"
error="${error} module load EasyBuild\n"
error="${error}failed for some reason.\n"
error="${error}Please re-run this script with the 'eb' command available."
fatal_error "${error}"
fi
fi

cudnn_easyconfig="cuDNN-${cudnn_version}-CUDA-${cuda_version}.eb"

# Check the easyconfig file is available in the release
# (eb search always returns 0, so we need a grep to ensure a usable exit code)
eb --search ^${cudnn_easyconfig}|grep cuDNN > /dev/null 2>&1
# Check the exit code
if [ $? -ne 0 ]; then
eb_version=$(eb --version)
available_cudnn_easyconfigs=$(eb --search ^cuDNN-*.eb|grep cuDNN)

error="The easyconfig ${cudnn_easyconfig} was not found in EasyBuild version:\n"
error="${error} ${eb_version}\n"
error="${error}You either need to give a different version of CUDA to install _or_ \n"
error="${error}use a different version of EasyBuild for the installation.\n"
error="${error}\nThe versions of available with the current eb command are:\n"
error="${error}${available_cudnn_easyconfigs}"
fatal_error "${error}"
fi

# We need the --rebuild option, as the cuDNN module may or may not be on the
# `MODULEPATH` yet. Even if it is, we still want to redo this installation
# since it will provide the symlinked targets for the parts of the cuDNN
# installation in the `.../versions/...` prefix
# We install the module in our `tmpdir` since we do not need the modulefile,
# we only care about providing the targets for the symlinks.
extra_args="--rebuild --installpath-modules=${tmpdir}"

# We don't want hooks used in this install, we need a vanilla cuDNN installation
touch "$tmpdir"/none.py
# shellcheck disable=SC2086 # Intended splitting of extra_args
eb --prefix="$tmpdir" ${extra_args} --hooks="$tmpdir"/none.py --installpath="${cudnn_install_parent}"/ "${cudnn_easyconfig}"
ret=$?
if [ $ret -ne 0 ]; then
eb_last_log=$(unset EB_VERBOSE; eb --last-log)
cp -a ${eb_last_log} .
fatal_error "cuDNN installation failed, please check EasyBuild logs $(basename ${eb_last_log})..."
else
echo_green "cuDNN installation at ${cudnn_install_parent}/software/cuDNN/${cudnn_version}-CUDA-${cuda_version} succeeded!"
fi
# clean up tmpdir
rm -rf "${tmpdir}"
fi
Loading