Skip to content

Commit

Permalink
Merge pull request #151 from slaclab/comp_and_load_drivers-dev
Browse files Browse the repository at this point in the history
overhauling the comp_and_load_drivers.sh script
  • Loading branch information
ruck314 authored Jul 9, 2024
2 parents ff1948d + f7b4fa8 commit 01c044b
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 65 deletions.
7 changes: 4 additions & 3 deletions data_gpu/driver/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@
# contained in the LICENSE.txt file.
# ----------------------------------------------------------------------------

# Optional path to NVIDIA drivers, if not specified, use empty string.
# Args to this Makefile
NVIDIA_DRIVERS ?= ""
CC ?= ""

# Define the module name.
NAME := datagpu
Expand Down Expand Up @@ -63,10 +64,10 @@ obj-m := $(NAME).o
# Default target: Display git version and build the module.
all:
@echo $(GITV)
$(MAKE) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) -C $(KERNELDIR) M=$(HOME) modules
$(MAKE) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC=$(CC) -C $(KERNELDIR) M=$(HOME) modules

# Clean target: Remove built module files and object files.
clean:
$(MAKE) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) -C $(KERNELDIR) M=$(HOME) clean
$(MAKE) ARCH=$(ARCH) CROSS_COMPILE=$(CROSS_COMPILE) CC=$(CC) -C $(KERNELDIR) M=$(HOME) clean
rm -f $(OBJS)

14 changes: 9 additions & 5 deletions data_gpu/driver/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@ $ sudo apt-get install nvidia-cuda-toolkit
$ sudo ./comp_and_load_drivers.sh
```

If NVLink support is required, install cuda-drivers-fabricmanager-<driver version> as well. However, as of Febuary 2024, the latest version of fabricmanager is 535.
When it becomes available on Ubuntu, install with:
Disable the Xserver and nvidia-persistenced to prevent rmmod due to Module XXX is in use by: YYY
because the Nvidia driver gets loaded by default at startup

```
$ sudo apt-get install cuda-drivers-fabricmanager-545
```
https://forums.developer.nvidia.com/t/cant-install-new-driver-cannot-unload-module/63639

```bash
$ sudo systemctl disable gdm # For GNOME Display Manager
$ sudo systemctl disable lightdm # For LightDM
$ sudo systemctl disable sddm # For SDDM
$ sudo systemctl disable nvidia-persistenced
```
81 changes: 24 additions & 57 deletions data_gpu/driver/comp_and_load_drivers.sh
Original file line number Diff line number Diff line change
@@ -1,48 +1,20 @@
#!/bin/bash

# Provide defaults for CC
[ -z "$CC" ] && CC=gcc

# Function to check if the script is run with sudo
check_sudo() {
if [ "$EUID" -ne 0 ]; then
echo "Error: This script must be run with sudo." >&2
exit 1
fi
}
# Checks that GCC matches what the kernel was built with
check_gcc_version() {
_GCC_VER="$($CC --version | grep -Eo "\s[0-9]+\.[0-9]+\.[0-9]+\s" | awk '{$1=$1};1')"
if ! cat /proc/version | grep -Eoq "gcc version $_GCC_VER"; then
echo "Error: GCC version 'gcc version $_GCC_VER' does not match what the kernel was built with: '$(cat /proc/version | grep -Eo "gcc version [0-9]+\.[0-9]+\.[0-9]+")'"
echo " You can specify an alternative compiler by setting the 'CC' environment variable"
exit 1
fi
}

# Check if the script is run with sudo
check_sudo

# Check that our GCC matches what the kernel was built with
check_gcc_version

# Function to find the latest Nvidia version directory
get_latest_nvidia_path() {
# Navigate to the /usr/src directory
cd /usr/src
if [ "$EUID" -ne 0 ]; then
echo "Error: This script must be run with sudo." >&2
exit 1
fi

# List and sort NVIDIA directories, then get the last one (the latest)
latest_nvidia_path=$(ls -d nvidia-* | sort -V | tail -n 1)
# Get the gcc that kernel was built with
version_content=$(cat /proc/version)
CC=$(echo "$version_content" | grep -oP 'x86_64-linux-gnu-gcc-\d+')
echo "CC: $CC"

# Check if no NVIDIA directory was found
if [ -z "$latest_nvidia_path" ]; then
echo "Error: No NVIDIA directory found in /usr/src" >&2
exit 1
else
# Print the full path of the latest NVIDIA directory
echo "/usr/src/$latest_nvidia_path"
fi
}
# Define Nvidia path
output=$(find /usr -name nv-p2p.h 2>/dev/null)
NVIDIA_PATH=$(echo "$output" | grep -oP '^/usr/src/nvidia-\d+\.\d+\.\d+' | head -n 1)
echo "Using Nvidia path: $NVIDIA_PATH"

# Return directory
RET_DIR=$PWD
Expand All @@ -55,27 +27,22 @@ echo "Using RET_DIR: $RET_DIR"
/usr/sbin/rmmod nvidia-modeset 2>/dev/null
/usr/sbin/rmmod nvidia 2>/dev/null

# Define Nvidia path
NVIDIA_PATH=$(get_latest_nvidia_path)
echo "Using Nvidia path: $NVIDIA_PATH"

cd $NVIDIA_PATH

make
# Go to nvidia path and build Nvidia driver
cd "$NVIDIA_PATH" || { echo "Error: Failed to change directory to $NVIDIA_PATH"; exit 1; }
make CC=$CC

if modinfo ecc >/dev/null 2>&1; then
modprobe ecc || { echo "Error: Failed to insert ecc module."; exit 1; }
fi

/usr/sbin/insmod nvidia.ko NVreg_OpenRmEnableUnsupportedGpus=1 NVreg_EnableStreamMemOPs=1 || { echo "Error: Failed to insert nvidia.ko."; exit 1; }

/usr/sbin/insmod nvidia-modeset.ko || { echo "Error: Failed to insert nvidia-modeset.ko."; exit 1; }

/usr/sbin/insmod nvidia-uvm.ko || { echo "Error: Failed to insert nvidia-uvm.ko."; exit 1; }

/usr/sbin/insmod nvidia-drm.ko modeset=1 || { echo "Error: Failed to insert nvidia-drm.ko."; exit 1; }
# Load the nvidia kernel drivers
/usr/sbin/insmod $NVIDIA_PATH/nvidia.ko NVreg_OpenRmEnableUnsupportedGpus=1 NVreg_EnableStreamMemOPs=1 || { echo "Error: Failed to insert nvidia.ko."; exit 1; }
/usr/sbin/insmod $NVIDIA_PATH/nvidia-modeset.ko || { echo "Error: Failed to insert nvidia-modeset.ko."; exit 1; }
/usr/sbin/insmod $NVIDIA_PATH/nvidia-drm.ko modeset=1 || { echo "Error: Failed to insert nvidia-drm.ko."; exit 1; }
/usr/sbin/insmod $NVIDIA_PATH/nvidia-uvm.ko || { echo "Error: Failed to insert nvidia-uvm.ko."; exit 1; }

cd $RET_DIR
# Go to nvidia path and build Nvidia driver
cd "$RET_DIR" || { echo "Error: Failed to change directory to $RET_DIR"; exit 1; }
make CC=$CC NVIDIA_DRIVERS=$NVIDIA_PATH
/usr/sbin/insmod $RET_DIR/datagpu.ko || { echo "Error: Failed to insert datagpu.ko."; exit 1; }

make NVIDIA_DRIVERS=$NVIDIA_PATH
/usr/sbin/insmod datagpu.ko || { echo "Error: Failed to insert datagpu.ko."; exit 1; }

0 comments on commit 01c044b

Please sign in to comment.