diff --git a/data_gpu/driver/README.md b/data_gpu/driver/README.md index 002715e..33a5655 100644 --- a/data_gpu/driver/README.md +++ b/data_gpu/driver/README.md @@ -1,18 +1,14 @@ # GPU Enabled Driver -To build this driver you need to have the NVIDA Open GPU Kernel Modules installed. This driver will not compile gainst the CUDA toolkit drivers. +To build this driver you need to have the NVIDA Open GPU Kernel Modules installed. This driver will not compile against the CUDA toolkit drivers. https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html See section 5. -A script in this directory 'comp_and_load_drivers.sh' is provided to compile and load the nvidia drivers as well as the driver in this directory. Edit this file and update the NVIDIA_PATH value at the top to the install directory for the nvidia drivers. + -```bash -$ sudo apt-get install nvidia-kernel-source-545-open -$ sudo apt-get install nvidia-cuda-toolkit -$ sudo ./comp_and_load_drivers.sh -``` +# System Configurations Disable the Xserver and nvidia-persistenced to prevent rmmod due to Module XXX is in use by: YYY because the Nvidia driver gets loaded by default at startup @@ -25,3 +21,37 @@ $ sudo systemctl disable lightdm # For LightDM $ sudo systemctl disable sddm # For SDDM $ sudo systemctl disable nvidia-persistenced ``` + +Install the nvidia 555 drivers and the cuda 12.3 toolkit: + +```bash +$ wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb +$ sudo dpkg -i cuda-keyring_1.0-1_all.deb +$ sudo /usr/sbin/rmmod datagpu; sudo /usr/sbin/rmmod nvidia-drm; sudo /usr/sbin/rmmod nvidia-uvm; sudo /usr/sbin/rmmod nvidia-modeset; sudo /usr/sbin/rmmod nvidia; sudo /usr/sbin/rmmod nouveau +$ sudo apt update +$ sudo apt-get purge nvidia-* -y +$ sudo apt autoremove -y +$ sudo apt install cuda-toolkit-12-3 nvidia-kernel-source-555-open libnvidia-compute-555 nvidia-firmware-555-555.42.06 -y +$ sudo reboot +``` + +Next, Add `iommu=off nouveau.modeset=0 rd.driver.blacklist=nouveau` GRUB_CMDLINE_LINUX: + +```bash +$ sudo nano /etc/default/grub + GRUB_CMDLINE_LINUX="iommu=off nouveau.modeset=0 rd.driver.blacklist=nouveau" +$ sudo update-grub +$ sudo reboot +``` + + + +# How to build and load the nvidia and datagpu drviers + +After you completed all the "System Configuration" configuration steps above, run the following script to build and load the nvidia and datagpu drviers + +```bash +$ sudo ./comp_and_load_drivers.sh +``` + + diff --git a/data_gpu/driver/comp_and_load_drivers.sh b/data_gpu/driver/comp_and_load_drivers.sh index 7a38c13..b5dbe91 100755 --- a/data_gpu/driver/comp_and_load_drivers.sh +++ b/data_gpu/driver/comp_and_load_drivers.sh @@ -1,4 +1,5 @@ #!/bin/bash +# vi: et sw=4 ts=4 # Check if the script is run with sudo if [ "$EUID" -ne 0 ]; then @@ -6,14 +7,35 @@ if [ "$EUID" -ne 0 ]; then exit 1 fi -# Get the gcc that kernel was built with -version_content=$(cat /proc/version) -CC=$(echo "$version_content" | grep -oP 'x86_64-linux-gnu-gcc-\d+') +# Determine the Linux distribution +if [ -f /etc/os-release ]; then + . /etc/os-release + distro=$ID +else + echo "Error: Cannot determine the Linux distribution." >&2 + exit 1 +fi + +# Set the CC variable based on the distribution +if [ "$distro" == "ubuntu" ]; then + # Get the gcc that kernel was built with + version_content=$(cat /proc/version) + CC=$(echo "$version_content" | grep -oP 'x86_64-linux-gnu-gcc-\d+') +elif [ "$distro" == "rhel" ] || [ "$distro" == "rocky" ]; then + CC="x86_64-redhat-linux-gcc" +else + echo "Error: Unsupported Linux distribution." >&2 + exit 1 +fi echo "CC: $CC" # Define Nvidia path output=$(find /usr -name nv-p2p.h 2>/dev/null) -NVIDIA_PATH=$(echo "$output" | grep -oP '^/usr/src/nvidia-\d+\.\d+\.\d+' | head -n 1) +NVIDIA_PATH=$(echo "$output" | grep -oP '^/usr/src/nvidia(-open)?-\d+\.\d+\.\d+' | head -n 1) +if [ -z "$NVIDIA_PATH" ]; then + echo "Could not find NVIDIA open drivers in /usr/src. Is it installed?" + exit 1 +fi echo "Using Nvidia path: $NVIDIA_PATH" # Return directory @@ -21,15 +43,24 @@ RET_DIR=$PWD echo "Using RET_DIR: $RET_DIR" # Remove existing Nvidia modules (if any) -/usr/sbin/rmmod datagpu 2>/dev/null -/usr/sbin/rmmod nvidia-drm 2>/dev/null -/usr/sbin/rmmod nvidia-uvm 2>/dev/null -/usr/sbin/rmmod nvidia-modeset 2>/dev/null -/usr/sbin/rmmod nvidia 2>/dev/null +modules=("datagpu" "nvidia-drm" "nvidia-uvm" "nvidia-modeset" "nvidia" "nouveau") +for module in "${modules[@]}"; do + output=$(/usr/sbin/rmmod $module 2>&1) + status=$? + if [[ $status -ne 0 ]]; then + if [[ $output == *"Module $module is in use"* ]]; then + echo "Error: Module $module is in use. Stopping script." + exit 1 + fi + fi +done # Go to nvidia path and build Nvidia driver cd "$NVIDIA_PATH" || { echo "Error: Failed to change directory to $NVIDIA_PATH"; exit 1; } -make CC=$CC +# Clean previous builds +make clean +# Build Nvidia driver +make CC=$CC -j $(nproc) if modinfo ecc >/dev/null 2>&1; then modprobe ecc || { echo "Error: Failed to insert ecc module."; exit 1; } @@ -43,6 +74,8 @@ fi # Go to nvidia path and build Nvidia driver cd "$RET_DIR" || { echo "Error: Failed to change directory to $RET_DIR"; exit 1; } +# Clean previous builds +make clean +# Build datagpu driver make CC=$CC NVIDIA_DRIVERS=$NVIDIA_PATH /usr/sbin/insmod $RET_DIR/datagpu.ko || { echo "Error: Failed to insert datagpu.ko."; exit 1; } - diff --git a/data_gpu/driver/src/data_gpu_top.c b/data_gpu/driver/src/data_gpu_top.c index 7dd425a..803325f 100755 --- a/data_gpu/driver/src/data_gpu_top.c +++ b/data_gpu/driver/src/data_gpu_top.c @@ -310,12 +310,12 @@ void DataGpu_Remove(struct pci_dev *pcidev) { /* Decrement the global count of DMA devices. */ gDmaDevCount--; - /* Disable the PCI device to prevent further access. */ - pci_disable_device(pcidev); - /* Clean up DMA resources specific to this device. */ Dma_Clean(dev); + /* Disable the PCI device to prevent further access. */ + pci_disable_device(pcidev); + /* Log the successful removal of the device. */ pr_info("%s: Remove: Driver is unloaded.\n", MOD_NAME); } diff --git a/rce_stream/driver/src/rce_top.c b/rce_stream/driver/src/rce_top.c index 966d41f..4e41c32 100755 --- a/rce_stream/driver/src/rce_top.c +++ b/rce_stream/driver/src/rce_top.c @@ -30,7 +30,11 @@ #include #include #include +#include + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) #include +#endif // Init Configuration values int cfgTxCount0 = 8;