Skip to content

Commit

Permalink
feat(gpu): add shared nvidia boost clock logic (#2014)
Browse files Browse the repository at this point in the history
* Add shared nvidia boost clock logic

* Use systemd unit for gpu clocks in al2

* Use ConditionPathExists to start unit
  • Loading branch information
ndbaker1 authored Nov 12, 2024
1 parent 34f19d0 commit 6b5a944
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 38 deletions.
4 changes: 4 additions & 0 deletions templates/al2/provisioners/install-worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,10 @@ fi
################################################################################

sudo mv "${WORKING_DIR}/runtime.slice" /etc/systemd/system/runtime.slice
# this unit is safe to have regardless of variant because it will not run if
# the required binaries are not present.
sudo mv $WORKING_DIR/set-nvidia-clocks.service /etc/systemd/system/set-nvidia-clocks.service
sudo systemctl enable set-nvidia-clocks.service

###############################################################################
### Containerd setup ##########################################################
Expand Down
14 changes: 14 additions & 0 deletions templates/al2/runtime/set-nvidia-clocks.service
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[Unit]
Description=Configure NVIDIA GPU clock rate
After=nvidia-persistenced.service
Requires=nvidia-persistenced.service

ConditionPathExists=/usr/bin/nvidia-smi

[Service]
Type=oneshot
ExecStart=/usr/bin/set-nvidia-clocks
RemainAfterExit=true

[Install]
WantedBy=multi-user.target
1 change: 0 additions & 1 deletion templates/al2023/provisioners/install-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ archive-proprietary-kmod
################################################################################

sudo mv ${WORKING_DIR}/gpu/nvidia-kmod-load.sh /etc/eks/
sudo mv ${WORKING_DIR}/gpu/set-nvidia-clocks.sh /etc/eks/
sudo mv ${WORKING_DIR}/gpu/nvidia-kmod-load.service /etc/systemd/system/nvidia-kmod-load.service
sudo mv ${WORKING_DIR}/gpu/set-nvidia-clocks.service /etc/systemd/system/set-nvidia-clocks.service
sudo systemctl daemon-reload
Expand Down
4 changes: 3 additions & 1 deletion templates/al2023/runtime/gpu/set-nvidia-clocks.service
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ Description=Configure NVIDIA GPU clock rate
After=nvidia-persistenced.service
Requires=nvidia-persistenced.service

ConditionPathExists=/usr/bin/nvidia-smi

[Service]
Type=oneshot
ExecStart=/etc/eks/set-nvidia-clocks.sh
ExecStart=/usr/bin/set-nvidia-clocks
RemainAfterExit=true

[Install]
Expand Down
36 changes: 0 additions & 36 deletions templates/al2023/runtime/gpu/set-nvidia-clocks.sh

This file was deleted.

30 changes: 30 additions & 0 deletions templates/shared/runtime/bin/set-nvidia-clocks
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

set -o errexit
set -o pipefail
set -o nounset

# nvidia-smi is required for this script.
if ! nvidia-smi -q > /tmp/nvidia-smi-check; then
echo >&2 "ERROR: nvidia-smi check failed!"
cat /tmp/nvidia-smi-check
exit 1
fi

# it's generally recommended to manually set clocks using max performance in
# order to get predictable performance.
# see: https://developer.nvidia.com/blog/advanced-api-performance-setstablepowerstate/
# see: https://developer.nvidia.com/blog/increase-performance-gpu-boost-k80-autoboost/

# persist device power states so that you dont incur long start up delay when
# initializing new contexts on a CUDA GPU.
sudo nvidia-smi --persistence-mode=1

# query the highest speed for both memory and graphics GPU clocks
# NOTE: su permissions are not required for queries
MEMORY_CLOCK=$(nvidia-smi --query-supported-clocks=memory --format=csv,noheader,nounits | head -n 1 | tr -d '\n')
GRAPHICS_CLOCK=$(nvidia-smi --query-supported-clocks=graphics --format=csv,noheader,nounits | head -n 1 | tr -d '\n')

# disable automatic clock boosts and specify desired maximum clock values
sudo nvidia-smi --auto-boost-default=0
sudo nvidia-smi --applications-clocks ${MEMORY_CLOCK},${GRAPHICS_CLOCK}
1 change: 1 addition & 0 deletions templates/test/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ COPY al2/runtime/kubelet-kubeconfig /var/lib/kubelet/kubeconfig
COPY al2/runtime/ecr-credential-provider-config.json /etc/eks/image-credential-provider/config.json
COPY test/entrypoint.sh /entrypoint.sh
COPY al2/runtime/bin/* /usr/bin/
COPY shared/runtime/bin/* /usr/bin/
COPY test/mocks/ /sbin/
ENTRYPOINT ["/entrypoint.sh"]

0 comments on commit 6b5a944

Please sign in to comment.