Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update driver-container for SUSE SLE #126

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
6 changes: 3 additions & 3 deletions sle15/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ARG SLES_VERSION
FROM nvcr.io/nvidia/cuda:12.6.2-base-ubi8 as license
FROM nvcr.io/nvidia/cuda:12.6.0-base-ubi9 as license

FROM registry.suse.com/bci/golang:1.17 as build
FROM registry.suse.com/bci/golang:1.23 as build

RUN zypper --non-interactive install -y git wget tar gzip

Expand All @@ -12,7 +12,7 @@ RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
go build -o vgpu-util && \
mv vgpu-util /work

FROM registry.suse.com/suse/sle15:$SLES_VERSION
FROM registry.suse.com/bci/bci-base:$SLES_VERSION

#ARG BASE_URL=http://us.download.nvidia.com/XFree86/Linux-x86_64
ARG BASE_URL=https://us.download.nvidia.com/tesla
Expand Down
108 changes: 96 additions & 12 deletions sle15/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,15 @@ NVIDIA_MODULE_PARAMS=()
NVIDIA_UVM_MODULE_PARAMS=()
NVIDIA_MODESET_MODULE_PARAMS=()

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-true}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "false" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel

_update_package_cache() {
if [ "${PACKAGE_TAG:-}" != "builtin" ]; then
echo "Updating the package cache..."
FLAVOR="$(echo ${KERNEL_VERSION} | cut -d- -f3)"
if [ "$FLAVOR" == "azure" ]; then
# consumed by container-suseconnect when calling `zypper refresh`
export ADDITIONAL_MODULES="sle-module-public-cloud"
fi
if ! zypper refresh; then
Expand Down Expand Up @@ -66,10 +70,13 @@ _install_prerequisites() (

echo "Installing Linux kernel source..."
local version_without_flavor=$(echo ${KERNEL_VERSION} | cut -d- -f-2)
export ZYPP_MODALIAS_SYSFS=$(mktemp /tmp/modalias-XXXX)
if ! zypper --non-interactive in -y --no-recommends --capability kernel-${FLAVOR} = ${version_without_flavor} kernel-${FLAVOR}-devel = ${version_without_flavor} ; then
echo "FATAL: failed to install kernel packages. Ensure SLES subscription is available."
rm -f ${ZYPP_MODALIAS_SYSFS}
exit 1
fi
rm -f ${ZYPP_MODALIAS_SYSFS}; unset ZYPP_MODALIAS_SYSFS

echo "Generating Linux kernel version string..."
extract-vmlinux /boot/vmlinuz-${KERNEL_VERSION} | strings | grep -E '^Linux version' | sed 's/^\(.*\)\s\+(.*)$/\1/' > version
Expand All @@ -96,8 +103,8 @@ _kernel_requires_package() {

echo "Checking NVIDIA driver packages..."

[[ ! -d /usr/src/nvidia-${DRIVER_VERSION}/kernel ]] && return 0
cd /usr/src/nvidia-${DRIVER_VERSION}/kernel
[[ ! -d /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} ]] && return 0
cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE}

proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc"
for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do
Expand All @@ -120,7 +127,7 @@ _create_driver_package() (
trap "make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/source clean > /dev/null" EXIT

echo "Compiling NVIDIA driver kernel modules..."
cd /usr/src/nvidia-${DRIVER_VERSION}/kernel
cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE}
make -s -j ${MAX_THREADS} SYSSRC=/lib/modules/${KERNEL_VERSION}/source nv-linux.o nv-modeset-linux.o > /dev/null

echo "Relinking NVIDIA driver kernel modules..."
Expand Down Expand Up @@ -205,6 +212,25 @@ _get_module_params() {

# Load the kernel modules and start persistenced.
_load_driver() {
local nv_fw_search_path="$RUN_DIR/driver/lib/firmware"
local set_fw_path="true"
local fw_path_config_file="/sys/module/firmware_class/parameters/path"
for param in "${NVIDIA_MODULE_PARAMS[@]}"; do
if [[ "$param" == "NVreg_EnableGpuFirmware=0" ]]; then
set_fw_path="false"
fi
done

if [[ "$set_fw_path" == "true" ]]; then
echo "Configuring the following firmware search path in '$fw_path_config_file': $nv_fw_search_path"
if [[ ! -z $(grep '[^[:space:]]' $fw_path_config_file) ]]; then
echo "WARNING: A search path is already configured in $fw_path_config_file"
echo " Retaining the current configuration"
else
echo -n "$nv_fw_search_path" > $fw_path_config_file || echo "WARNING: Failed to configure the firmware search path"
fi
fi

echo "Parsing kernel module parameters..."
_get_module_params

Expand Down Expand Up @@ -245,9 +271,11 @@ _load_driver() {
_unload_driver() {
local rmmod_args=()
local nvidia_deps=0
local nvidia_modeset_deps=0
local nvidia_refs=0
local nvidia_uvm_refs=0
local nvidia_modeset_refs=0
local nvidia_drm_refs=0

echo "Stopping NVIDIA persistence daemon..."
if [ -f /var/run/nvidia-persistenced/nvidia-persistenced.pid ]; then
Expand Down Expand Up @@ -295,6 +323,11 @@ _unload_driver() {
fi

echo "Unloading NVIDIA driver kernel modules..."
if [ -f /sys/module/nvidia_drm/refcnt ]; then
nvidia_drm_refs=$(< /sys/module/nvidia_drm/refcnt)
rmmod_args+=("nvidia-drm")
((++nvidia_modeset_deps))
fi
if [ -f /sys/module/nvidia_modeset/refcnt ]; then
nvidia_modeset_refs=$(< /sys/module/nvidia_modeset/refcnt)
rmmod_args+=("nvidia-modeset")
Expand All @@ -309,7 +342,7 @@ _unload_driver() {
nvidia_refs=$(< /sys/module/nvidia/refcnt)
rmmod_args+=("nvidia")
fi
if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt 0 ]; then
if [ ${nvidia_refs} -gt ${nvidia_deps} ] || [ ${nvidia_uvm_refs} -gt 0 ] || [ ${nvidia_modeset_refs} -gt ${nvidia_modeset_deps} ] || [ ${nvidia_drm_refs} -gt 0 ]; then
echo "Could not unload NVIDIA driver kernel modules, driver is in use" >&2
return 1
fi
Expand All @@ -331,7 +364,7 @@ _install_driver() {
if [ "${ACCEPT_LICENSE}" = "yes" ]; then
install_args+=("--accept-license")
fi
nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"}
IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --ui=none --no-nouveau-check -m=${KERNEL_TYPE} --no-rebuild-initramfs ${install_args[@]+"${install_args[@]}"} --skip-module-load # --no-drm
}

# Mount the driver rootfs into the run directory with the exception of sysfs.
Expand All @@ -341,6 +374,16 @@ _mount_rootfs() {
mount --make-private /sys
mkdir -p ${RUN_DIR}/driver
mount --rbind / ${RUN_DIR}/driver

echo "Check SELinux status"
if [ -e /sys/fs/selinux ]; then
echo "SELinux is enabled"
echo "Change device files security context for selinux compatibility"
chcon -R -t container_file_t ${RUN_DIR}/driver/dev
else
echo "SELinux is disabled, skipping..."
fi

}

# Unmount the driver rootfs from the run directory.
Expand Down Expand Up @@ -419,7 +462,7 @@ _start_vgpu_topology_daemon() {
nvidia-topologyd
}

init() {
_prepare() {
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
_find_vgpu_driver_version || exit 1
fi
Expand All @@ -429,11 +472,15 @@ init() {
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
sh /tmp/install.sh nvinstall && \
mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \
mv LICENSE mkprecompiled kernel /usr/src/nvidia-$DRIVER_VERSION && \
mv LICENSE mkprecompiled $KERNEL_TYPE /usr/src/nvidia-$DRIVER_VERSION && \
sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest

echo -e "\n========== NVIDIA Software Installer ==========\n"
echo -e "Starting installation of NVIDIA driver version ${DRIVER_VERSION} for Linux kernel version ${KERNEL_VERSION}\n"
}

_prepare_exclusive() {
_prepare

exec 3> ${PID_FILE}
if ! flock -n 3; then
Expand All @@ -447,22 +494,35 @@ init() {

_unload_driver || exit 1
_unmount_rootfs
}

_build() {
local cleanup=false

# Install dependencies
if _kernel_requires_package; then
_update_package_cache
_resolve_kernel_version || exit 1
_install_prerequisites
_create_driver_package
#_remove_prerequisites
_cleanup_package_cache
cleanup=true
fi

# Build the driver
# Build the driver - rootfs needs to be mounted as the build magic attempts to
# load the driver.
_install_driver
_load_driver || exit 1
if $cleanup; then
# Do not call _remove_prerequisites as this will delete depmod information
_cleanup_package_cache
fi
}

_load() {
_mount_rootfs
# Something in the build process may have decided to load drivers that happened to be installed.
# Make sure they are uninstalled.
lsmod | grep -q nvidia && { _unload_driver || exit 1; } || true
_load_driver || exit 1
_write_kernel_update_hook

echo "Done, now waiting for signal"
Expand All @@ -473,6 +533,26 @@ init() {
exit 0
}

init() {
_prepare_exclusive

_build

_load
}

build() {
_prepare

_build
}

load() {
_prepare_exclusive

_load
}

update() {
exec 3>&2
if exec 2> /dev/null 4< ${PID_FILE}; then
Expand Down Expand Up @@ -511,7 +591,7 @@ update() {
if _kernel_requires_package; then
_create_driver_package
fi
#_remove_prerequisites
# Do not call _remove_prerequisites as this will delete demod information
_cleanup_package_cache

echo "Done"
Expand All @@ -524,6 +604,8 @@ Usage: $0 COMMAND [ARG...]

Commands:
init [-a | --accept-license] [-m | --max-threads MAX_THREADS]
build [-a | --accept-license] [-m | --max-threads MAX_THREADS]
load
update [-k | --kernel VERSION] [-s | --sign KEYID] [-t | --tag TAG] [-m | --max-threads MAX_THREADS]
EOF
exit 1
Expand All @@ -535,6 +617,8 @@ fi
command=$1; shift
case "${command}" in
init) options=$(getopt -l accept-license,max-threads: -o am: -- "$@") ;;
build) options=$(getopt -l accept-license,tag:,max-threads: -o a:t:m: -- "$@") ;;
load) options="" ;;
update) options=$(getopt -l kernel:,sign:,tag:,max-threads: -o k:s:t:m: -- "$@") ;;
*) usage ;;
esac
Expand Down