Skip to content

Commit

Permalink
integrate gpu-driver-util into the driver images
Browse files Browse the repository at this point in the history
Signed-off-by: Tariq Ibrahim <[email protected]>
  • Loading branch information
tariq1890 committed Jan 7, 2025
1 parent f7b59de commit 1ffa0da
Show file tree
Hide file tree
Showing 12 changed files with 203 additions and 29 deletions.
10 changes: 7 additions & 3 deletions rhel8/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubi8

Expand Down Expand Up @@ -57,6 +58,9 @@ RUN sh /tmp/install.sh depinstall && \
chmod +x /usr/local/bin/donkey /usr/local/bin/extract-vmlinux && \
ln -s /sbin/ldconfig /sbin/ldconfig.real

# Download the nvidia-driver-assistant to get the latest supported-gpus.json file
RUN dnf install -y nvidia-driver-assistant

ADD drivers drivers/

# Fetch the installer automatically for passthrough/baremetal types
Expand Down
24 changes: 22 additions & 2 deletions rhel8/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ DNF_RELEASEVER=${DNF_RELEASEVER:-""}
RHEL_VERSION=${RHEL_VERSION:-""}
RHEL_MAJOR_VERSION=8

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
echo "DRIVER_ARCH is $DRIVER_ARCH"
Expand Down Expand Up @@ -577,6 +576,26 @@ _start_vgpu_topology_daemon() {
nvidia-topologyd
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
return 0
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
return 0
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, please check /var/log/gpu-driver-util.log for more details..."
return 1
fi
return 0
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_prepare() {
if [ "${DRIVER_TYPE}" = "vgpu" ]; then
_find_vgpu_driver_version || exit 1
Expand Down Expand Up @@ -797,5 +816,6 @@ if [ $# -ne 0 ]; then
fi

_resolve_rhel_version || exit 1
_resolve_kernel_type || exit 1

$command
3 changes: 3 additions & 0 deletions rhel9/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ RUN sh /tmp/install.sh depinstall && \
chmod +x /usr/local/bin/donkey /usr/local/bin/extract-vmlinux && \
ln -s /sbin/ldconfig /sbin/ldconfig.real

# Download the nvidia-driver-assistant to get the latest supported-gpus.json file
RUN dnf install -y nvidia-driver-assistant

ADD drivers drivers/

# Fetch the installer automatically for passthrough/baremetal types
Expand Down
24 changes: 22 additions & 2 deletions rhel9/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ DNF_RELEASEVER=${DNF_RELEASEVER:-""}
RHEL_VERSION=${RHEL_VERSION:-""}
RHEL_MAJOR_VERSION=9

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64}
echo "DRIVER_ARCH is $DRIVER_ARCH"
Expand Down Expand Up @@ -571,6 +570,26 @@ _find_vgpu_driver_version() {
return 0
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
return 0
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
return 0
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, please check /var/log/gpu-driver-util.log for more details..."
return 1
fi
return 0
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_start_vgpu_topology_daemon() {
type nvidia-topologyd > /dev/null 2>&1 || return 0
echo "Starting nvidia-topologyd.."
Expand Down Expand Up @@ -797,5 +816,6 @@ if [ $# -ne 0 ]; then
fi

_resolve_rhel_version || exit 1
_resolve_kernel_type || exit 1

$command
13 changes: 10 additions & 3 deletions ubuntu20.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu20.04

Expand Down Expand Up @@ -69,9 +70,15 @@ RUN /tmp/install.sh reposetup && /tmp/install.sh depinstall && \
curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey && \
chmod +x /usr/local/bin/donkey

# Download the nvidia-driver-assistant to get the latest supported-gpus.json file
RUN apt-get update && \
apt-get install -y --no-install-recommends nvidia-driver-assistant && \
rm -rf /var/lib/apt/lists/*

COPY nvidia-driver /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

ADD drivers drivers/

Expand Down
27 changes: 25 additions & 2 deletions ubuntu20.04/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ NVIDIA_MODESET_MODULE_PARAMS=()
NVIDIA_PEERMEM_MODULE_PARAMS=()
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

export DEBIAN_FRONTEND=noninteractive

Expand Down Expand Up @@ -477,6 +476,26 @@ _shutdown() {
return 1
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
return 0
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
return 0
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, please check /var/log/gpu-driver-util.log for more details..."
return 1
fi
return 0
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_find_vgpu_driver_version() {
local count=""
local version=""
Expand Down Expand Up @@ -520,6 +539,8 @@ init() {
_find_vgpu_driver_version || exit 1
fi

_resolve_kernel_type || exit 1

# Install the userspace components and copy the kernel module sources.
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
Expand Down Expand Up @@ -592,6 +613,8 @@ update() {
fi
exec 3>&-

_resolve_kernel_type || exit 1

# vgpu driver version is choosen dynamically during runtime, so pre-compile modules for
# only non-vgpu driver types
if [ "${DRIVER_TYPE}" != "vgpu" ]; then
Expand Down
13 changes: 10 additions & 3 deletions ubuntu22.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04

Expand Down Expand Up @@ -69,9 +70,15 @@ RUN /tmp/install.sh reposetup && /tmp/install.sh depinstall && \
curl -fsSL -o /usr/local/bin/donkey https://github.com/3XX0/donkey/releases/download/v1.1.0/donkey && \
chmod +x /usr/local/bin/donkey

# Download the nvidia-driver-assistant to get the latest supported-gpus.json file
RUN apt-get update && \
apt-get install -y --no-install-recommends nvidia-driver-assistant && \
rm -rf /var/lib/apt/lists/*

COPY nvidia-driver /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

ADD drivers drivers/

Expand Down
28 changes: 25 additions & 3 deletions ubuntu22.04/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ NVIDIA_UVM_MODULE_PARAMS=()
NVIDIA_MODESET_MODULE_PARAMS=()
NVIDIA_PEERMEM_MODULE_PARAMS=()
TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"}

OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false}
[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}

export DEBIAN_FRONTEND=noninteractive

Expand Down Expand Up @@ -481,6 +479,26 @@ _shutdown() {
return 1
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
return 0
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
return 0
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, please check /var/log/gpu-driver-util.log for more details..."
return 1
fi
return 0
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

_find_vgpu_driver_version() {
local count=""
local version=""
Expand Down Expand Up @@ -524,6 +542,8 @@ init() {
_find_vgpu_driver_version || exit 1
fi

_resolve_kernel_type || exit 1

# Install the userspace components and copy the kernel module sources.
sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \
cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \
Expand Down Expand Up @@ -596,6 +616,8 @@ update() {
fi
exec 3>&-

_resolve_kernel_type || exit 1

# vgpu driver version is choosen dynamically during runtime, so pre-compile modules for
# only non-vgpu driver types
if [ "${DRIVER_TYPE}" != "vgpu" ]; then
Expand Down
25 changes: 23 additions & 2 deletions ubuntu22.04/precompiled/nvidia-driver
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
set -eu

KERNEL_VERSION=$(uname -r)
OPEN_KERNEL_MODULES_ENABLED="${OPEN_KERNEL_MODULES_ENABLED:-false}"
KERNEL_MODULE_TYPE=${KERNEL_MODULE_TYPE:-auto}
RUN_DIR=/run/nvidia
PID_FILE=${RUN_DIR}/${0##*/}.pid
DRIVER_BRANCH=${DRIVER_BRANCH:?"Missing driver version"}
Expand Down Expand Up @@ -96,6 +96,26 @@ _get_module_params() {
fi
}

_resolve_kernel_type() {
if [ "${KERNEL_MODULE_TYPE}" == "proprietary" ]; then
KERNEL_TYPE=kernel
return 0
elif [ "${KERNEL_MODULE_TYPE}" == "open" ]; then
KERNEL_TYPE=kernel-open
return 0
elif [ "${KERNEL_MODULE_TYPE}" == "auto" ]; then
KERNEL_TYPE=$(gpu-driver-util get-kernel-module-type -b "${DRIVER_BRANCH}")
if [ $? -ne 0 ]; then
echo "cannot autodetect the kernel module type, please check /var/log/gpu-driver-util.log for more details..."
return 1
fi
return 0
else
echo "invalid value for the KERNEL_MODULE_TYPE variable: ${KERNEL_MODULE_TYPE}"
return 1
fi
}

# Load the kernel modules and start persistenced.
_load_driver() {
echo "Parsing kernel module parameters..."
Expand Down Expand Up @@ -245,7 +265,7 @@ _install_driver() {
xserver-xorg-video-nvidia-${DRIVER_BRANCH}-server

# Now install the precompiled kernel module packages signed by Canonical
if [ "$OPEN_KERNEL_MODULES_ENABLED" = true ]; then
if [ "${KERNEL_TYPE}" == "kernel-open" ]; then
echo "Installing Open NVIDIA driver kernel modules..."
apt-get install --no-install-recommends -y \
linux-signatures-nvidia-${KERNEL_VERSION} \
Expand Down Expand Up @@ -293,6 +313,7 @@ init() {
_unload_driver || exit 1
_unmount_rootfs

_resolve_kernel_type || exit 1
_install_driver
_load_driver || exit 1
_mount_rootfs
Expand Down
15 changes: 10 additions & 5 deletions ubuntu24.04/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
git && \
rm -rf /var/lib/apt/lists/*



# download appropriate binary based on the target architecture for multi-arch builds
RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \
curl https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${OS_ARCH}.tar.gz \
Expand All @@ -30,9 +28,10 @@ ENV PATH /usr/local/go/bin:$PATH
WORKDIR /work

RUN git clone https://github.com/NVIDIA/gpu-driver-container driver && \
cd driver/vgpu/src && \
go build -o vgpu-util && \
mv vgpu-util /work
go build -C driver/vgpu/src -o vgpu-util && \
mv driver/vgpu/src/vgpu-util /work && \
go build -C driver/gpu-driver-util -o gpu-driver-util && \
mv driver/gpu-driver-util/gpu-driver-util /work

FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04

Expand Down Expand Up @@ -66,9 +65,15 @@ ADD install.sh /tmp
RUN usermod -o -u 0 -g 0 _apt && \
/tmp/install.sh depinstall && /tmp/install.sh setup_cuda_repo

# Download the nvidia-driver-assistant to get the latest supported-gpus.json file
RUN apt-get update && \
apt-get install -y --no-install-recommends nvidia-driver-assistant && \
rm -rf /var/lib/apt/lists/*

COPY nvidia-driver /usr/local/bin

COPY --from=build /work/vgpu-util /usr/local/bin
COPY --from=build /work/gpu-driver-util /usr/local/bin

ADD drivers drivers/

Expand Down
Loading

0 comments on commit 1ffa0da

Please sign in to comment.