Skip to content

Commit

Permalink
skip old architecture version GPU settings time slice
Browse files Browse the repository at this point in the history
Signed-off-by: wawa0210 <[email protected]>
  • Loading branch information
wawa0210 committed Feb 1, 2024
1 parent b6c7aae commit a672dd7
Showing 1 changed file with 22 additions and 1 deletion.
23 changes: 22 additions & 1 deletion cmd/nvidia-dra-plugin/sharing.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"text/template"
"time"

"golang.org/x/mod/semver"
appsv1 "k8s.io/api/apps/v1"
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -101,6 +102,16 @@ func (t *TimeSlicingManager) SetTimeSlice(devices *PreparedDevices, config *nasc
return fmt.Errorf("setting a TimeSlice duration on MIG devices is unsupported")
}

var supportTimeSliceIDs []string
for _, gpu := range devices.Gpu.Devices {
isSupportTimeSlice := detectSupportTimeSliceByCudaComputeCapability(gpu.cudaComputeCapability)
if isSupportTimeSlice {
supportTimeSliceIDs = append(supportTimeSliceIDs, gpu.uuid)
continue
}
klog.InfoS("the current card does not support setting time slices and will be ignored.", "arch", gpu.architecture, "uuid", gpu.uuid, "cudaComputeCapability", gpu.cudaComputeCapability)
}

timeSlice := nascrd.DefaultTimeSlice
if config != nil && config.TimeSlice != nil {
timeSlice = *config.TimeSlice
Expand All @@ -111,7 +122,7 @@ func (t *TimeSlicingManager) SetTimeSlice(devices *PreparedDevices, config *nasc
return fmt.Errorf("error setting compute mode: %w", err)
}

err = t.nvdevlib.setTimeSlice(devices.UUIDs(), timeSlice.Int())
err = t.nvdevlib.setTimeSlice(supportTimeSliceIDs, timeSlice.Int())
if err != nil {
return fmt.Errorf("error setting time slice: %w", err)
}
Expand Down Expand Up @@ -389,3 +400,13 @@ func (m *MpsControlDaemon) Stop(ctx context.Context) error {

return nil
}

// detactSupportTimeSliceByArch Determine whether the architecture series
// supports setting time slices based on the gpu cudaComputeCapability.
func detectSupportTimeSliceByCudaComputeCapability(cudaComputeCapability string) bool {
// ref https://github.com/NVIDIA/k8s-dra-driver/pull/58#discussion_r1469338562
// we believe time-slicing is available on Volta+ architectures, so the check would simply be cudaComputeCapability >= 7.0
// by https://github.com/NVIDIA/go-nvlib/blob/main/pkg/nvlib/device/device.go#L149, We know that cuda major and minor versions are concatenated through `.` .

return semver.Compare("v"+strings.TrimPrefix(cudaComputeCapability, "v"), "v7.0") >= 0
}

0 comments on commit a672dd7

Please sign in to comment.