From 531a905884b682389922af9bef977edb79539c64 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 10 Jan 2025 16:49:44 +0100 Subject: [PATCH] DRA: add admin controlled device attributes --- keps/prod-readiness/sig-node/5027.yaml | 6 + .../README.md | 868 ++++++++++++++++++ .../kep.yaml | 40 + 3 files changed, 914 insertions(+) create mode 100644 keps/prod-readiness/sig-node/5027.yaml create mode 100644 keps/sig-node/5027-dra-admin-controlled-device-attributes/README.md create mode 100644 keps/sig-node/5027-dra-admin-controlled-device-attributes/kep.yaml diff --git a/keps/prod-readiness/sig-node/5027.yaml b/keps/prod-readiness/sig-node/5027.yaml new file mode 100644 index 00000000000..47cdd47b8f3 --- /dev/null +++ b/keps/prod-readiness/sig-node/5027.yaml @@ -0,0 +1,6 @@ +# The KEP must have an approver from the +# "prod-readiness-approvers" group +# of http://git.k8s.io/enhancements/OWNERS_ALIASES +kep-number: 5027 +alpha: + approver: "@johnbelamaric" diff --git a/keps/sig-node/5027-dra-admin-controlled-device-attributes/README.md b/keps/sig-node/5027-dra-admin-controlled-device-attributes/README.md new file mode 100644 index 00000000000..2a5ecfa8611 --- /dev/null +++ b/keps/sig-node/5027-dra-admin-controlled-device-attributes/README.md @@ -0,0 +1,868 @@ + +# [KEP-5027](https://github.com/kubernetes/enhancements/issues/5027): DRA: admin-controlled device attributes (device health, maintenance, priority) + + + +- [Release Signoff Checklist](#release-signoff-checklist) +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [User Stories (Optional)](#user-stories-optional) + - [Story 1](#story-1) + - [Story 2](#story-2) + - [Notes/Constraints/Caveats (Optional)](#notesconstraintscaveats-optional) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Test Plan](#test-plan) + - [Prerequisite testing updates](#prerequisite-testing-updates) + - [Unit tests](#unit-tests) + - [Integration tests](#integration-tests) + - [e2e tests](#e2e-tests) + - [Graduation Criteria](#graduation-criteria) + - [Upgrade / Downgrade Strategy](#upgrade--downgrade-strategy) + - [Version Skew Strategy](#version-skew-strategy) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) +- [Infrastructure Needed (Optional)](#infrastructure-needed-optional) + + +## Release Signoff Checklist + + + +Items marked with (R) are required *prior to targeting to a milestone / release*. + +- [ ] (R) Enhancement issue in release milestone, which links to KEP dir in [kubernetes/enhancements] (not the initial KEP PR) +- [ ] (R) KEP approvers have approved the KEP status as `implementable` +- [ ] (R) Design details are appropriately documented +- [ ] (R) Test plan is in place, giving consideration to SIG Architecture and SIG Testing input (including test refactors) + - [ ] e2e Tests for all Beta API Operations (endpoints) + - [ ] (R) Ensure GA e2e tests meet requirements for [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) + - [ ] (R) Minimum Two Week Window for GA e2e tests to prove flake free +- [ ] (R) Graduation criteria is in place + - [ ] (R) [all GA Endpoints](https://github.com/kubernetes/community/pull/1806) must be hit by [Conformance Tests](https://github.com/kubernetes/community/blob/master/contributors/devel/sig-architecture/conformance-tests.md) +- [ ] (R) Production readiness review completed +- [ ] (R) Production readiness review approved +- [ ] "Implementation History" section is up-to-date for milestone +- [ ] User-facing documentation has been created in [kubernetes/website], for publication to [kubernetes.io] +- [ ] Supporting documentation—e.g., additional design documents, links to mailing list discussions/SIG meetings, relevant PRs/issues, release notes + + + +[kubernetes.io]: https://kubernetes.io/ +[kubernetes/enhancements]: https://git.k8s.io/enhancements +[kubernetes/kubernetes]: https://git.k8s.io/kubernetes +[kubernetes/website]: https://git.k8s.io/website + +## Summary + + + +With Dynamic Resource Allocation (DRA), DRA drivers publish information about +the devices that they manage in ResourceSlices. This information is used by the +scheduler when selecting devices for user requests in ResourceClaims. + +This KEP adds a Kubernetes API that privileged users, typically cluster +administrators, can use to override or extend that information. This can be +permanent as part of the installation of a DRA driver to adapt the driver to +the cluster or temporary as part of cluster maintenance. + +This generic mechanism can be used to mark devices as offline, regardless +whether it is because of device health as observed by some component other than +the driver itself or because the cluster administrator is doing some manual +maintenance work. + +The other usage is to influence which devices are picked when there are +multiple viable alternatives. This is a first step towards providing a more +comprehensive [scoring](https://github.com/kubernetes/enhancements/issues/4970) +solution. + +For both usages this KEP standardizes device attributes that need to be checked +by the scheduler before using a device for a ResourceClaim. + +## Motivation + +### Goals + +- Enable preventing usage of certain devices without having to reconfigure the + DRA driver. + +- Enable configuring a cluster where some devices are local and others are + attached via some fabric such that local devices are preferred by the + scheduler (https://github.com/kubernetes/kubernetes/issues/124042#issuecomment-2550962390). + +### Non-Goals + +- A complete scoring solution which also considers also user preferences + and picks the "best" device. + +## Proposal + +The intent to override device attributes must be recorded persistently so that +it is preserved even when a ResourceSlice gets removed or updated. To achieve +this, a new cluster-scoped ResourceSliceOverride type gets added. A single +ResourceSliceOverride object specifies device attributes that apply to all +devices matching a CEL expression, i.e. the same way as users select devices in +a ResourceClaim. + +The scheduler must merge these additional attributes with the ones provided by +the DRA drivers. The "kubernetes.io/offline" string attribute contains a +free-form explanation why the device is not currently available. Such a device +must be ignored by the scheduler. The "kubernetes.io/priority" integer defines +which devices the scheduler should look at first when searching for devices +matching the user request. + +``` +<<[UNRESOLVED @pohly ]>> +As defined now, a device has exactly one "kubernetes.io/priority" value. +It might make sense to rename this to "kubernetes.io/admin-priority" (name?) to +make it clearer that this is the intent of the cluster admin. That leaves +room for defining other priorities. +<<[/UNRESOLVED]>> +``` + +DRA drivers may also set these attributes directly in their ResourceSlices. +"kubernetes.io/offline" then replaces the current approach of removing a device +that is offline from its ResourceSlice. Setting "kubernetes.io/priority" makes +less sense because it is a cluster policy decision, but isn't prevented. + +### User Stories + +#### External Health Monitoring + +As cluster admin, I am deploying a vendor-provided DRA driver together with a +separate monitoring component for hardware aspects that are not available or +not supported by that DRA driver. When that component detects problems, it can +check its policy configuration and decide to take devices offline by creating +a ResourceSliceOverride with "kubernetes.io/offline" for affected devices. + +#### Composable Disaggregated Infrastructure + +As a cluster admin, I want to improve hardware utilization by making devices +available on nodes on demand. Some GPUs are plugged into a local PCI slot of +some nodes and advertised by the local DRA driver on the node, others are +connected dynamically through a PCI switch and advertised by a control plane +component. For performance reasons I want to ensure that the scheduler prefers +local devices, so I create a ResourceSliceOverride matching local devices with +a higher "kubernetes.io/priority". + +### Notes/Constraints/Caveats + +Users who look at ResourceSlices to figure out which devices are available also +need to consider ResourceSliceOverrides to get the full picture. Copying from +the ResourceSliceOverride spec into the ResourceSlice status could help here, +but would not be instantaneous and potentially cause write amplification (one +ResourceSliceOverride affecting many different devices) and therefore is not +part of this proposal. + +Perhaps `kubectl describe resourceslices` can be extended to include the +additional information. For now this is out of scope. + +Creating a ResourceSliceOverride is racing with on-going scheduling attempts, +which is unavoidable. + +### Risks and Mitigations + + + +## Design Details + +### API + +The ResourceSliceOverride is a cluster-scoped type in the `resource.k8s.io` API +group, initially in `v1alpha3` (the alpha version in Kubernetes 1.32). + +```Go +type ResourceSliceOverride struct { +metav1.TypeMeta + // Standard object metadata + // +optional + metav1.ObjectMeta + + // Changing the spec automatically increments the metadata.generation number. + Spec ResourceSliceSpec +} + +type ResourceSliceOverrideSpec struct { + // Devices defines how to override device attributes. + Devices DeviceOverride +} + +type DeviceOverride struct { + // Selectors define criteria which must be satisfied by a specific + // device, otherwise the override is ignore for it. + // All selectors must be satisfied. + // + // +optional + // +listType=atomic + Selectors []DeviceSelector + + // If a ResourceSlice and a DeviceOverride define the same attribute or + // capacity, the value of the DeviceOverride is used. If multiple + // different DeviceOverrides match the same device, then the one with + // the highest rank wins. If the ranks are the same, it is non-deterministic + // which override is used. + Rank int + + // Attributes defines the set of attributes to override for matching devices. + // The name of each attribute must be unique in that set and + // include the domain prefix. + // + // The maximum number of attributes and capacities combined is 32. + // + // +optional + Attributes map[FullyQualifiedName]DeviceAttribute + + // Capacity defines the set of capacities to override for matching devices. + // The name of each capacity must be unique in that set and + // include the domain prefix. + // + // The maximum number of attributes and capacities combined is 32. + // + // +optional + Capacity map[QualifiedName]DeviceCapacity +} + +// AttributeNamePriority is a standardized attribute name. Its value must be an integer. +// It may be positive and negative, with 0 the default if not set for a device. +// When looking for devices that match a request, the scheduler will try to +// use devices with a higher priority first. +// +// This is just a hint. It may be ignored by the scheduler and/or the exact +// behavior may evolve over time. This scheduler behavior is controlled by +// the DRAAdminControlledDeviceAttributes feature gate. +const AttributeNameDevicePriority = "kubernetes.io/priority" + +// AttributeNameOffline is a standardized attribute name. Its value must be a string. +// The string itself should be a free-form human-readable explanation. If this +// attribute is set for a device, the scheduler will ignore it when looking +// for devices that match a request. +// +// This scheduler behavior is controlled by the DRAAdminControlledDeviceAttributes +// feature gate. +const AttributeNameOffline = "kubernetes.io/offline" +``` + +Validation of these standardized attributes rejects invalid values when the +feature gate is enabled, but only when creating new ResourceSlices or +ResourceSliceOverrides. Existing objects might have been created before these +attributes where standardized or while their validation was turned off. + +``` +<<[UNRESOLVED @pohly]>> +Should validation reject unknown attribute names? This may be possible on creation. +For updates, whatever was already stored must remain valid to support downgrades or +disabling features like DRAAdminControlledDeviceAttributes. +<<[/UNRESOLVED]>> +``` + + +### Test Plan + +[X] I/we understand the owners of the involved components may require updates to +existing tests to make this code solid enough prior to committing the changes necessary +to implement this enhancement. + +##### Prerequisite testing updates + +None. + +##### Unit tests + + + + + +v1.32.0: + +- `k8s.io/dynamic-resource-allocation/structured`: 91.3% +- `k8s.io/kubernetes/pkg/apis/resource/validation`: 98.6% + + +##### Integration tests + + + + + +Additional scenarios will be added to `test/integration/scheduler_perf`, not +just for correctness but also to evaluate a potential performance impact. + +- : + +##### e2e tests + + + +One E2E test scenario is to mark all devices as offline and then verify that +pods don't get scheduled. Another is to set different priorities and check that +the scheduler picks the device with the highest one. Some care will be needed +to avoid flakes because the creation of a ResourceSliceOverride will not be +immediately visible to the scheduler. + +- : + +### Graduation Criteria + +#### Alpha + +- Feature implemented behind a feature flag +- Initial e2e tests completed and enabled + +#### Beta + +- Gather feedback from developers and surveys +- Additional tests are in Testgrid and linked in KEP + +#### GA + +- 3 examples of real-world usage +- Allowing time for feedback +- [Conformance tests] + +[conformance tests]: https://git.k8s.io/community/contributors/devel/sig-architecture/conformance-tests.md + +### Upgrade / Downgrade Strategy + + + +### Version Skew Strategy + + + +## Production Readiness Review Questionnaire + + + +### Feature Enablement and Rollback + + + +###### How can this feature be enabled / disabled in a live cluster? + + + +- [ ] Feature gate (also fill in values in `kep.yaml`) + - Feature gate name: + - Components depending on the feature gate: +- [ ] Other + - Describe the mechanism: + - Will enabling / disabling the feature require downtime of the control + plane? + - Will enabling / disabling the feature require downtime or reprovisioning + of a node? + +###### Does enabling the feature change any default behavior? + + + +###### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? + + + +###### What happens if we reenable the feature if it was previously rolled back? + +###### Are there any tests for feature enablement/disablement? + + + +### Rollout, Upgrade and Rollback Planning + + + +###### How can a rollout or rollback fail? Can it impact already running workloads? + + + +###### What specific metrics should inform a rollback? + + + +###### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? + + + +###### Is the rollout accompanied by any deprecations and/or removals of features, APIs, fields of API types, flags, etc.? + + + +### Monitoring Requirements + + + +###### How can an operator determine if the feature is in use by workloads? + + + +###### How can someone using this feature know that it is working for their instance? + + + +- [ ] Events + - Event Reason: +- [ ] API .status + - Condition name: + - Other field: +- [ ] Other (treat as last resort) + - Details: + +###### What are the reasonable SLOs (Service Level Objectives) for the enhancement? + + + +###### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? + + + +- [ ] Metrics + - Metric name: + - [Optional] Aggregation method: + - Components exposing the metric: +- [ ] Other (treat as last resort) + - Details: + +###### Are there any missing metrics that would be useful to have to improve observability of this feature? + + + +### Dependencies + + + +###### Does this feature depend on any specific services running in the cluster? + + + +### Scalability + + + +###### Will enabling / using this feature result in any new API calls? + + + +###### Will enabling / using this feature result in introducing new API types? + + + +###### Will enabling / using this feature result in any new calls to the cloud provider? + + + +###### Will enabling / using this feature result in increasing size or count of the existing API objects? + + + +###### Will enabling / using this feature result in increasing time taken by any operations covered by existing SLIs/SLOs? + + + +###### Will enabling / using this feature result in non-negligible increase of resource usage (CPU, RAM, disk, IO, ...) in any components? + + + +###### Can enabling / using this feature result in resource exhaustion of some node resources (PIDs, sockets, inodes, etc.)? + + + +### Troubleshooting + + + +###### How does this feature react if the API server and/or etcd is unavailable? + +###### What are other known failure modes? + + + +###### What steps should be taken if SLOs are not being met to determine the problem? + +## Implementation History + + + +## Drawbacks + + + +## Alternatives + +Instead of standardized attributes, top-level fields could be introduced. But +the ability to override all kinds of attributes may be useful by itself. Once +that is implemented, adding attributes with special semantic is simpler than +dedicated fields. + +Instead of ResourceSliceOverride as a separate type, new fields in the +ResourceSlice status could be modified by an admin. That has the problem that +the ResourceSlice object might get deleted while doing cluster maintenance like +a driver update, in which case the admin intent would get lost. A driver would +not be able to publish a new ResourceSlice where a device is immediately marked +as offline because creating a ResourceSlice strips the status. diff --git a/keps/sig-node/5027-dra-admin-controlled-device-attributes/kep.yaml b/keps/sig-node/5027-dra-admin-controlled-device-attributes/kep.yaml new file mode 100644 index 00000000000..0e3ebd2d03a --- /dev/null +++ b/keps/sig-node/5027-dra-admin-controlled-device-attributes/kep.yaml @@ -0,0 +1,40 @@ +title: KEP Template +kep-number: 5027 +authors: + - "@pohly" +owning-sig: sig-node +participating-sigs: + - sig-scheduling +status: implementable +creation-date: 2024-01-10 +reviewers: + - TBD +approvers: + - TBD + +see-also: + - "/keps/sig-node/4381-dra-structured-parameters" + +# The target maturity stage in the current dev cycle for this KEP. +stage: alpha + +# The most recent milestone for which work toward delivery of this KEP has been +# done. This can be the current (upcoming) milestone, if it is being actively +# worked on. +latest-milestone: "v1.33" + +# The milestone at which this feature was, or is targeted to be, at each stage. +milestone: + alpha: "v1.33" + +# The following PRR answers are required at alpha release +# List the feature gate name and the components for which it must be enabled +feature-gates: + - name: DRAAdminControlledDeviceAttributes + components: + - kube-apiserver + - kube-scheduler +disable-supported: true + +# The following PRR answers are required at beta release +metrics: