diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs.go b/cmd/containerd-shim-runhcs-v1/task_hcs.go index a59618137b..b984981395 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs.go @@ -931,7 +931,11 @@ func (ht *hcsTask) updateTaskContainerResources(ctx context.Context, data interf func (ht *hcsTask) updateWCOWContainerCPU(ctx context.Context, cpu *specs.WindowsCPUResources) error { // if host is 20h2+ then we can make a request directly to hcs if osversion.Get().Build >= osversion.V20H2 { + // Count/Maximum/Shares live on the HCS Processor schema. Only send a modify + // request when at least one of them is set, so an affinity-only update does + // not push an empty (no-op) request to HCS. req := &hcsschema.Processor{} + hasRateControl := false if cpu.Count != nil { procCount := int32(*cpu.Count) hostProcs := processorinfo.ProcessorCount() @@ -939,23 +943,73 @@ func (ht *hcsTask) updateWCOWContainerCPU(ctx context.Context, cpu *specs.Window hostProcs = ht.host.ProcessorCount() } req.Count = hcsoci.NormalizeProcessorCount(ctx, ht.id, procCount, hostProcs) + hasRateControl = true } if cpu.Maximum != nil { req.Maximum = int32(*cpu.Maximum) + hasRateControl = true } if cpu.Shares != nil { req.Weight = int32(*cpu.Shares) + hasRateControl = true } - return ht.requestUpdateContainer(ctx, resourcepaths.SiloProcessorResourcePath, req) + if hasRateControl { + if err := ht.requestUpdateContainer(ctx, resourcepaths.SiloProcessorResourcePath, req); err != nil { + return err + } + } + + // CPU affinity is not part of the HCS Processor schema, so it has to be + // applied out of band (the silo's job object for Argon). A no-op when unset. + if len(cpu.Affinity) > 0 { + return ht.updateWCOWContainerCPUAffinity(ctx, cpu.Affinity) + } + return nil } return errdefs.ErrNotImplemented } +// updateWCOWContainerCPUAffinity honors a post-start change to +// spec.Windows.Resources.CPU.Affinity for an HCS-backed WCOW container. +// +// For process-isolated (Argon) containers this re-pins the silo's job object, using +// the same race-free mechanism as create-time: the Windows kernel re-applies the new +// mask to every process already in the silo and to every future joiner. +// +// Hypervisor-isolated (Xenon) containers require swapping the UVM's CPU group instead; +// that is not yet implemented, so this returns ErrNotImplemented rather than silently +// dropping the request. +func (ht *hcsTask) updateWCOWContainerCPUAffinity(ctx context.Context, affinity []specs.WindowsCPUGroupAffinity) error { + validated, err := hcsoci.ValidateCPUAffinityEntries(affinity) + if err != nil { + return err + } + if len(validated) == 0 { + return nil + } + + if ht.host != nil { + // Xenon: UVM-level CPU-group swap is out of scope here (Track A). + return fmt.Errorf("cpu affinity update for hypervisor-isolated containers is not supported: %w", errdefs.ErrNotImplemented) + } + + system, ok := ht.c.(*hcs.System) + if !ok { + return fmt.Errorf("cpu affinity update requires an HCS-backed container, got %T", ht.c) + } + return system.SetSiloCPUGroupAffinities(ctx, hcsoci.ToJobObjectAffinities(validated)) +} + func isValidWindowsCPUResources(c *specs.WindowsCPUResources) bool { - return (c.Count != nil && (c.Shares == nil && c.Maximum == nil)) || + // Exactly one of the mutually-exclusive rate controls (Count/Shares/Maximum). + exactlyOneRateControl := (c.Count != nil && (c.Shares == nil && c.Maximum == nil)) || (c.Shares != nil && (c.Count == nil && c.Maximum == nil)) || (c.Maximum != nil && (c.Count == nil && c.Shares == nil)) + // An affinity-only update carries no rate control; accept it on its own so that + // CPU affinity can be changed after the container has started. + affinityOnly := len(c.Affinity) > 0 && c.Count == nil && c.Shares == nil && c.Maximum == nil + return exactlyOneRateControl || affinityOnly } func (ht *hcsTask) updateWCOWResources(ctx context.Context, resources *specs.WindowsResources, annotations map[string]string) error { diff --git a/cmd/containerd-shim-runhcs-v1/task_hcs_test.go b/cmd/containerd-shim-runhcs-v1/task_hcs_test.go index beb58ffc50..d922d8e2f3 100644 --- a/cmd/containerd-shim-runhcs-v1/task_hcs_test.go +++ b/cmd/containerd-shim-runhcs-v1/task_hcs_test.go @@ -4,12 +4,14 @@ package main import ( "context" + "errors" "math/rand" "reflect" "strconv" "testing" "time" + "github.com/Microsoft/hcsshim/internal/uvm" "github.com/Microsoft/hcsshim/pkg/annotations" "github.com/containerd/errdefs" "github.com/opencontainers/runtime-spec/specs-go" @@ -506,3 +508,45 @@ func Test_handleProcessArgsForIsolatedJobContainer(t *testing.T) { }) } } + +func u64(v uint64) *uint64 { return &v } +func u16(v uint16) *uint16 { return &v } + +func Test_isValidWindowsCPUResources(t *testing.T) { + affinity := []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x3}} + for _, tt := range []struct { + name string + c *specs.WindowsCPUResources + want bool + }{ + {"count only", &specs.WindowsCPUResources{Count: u64(2)}, true}, + {"shares only", &specs.WindowsCPUResources{Shares: u16(100)}, true}, + {"maximum only", &specs.WindowsCPUResources{Maximum: u16(5000)}, true}, + {"count and shares", &specs.WindowsCPUResources{Count: u64(2), Shares: u16(100)}, false}, + {"affinity only", &specs.WindowsCPUResources{Affinity: affinity}, true}, + {"affinity with count", &specs.WindowsCPUResources{Count: u64(2), Affinity: affinity}, true}, + {"empty", &specs.WindowsCPUResources{}, false}, + } { + t.Run(tt.name, func(t *testing.T) { + if got := isValidWindowsCPUResources(tt.c); got != tt.want { + t.Fatalf("isValidWindowsCPUResources(%+v) = %v, want %v", tt.c, got, tt.want) + } + }) + } +} + +func Test_hcsTask_updateWCOWContainerCPUAffinity_NoAffinity(t *testing.T) { + ht := &hcsTask{id: t.Name()} + // An empty affinity slice is a no-op and must not require an HCS-backed container. + if err := ht.updateWCOWContainerCPUAffinity(context.Background(), nil); err != nil { + t.Fatalf("expected nil error for empty affinity, got %v", err) + } +} + +func Test_hcsTask_updateWCOWContainerCPUAffinity_XenonNotImplemented(t *testing.T) { + ht := &hcsTask{id: t.Name(), host: &uvm.UtilityVM{}} + err := ht.updateWCOWContainerCPUAffinity(context.Background(), []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x1}}) + if !errors.Is(err, errdefs.ErrNotImplemented) { + t.Fatalf("expected ErrNotImplemented for hypervisor-isolated container, got %v", err) + } +} diff --git a/internal/hcs/system.go b/internal/hcs/system.go index 869a5f3e7a..c50771cc4e 100644 --- a/internal/hcs/system.go +++ b/internal/hcs/system.go @@ -424,6 +424,20 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr return properties, nil } +// openSilo opens the container's server silo job object by its well-known name +// (`\Container_`). HCS owns the silo; the only way to open it from the shim is +// by name, and only while running as SYSTEM. The caller owns the returned handle and +// must Close it. +// +// In the future we can make use of some new functionality in HCS that allows you to +// pass a job object for HCS to use for the container. +func (computeSystem *System) openSilo(ctx context.Context) (*jobobject.JobObject, error) { + return jobobject.Open(ctx, &jobobject.Options{ + UseNTVariant: true, + Name: siloNameFmt(computeSystem.id), + }) +} + // queryInProc handles querying for container properties without reaching out to HCS. `props` // will be updated to contain any data returned from the queries present in `types`. If any properties // failed to be queried they will be tallied up and returned in as the first return value. Failures on @@ -434,14 +448,7 @@ func (computeSystem *System) queryInProc( props *hcsschema.Properties, types []hcsschema.PropertyType, ) ([]hcsschema.PropertyType, error) { - // In the future we can make use of some new functionality in the HCS that allows you - // to pass a job object for HCS to use for the container. Currently, the only way we'll - // be able to open the job/silo is if we're running as SYSTEM. - jobOptions := &jobobject.Options{ - UseNTVariant: true, - Name: siloNameFmt(computeSystem.id), - } - job, err := jobobject.Open(ctx, jobOptions) + job, err := computeSystem.openSilo(ctx) if err != nil { return nil, err } @@ -535,6 +542,35 @@ func (computeSystem *System) statisticsInProc(job *jobobject.JobObject) (*hcssch }, nil } +// SetSiloCPUGroupAffinities pins the container's server silo to the given processor +// group affinities. HCS does not expose a CPU-affinity field on the container Processor +// schema, so for process-isolated (Argon) containers we set the affinity directly on the +// silo's job object via SetInformationJobObject(JobObjectGroupInformationEx). +// +// HCS owns the silo; we only open a transient handle (by the silo's well-known job name, +// the same handle queryInProc opens) to record the affinity property. The kernel enforces +// it on every process that joins the silo via AssignProcessToJobObject — including the init +// process at Start and any descendants it spawns. +// +// This must be called after the compute system is created but before it is started, so the +// affinity is already recorded on the job when HCS assigns the init process. Applying it to +// an already-running silo is also safe: the kernel re-applies the mask to current members and +// migrates threads at the next scheduling dispatch. +func (computeSystem *System) SetSiloCPUGroupAffinities(ctx context.Context, affinities []jobobject.GroupAffinity) (err error) { + operation := "hcs::System::SetSiloCPUGroupAffinities" + + job, err := computeSystem.openSilo(ctx) + if err != nil { + return makeSystemError(computeSystem, operation, err, nil) + } + defer job.Close() + + if err := job.SetCPUGroupAffinities(affinities); err != nil { + return makeSystemError(computeSystem, operation, err, nil) + } + return nil +} + // hcsPropertiesV2Query is a helper to make a HcsGetComputeSystemProperties call using the V2 schema property types. func (computeSystem *System) hcsPropertiesV2Query(ctx context.Context, types []hcsschema.PropertyType) (*hcsschema.Properties, error) { operation := "hcs::System::PropertiesV2" diff --git a/internal/hcsoci/cpuaffinity.go b/internal/hcsoci/cpuaffinity.go new file mode 100644 index 0000000000..0db6955758 --- /dev/null +++ b/internal/hcsoci/cpuaffinity.go @@ -0,0 +1,99 @@ +//go:build windows +// +build windows + +package hcsoci + +import ( + "errors" + "fmt" + + specs "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/Microsoft/hcsshim/internal/jobobject" + "github.com/Microsoft/hcsshim/osversion" +) + +// Shared, container-kind-agnostic CPU affinity helpers. These are used by every +// Windows container shape that honors spec.Windows.Resources.CPU.Affinity: +// HostProcess (internal/jobcontainers) and Argon (this package). Keeping them +// here, rather than in a kind-specific file, avoids duplicating the validation +// and conversion logic across packages. + +// Sentinel errors returned by ValidateCPUAffinity / ValidateCPUAffinityEntries. +var ( + // ErrCPUAffinityMultipleGroupsNotSupported is returned when multiple processor-group + // affinity entries are requested on a host older than Windows Server 2022 (build 20348), + // which does not support multi-group affinity for job object silos. + // On Windows Server 2022+, multiple processor groups are fully supported. + ErrCPUAffinityMultipleGroupsNotSupported = errors.New("cpu affinity with multiple processor groups requires Windows Server 2022 or later") + // ErrCPUAffinityNonZeroGroupNotSupported is returned when a non-zero processor group is + // requested on a host older than Windows Server 2022 (build 20348). + // On Windows Server 2022+, non-zero processor groups are fully supported. + ErrCPUAffinityNonZeroGroupNotSupported = errors.New("cpu affinity with a non-zero processor group requires Windows Server 2022 or later") + // ErrCPUAffinityMaskZero is returned when an affinity entry has a zero bitmask, + // which would select no processors and is always invalid. + ErrCPUAffinityMaskZero = errors.New("cpu affinity mask must be non-zero") +) + +// ValidateCPUAffinity handles the logic of validating the container's CPU affinity +// specified in the OCI spec. +// +// Returns the validated affinity entries (nil if not specified) and any validation error. +// Multiple processor groups and non-zero group numbers require Windows Server 2022 +// (build 20348) or later; on older hosts only a single entry for group 0 is accepted. +func ValidateCPUAffinity(spec *specs.Spec) ([]specs.WindowsCPUGroupAffinity, error) { + if spec.Windows == nil || spec.Windows.Resources == nil || spec.Windows.Resources.CPU == nil { + return nil, nil + } + return ValidateCPUAffinityEntries(spec.Windows.Resources.CPU.Affinity) +} + +// ValidateCPUAffinityEntries validates a set of OCI CPU affinity entries directly, +// applying the same rules as ValidateCPUAffinity. It is used on the container update +// path, where the affinity is supplied as a bare slice rather than a full spec. +// +// Returns the validated entries (nil if empty) and any validation error. +func ValidateCPUAffinityEntries(affinity []specs.WindowsCPUGroupAffinity) ([]specs.WindowsCPUGroupAffinity, error) { + if len(affinity) == 0 { + return nil, nil + } + + // Zero masks are never valid regardless of OS version. + for i, a := range affinity { + if a.Mask == 0 { + return nil, fmt.Errorf("%w: entry %d has zero mask", ErrCPUAffinityMaskZero, i) + } + } + + // Determine whether multi-group features are needed: either multiple entries, + // or a single entry targeting a non-zero processor group. + multiGroup := len(affinity) > 1 || affinity[0].Group != 0 + + // Multiple processor groups are only supported on Windows Server 2022+. + if multiGroup && osversion.Build() < osversion.LTSC2022 { + if len(affinity) > 1 { + return nil, fmt.Errorf("%w: %d entries", ErrCPUAffinityMultipleGroupsNotSupported, len(affinity)) + } + return nil, fmt.Errorf("%w: group %d", ErrCPUAffinityNonZeroGroupNotSupported, affinity[0].Group) + } + + return affinity, nil +} + +// ToJobObjectAffinities converts validated OCI CPU affinity entries into the +// jobobject.GroupAffinity representation used by the Win32 job-object APIs. +// +// The input is expected to already have been run through ValidateCPUAffinity. +func ToJobObjectAffinities(affinities []specs.WindowsCPUGroupAffinity) []jobobject.GroupAffinity { + if len(affinities) == 0 { + return nil + } + out := make([]jobobject.GroupAffinity, len(affinities)) + for i, a := range affinities { + out[i] = jobobject.GroupAffinity{ + Mask: a.Mask, + Group: uint16(a.Group), + } + } + return out +} diff --git a/internal/hcsoci/cpuaffinity_argon.go b/internal/hcsoci/cpuaffinity_argon.go new file mode 100644 index 0000000000..2deea3ef70 --- /dev/null +++ b/internal/hcsoci/cpuaffinity_argon.go @@ -0,0 +1,39 @@ +//go:build windows +// +build windows + +package hcsoci + +import ( + "context" + "fmt" + + "github.com/Microsoft/hcsshim/internal/hcs" + "github.com/Microsoft/hcsshim/internal/log" +) + +// applyArgonCPUAffinity honors spec.Windows.Resources.CPU.Affinity for a +// process-isolated (Argon) container by pinning the container's server silo. +// +// HCS ignores CPU affinity on the container Processor schema (Count/Maximum/Weight), +// so instead we set the affinity on the silo's job object directly. This must run +// after the compute system is created but before it is started, so the affinity is +// already recorded on the job when HCS assigns the init process to the silo. See +// (*hcs.System).SetSiloCPUGroupAffinities for the race-free timeline. +// +// If the spec requests no affinity this is a no-op. +func applyArgonCPUAffinity(ctx context.Context, system *hcs.System, coi *createOptionsInternal) error { + affinities, err := ValidateCPUAffinity(coi.Spec) + if err != nil { + return err + } + if len(affinities) == 0 { + return nil + } + + if err := system.SetSiloCPUGroupAffinities(ctx, ToJobObjectAffinities(affinities)); err != nil { + return fmt.Errorf("apply CPU affinity to container silo: %w", err) + } + + log.G(ctx).WithField("affinities", affinities).Debug("applied CPU affinity to Argon container silo") + return nil +} diff --git a/internal/hcsoci/cpuaffinity_test.go b/internal/hcsoci/cpuaffinity_test.go new file mode 100644 index 0000000000..c74c63d3e4 --- /dev/null +++ b/internal/hcsoci/cpuaffinity_test.go @@ -0,0 +1,83 @@ +//go:build windows +// +build windows + +package hcsoci + +import ( + "errors" + "testing" + + specs "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/Microsoft/hcsshim/internal/jobobject" +) + +func TestValidateCPUAffinityEntries(t *testing.T) { + // A zero mask is invalid on every OS version, so this case is host-independent. + if _, err := ValidateCPUAffinityEntries([]specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0}}); !errors.Is(err, ErrCPUAffinityMaskZero) { + t.Fatalf("zero mask: got %v, want %v", err, ErrCPUAffinityMaskZero) + } + + // Empty input validates to no entries (no affinity requested). + got, err := ValidateCPUAffinityEntries(nil) + if err != nil || got != nil { + t.Fatalf("nil input: got (%v, %v), want (nil, nil)", got, err) + } + + // A single group-0 entry with a non-zero mask is valid regardless of OS version. + in := []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0x3}} + got, err = ValidateCPUAffinityEntries(in) + if err != nil { + t.Fatalf("group-0 single entry: unexpected error %v", err) + } + if len(got) != 1 || got[0] != in[0] { + t.Fatalf("group-0 single entry: got %+v, want %+v", got, in) + } +} + +func TestToJobObjectAffinities(t *testing.T) { + for _, tc := range []struct { + name string + in []specs.WindowsCPUGroupAffinity + want []jobobject.GroupAffinity + }{ + { + name: "nil", + in: nil, + want: nil, + }, + { + name: "empty", + in: []specs.WindowsCPUGroupAffinity{}, + want: nil, + }, + { + name: "single group", + in: []specs.WindowsCPUGroupAffinity{{Group: 0, Mask: 0b1011}}, + want: []jobobject.GroupAffinity{{Group: 0, Mask: 0b1011}}, + }, + { + name: "multiple groups", + in: []specs.WindowsCPUGroupAffinity{ + {Group: 0, Mask: 0xff}, + {Group: 1, Mask: 0x1}, + }, + want: []jobobject.GroupAffinity{ + {Group: 0, Mask: 0xff}, + {Group: 1, Mask: 0x1}, + }, + }, + } { + t.Run(tc.name, func(t *testing.T) { + got := ToJobObjectAffinities(tc.in) + if len(got) != len(tc.want) { + t.Fatalf("got %d entries, want %d", len(got), len(tc.want)) + } + for i := range got { + if got[i] != tc.want[i] { + t.Errorf("entry %d: got %+v, want %+v", i, got[i], tc.want[i]) + } + } + }) + } +} diff --git a/internal/hcsoci/create.go b/internal/hcsoci/create.go index 5288932fa1..5a3082012e 100644 --- a/internal/hcsoci/create.go +++ b/internal/hcsoci/create.go @@ -357,6 +357,17 @@ func CreateContainer(ctx context.Context, createOptions *CreateOptions) (_ cow.C if err != nil { return nil, r, err } + + // Process-isolated (Argon) containers run in a server silo on the host. HCS does not + // honor CPU affinity on the container Processor schema, so pin the silo's job object + // directly, after create but before the caller starts the container. Xenon (UVM-backed) + // containers are handled separately at the UVM layer and are skipped here. + if coi.HostingSystem == nil { + if err := applyArgonCPUAffinity(ctx, system, coi); err != nil { + return nil, r, err + } + } + return system, r, nil } diff --git a/internal/hcsoci/hcsdoc_wcow.go b/internal/hcsoci/hcsdoc_wcow.go index 52f01e2ab6..b14e6190aa 100644 --- a/internal/hcsoci/hcsdoc_wcow.go +++ b/internal/hcsoci/hcsdoc_wcow.go @@ -32,21 +32,8 @@ import ( const createContainerSubdirectoryForProcessDumpSuffix = "{container_id}" -// Sentinel errors returned by ValidateCPUAffinity. -var ( - // ErrCPUAffinityMultipleGroupsNotSupported is returned when multiple processor-group - // affinity entries are requested on a host older than Windows Server 2022 (build 20348), - // which does not support multi-group affinity for job object silos. - // On Windows Server 2022+, multiple processor groups are fully supported. - ErrCPUAffinityMultipleGroupsNotSupported = errors.New("cpu affinity with multiple processor groups requires Windows Server 2022 or later") - // ErrCPUAffinityNonZeroGroupNotSupported is returned when a non-zero processor group is - // requested on a host older than Windows Server 2022 (build 20348). - // On Windows Server 2022+, non-zero processor groups are fully supported. - ErrCPUAffinityNonZeroGroupNotSupported = errors.New("cpu affinity with a non-zero processor group requires Windows Server 2022 or later") - // ErrCPUAffinityMaskZero is returned when an affinity entry has a zero bitmask, - // which would select no processors and is always invalid. - ErrCPUAffinityMaskZero = errors.New("cpu affinity mask must be non-zero") -) +// CPU affinity validation (ValidateCPUAffinity / ValidateCPUAffinityEntries) and its +// sentinel errors live in cpuaffinity.go, shared with the HostProcess container path. // A simple wrapper struct around the container mount configs that should be added to the // container. @@ -111,41 +98,6 @@ func createMountsConfig(ctx context.Context, coi *createOptionsInternal) (*mount return &config, nil } -// ValidateCPUAffinity handles the logic of validating the container's CPU affinity -// specified in the OCI spec. -// -// Returns the validated affinity entries (nil if not specified) and any validation error. -// Multiple processor groups and non-zero group numbers require Windows Server 2022 -// (build 20348) or later; on older hosts only a single entry for group 0 is accepted. -func ValidateCPUAffinity(spec *specs.Spec) ([]specs.WindowsCPUGroupAffinity, error) { - if spec.Windows == nil || spec.Windows.Resources == nil || spec.Windows.Resources.CPU == nil || len(spec.Windows.Resources.CPU.Affinity) == 0 { - return nil, nil - } - - affinity := spec.Windows.Resources.CPU.Affinity - - // Zero masks are never valid regardless of OS version. - for i, a := range affinity { - if a.Mask == 0 { - return nil, fmt.Errorf("%w: entry %d has zero mask", ErrCPUAffinityMaskZero, i) - } - } - - // Determine whether multi-group features are needed: either multiple entries, - // or a single entry targeting a non-zero processor group. - multiGroup := len(affinity) > 1 || affinity[0].Group != 0 - - // Multiple processor groups are only supported on Windows Server 2022+. - if multiGroup && osversion.Build() < osversion.LTSC2022 { - if len(affinity) > 1 { - return nil, fmt.Errorf("%w: %d entries", ErrCPUAffinityMultipleGroupsNotSupported, len(affinity)) - } - return nil, fmt.Errorf("%w: group %d", ErrCPUAffinityNonZeroGroupNotSupported, affinity[0].Group) - } - - return affinity, nil -} - // ConvertCPULimits handles the logic of converting and validating the containers CPU limits // specified in the OCI spec to what HCS expects. // diff --git a/internal/jobcontainers/oci.go b/internal/jobcontainers/oci.go index b0b07927dc..fa0b1bd276 100644 --- a/internal/jobcontainers/oci.go +++ b/internal/jobcontainers/oci.go @@ -46,16 +46,7 @@ func specToLimits(ctx context.Context, cid string, s *specs.Spec) (*jobobject.Jo if err != nil { return nil, err } - var groupAffinities []jobobject.GroupAffinity - if len(affinities) > 0 { - groupAffinities = make([]jobobject.GroupAffinity, len(affinities)) - for i, a := range affinities { - groupAffinities[i] = jobobject.GroupAffinity{ - Mask: a.Mask, - Group: uint16(a.Group), - } - } - } + groupAffinities := hcsoci.ToJobObjectAffinities(affinities) realCPULimit, realCPUWeight := uint32(cpuLimit), uint32(cpuWeight) if cpuCount != 0 { diff --git a/test/functional/container_affinity_test.go b/test/functional/container_affinity_test.go new file mode 100644 index 0000000000..efe1edffc6 --- /dev/null +++ b/test/functional/container_affinity_test.go @@ -0,0 +1,255 @@ +//go:build windows && functional +// +build windows,functional + +package functional + +import ( + "context" + "errors" + "testing" + "unsafe" + + "github.com/containerd/containerd/v2/core/containers" + ctrdoci "github.com/containerd/containerd/v2/pkg/oci" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/windows" + + "github.com/Microsoft/hcsshim/internal/jobobject" + "github.com/Microsoft/hcsshim/osversion" + + testcontainer "github.com/Microsoft/hcsshim/test/internal/container" + testlayers "github.com/Microsoft/hcsshim/test/internal/layers" + testoci "github.com/Microsoft/hcsshim/test/internal/oci" + "github.com/Microsoft/hcsshim/test/internal/util" + "github.com/Microsoft/hcsshim/test/pkg/require" +) + +// Test_Container_CPUAffinity_Argon is the CI-gating functional test for honoring +// spec.Windows.Resources.CPU.Affinity on process-isolated (Argon) WCOW containers +// (commit "hcsoci,hcs,shim: honor CPU affinity for Argon containers"). +// +// It asserts the three layers from the validation strategy, all reachable from this +// one in-process test (the functional suite runs in-process with internal/jobobject +// and as SYSTEM, so it can open the silo job by name): +// +// Layer 1 — the PR wrote the affinity to the silo's job object in the +// create→start window. This is the real regression gate: it fails if +// applyArgonCPUAffinity / SetSiloCPUGroupAffinities regresses. +// Layer 2 — the host's view matches. The NT-variant silo job IS the host object, +// so the same GetCPUGroupAffinities read-back doubles as the host view; +// no second tool is needed. +// Layer 3 — the init process is actually constrained. This is a kernel guarantee +// (the kernel propagates the silo job's affinity onto silo members), not +// hcsshim code. If the affinity cannot be read (OpenProcess / +// GetProcessGroupAffinity fail) the check is skipped, but a genuine +// mismatch is a hard failure: with Layer 1 passing, it points at the +// kernel/silo plumbing rather than this PR. +func Test_Container_CPUAffinity_Argon(t *testing.T) { + requireFeatures(t, featureWCOW) + // Affinity is applied via the silo job object on 20H2+ (the same floor as the + // rest of the WCOW resource-update path). + require.Build(t, osversion.V20H2) + + ctx := util.Context(namespacedContext(context.Background()), t) + + // Group 0 / single-mask works on any host, so it is the default CI case. + t.Run("Group0SingleMask", func(t *testing.T) { + want := []jobobject.GroupAffinity{{Group: 0, Mask: 0x3}} // CPUs 0 and 1. + runArgonAffinityTest(ctx, t, want) + }) + + // A genuine multi-group pin needs a confirmed >1-processor-group host and + // Windows Server 2022+; skip otherwise rather than assert against a topology + // the runner does not have. + t.Run("MultiGroup", func(t *testing.T) { + require.Build(t, osversion.LTSC2022) + if n := activeProcessorGroupCount(t); n < 2 { + t.Skipf("multi-group affinity requires a host with >1 processor group, got %d", n) + } + want := []jobobject.GroupAffinity{ + {Group: 0, Mask: 0x1}, + {Group: 1, Mask: 0x1}, + } + runArgonAffinityTest(ctx, t, want) + }) +} + +// runArgonAffinityTest creates an Argon container pinned to want, then asserts the +// three validation layers. +func runArgonAffinityTest(ctx context.Context, t *testing.T, want []jobobject.GroupAffinity) { + t.Helper() + + cID := testName(t, "container") + scratch := testlayers.WCOWScratchDir(ctx, t, "") + spec := testoci.CreateWindowsSpec(ctx, t, cID, + testoci.DefaultWindowsSpecOpts(cID, + ctrdoci.WithProcessCommandLine(testoci.PingSelfCmd), + testoci.WithWindowsLayerFolders(append(windowsImageLayers(ctx, t), scratch)), + withCPUAffinity(want), + )...) + + // nil host => process-isolated (Argon). Create runs the PR's applyArgonCPUAffinity + // between HCS-create and HCS-start. + c, _, cleanup := testcontainer.Create(ctx, t, nil, spec, cID, hcsOwner) + t.Cleanup(cleanup) + + // Layers 1 & 2, pre-start gate: the affinity is already recorded on the silo job + // before the init process runs, proving "set after create, before start". + assertSiloJobAffinity(ctx, t, cID, want) + + init := testcontainer.StartWithSpec(ctx, t, c, spec.Process, nil) + t.Cleanup(func() { + testcontainer.Kill(ctx, t, c) + testcontainer.Wait(ctx, t, c) + }) + + // Layers 1 & 2 again, now that the silo has a running member. + assertSiloJobAffinity(ctx, t, cID, want) + + // Layer 3 (kernel assertion): the init process inherited the pin. Skipped if the + // affinity cannot be read; a real mismatch fails the test. + assertProcessGroupAffinity(t, uint32(init.Process.Pid()), want) +} + +// withCPUAffinity returns a SpecOpt that sets spec.Windows.Resources.CPU.Affinity. +func withCPUAffinity(affinities []jobobject.GroupAffinity) ctrdoci.SpecOpts { + return func(_ context.Context, _ ctrdoci.Client, _ *containers.Container, s *specs.Spec) error { + if s.Windows == nil { + s.Windows = &specs.Windows{} + } + if s.Windows.Resources == nil { + s.Windows.Resources = &specs.WindowsResources{} + } + if s.Windows.Resources.CPU == nil { + s.Windows.Resources.CPU = &specs.WindowsCPUResources{} + } + oci := make([]specs.WindowsCPUGroupAffinity, len(affinities)) + for i, a := range affinities { + oci[i] = specs.WindowsCPUGroupAffinity{Group: uint32(a.Group), Mask: a.Mask} + } + s.Windows.Resources.CPU.Affinity = oci + return nil + } +} + +// assertSiloJobAffinity opens the container's server silo job object by its +// well-known name (\Container_) and asserts its CPU group affinities equal want. +// This is the host-side view of the object the PR wrote to (Layers 1 & 2). +func assertSiloJobAffinity(ctx context.Context, t *testing.T, cID string, want []jobobject.GroupAffinity) { + t.Helper() + + job, err := jobobject.Open(ctx, &jobobject.Options{ + UseNTVariant: true, + Name: `\Container_` + cID, + }) + if err != nil { + t.Fatalf("open silo job for %q: %v", cID, err) + } + defer job.Close() + + got, err := job.GetCPUGroupAffinities() + if err != nil { + t.Fatalf("get silo job cpu group affinities: %v", err) + } + assertAffinitiesEqual(t, "silo job object", got, want) +} + +// assertProcessGroupAffinity reads the group affinity the kernel placed on the init +// process and compares it to want. The PR only writes the job object; propagation +// onto silo members is a kernel guarantee. If the affinity cannot be read the check +// is skipped (logged, not failed), but a successful read that omits a pinned group +// is a hard failure. +func assertProcessGroupAffinity(t *testing.T, pid uint32, want []jobobject.GroupAffinity) { + t.Helper() + + h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid) + if err != nil { + t.Logf("Layer 3 (kernel) skipped: OpenProcess(%d): %v", pid, err) + return + } + defer windows.CloseHandle(h) //nolint:errcheck + + got, err := getProcessGroupAffinity(h) + if err != nil { + t.Logf("Layer 3 (kernel) skipped: GetProcessGroupAffinity(%d): %v", pid, err) + return + } + + // The process reports the set of groups it may run on; assert every group we + // pinned shows up. We do not compare masks here: the kernel reports the group's + // active-processor mask for the process, not necessarily the bits we requested. + wantGroups := make(map[uint16]struct{}, len(want)) + for _, a := range want { + wantGroups[a.Group] = struct{}{} + } + gotGroups := make(map[uint16]struct{}, len(got)) + for _, g := range got { + gotGroups[g] = struct{}{} + } + for g := range wantGroups { + if _, ok := gotGroups[g]; !ok { + t.Errorf("Layer 3 (kernel): init process not constrained to group %d; process groups = %v", g, got) + } + } +} + +func assertAffinitiesEqual(t *testing.T, what string, got, want []jobobject.GroupAffinity) { + t.Helper() + + // Order-independent compare keyed by group: the OS does not promise to return + // entries in the order they were set. + if len(got) != len(want) { + t.Fatalf("%s affinity: got %+v, want %+v (length mismatch)", what, got, want) + } + byGroup := make(map[uint16]uint64, len(got)) + for _, g := range got { + byGroup[g.Group] = g.Mask + } + for _, w := range want { + mask, ok := byGroup[w.Group] + if !ok { + t.Fatalf("%s affinity: missing group %d; got %+v, want %+v", what, w.Group, got, want) + } + if mask != w.Mask { + t.Fatalf("%s affinity: group %d mask = %#x, want %#x", what, w.Group, mask, w.Mask) + } + } +} + +var ( + kernel32 = windows.NewLazySystemDLL("kernel32.dll") + procGetProcessGroupAffinity = kernel32.NewProc("GetProcessGroupAffinity") + procGetActiveProcessorGroupCnt = kernel32.NewProc("GetActiveProcessorGroupCount") +) + +// getProcessGroupAffinity wraps kernel32!GetProcessGroupAffinity, which is not bound +// in golang.org/x/sys/windows. It returns the processor groups the process may run on. +func getProcessGroupAffinity(h windows.Handle) ([]uint16, error) { + // Probe with a small buffer; the call sets count to the required size and fails + // with ERROR_INSUFFICIENT_BUFFER if it is too small. + groups := make([]uint16, 4) + count := uint16(len(groups)) + for { + r1, _, e := procGetProcessGroupAffinity.Call( + uintptr(h), + uintptr(unsafe.Pointer(&count)), + uintptr(unsafe.Pointer(&groups[0])), + ) + if r1 != 0 { + return groups[:count], nil + } + if errors.Is(e, windows.ERROR_INSUFFICIENT_BUFFER) && int(count) > len(groups) { + groups = make([]uint16, count) + continue + } + return nil, e + } +} + +// activeProcessorGroupCount returns the number of active processor groups on the host, +// used to decide whether a multi-group affinity test can run. +func activeProcessorGroupCount(t *testing.T) int { + t.Helper() + r1, _, _ := procGetActiveProcessorGroupCnt.Call() + return int(uint16(r1)) +}