From 5e2b46cab66bc0280f57a34714816fd60b206f8b Mon Sep 17 00:00:00 2001 From: Shreyansh Sancheti Date: Tue, 31 Mar 2026 22:06:54 +0530 Subject: [PATCH 1/2] guest: use OCIBundlePath as sandbox root source of truth Replace heuristic sandbox path derivation (hard-coded /run/gcs/c prefix + ID) with host-provided OCIBundlePath as the canonical sandbox root directory. This change prepares the guest-side GCS for Shim v2 and multi-pod UVM support, where the host may use a different path layout than the legacy /run/gcs/c/. Key changes: - Add sandboxRoots mapping on Host to store resolved sandbox root per sandbox ID - Sandbox containers: register OCIBundlePath as sandbox root - Virtual pods: derive sandbox root from OCIBundlePath parent + /virtual-pods/ - Workload containers: resolve sandbox root from Host mapping (fallback to legacy) - Standalone containers: use OCIBundlePath directly as root - Container.Delete: use stored sandboxRoot for cleanup paths - Remove duplicate setup functions (setupVirtualPod* merged into unified setup*) The refactor produces identical paths when the old shim sends OCIBundlePath in the legacy format, ensuring zero behavior change for existing deployments. Security: virtualPodID is validated against path traversal before use. Signed-off-by: Shreyansh Sancheti --- internal/guest/runtime/hcsv2/container.go | 25 +- .../guest/runtime/hcsv2/sandbox_container.go | 34 +- .../guest/runtime/hcsv2/sandbox_root_test.go | 362 ++++++++++++++++++ .../runtime/hcsv2/standalone_container.go | 58 +-- internal/guest/runtime/hcsv2/uvm.go | 203 +++++----- .../guest/runtime/hcsv2/workload_container.go | 43 +-- internal/guest/spec/spec.go | 65 ++++ 7 files changed, 596 insertions(+), 194 deletions(-) create mode 100644 internal/guest/runtime/hcsv2/sandbox_root_test.go diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 6c4be709ca..0ce930dd65 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -29,7 +29,6 @@ import ( "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" - "github.com/Microsoft/hcsshim/pkg/annotations" ) // containerStatus has been introduced to enable parallel container creation @@ -76,6 +75,10 @@ type Container struct { // of this container is located. Usually, this is either `/run/gcs/c/` or // `/run/gcs/c//container_` if scratch is shared with UVM scratch. scratchDirPath string + + // sandboxRoot is the root directory of the pod within the guest. + // Used during cleanup to unmount sandbox-specific paths. + sandboxRoot string } func (c *Container) Start(ctx context.Context, conSettings stdio.ConnectionSettings) (_ int, err error) { @@ -228,25 +231,19 @@ func (c *Container) Kill(ctx context.Context, signal syscall.Signal) error { func (c *Container) Delete(ctx context.Context) error { entity := log.G(ctx).WithField(logfields.ContainerID, c.id) entity.Info("opengcs::Container::Delete") - if c.isSandbox { - // Check if this is a virtual pod - virtualSandboxID := "" - if c.spec != nil && c.spec.Annotations != nil { - virtualSandboxID = c.spec.Annotations[annotations.VirtualPodID] - } - - // remove user mounts in sandbox container - use virtual pod aware paths - if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareSandboxMountsDir(c.id, virtualSandboxID), true); err != nil { + if c.isSandbox && c.sandboxRoot != "" { + // remove user mounts in sandbox container + if err := storage.UnmountAllInPath(ctx, specGuest.SandboxMountsDirFromRoot(c.sandboxRoot), true); err != nil { entity.WithError(err).Error("failed to unmount sandbox mounts") } - // remove user mounts in tmpfs sandbox container - use virtual pod aware paths - if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareSandboxTmpfsMountsDir(c.id, virtualSandboxID), true); err != nil { + // remove tmpfs mounts in sandbox container + if err := storage.UnmountAllInPath(ctx, specGuest.SandboxTmpfsMountsDirFromRoot(c.sandboxRoot), true); err != nil { entity.WithError(err).Error("failed to unmount tmpfs sandbox mounts") } - // remove hugepages mounts in sandbox container - use virtual pod aware paths - if err := storage.UnmountAllInPath(ctx, specGuest.VirtualPodAwareHugePagesMountsDir(c.id, virtualSandboxID), true); err != nil { + // remove hugepages mounts in sandbox container + if err := storage.UnmountAllInPath(ctx, specGuest.SandboxHugePagesMountsDirFromRoot(c.sandboxRoot), true); err != nil { entity.WithError(err).Error("failed to unmount hugepages mounts") } } diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index 471a28bfe1..d30163cf7a 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -20,35 +20,30 @@ import ( "github.com/Microsoft/hcsshim/pkg/annotations" ) -func getSandboxHostnamePath(id, virtualSandboxID string) string { - return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hostname") +func getSandboxHostnamePath(sandboxRoot string) string { + return filepath.Join(sandboxRoot, "hostname") } -func getSandboxHostsPath(id, virtualSandboxID string) string { - return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "hosts") +func getSandboxHostsPath(sandboxRoot string) string { + return filepath.Join(sandboxRoot, "hosts") } -func getSandboxResolvPath(id, virtualSandboxID string) string { - return filepath.Join(specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID), "resolv.conf") +func getSandboxResolvPath(sandboxRoot string) string { + return filepath.Join(sandboxRoot, "resolv.conf") } -func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) { +func setupSandboxContainerSpec(ctx context.Context, id, sandboxRoot string, spec *oci.Spec) (err error) { ctx, span := oc.StartSpan(ctx, "hcsv2::setupSandboxContainerSpec") defer span.End() defer func() { oc.SetSpanStatus(span, err) }() span.AddAttributes(trace.StringAttribute("cid", id)) - // Check if this is a virtual pod to use appropriate root directory - virtualSandboxID := spec.Annotations[annotations.VirtualPodID] - - // Generate the sandbox root dir - virtual pod aware - rootDir := specGuest.VirtualPodAwareSandboxRootDir(id, virtualSandboxID) - if err := os.MkdirAll(rootDir, 0755); err != nil { - return errors.Wrapf(err, "failed to create sandbox root directory %q", rootDir) + if err := os.MkdirAll(sandboxRoot, 0755); err != nil { + return errors.Wrapf(err, "failed to create sandbox root directory %q", sandboxRoot) } defer func() { if err != nil { - _ = os.RemoveAll(rootDir) + _ = os.RemoveAll(sandboxRoot) } }() @@ -62,19 +57,20 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( } } - sandboxHostnamePath := getSandboxHostnamePath(id, virtualSandboxID) + sandboxHostnamePath := getSandboxHostnamePath(sandboxRoot) if err := os.WriteFile(sandboxHostnamePath, []byte(hostname+"\n"), 0644); err != nil { return errors.Wrapf(err, "failed to write hostname to %q", sandboxHostnamePath) } // Write the hosts sandboxHostsContent := network.GenerateEtcHostsContent(ctx, hostname) - sandboxHostsPath := getSandboxHostsPath(id, virtualSandboxID) + sandboxHostsPath := getSandboxHostsPath(sandboxRoot) if err := os.WriteFile(sandboxHostsPath, []byte(sandboxHostsContent), 0644); err != nil { return errors.Wrapf(err, "failed to write sandbox hosts to %q", sandboxHostsPath) } // Check if this is a virtual pod sandbox container by comparing container ID with virtual pod ID + virtualSandboxID := spec.Annotations[annotations.VirtualPodID] isVirtualPodSandbox := virtualSandboxID != "" && id == virtualSandboxID if strings.EqualFold(spec.Annotations[annotations.SkipPodNetworking], "true") || isVirtualPodSandbox { ns := GetOrAddNetworkNamespace(specGuest.GetNetworkNamespaceID(spec)) @@ -97,7 +93,7 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( if err != nil { return errors.Wrap(err, "failed to generate sandbox resolv.conf content") } - sandboxResolvPath := getSandboxResolvPath(id, virtualSandboxID) + sandboxResolvPath := getSandboxResolvPath(sandboxRoot) if err := os.WriteFile(sandboxResolvPath, []byte(resolvContent), 0644); err != nil { return errors.Wrap(err, "failed to write sandbox resolv.conf") } @@ -125,10 +121,8 @@ func setupSandboxContainerSpec(ctx context.Context, id string, spec *oci.Spec) ( // Set cgroup path - check if this is a virtual pod if virtualSandboxID != "" { - // Virtual pod sandbox gets its own cgroup under /containers/virtual-pods using the virtual pod ID spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID } else { - // Traditional sandbox goes under /containers spec.Linux.CgroupsPath = "/containers/" + id } diff --git a/internal/guest/runtime/hcsv2/sandbox_root_test.go b/internal/guest/runtime/hcsv2/sandbox_root_test.go new file mode 100644 index 0000000000..7ef84f47a1 --- /dev/null +++ b/internal/guest/runtime/hcsv2/sandbox_root_test.go @@ -0,0 +1,362 @@ +//go:build linux +// +build linux + +package hcsv2 + +import ( + "path/filepath" + "testing" + + specGuest "github.com/Microsoft/hcsshim/internal/guest/spec" +) + +func TestRegisterAndResolveSandboxRoot(t *testing.T) { + h := &Host{ + sandboxRoots: make(map[string]string), + } + + h.registerSandboxRoot("sandbox-1", "/run/gcs/c/sandbox-1", "") + got := h.resolveSandboxRoot("sandbox-1") + if got != "/run/gcs/c/sandbox-1" { + t.Fatalf("expected /run/gcs/c/sandbox-1, got %s", got) + } +} + +func TestResolveSandboxRootFallback(t *testing.T) { + h := &Host{ + sandboxRoots: make(map[string]string), + } + + // No registration — should fall back to legacy derivation + got := h.resolveSandboxRoot("sandbox-missing") + want := specGuest.SandboxRootDir("sandbox-missing") + if got != want { + t.Fatalf("expected fallback %s, got %s", want, got) + } +} + +func TestUnregisterSandboxRoot(t *testing.T) { + h := &Host{ + sandboxRoots: make(map[string]string), + } + + h.registerSandboxRoot("sandbox-1", "/some/path", "") + h.unregisterSandboxRoot("sandbox-1") + + // After unregister, should fall back to legacy + got := h.resolveSandboxRoot("sandbox-1") + want := specGuest.SandboxRootDir("sandbox-1") + if got != want { + t.Fatalf("expected fallback %s after unregister, got %s", want, got) + } +} + +func TestSandboxRootFromOCIBundlePath(t *testing.T) { + // Regular sandbox: sandboxRoot == OCIBundlePath + ociBundlePath := "/run/gcs/c/my-sandbox-id" + sandboxRoot := ociBundlePath + if sandboxRoot != "/run/gcs/c/my-sandbox-id" { + t.Fatalf("expected sandbox root to equal OCIBundlePath, got %s", sandboxRoot) + } +} + +func TestVirtualPodRootFromOCIBundlePath(t *testing.T) { + tests := []struct { + name string + ociBundlePath string + virtualPodID string + want string + }{ + { + name: "legacy shim path", + ociBundlePath: "/run/gcs/c/container-abc", + virtualPodID: "vpod-123", + want: "/run/gcs/c/virtual-pods/vpod-123", + }, + { + name: "new shim path", + ociBundlePath: "/new/layout/sandboxes/container-abc", + virtualPodID: "vpod-456", + want: "/new/layout/sandboxes/virtual-pods/vpod-456", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := filepath.Join(filepath.Dir(tt.ociBundlePath), "virtual-pods", tt.virtualPodID) + if got != tt.want { + t.Fatalf("got %s, want %s", got, tt.want) + } + }) + } +} + +func TestVirtualPodRootMatchesLegacy(t *testing.T) { + // When OCIBundlePath uses the legacy prefix, the derived virtual pod root + // must match what VirtualPodRootDir() would have produced. + ociBundlePath := "/run/gcs/c/container-id" + virtualPodID := "vpod-abc" + + derived := filepath.Join(filepath.Dir(ociBundlePath), "virtual-pods", virtualPodID) + legacy := specGuest.VirtualPodRootDir(virtualPodID) + + if derived != legacy { + t.Fatalf("derived %q != legacy %q — backwards compatibility broken", derived, legacy) + } +} + +func TestSubdirectoryPaths(t *testing.T) { + sandboxRoot := "/run/gcs/c/sandbox-xyz" + checks := map[string]string{ + "sandboxMounts": filepath.Join(sandboxRoot, "sandboxMounts"), + "sandboxTmpfsMounts": filepath.Join(sandboxRoot, "sandboxTmpfsMounts"), + "hugepages": filepath.Join(sandboxRoot, "hugepages"), + "logs": filepath.Join(sandboxRoot, "logs"), + } + + for name, want := range checks { + got := filepath.Join(sandboxRoot, name) + if got != want { + t.Fatalf("subdir %s: got %s, want %s", name, got, want) + } + } + + // Verify these match what spec.go functions produce + if filepath.Join(sandboxRoot, "sandboxMounts") != specGuest.SandboxMountsDir("sandbox-xyz") { + t.Fatal("sandboxMounts path doesn't match legacy SandboxMountsDir") + } + if filepath.Join(sandboxRoot, "hugepages") != specGuest.HugePagesMountsDir("sandbox-xyz") { + t.Fatal("hugepages path doesn't match legacy HugePagesMountsDir") + } +} + +// TestOldVsNewPathParity exhaustively compares every path the old code +// would have produced (via spec.go functions with hard-coded prefix + ID) +// against the new code (resolved sandboxRoot + inline filepath.Join). +// If any pair diverges, backwards compatibility is broken. +func TestOldVsNewPathParity(t *testing.T) { + type pathCase struct { + name string + oldPath string // what the old code produced + newPath string // what the new code produces + } + + // Simulate old shim: OCIBundlePath = /run/gcs/c/ + sandboxID := "abc-123-sandbox" + ociBundlePath := "/run/gcs/c/" + sandboxID + sandboxRoot := ociBundlePath // new code: sandboxRoot = OCIBundlePath + + regularCases := []pathCase{ + { + name: "sandbox root", + oldPath: specGuest.SandboxRootDir(sandboxID), + newPath: sandboxRoot, + }, + { + name: "sandboxMounts dir", + oldPath: specGuest.SandboxMountsDir(sandboxID), + newPath: filepath.Join(sandboxRoot, "sandboxMounts"), + }, + { + name: "sandboxTmpfsMounts dir", + oldPath: specGuest.SandboxTmpfsMountsDir(sandboxID), + newPath: filepath.Join(sandboxRoot, "sandboxTmpfsMounts"), + }, + { + name: "hugepages dir", + oldPath: specGuest.HugePagesMountsDir(sandboxID), + newPath: filepath.Join(sandboxRoot, "hugepages"), + }, + { + name: "logs dir", + oldPath: specGuest.SandboxLogsDir(sandboxID, ""), + newPath: filepath.Join(sandboxRoot, "logs"), + }, + { + name: "log file path", + oldPath: specGuest.SandboxLogPath(sandboxID, "", "container.log"), + newPath: filepath.Join(sandboxRoot, "logs", "container.log"), + }, + { + name: "hostname file", + oldPath: filepath.Join(specGuest.SandboxRootDir(sandboxID), "hostname"), + newPath: filepath.Join(sandboxRoot, "hostname"), + }, + { + name: "hosts file", + oldPath: filepath.Join(specGuest.SandboxRootDir(sandboxID), "hosts"), + newPath: filepath.Join(sandboxRoot, "hosts"), + }, + { + name: "resolv.conf file", + oldPath: filepath.Join(specGuest.SandboxRootDir(sandboxID), "resolv.conf"), + newPath: filepath.Join(sandboxRoot, "resolv.conf"), + }, + } + + t.Run("regular_sandbox", func(t *testing.T) { + for _, tc := range regularCases { + if tc.oldPath != tc.newPath { + t.Errorf("%s: old=%q new=%q", tc.name, tc.oldPath, tc.newPath) + } + } + }) + + // Virtual pod: old code used VirtualPodRootDir(vpodID), + // new code uses filepath.Join(filepath.Dir(ociBundlePath), "virtual-pods", vpodID) + vpodID := "vpod-456" + vpodOCIBundlePath := "/run/gcs/c/" + sandboxID // container's own bundle + vpodSandboxRoot := filepath.Join(filepath.Dir(vpodOCIBundlePath), "virtual-pods", vpodID) + + vpodCases := []pathCase{ + { + name: "virtual pod root", + oldPath: specGuest.VirtualPodRootDir(vpodID), + newPath: vpodSandboxRoot, + }, + { + name: "virtual pod sandboxMounts", + oldPath: specGuest.VirtualPodMountsDir(vpodID), + newPath: filepath.Join(vpodSandboxRoot, "sandboxMounts"), + }, + { + name: "virtual pod tmpfs mounts", + oldPath: specGuest.VirtualPodTmpfsMountsDir(vpodID), + newPath: filepath.Join(vpodSandboxRoot, "sandboxTmpfsMounts"), + }, + { + name: "virtual pod hugepages", + oldPath: specGuest.VirtualPodHugePagesMountsDir(vpodID), + newPath: filepath.Join(vpodSandboxRoot, "hugepages"), + }, + { + name: "virtual pod logs", + oldPath: specGuest.SandboxLogsDir(sandboxID, vpodID), + newPath: filepath.Join(vpodSandboxRoot, "logs"), + }, + { + name: "virtual pod hostname", + oldPath: filepath.Join(specGuest.VirtualPodRootDir(vpodID), "hostname"), + newPath: filepath.Join(vpodSandboxRoot, "hostname"), + }, + { + name: "virtual pod hosts", + oldPath: filepath.Join(specGuest.VirtualPodRootDir(vpodID), "hosts"), + newPath: filepath.Join(vpodSandboxRoot, "hosts"), + }, + { + name: "virtual pod resolv.conf", + oldPath: filepath.Join(specGuest.VirtualPodRootDir(vpodID), "resolv.conf"), + newPath: filepath.Join(vpodSandboxRoot, "resolv.conf"), + }, + } + + t.Run("virtual_pod", func(t *testing.T) { + for _, tc := range vpodCases { + if tc.oldPath != tc.newPath { + t.Errorf("%s: old=%q new=%q", tc.name, tc.oldPath, tc.newPath) + } + } + }) + + // Workload container: sandbox root comes from mapping, not OCIBundlePath + t.Run("workload_uses_sandbox_root", func(t *testing.T) { + h := &Host{sandboxRoots: make(map[string]string)} + h.registerSandboxRoot(sandboxID, ociBundlePath, "") + + workloadSandboxRoot := h.resolveSandboxRoot(sandboxID) + if workloadSandboxRoot != ociBundlePath { + t.Fatalf("workload got sandboxRoot=%q, want %q", workloadSandboxRoot, ociBundlePath) + } + // Networking mount: hostname from sandbox root, not workload's own bundle + workloadBundle := "/run/gcs/c/workload-container-999" + hostsOld := filepath.Join(specGuest.SandboxRootDir(sandboxID), "hosts") + hostsNew := filepath.Join(workloadSandboxRoot, "hosts") + if hostsOld != hostsNew { + t.Errorf("workload hosts mount: old=%q new=%q", hostsOld, hostsNew) + } + // Verify it's NOT derived from workload's own bundle + if hostsNew == filepath.Join(workloadBundle, "hosts") { + t.Error("workload hosts incorrectly derived from its own bundle path") + } + }) + + // Standalone: sandboxRoot = OCIBundlePath directly + t.Run("standalone", func(t *testing.T) { + standaloneBundle := "/run/gcs/c/standalone-789" + standaloneRoot := standaloneBundle // new code: sandboxRoot = OCIBundlePath + oldRoot := specGuest.SandboxRootDir("standalone-789") + + if standaloneRoot != oldRoot { + t.Errorf("standalone root: old=%q new=%q", oldRoot, standaloneRoot) + } + }) +} + +// TestMultiPodIsolation simulates two virtual pods sharing a UVM and verifies +// that each gets its own isolated sandbox root, mounts, and networking paths. +func TestMultiPodIsolation(t *testing.T) { + h := &Host{sandboxRoots: make(map[string]string)} + + // Simulate two virtual pod sandboxes created in the same UVM. + // Each has its own OCIBundlePath and virtual pod ID. + pod1BundlePath := "/run/gcs/c/sandbox-pod1" + pod1VPodID := "vpod-aaa" + pod1Root := filepath.Join(filepath.Dir(pod1BundlePath), "virtual-pods", pod1VPodID) + + pod2BundlePath := "/run/gcs/c/sandbox-pod2" + pod2VPodID := "vpod-bbb" + pod2Root := filepath.Join(filepath.Dir(pod2BundlePath), "virtual-pods", pod2VPodID) + + h.registerSandboxRoot("sandbox-pod1", pod1Root, "") + h.registerSandboxRoot("sandbox-pod2", pod2Root, "") + + // Verify roots are distinct + if pod1Root == pod2Root { + t.Fatalf("pod roots must be different: both are %q", pod1Root) + } + + // Verify each resolves correctly + if got := h.resolveSandboxRoot("sandbox-pod1"); got != pod1Root { + t.Errorf("pod1: got %q, want %q", got, pod1Root) + } + if got := h.resolveSandboxRoot("sandbox-pod2"); got != pod2Root { + t.Errorf("pod2: got %q, want %q", got, pod2Root) + } + + // Verify subdirectories are isolated + pod1Hosts := filepath.Join(pod1Root, "hosts") + pod2Hosts := filepath.Join(pod2Root, "hosts") + if pod1Hosts == pod2Hosts { + t.Error("hosts files should be in different directories for different pods") + } + + pod1Mounts := filepath.Join(pod1Root, "sandboxMounts") + pod2Mounts := filepath.Join(pod2Root, "sandboxMounts") + if pod1Mounts == pod2Mounts { + t.Error("sandboxMounts should be in different directories for different pods") + } + + // A workload referencing pod1 should get pod1's root, not pod2's + workloadRoot := h.resolveSandboxRoot("sandbox-pod1") + if workloadRoot != pod1Root { + t.Errorf("workload resolved to %q, want pod1 root %q", workloadRoot, pod1Root) + } + workloadHosts := filepath.Join(workloadRoot, "hosts") + if workloadHosts != pod1Hosts { + t.Errorf("workload hosts %q should match pod1 hosts %q", workloadHosts, pod1Hosts) + } + + // Unregister pod1, pod2 still works + h.unregisterSandboxRoot("sandbox-pod1") + if got := h.resolveSandboxRoot("sandbox-pod2"); got != pod2Root { + t.Errorf("pod2 after pod1 removal: got %q, want %q", got, pod2Root) + } + + // pod1 now falls back to legacy + fallback := h.resolveSandboxRoot("sandbox-pod1") + legacy := specGuest.SandboxRootDir("sandbox-pod1") + if fallback != legacy { + t.Errorf("pod1 fallback: got %q, want legacy %q", fallback, legacy) + } +} diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go index 1a23d276fc..768adda344 100644 --- a/internal/guest/runtime/hcsv2/standalone_container.go +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -14,42 +14,28 @@ import ( "github.com/Microsoft/hcsshim/internal/guest/network" specGuest "github.com/Microsoft/hcsshim/internal/guest/spec" - "github.com/Microsoft/hcsshim/internal/guestpath" "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/pkg/annotations" ) -func getStandaloneRootDir(id, virtualSandboxID string) string { - if virtualSandboxID != "" { - // Standalone container in virtual pod gets its own subdir - return filepath.Join(guestpath.LCOWRootPrefixInUVM, "virtual-pods", virtualSandboxID, id) - } - return filepath.Join(guestpath.LCOWRootPrefixInUVM, id) -} - -func getStandaloneHostnamePath(id, virtualSandboxID string) string { - return filepath.Join(getStandaloneRootDir(id, virtualSandboxID), "hostname") +func getStandaloneHostnamePath(rootDir string) string { + return filepath.Join(rootDir, "hostname") } -func getStandaloneHostsPath(id, virtualSandboxID string) string { - return filepath.Join(getStandaloneRootDir(id, virtualSandboxID), "hosts") +func getStandaloneHostsPath(rootDir string) string { + return filepath.Join(rootDir, "hosts") } -func getStandaloneResolvPath(id, virtualSandboxID string) string { - return filepath.Join(getStandaloneRootDir(id, virtualSandboxID), "resolv.conf") +func getStandaloneResolvPath(rootDir string) string { + return filepath.Join(rootDir, "resolv.conf") } -func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec) (err error) { +func setupStandaloneContainerSpec(ctx context.Context, id, rootDir string, spec *oci.Spec) (err error) { ctx, span := oc.StartSpan(ctx, "hcsv2::setupStandaloneContainerSpec") defer span.End() defer func() { oc.SetSpanStatus(span, err) }() span.AddAttributes(trace.StringAttribute("cid", id)) - // Check if this is a virtual pod (unlikely for standalone) - virtualSandboxID := spec.Annotations[annotations.VirtualPodID] - - // Generate the standalone root dir - virtual pod aware - rootDir := getStandaloneRootDir(id, virtualSandboxID) if err := os.MkdirAll(rootDir, 0755); err != nil { return errors.Wrapf(err, "failed to create container root directory %q", rootDir) } @@ -70,15 +56,15 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec // Write the hostname if !specGuest.MountPresent("/etc/hostname", spec.Mounts) { - standaloneHostnamePath := getStandaloneHostnamePath(id, virtualSandboxID) - if err := os.WriteFile(standaloneHostnamePath, []byte(hostname+"\n"), 0644); err != nil { - return errors.Wrapf(err, "failed to write hostname to %q", standaloneHostnamePath) + hostnamePath := getStandaloneHostnamePath(rootDir) + if err := os.WriteFile(hostnamePath, []byte(hostname+"\n"), 0644); err != nil { + return errors.Wrapf(err, "failed to write hostname to %q", hostnamePath) } mt := oci.Mount{ Destination: "/etc/hostname", Type: "bind", - Source: getStandaloneHostnamePath(id, virtualSandboxID), + Source: hostnamePath, Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -89,16 +75,16 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec // Write the hosts if !specGuest.MountPresent("/etc/hosts", spec.Mounts) { - standaloneHostsContent := network.GenerateEtcHostsContent(ctx, hostname) - standaloneHostsPath := getStandaloneHostsPath(id, virtualSandboxID) - if err := os.WriteFile(standaloneHostsPath, []byte(standaloneHostsContent), 0644); err != nil { - return errors.Wrapf(err, "failed to write standalone hosts to %q", standaloneHostsPath) + hostsContent := network.GenerateEtcHostsContent(ctx, hostname) + hostsPath := getStandaloneHostsPath(rootDir) + if err := os.WriteFile(hostsPath, []byte(hostsContent), 0644); err != nil { + return errors.Wrapf(err, "failed to write standalone hosts to %q", hostsPath) } mt := oci.Mount{ Destination: "/etc/hosts", Type: "bind", - Source: getStandaloneHostsPath(id, virtualSandboxID), + Source: hostsPath, Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -115,15 +101,15 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec if err != nil { return errors.Wrap(err, "failed to generate standalone resolv.conf content") } - standaloneResolvPath := getStandaloneResolvPath(id, virtualSandboxID) - if err := os.WriteFile(standaloneResolvPath, []byte(resolvContent), 0644); err != nil { + resolvPath := getStandaloneResolvPath(rootDir) + if err := os.WriteFile(resolvPath, []byte(resolvContent), 0644); err != nil { return errors.Wrap(err, "failed to write standalone resolv.conf") } mt := oci.Mount{ Destination: "/etc/resolv.conf", Type: "bind", - Source: getStandaloneResolvPath(id, virtualSandboxID), + Source: resolvPath, Options: []string{"bind"}, } if specGuest.IsRootReadonly(spec) { @@ -132,13 +118,11 @@ func setupStandaloneContainerSpec(ctx context.Context, id string, spec *oci.Spec spec.Mounts = append(spec.Mounts, mt) } - // Set cgroup path - check if this is part of a virtual pod (unlikely for standalone) + // Set cgroup path + virtualSandboxID := spec.Annotations[annotations.VirtualPodID] if virtualSandboxID != "" { - // Standalone container in virtual pod goes under /containers/virtual-pods/{virtualSandboxID}/{containerID} - // Each virtualSandboxID creates its own pod-level cgroup for all containers in that virtual pod spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID + "/" + id } else { - // Traditional standalone container goes under /containers spec.Linux.CgroupsPath = "/containers/" + id } diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index e872d1333d..1bc9673b2b 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -99,6 +99,12 @@ type Host struct { containerToVirtualPod map[string]string // containerID -> virtualSandboxID virtualPodsCgroupManager cgroup.Manager // Parent cgroup for all virtual pods + // sandboxRoots maps sandboxID to the resolved sandbox root directory. + // Populated via RegisterSandboxRoot during sandbox creation using + // the host-provided OCIBundlePath as source of truth. + sandboxRootsMutex sync.RWMutex + sandboxRoots map[string]string + rtime runtime.Runtime vsock transport.Transport devNullTransport transport.Transport @@ -123,6 +129,7 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s externalProcesses: make(map[int]*externalProcess), virtualPods: make(map[string]*VirtualPod), containerToVirtualPod: make(map[string]string), + sandboxRoots: make(map[string]string), rtime: rtime, vsock: vsock, devNullTransport: &transport.DevNullTransport{}, @@ -131,6 +138,64 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s } } +// registerSandboxRoot stores the resolved sandbox root directory for a given sandbox ID. +// For virtual pods, it derives the shared root from OCIBundlePath's parent directory. +func (h *Host) registerSandboxRoot(sandboxID, ociBundlePath, virtualPodID string) string { + var sandboxRoot string + + if virtualPodID != "" { + // Validate virtualPodID to prevent path traversal. + cleanID := filepath.Clean(virtualPodID) + if filepath.IsAbs(cleanID) || strings.Contains(cleanID, "..") { + logrus.WithField("virtualPodID", virtualPodID).Error("invalid virtual pod ID") + sandboxRoot = ociBundlePath + } else { + // Derive virtual pod root from the OCIBundlePath parent, + // preserving the virtual-pods/ subdirectory layout. + sandboxRoot = filepath.Join(filepath.Dir(ociBundlePath), "virtual-pods", cleanID) + } + } else { + sandboxRoot = ociBundlePath + } + + h.sandboxRootsMutex.Lock() + defer h.sandboxRootsMutex.Unlock() + h.sandboxRoots[sandboxID] = sandboxRoot + + logrus.WithFields(logrus.Fields{ + "sandboxID": sandboxID, + "sandboxRoot": sandboxRoot, + }).Debug("registered sandbox root") + + return sandboxRoot +} + +// resolveSandboxRoot returns the resolved sandbox root for the given sandbox ID. +// Falls back to legacy path derivation if no mapping exists. +func (h *Host) resolveSandboxRoot(sandboxID string) string { + h.sandboxRootsMutex.RLock() + root, ok := h.sandboxRoots[sandboxID] + h.sandboxRootsMutex.RUnlock() + if ok { + return root + } + // Fallback to legacy derivation for backwards compatibility. + // TODO: remove fallback after shim v1 sunset + fallback := specGuest.SandboxRootDir(sandboxID) + logrus.WithFields(logrus.Fields{ + "sandboxID": sandboxID, + "fallback": fallback, + }).Warn("sandbox root not found in mapping, falling back to legacy path derivation") + return fallback +} + +// unregisterSandboxRoot removes the sandbox root mapping for a given sandbox ID. +func (h *Host) unregisterSandboxRoot(sandboxID string) { + h.sandboxRootsMutex.Lock() + defer h.sandboxRootsMutex.Unlock() + delete(h.sandboxRoots, sandboxID) +} + func (h *Host) SecurityPolicyEnforcer() securitypolicy.SecurityPolicyEnforcer { return h.securityOptions.PolicyEnforcer } @@ -171,6 +236,9 @@ func (h *Host) RemoveContainer(id string) { } delete(h.containers, id) + + // Clean up the sandbox root mapping if this ID was registered. + h.unregisterSandboxRoot(id) } func (h *Host) GetCreatedContainer(id string) (*Container, error) { @@ -199,10 +267,11 @@ func (h *Host) AddContainer(id string, c *Container) error { return nil } -func setupSandboxMountsPath(id string) (err error) { - mountPath := specGuest.SandboxMountsDir(id) +// setupSandboxMountsPath creates the sandboxMounts directory from a resolved root. +func setupSandboxMountsPath(sandboxRoot string) (err error) { + mountPath := specGuest.SandboxMountsDirFromRoot(sandboxRoot) if err := os.MkdirAll(mountPath, 0755); err != nil { - return errors.Wrapf(err, "failed to create sandboxMounts dir in sandbox %v", id) + return errors.Wrapf(err, "failed to create sandboxMounts dir at %v", mountPath) } defer func() { if err != nil { @@ -213,10 +282,11 @@ func setupSandboxMountsPath(id string) (err error) { return storage.MountRShared(mountPath) } -func setupSandboxTmpfsMountsPath(id string) (err error) { - tmpfsDir := specGuest.SandboxTmpfsMountsDir(id) +// setupSandboxTmpfsMountsPath creates the sandbox tmpfs mounts directory from a resolved root. +func setupSandboxTmpfsMountsPath(sandboxRoot string) (err error) { + tmpfsDir := specGuest.SandboxTmpfsMountsDirFromRoot(sandboxRoot) if err := os.MkdirAll(tmpfsDir, 0755); err != nil { - return errors.Wrapf(err, "failed to create sandbox tmpfs mounts dir in sandbox %v", id) + return errors.Wrapf(err, "failed to create sandbox tmpfs mounts dir at %v", tmpfsDir) } defer func() { @@ -237,26 +307,21 @@ func setupSandboxTmpfsMountsPath(id string) (err error) { return storage.MountRShared(tmpfsDir) } -func setupSandboxHugePageMountsPath(id string) error { - mountPath := specGuest.HugePagesMountsDir(id) +// setupSandboxHugePageMountsPath creates the hugepages mounts directory from a resolved root. +func setupSandboxHugePageMountsPath(sandboxRoot string) error { + mountPath := specGuest.SandboxHugePagesMountsDirFromRoot(sandboxRoot) if err := os.MkdirAll(mountPath, 0755); err != nil { - return errors.Wrapf(err, "failed to create hugepage Mounts dir in sandbox %v", id) + return errors.Wrapf(err, "failed to create hugepage mounts dir at %v", mountPath) } return storage.MountRShared(mountPath) } -// setupSandboxLogDir creates the directory to house all redirected stdio logs from containers. -// -// Virtual pod aware. -func setupSandboxLogDir(sandboxID, virtualSandboxID string) error { - mountPath := specGuest.SandboxLogsDir(sandboxID, virtualSandboxID) +// setupSandboxLogDir creates the directory to house all redirected stdio logs from a resolved root. +func setupSandboxLogDir(sandboxRoot string) error { + mountPath := specGuest.SandboxLogsDirFromRoot(sandboxRoot) if err := mkdirAllModePerm(mountPath); err != nil { - id := sandboxID - if virtualSandboxID != "" { - id = virtualSandboxID - } - return errors.Wrapf(err, "failed to create sandbox logs dir in sandbox %v", id) + return errors.Wrapf(err, "failed to create sandbox logs dir at %v", mountPath) } return nil } @@ -406,7 +471,11 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM // Capture namespaceID if any because setupSandboxContainerSpec clears the Windows section. namespaceID = specGuest.GetNetworkNamespaceID(settings.OCISpecification) - err = setupSandboxContainerSpec(ctx, id, settings.OCISpecification) + // Resolve the sandbox root from OCIBundlePath. + sandboxRoot := h.registerSandboxRoot(id, settings.OCIBundlePath, virtualPodID) + c.sandboxRoot = sandboxRoot + + err = setupSandboxContainerSpec(ctx, id, sandboxRoot, settings.OCISpecification) if err != nil { return nil, err } @@ -416,30 +485,16 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } }() - if isVirtualPod { - // For virtual pods, create virtual pod specific paths - if err = setupVirtualPodMountsPath(virtualPodID); err != nil { - return nil, err - } - if err = setupVirtualPodTmpfsMountsPath(virtualPodID); err != nil { - return nil, err - } - if err = setupVirtualPodHugePageMountsPath(virtualPodID); err != nil { - return nil, err - } - } else { - // Traditional sandbox setup - if err = setupSandboxMountsPath(id); err != nil { - return nil, err - } - if err = setupSandboxTmpfsMountsPath(id); err != nil { - return nil, err - } - if err = setupSandboxHugePageMountsPath(id); err != nil { - return nil, err - } + if err = setupSandboxMountsPath(sandboxRoot); err != nil { + return nil, err + } + if err = setupSandboxTmpfsMountsPath(sandboxRoot); err != nil { + return nil, err + } + if err = setupSandboxHugePageMountsPath(sandboxRoot); err != nil { + return nil, err } - if err = setupSandboxLogDir(id, virtualPodID); err != nil { + if err = setupSandboxLogDir(sandboxRoot); err != nil { return nil, err } @@ -457,7 +512,9 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM if !ok || sid == "" { return nil, errors.Errorf("unsupported 'io.kubernetes.cri.sandbox-id': '%s'", sid) } - if err = setupWorkloadContainerSpec(ctx, sid, id, settings.OCISpecification, settings.OCIBundlePath); err != nil { + sandboxRoot := h.resolveSandboxRoot(sid) + c.sandboxRoot = sandboxRoot + if err = setupWorkloadContainerSpec(ctx, sid, id, sandboxRoot, settings.OCISpecification, settings.OCIBundlePath); err != nil { return nil, err } @@ -484,7 +541,9 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } else { // Capture namespaceID if any because setupStandaloneContainerSpec clears the Windows section. namespaceID = specGuest.GetNetworkNamespaceID(settings.OCISpecification) - if err := setupStandaloneContainerSpec(ctx, id, settings.OCISpecification); err != nil { + // Standalone uses OCIBundlePath directly as its root. + c.sandboxRoot = settings.OCIBundlePath + if err := setupStandaloneContainerSpec(ctx, id, settings.OCIBundlePath, settings.OCISpecification); err != nil { return nil, err } defer func() { @@ -504,7 +563,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM settings.OCISpecification.Mounts = append(settings.OCISpecification.Mounts, specs.Mount{ Destination: logDirMount, Type: "bind", - Source: specGuest.SandboxLogsDir(sandboxID, virtualPodID), + Source: specGuest.SandboxLogsDirFromRoot(c.sandboxRoot), Options: []string{"bind"}, }) } @@ -559,9 +618,10 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM return nil, errors.Errorf("teeing container stdio to log path %q denied due to policy not allowing stdio access", logPath) } - c.logPath = specGuest.SandboxLogPath(sandboxID, virtualPodID, logPath) + c.logPath = filepath.Join(specGuest.SandboxLogsDirFromRoot(c.sandboxRoot), logPath) // verify the logpath is still under the correct directory - if !strings.HasPrefix(c.logPath, specGuest.SandboxLogsDir(sandboxID, virtualPodID)) { + logsDir := specGuest.SandboxLogsDirFromRoot(c.sandboxRoot) + if !strings.HasPrefix(c.logPath, logsDir) { return nil, errors.Errorf("log path %v is not within sandbox's log dir", c.logPath) } @@ -1591,50 +1651,3 @@ func (h *Host) cleanupVirtualPod(virtualSandboxID string) { logrus.WithField("virtualSandboxID", virtualSandboxID).Info("Virtual pod cleaned up") } } - -// setupVirtualPodMountsPath creates mount directories for virtual pods -func setupVirtualPodMountsPath(virtualSandboxID string) (err error) { - // Create virtual pod specific mount path using the new path generation functions - mountPath := specGuest.VirtualPodMountsDir(virtualSandboxID) - if err := os.MkdirAll(mountPath, 0755); err != nil { - return errors.Wrapf(err, "failed to create virtual pod mounts dir in sandbox %v", virtualSandboxID) - } - defer func() { - if err != nil { - _ = os.RemoveAll(mountPath) - } - }() - - return storage.MountRShared(mountPath) -} - -func setupVirtualPodTmpfsMountsPath(virtualSandboxID string) (err error) { - tmpfsDir := specGuest.VirtualPodTmpfsMountsDir(virtualSandboxID) - if err := os.MkdirAll(tmpfsDir, 0755); err != nil { - return errors.Wrapf(err, "failed to create virtual pod tmpfs mounts dir in sandbox %v", virtualSandboxID) - } - - defer func() { - if err != nil { - _ = os.RemoveAll(tmpfsDir) - } - }() - - // mount a tmpfs at the tmpfsDir - // this ensures that the tmpfsDir is a mount point and not just a directory - // we don't care if it is already mounted, so ignore EBUSY - if err := unix.Mount("tmpfs", tmpfsDir, "tmpfs", 0, ""); err != nil && !errors.Is(err, unix.EBUSY) { - return errors.Wrapf(err, "failed to mount tmpfs at %s", tmpfsDir) - } - - return storage.MountRShared(tmpfsDir) -} - -func setupVirtualPodHugePageMountsPath(virtualSandboxID string) error { - mountPath := specGuest.VirtualPodHugePagesMountsDir(virtualSandboxID) - if err := os.MkdirAll(mountPath, 0755); err != nil { - return errors.Wrapf(err, "failed to create virtual pod hugepage mounts dir %v", virtualSandboxID) - } - - return storage.MountRShared(mountPath) -} diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go index 683c076e2c..d118a4d35c 100644 --- a/internal/guest/runtime/hcsv2/workload_container.go +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -7,7 +7,6 @@ import ( "context" "fmt" "os" - "path/filepath" "strings" "github.com/opencontainers/runc/libcontainer/devices" @@ -32,9 +31,9 @@ func mkdirAllModePerm(target string) error { return os.MkdirAll(target, os.ModePerm) } -func updateSandboxMounts(sbid string, spec *oci.Spec) error { - // Check if this is a virtual pod - virtualSandboxID := spec.Annotations[annotations.VirtualPodID] +func updateSandboxMounts(sandboxRoot string, spec *oci.Spec) error { + mountsDir := specGuest.SandboxMountsDirFromRoot(sandboxRoot) + tmpfsMountsDir := specGuest.SandboxTmpfsMountsDirFromRoot(sandboxRoot) for i, m := range spec.Mounts { if !strings.HasPrefix(m.Source, guestpath.SandboxMountPrefix) && @@ -43,25 +42,16 @@ func updateSandboxMounts(sbid string, spec *oci.Spec) error { } var sandboxSource string - // if using `sandbox-tmp://` prefix, we mount a tmpfs in sandboxTmpfsMountsDir if strings.HasPrefix(m.Source, guestpath.SandboxTmpfsMountPrefix) { - // Use virtual pod aware mount source - sandboxSource = specGuest.VirtualPodAwareSandboxTmpfsMountSource(sbid, virtualSandboxID, m.Source) - expectedMountsDir := specGuest.VirtualPodAwareSandboxTmpfsMountsDir(sbid, virtualSandboxID) + sandboxSource = specGuest.SandboxTmpfsMountSourceFromRoot(sandboxRoot, m.Source) - // filepath.Join cleans the resulting path before returning, so it would resolve the relative path if one was given. - // Hence, we need to ensure that the resolved path is still under the correct directory - if !strings.HasPrefix(sandboxSource, expectedMountsDir) { + if !strings.HasPrefix(sandboxSource, tmpfsMountsDir) { return errors.Errorf("mount path %v for mount %v is not within sandbox's tmpfs mounts dir", sandboxSource, m.Source) } } else { - // Use virtual pod aware mount source - sandboxSource = specGuest.VirtualPodAwareSandboxMountSource(sbid, virtualSandboxID, m.Source) - expectedMountsDir := specGuest.VirtualPodAwareSandboxMountsDir(sbid, virtualSandboxID) + sandboxSource = specGuest.SandboxMountSourceFromRoot(sandboxRoot, m.Source) - // filepath.Join cleans the resulting path before returning, so it would resolve the relative path if one was given. - // Hence, we need to ensure that the resolved path is still under the correct directory - if !strings.HasPrefix(sandboxSource, expectedMountsDir) { + if !strings.HasPrefix(sandboxSource, mountsDir) { return errors.Errorf("mount path %v for mount %v is not within sandbox's mounts dir", sandboxSource, m.Source) } } @@ -78,24 +68,21 @@ func updateSandboxMounts(sbid string, spec *oci.Spec) error { return nil } -func updateHugePageMounts(sbid string, spec *oci.Spec) error { - // Check if this is a virtual pod - virtualSandboxID := spec.Annotations[annotations.VirtualPodID] +func updateHugePageMounts(sandboxRoot string, spec *oci.Spec) error { + hugepagesDir := specGuest.SandboxHugePagesMountsDirFromRoot(sandboxRoot) for i, m := range spec.Mounts { if !strings.HasPrefix(m.Source, guestpath.HugePagesMountPrefix) { continue } - // Use virtual pod aware hugepages directory - mountsDir := specGuest.VirtualPodAwareHugePagesMountsDir(sbid, virtualSandboxID) subPath := strings.TrimPrefix(m.Source, guestpath.HugePagesMountPrefix) pageSize := strings.Split(subPath, string(os.PathSeparator))[0] - hugePageMountSource := filepath.Join(mountsDir, subPath) + hugePageMountSource := specGuest.HugePagesMountSourceFromRoot(sandboxRoot, m.Source) // filepath.Join cleans the resulting path before returning so it would resolve the relative path if one was given. // Hence, we need to ensure that the resolved path is still under the correct directory - if !strings.HasPrefix(hugePageMountSource, mountsDir) { + if !strings.HasPrefix(hugePageMountSource, hugepagesDir) { return errors.Errorf("mount path %v for mount %v is not within hugepages's mounts dir", hugePageMountSource, m.Source) } @@ -178,7 +165,7 @@ func specHasGPUDevice(spec *oci.Spec) bool { return false } -func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci.Spec, ociBundlePath string) (err error) { +func setupWorkloadContainerSpec(ctx context.Context, sbid, id, sandboxRoot string, spec *oci.Spec, ociBundlePath string) (err error) { ctx, span := oc.StartSpan(ctx, "hcsv2::setupWorkloadContainerSpec") defer span.End() defer func() { oc.SetSpanStatus(span, err) }() @@ -192,11 +179,11 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci. } // update any sandbox mounts with the sandboxMounts directory path and create files - if err = updateSandboxMounts(sbid, spec); err != nil { + if err = updateSandboxMounts(sandboxRoot, spec); err != nil { return errors.Wrapf(err, "failed to update sandbox mounts for container %v in sandbox %v", id, sbid) } - if err = updateHugePageMounts(sbid, spec); err != nil { + if err = updateHugePageMounts(sandboxRoot, spec); err != nil { return errors.Wrapf(err, "failed to update hugepages mounts for container %v in sandbox %v", id, sbid) } @@ -210,7 +197,7 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id string, spec *oci. // Add default mounts for container networking (e.g. /etc/hostname, /etc/hosts), // if spec didn't override them explicitly. - networkingMounts := specGuest.GenerateWorkloadContainerNetworkMounts(sbid, spec) + networkingMounts := specGuest.GenerateWorkloadContainerNetworkMountsFromRoot(sandboxRoot, spec) spec.Mounts = append(spec.Mounts, networkingMounts...) // TODO: JTERRY75 /dev/shm is not properly setup for LCOW I believe. CRI diff --git a/internal/guest/spec/spec.go b/internal/guest/spec/spec.go index 4273ffa1c0..b78ddac616 100644 --- a/internal/guest/spec/spec.go +++ b/internal/guest/spec/spec.go @@ -230,6 +230,71 @@ func SandboxLogPath(sandboxID, virtualSandboxID, path string) string { return filepath.Join(SandboxLogsDir(sandboxID, virtualSandboxID), path) } +// Root-based path helpers. These accept a pre-resolved sandbox root directory +// and are used by the guest-side sandbox root mapping for Shim v2 / multi-pod support. + +// SandboxMountsDirFromRoot returns the sandbox mounts directory for a given root. +func SandboxMountsDirFromRoot(sandboxRoot string) string { + return filepath.Join(sandboxRoot, "sandboxMounts") +} + +// SandboxTmpfsMountsDirFromRoot returns the sandbox tmpfs mounts directory for a given root. +func SandboxTmpfsMountsDirFromRoot(sandboxRoot string) string { + return filepath.Join(sandboxRoot, "sandboxTmpfsMounts") +} + +// SandboxHugePagesMountsDirFromRoot returns the hugepages directory for a given root. +func SandboxHugePagesMountsDirFromRoot(sandboxRoot string) string { + return filepath.Join(sandboxRoot, "hugepages") +} + +// SandboxLogsDirFromRoot returns the logs directory for a given root. +func SandboxLogsDirFromRoot(sandboxRoot string) string { + return filepath.Join(sandboxRoot, "logs") +} + +// SandboxMountSourceFromRoot resolves a sandbox:// mount source from a given root. +func SandboxMountSourceFromRoot(sandboxRoot, mountPath string) string { + subPath := strings.TrimPrefix(mountPath, guestpath.SandboxMountPrefix) + return filepath.Join(SandboxMountsDirFromRoot(sandboxRoot), subPath) +} + +// SandboxTmpfsMountSourceFromRoot resolves a sandbox-tmp:// mount source from a given root. +func SandboxTmpfsMountSourceFromRoot(sandboxRoot, mountPath string) string { + subPath := strings.TrimPrefix(mountPath, guestpath.SandboxTmpfsMountPrefix) + return filepath.Join(SandboxTmpfsMountsDirFromRoot(sandboxRoot), subPath) +} + +// HugePagesMountSourceFromRoot resolves a hugepages:// mount source from a given root. +func HugePagesMountSourceFromRoot(sandboxRoot, mountPath string) string { + subPath := strings.TrimPrefix(mountPath, guestpath.HugePagesMountPrefix) + return filepath.Join(SandboxHugePagesMountsDirFromRoot(sandboxRoot), subPath) +} + +// GenerateWorkloadContainerNetworkMountsFromRoot generates networking mounts +// using a pre-resolved sandbox root directory. +func GenerateWorkloadContainerNetworkMountsFromRoot(sandboxRoot string, spec *oci.Spec) []oci.Mount { + var nMounts []oci.Mount + for _, mountPath := range networkingMountPaths() { + if MountPresent(mountPath, spec.Mounts) { + continue + } + options := []string{"bind"} + if spec.Root != nil && spec.Root.Readonly { + options = append(options, "ro") + } + trimmedMountPath := strings.TrimPrefix(mountPath, "/etc/") + mt := oci.Mount{ + Destination: mountPath, + Type: "bind", + Source: filepath.Join(sandboxRoot, trimmedMountPath), + Options: options, + } + nMounts = append(nMounts, mt) + } + return nMounts +} + // GetNetworkNamespaceID returns the `ToLower` of // `spec.Windows.Network.NetworkNamespace` or `""`. func GetNetworkNamespaceID(spec *oci.Spec) string { From 18fb5d8d35b3071cf020c990a85bb8a225946b08 Mon Sep 17 00:00:00 2001 From: Shreyansh Sancheti Date: Tue, 7 Apr 2026 00:06:44 +0530 Subject: [PATCH 2/2] guest: unify pod model for V1, virtual pod, and V2 shim support Replace VirtualPod with a generic uvmPod struct that serves all three sandbox modes (V1 shim, virtual pod annotation, V2 native Sandbox API). Key changes: - VirtualPod (exported, complex) -> uvmPod (unexported, simpler) - Host.virtualPods/containerToVirtualPod/virtualPodsCgroupParent -> Host.pods - createPodInUVM: unified pod creation under /pods/ cgroup - Container.sandboxID: every container tracks its sandbox for cleanup - RemoveContainer: uses sandboxID + pod lookup instead of annotation checks - Cgroup layout: /pods// for all CRI containers - cmd/gcs/main.go: /containers/virtual-pods cgroup -> /pods cgroup - Remove InitializeVirtualPodSupport and all VirtualPod management methods Signed-off-by: Shreyansh Sancheti --- cmd/gcs/main.go | 27 +- internal/guest/runtime/hcsv2/container.go | 1 + .../runtime/hcsv2/container_stats_test.go | 13 - .../guest/runtime/hcsv2/sandbox_container.go | 8 +- .../runtime/hcsv2/standalone_container.go | 10 +- internal/guest/runtime/hcsv2/uvm.go | 375 +++++------------- .../guest/runtime/hcsv2/workload_container.go | 13 +- 7 files changed, 122 insertions(+), 325 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index e45fa55d03..0741b14eb1 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -362,17 +362,16 @@ func main() { } defer containersControl.Delete() //nolint:errcheck - // Create virtual-pods cgroup hierarchy for multi-pod support - // This will be the parent for all virtual pod cgroups: /containers/virtual-pods/{virtualSandboxID} - virtualPodsControl, err := cgroup.NewManager("/containers/virtual-pods", &oci.LinuxResources{ + // Create /pods cgroup hierarchy for all pods (sandbox, virtual pod, and v2). + podsControl, err := cgroup.NewManager("/pods", &oci.LinuxResources{ Memory: &oci.LinuxMemory{ - Limit: &containersLimit, // Share the same limit as containers + Limit: &containersLimit, }, }) if err != nil { - logrus.WithError(err).Fatal("failed to create containers/virtual-pods cgroup") + logrus.WithError(err).Fatal("failed to create pods cgroup") } - defer virtualPodsControl.Delete() //nolint:errcheck + defer podsControl.Delete() //nolint:errcheck gcsControl, err := cgroup.NewManager("/gcs", &oci.LinuxResources{}) if err != nil { @@ -394,10 +393,6 @@ func main() { EnableV4: *v4, } h := hcsv2.NewHost(rtime, tport, initialEnforcer, logWriter) - // Initialize virtual pod support in the host - if err := h.InitializeVirtualPodSupport(virtualPodsControl); err != nil { - logrus.WithError(err).Warn("Virtual pod support initialization failed") - } b.AssignHandlers(mux, h) var bridgeIn io.ReadCloser @@ -433,13 +428,13 @@ func main() { oomFile := os.NewFile(oom, "cefd") defer oomFile.Close() - // Setup OOM monitoring for virtual-pods cgroup - virtualPodsOom, err := virtualPodsControl.OOMEventFD() + // Setup OOM monitoring for pods cgroup + podsOom, err := podsControl.OOMEventFD() if err != nil { - logrus.WithError(err).Fatal("failed to retrieve the virtual-pods cgroups oom eventfd") + logrus.WithError(err).Fatal("failed to retrieve the pods cgroups oom eventfd") } - virtualPodsOomFile := os.NewFile(virtualPodsOom, "vp-oomfd") - defer virtualPodsOomFile.Close() + podsOomFile := os.NewFile(podsOom, "pods-oomfd") + defer podsOomFile.Close() // time synchronization service if !(*disableTimeSync) { @@ -450,7 +445,7 @@ func main() { go readMemoryEvents(startTime, gefdFile, "/gcs", int64(*gcsMemLimitBytes), gcsControl) go readMemoryEvents(startTime, oomFile, "/containers", containersLimit, containersControl) - go readMemoryEvents(startTime, virtualPodsOomFile, "/containers/virtual-pods", containersLimit, virtualPodsControl) + go readMemoryEvents(startTime, podsOomFile, "/pods", containersLimit, podsControl) err = b.ListenAndServe(bridgeIn, bridgeOut) if err != nil { logrus.WithFields(logrus.Fields{ diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 0ce930dd65..556aebf4bb 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -52,6 +52,7 @@ type Container struct { spec *oci.Spec ociBundlePath string + sandboxID string // ID of the sandbox/pod this container belongs to isSandbox bool container runtime.Container diff --git a/internal/guest/runtime/hcsv2/container_stats_test.go b/internal/guest/runtime/hcsv2/container_stats_test.go index c1986ca2b1..fa44a4451a 100644 --- a/internal/guest/runtime/hcsv2/container_stats_test.go +++ b/internal/guest/runtime/hcsv2/container_stats_test.go @@ -494,16 +494,3 @@ func TestConvertV2StatsToV1_NilInput(t *testing.T) { t.Error("ConvertV2StatsToV1(nil) should return empty metrics with all nil fields") } } - -func TestHost_InitializeVirtualPodSupport_ErrorCases(t *testing.T) { - host := &Host{} - - // Test with nil input - err := host.InitializeVirtualPodSupport(nil) - if err == nil { - t.Error("Expected error for nil input") - } - if err != nil && err.Error() != "no valid cgroup manager provided for virtual pod support" { - t.Errorf("Unexpected error message: %s", err.Error()) - } -} diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index d30163cf7a..921229ea8f 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -119,12 +119,8 @@ func setupSandboxContainerSpec(ctx context.Context, id, sandboxRoot string, spec // also has a concept of a sandbox/shm file when the IPC NamespaceMode != // NODE. - // Set cgroup path - check if this is a virtual pod - if virtualSandboxID != "" { - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID - } else { - spec.Linux.CgroupsPath = "/containers/" + id - } + // Set cgroup path under the pod's cgroup. + spec.Linux.CgroupsPath = "/pods/" + id // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go index 768adda344..0a90ceb308 100644 --- a/internal/guest/runtime/hcsv2/standalone_container.go +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -15,7 +15,6 @@ import ( "github.com/Microsoft/hcsshim/internal/guest/network" specGuest "github.com/Microsoft/hcsshim/internal/guest/spec" "github.com/Microsoft/hcsshim/internal/oc" - "github.com/Microsoft/hcsshim/pkg/annotations" ) func getStandaloneHostnamePath(rootDir string) string { @@ -118,13 +117,8 @@ func setupStandaloneContainerSpec(ctx context.Context, id, rootDir string, spec spec.Mounts = append(spec.Mounts, mt) } - // Set cgroup path - virtualSandboxID := spec.Annotations[annotations.VirtualPodID] - if virtualSandboxID != "" { - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualSandboxID + "/" + id - } else { - spec.Linux.CgroupsPath = "/containers/" + id - } + // Set cgroup path. Standalone containers go under /containers. + spec.Linux.CgroupsPath = "/containers/" + id // Clear the windows section as we dont want to forward to runc spec.Windows = nil diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 1bc9673b2b..9cf6375bfb 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -56,35 +56,31 @@ import ( // for V2 where the specific message is targeted at the UVM itself. const UVMContainerID = "00000000-0000-0000-0000-000000000000" -// Prevent path traversal via malformed container / sandbox IDs. Container IDs -// can be either UVMContainerID, or a 64 character hex string. This is also used -// to check that sandbox IDs (which is also used in paths) are valid, which has -// the same format. +// validContainerIDRegexRaw matches 64-char hex IDs used for container and sandbox IDs. +// Used by security policy enforcement to prevent path traversal. const validContainerIDRegexRaw = `[0-9a-fA-F]{64}` var validContainerIDRegex = regexp.MustCompile("^" + validContainerIDRegexRaw + "$") -// idType just changes the error message func checkValidContainerID(id string, idType string) error { if id == UVMContainerID || validContainerIDRegex.MatchString(id) { return nil } - return errors.Errorf("invalid %s id: %s (must match %s)", idType, id, validContainerIDRegex.String()) } -// VirtualPod represents a virtual pod that shares a UVM/Sandbox with other pods -type VirtualPod struct { - VirtualSandboxID string - MasterSandboxID string - NetworkNamespace string - CgroupPath string - CgroupControl cgroup.Manager // Unified cgroup manager (v1 or v2) - Containers map[string]bool // containerID -> exists - CreatedAt time.Time +// uvmPod represents a pod within the UVM. +// Used for V1 sandbox containers, virtual pods, and V2 native sandboxes. +type uvmPod struct { + sandboxID string + networkNamespace string + cgroupPath string + cgroupControl cgroup.Manager + containers map[string]bool + createdAt time.Time } -// Host is the structure tracking all UVM host state including all containers +// Host is the structure tracking all UVM host state including all pods and containers // and processes. type Host struct { containersMutex sync.Mutex @@ -93,11 +89,9 @@ type Host struct { externalProcessesMutex sync.Mutex externalProcesses map[int]*externalProcess - // Virtual pod support for multi-pod scenarios - virtualPodsMutex sync.Mutex - virtualPods map[string]*VirtualPod // virtualSandboxID -> VirtualPod - containerToVirtualPod map[string]string // containerID -> virtualSandboxID - virtualPodsCgroupManager cgroup.Manager // Parent cgroup for all virtual pods + // pods tracks all pods (sandboxes) in this UVM. + podsMutex sync.Mutex + pods map[string]*uvmPod // sandboxRoots maps sandboxID to the resolved sandbox root directory. // Populated via RegisterSandboxRoot during sandbox creation using @@ -125,16 +119,15 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s logWriter, ) return &Host{ - containers: make(map[string]*Container), - externalProcesses: make(map[int]*externalProcess), - virtualPods: make(map[string]*VirtualPod), - containerToVirtualPod: make(map[string]string), - sandboxRoots: make(map[string]string), - rtime: rtime, - vsock: vsock, - devNullTransport: &transport.DevNullTransport{}, - hostMounts: newHostMounts(), - securityOptions: securityPolicyOptions, + containers: make(map[string]*Container), + externalProcesses: make(map[int]*externalProcess), + pods: make(map[string]*uvmPod), + sandboxRoots: make(map[string]string), + rtime: rtime, + vsock: vsock, + devNullTransport: &transport.DevNullTransport{}, + hostMounts: newHostMounts(), + securityOptions: securityPolicyOptions, } } @@ -217,22 +210,32 @@ func (h *Host) RemoveContainer(id string) { return } - // Check if this container is part of a virtual pod - virtualPodID, isVirtualPod := c.spec.Annotations[annotations.VirtualPodID] - if isVirtualPod { - // Remove from virtual pod tracking - h.RemoveContainerFromVirtualPod(id) - // Network namespace cleanup is handled in virtual pod cleanup when last container is removed. - logrus.WithFields(logrus.Fields{ - "containerID": id, - "virtualPodID": virtualPodID, - }).Info("Container removed from virtual pod") - } else { - // delete the network namespace for standalone and sandbox containers - criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] - if !isCRI || criType == "sandbox" { - _ = RemoveNetworkNamespace(context.Background(), id) + // Delete the network namespace for standalone and sandbox containers. + criType, isCRI := c.spec.Annotations[annotations.KubernetesContainerType] + if !isCRI || criType == "sandbox" { + _ = RemoveNetworkNamespace(context.Background(), id) + } + + // Update the pod tracking. + if c.sandboxID != "" { + h.podsMutex.Lock() + if pod, exists := h.pods[c.sandboxID]; exists { + if criType == "container" { + // For a workload container, just remove the reference from the pod. + delete(pod.containers, c.id) + } + if criType == "sandbox" || criType == "" { + // For a sandbox container, delete the pod's cgroup and the pod itself. + if pod.cgroupControl != nil { + if err := pod.cgroupControl.Delete(); err != nil { + logrus.WithError(err).WithField("sandboxID", c.sandboxID). + Warn("failed to delete pod cgroup") + } + } + delete(h.pods, c.sandboxID) + } } + h.podsMutex.Unlock() } delete(h.containers, id) @@ -377,31 +380,24 @@ func checkContainerSettings(sandboxID, containerID string, settings *prot.VMHost func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VMHostedContainerSettingsV2) (_ *Container, err error) { criType, isCRI := settings.OCISpecification.Annotations[annotations.KubernetesContainerType] - // Check for virtual pod annotation - virtualPodID, isVirtualPod := settings.OCISpecification.Annotations[annotations.VirtualPodID] - - if h.HasSecurityPolicy() { - if err = checkValidContainerID(id, "container"); err != nil { - return nil, err - } - if virtualPodID != "" { - if err = checkValidContainerID(virtualPodID, "virtual pod"); err != nil { - return nil, err - } - } - } - - // Special handling for virtual pod sandbox containers: - // The first container in a virtual pod (containerID == virtualPodID) should be treated as a sandbox - // even if the CRI annotation might indicate otherwise due to host-side UVM setup differences - if isVirtualPod && id == virtualPodID { + // For virtual pods, the first container (containerID == virtualPodID) is the sandbox. + virtualPodID := settings.OCISpecification.Annotations[annotations.VirtualPodID] + if virtualPodID != "" && id == virtualPodID { criType = "sandbox" isCRI = true - logrus.WithFields(logrus.Fields{ - "containerID": id, - "virtualPodID": virtualPodID, - "originalCriType": settings.OCISpecification.Annotations[annotations.KubernetesContainerType], - }).Info("Virtual pod first container detected - treating as sandbox container") + } + + // Determine the sandbox ID. For sandbox containers it's the container's own ID. + // For virtual pods it's the virtual pod ID. For workloads it comes from the annotation. + sandboxID := id + if criType == "container" { + sid := settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] + if sid == "" { + return nil, errors.Errorf("unsupported 'io.kubernetes.cri.sandbox-id': '%s'", sid) + } + sandboxID = sid + } else if virtualPodID != "" { + sandboxID = virtualPodID } c := &Container{ @@ -409,6 +405,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM vsock: h.vsock, spec: settings.OCISpecification, ociBundlePath: settings.OCIBundlePath, + sandboxID: sandboxID, isSandbox: criType == "sandbox", exitType: prot.NtUnexpectedExit, processes: make(map[uint32]*containerProcess), @@ -425,54 +422,20 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM } }() - // Handle virtual pod logic - if isVirtualPod && isCRI { - logrus.WithFields(logrus.Fields{ - "containerID": id, - "virtualPodID": virtualPodID, - "criType": criType, - }).Info("Processing container for virtual pod") - - if criType == "sandbox" { - // This is a virtual pod sandbox - create the virtual pod if it doesn't exist - if _, exists := h.GetVirtualPod(virtualPodID); !exists { - // Use the network namespace ID from the current container spec - // Virtual pods share the same network namespace - networkNamespace := specGuest.GetNetworkNamespaceID(settings.OCISpecification) - if networkNamespace == "" { - networkNamespace = fmt.Sprintf("virtual-pod-%s", virtualPodID) - } - - if err := h.CreateVirtualPod(ctx, virtualPodID, virtualPodID, networkNamespace, settings.OCISpecification); err != nil { - return nil, errors.Wrapf(err, "failed to create virtual pod %s", virtualPodID) - } - } - } - - // Add this container to the virtual pod - if err := h.AddContainerToVirtualPod(id, virtualPodID); err != nil { - return nil, errors.Wrapf(err, "failed to add container %s to virtual pod %s", id, virtualPodID) - } - } - - // Normally we would be doing policy checking here at the start of our - // "policy gated function". However, we can't for create container as we - // need a properly correct sandboxID which might be changed by the code - // below that determines the sandboxID. This is a bit of future proofing - // as currently for our single use case, the sandboxID is the same as the - // container id - var namespaceID string - // for sandbox container sandboxID is same as container id - sandboxID := id if isCRI { switch criType { case "sandbox": - // Capture namespaceID if any because setupSandboxContainerSpec clears the Windows section. + // Capture namespaceID before setupSandboxContainerSpec clears the Windows section. namespaceID = specGuest.GetNetworkNamespaceID(settings.OCISpecification) - // Resolve the sandbox root from OCIBundlePath. - sandboxRoot := h.registerSandboxRoot(id, settings.OCIBundlePath, virtualPodID) + // Create the pod within the UVM. + if err := h.createPodInUVM(sandboxID, settings.OCISpecification, namespaceID); err != nil { + return nil, err + } + + // Register the sandbox root from OCIBundlePath. + sandboxRoot := h.registerSandboxRoot(sandboxID, settings.OCIBundlePath, virtualPodID) c.sandboxRoot = sandboxRoot err = setupSandboxContainerSpec(ctx, id, sandboxRoot, settings.OCISpecification) @@ -502,19 +465,12 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM return nil, err } case "container": - sid, ok := settings.OCISpecification.Annotations[annotations.KubernetesSandboxID] - sandboxID = sid - if h.HasSecurityPolicy() { - if err = checkValidContainerID(sid, "sandbox"); err != nil { - return nil, err - } - } - if !ok || sid == "" { - return nil, errors.Errorf("unsupported 'io.kubernetes.cri.sandbox-id': '%s'", sid) - } - sandboxRoot := h.resolveSandboxRoot(sid) + // Add this container to the pod's tracking. + h.addContainerToPod(sandboxID, id) + + sandboxRoot := h.resolveSandboxRoot(sandboxID) c.sandboxRoot = sandboxRoot - if err = setupWorkloadContainerSpec(ctx, sid, id, sandboxRoot, settings.OCISpecification, settings.OCIBundlePath); err != nil { + if err = setupWorkloadContainerSpec(ctx, sandboxID, id, sandboxRoot, settings.OCISpecification, settings.OCIBundlePath); err != nil { return nil, err } @@ -1480,174 +1436,51 @@ func isPrivilegedContainerCreationRequest(ctx context.Context, spec *specs.Spec) return oci.ParseAnnotationsBool(ctx, spec.Annotations, annotations.LCOWPrivileged, false) } -// Virtual Pod Management Methods +// Pod Management Methods -// InitializeVirtualPodSupport sets up the parent cgroup for virtual pods -func (h *Host) InitializeVirtualPodSupport(mgr cgroup.Manager) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() +// createPodInUVM creates a new pod with its own cgroup. +func (h *Host) createPodInUVM(sid string, pSpec *specs.Spec, nsID string) error { + h.podsMutex.Lock() + defer h.podsMutex.Unlock() - if mgr == nil { - return errors.New("no valid cgroup manager provided for virtual pod support") + if _, exists := h.pods[sid]; exists { + return fmt.Errorf("pod with id %s already exists", sid) } - h.virtualPodsCgroupManager = mgr - logrus.Info("Virtual pod support initialized") - return nil -} - -// CreateVirtualPod creates a new virtual pod with its own cgroup and network namespace -func (h *Host) CreateVirtualPod(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string, pSpec *specs.Spec) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() + cgroupPath := path.Join("/pods", sid) - // Check if virtual pod already exists - if _, exists := h.virtualPods[virtualSandboxID]; exists { - return fmt.Errorf("virtual pod %s already exists", virtualSandboxID) - } - - // Extract resource limits if provided resources := &specs.LinuxResources{} if pSpec != nil && pSpec.Linux != nil && pSpec.Linux.Resources != nil { resources = pSpec.Linux.Resources - logrus.WithFields(logrus.Fields{ - "virtualSandboxID": virtualSandboxID, - }).Info("Creating virtual pod with specified resources") - } else { - logrus.WithField("virtualSandboxID", virtualSandboxID).Info("Creating pod cgroup with default resources as none were specified") } - if h.virtualPodsCgroupManager == nil { - return fmt.Errorf("no virtual pod cgroup manager available") - } - - cgroupPath := path.Join("/containers/virtual-pods", virtualSandboxID) cgroupControl, err := cgroup.NewManager(cgroupPath, resources) if err != nil { - return errors.Wrapf(err, "failed to create cgroup for virtual pod %s", virtualSandboxID) + return errors.Wrapf(err, "failed to create cgroup for pod %s", sid) } - logrus.WithField("cgroupPath", cgroupPath).Info("Created virtual pod cgroup") - // Create virtual pod structure - virtualPod := &VirtualPod{ - VirtualSandboxID: virtualSandboxID, - MasterSandboxID: masterSandboxID, - NetworkNamespace: networkNamespace, - CgroupPath: cgroupPath, - CgroupControl: cgroupControl, - Containers: make(map[string]bool), - CreatedAt: time.Now(), + h.pods[sid] = &uvmPod{ + sandboxID: sid, + networkNamespace: nsID, + cgroupPath: cgroupPath, + cgroupControl: cgroupControl, + containers: make(map[string]bool), + createdAt: time.Now(), } - h.virtualPods[virtualSandboxID] = virtualPod - logrus.WithFields(logrus.Fields{ - "virtualSandboxID": virtualSandboxID, - "masterSandboxID": masterSandboxID, - "cgroupPath": cgroupPath, - "networkNamespace": networkNamespace, - }).Info("Virtual pod created successfully") + "sandboxID": sid, + "cgroupPath": cgroupPath, + }).Info("pod created in UVM") return nil } -// CreateVirtualPodWithoutMemoryLimit creates a virtual pod without memory limits (backward compatibility) -func (h *Host) CreateVirtualPodWithoutMemoryLimit(ctx context.Context, virtualSandboxID, masterSandboxID, networkNamespace string) error { - return h.CreateVirtualPod(ctx, virtualSandboxID, masterSandboxID, networkNamespace, nil) -} - -// GetVirtualPod retrieves a virtual pod by its virtualSandboxID -func (h *Host) GetVirtualPod(virtualSandboxID string) (*VirtualPod, bool) { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - vp, exists := h.virtualPods[virtualSandboxID] - return vp, exists -} - -// AddContainerToVirtualPod associates a container with a virtual pod -func (h *Host) AddContainerToVirtualPod(containerID, virtualSandboxID string) error { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - // Check if virtual pod exists - vp, exists := h.virtualPods[virtualSandboxID] - if !exists { - return fmt.Errorf("virtual pod %s does not exist", virtualSandboxID) - } - - // Add container to virtual pod - vp.Containers[containerID] = true - h.containerToVirtualPod[containerID] = virtualSandboxID - - logrus.WithFields(logrus.Fields{ - "containerID": containerID, - "virtualSandboxID": virtualSandboxID, - }).Info("Container added to virtual pod") - - return nil -} - -// RemoveContainerFromVirtualPod removes a container from a virtual pod -func (h *Host) RemoveContainerFromVirtualPod(containerID string) { - h.virtualPodsMutex.Lock() - defer h.virtualPodsMutex.Unlock() - - virtualSandboxID, exists := h.containerToVirtualPod[containerID] - if !exists { - return // Container not in any virtual pod - } - - // Remove from virtual pod - if vp, vpExists := h.virtualPods[virtualSandboxID]; vpExists { - delete(vp.Containers, containerID) - - // If this is the sandbox container, delete the network namespace - if containerID == virtualSandboxID && vp.NetworkNamespace != "" { - if err := RemoveNetworkNamespace(context.Background(), vp.NetworkNamespace); err != nil { - logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID). - Warn("Failed to remove virtual pod network namespace (sandbox container removal)") - } - } - - // If this was the last container, cleanup the virtual pod - if len(vp.Containers) == 0 { - h.cleanupVirtualPod(virtualSandboxID) - } - } - - delete(h.containerToVirtualPod, containerID) - - logrus.WithFields(logrus.Fields{ - "containerID": containerID, - "virtualSandboxID": virtualSandboxID, - }).Info("Container removed from virtual pod") -} - -// cleanupVirtualPod removes a virtual pod and its cgroup (should be called with mutex held) -func (h *Host) cleanupVirtualPod(virtualSandboxID string) { - if vp, exists := h.virtualPods[virtualSandboxID]; exists { - // Delete the cgroup - if vp.CgroupControl != nil { - if err := vp.CgroupControl.Delete(); err != nil { - logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID). - Warn("Failed to delete virtual pod cgroup") - } - } - - // Clean up network namespace if this is the last virtual pod using it - // Only remove if this virtual pod was managing the network namespace - if vp.NetworkNamespace != "" { - // For virtual pods, the network namespace is shared, so we only clean it up - // when the virtual pod itself is being destroyed - if err := RemoveNetworkNamespace(context.Background(), vp.NetworkNamespace); err != nil { - logrus.WithError(err).WithField("virtualSandboxID", virtualSandboxID). - Warn("Failed to remove virtual pod network namespace") - } - } - - delete(h.virtualPods, virtualSandboxID) - - logrus.WithField("virtualSandboxID", virtualSandboxID).Info("Virtual pod cleaned up") +// addContainerToPod adds a container reference to the pod. +func (h *Host) addContainerToPod(sandboxID, containerID string) { + h.podsMutex.Lock() + defer h.podsMutex.Unlock() + if pod, exists := h.pods[sandboxID]; exists { + pod.containers[containerID] = true } } diff --git a/internal/guest/runtime/hcsv2/workload_container.go b/internal/guest/runtime/hcsv2/workload_container.go index d118a4d35c..0a89bfcbcc 100644 --- a/internal/guest/runtime/hcsv2/workload_container.go +++ b/internal/guest/runtime/hcsv2/workload_container.go @@ -224,17 +224,8 @@ func setupWorkloadContainerSpec(ctx context.Context, sbid, id, sandboxRoot strin } } - // Check if this is a virtual pod container - virtualPodID := spec.Annotations[annotations.VirtualPodID] - - // Set cgroup path - check if this is a virtual pod container - if virtualPodID != "" { - // Virtual pod containers go under /containers/virtual-pods/virtualPodID/containerID - spec.Linux.CgroupsPath = "/containers/virtual-pods/" + virtualPodID + "/" + id - } else { - // Regular containers go under /containers - spec.Linux.CgroupsPath = "/containers/" + id - } + // Set cgroup path under the pod's cgroup. + spec.Linux.CgroupsPath = "/pods/" + sbid + "/" + id if spec.Windows != nil { // we only support Nvidia gpus right now