Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions runsc/cgroup/cgroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,32 @@ func new(pid, cgroupsPath string, useSystemd bool) (Cgroup, error) {
return cg, nil
}

// InstallSubcontainerCompatDir creates a cgroup directory at the path
// resolved by cg, intended for cAdvisor (and other inotify-based) discovery
// of subcontainers running inside a gVisor sandbox. When res is non-nil, the
// limit/spec interface files cAdvisor reads as container_spec_* (memory.max,
// cpu.max, cpu.weight, pids.max, ...) are populated from it on a best-effort
// basis. The directory is tracked by cg so that cg.Uninstall() removes it at
// container destroy.
//
// On systemd v2, this bypasses the dbus StartTransientUnit path: a
// process-less transient unit is not necessary for inotify discovery and
// causes lifecycle conflicts with systemd. On cgroupfs (v1 or non-systemd v2),
// this delegates to Install(res) which already mkdirs the directory and
// writes spec files via the per-controller set() methods.
//
// The compat cgroup is intentionally process-less, so any limits written here
// have no kernel-side accounting effect; they exist solely so cAdvisor's
// container_spec_* series are populated for runsc pods, matching what runc
// produces. Runtime counter series (container_*_total) remain zero by design;
// see #13067.
func InstallSubcontainerCompatDir(cg Cgroup, res *specs.LinuxResources) error {
if sd, ok := cg.(*cgroupSystemd); ok {
return sd.installCompatDir(res)
}
return cg.Install(res)
}

// CgroupJSON is a wrapper for Cgroup that can be encoded to JSON.
type CgroupJSON struct {
Cgroup Cgroup
Expand Down
78 changes: 78 additions & 0 deletions runsc/cgroup/systemd.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,20 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/cleanup"
"gvisor.dev/gvisor/pkg/log"
)

// compatDirControllers are the v2 controllers populated on the compat cgroup
// directory created by installCompatDir for cAdvisor's container_spec_*
// series. Limited to controllers whose interface files cAdvisor reads as
// spec values (memory.max, cpu.max / cpu.weight, pids.max). Other controllers
// (cpuset, io, hugetlb) are intentionally excluded: they don't surface as
// container_spec_* and writing them widens the failure surface on hosts where
// they aren't enabled in the parent slice's cgroup.subtree_control.
var compatDirControllers = []string{"cpu", "memory", "pids"}

var (
// ErrBadResourceSpec indicates that a cgroupSystemd function was
// passed a specs.LinuxResources object that is impossible or illegal
Expand Down Expand Up @@ -92,6 +102,74 @@ func newCgroupV2Systemd(cgv2 *cgroupV2) (*cgroupSystemd, error) {
return cg, err
}

// installCompatDir creates the cgroup directory under the parent slice
// without registering a transient systemd unit, and tracks it in c.Own so
// the embedded cgroupV2.Uninstall removes it at container destroy. When res
// is non-nil, it also writes the cgroup interface files cAdvisor reads as
// container_spec_* (memory.max, cpu.max, cpu.weight, pids.max) on a
// best-effort basis.
//
// This exists so tools that discover containers via inotify on
// /sys/fs/cgroup (notably cAdvisor) can see subcontainers that run inside a
// gVisor sandbox and therefore have no real cgroup placement of their own.
// Install() on systemd v2 only stages dbus properties; the cgroup directory
// is otherwise created by Join() via StartTransientUnit, which would
// inappropriately register a process-less unit for compat purposes.
//
// Callers should use this in place of Install(...) when they only need the
// host-side cgroup directory to exist (and optionally its spec files
// populated) for compatibility.
func (c *cgroupSystemd) installCompatDir(res *specs.LinuxResources) error {
path := c.MakePath("")
if err := os.MkdirAll(path, 0o755); err != nil {
return fmt.Errorf("creating compat cgroup dir %q: %w", path, err)
}
alreadyTracked := false
for _, owned := range c.Own {
if owned == path {
alreadyTracked = true
break
}
}
if !alreadyTracked {
c.Own = append(c.Own, path)
}

// Best-effort spec-file population. Controllers that aren't enabled in
// the parent slice's cgroup.subtree_control don't have leaf interface
// files; setValue then returns ENOENT, which we swallow. The compat dir
// must always succeed at directory level (#6657 precedent: the compat
// path must never block container start).
if res == nil {
return nil
}
for _, name := range compatDirControllers {
ctrlr, ok := controllers2[name]
if !ok {
continue
}
if err := ctrlr.set(res, path); err != nil {
if isCompatDirIgnorableErr(err) {
log.Debugf("Skipping %q spec-file population for compat cgroup %q: %v", name, path, err)
continue
}
return fmt.Errorf("populating %q spec files for compat cgroup %q: %w", name, path, err)
}
}
return nil
}

// isCompatDirIgnorableErr reports whether err from a best-effort spec-file
// write on the compat cgroup is safe to swallow. The interface file may not
// exist (controller not in subtree_control), the cgroup mount may be
// read-only (sandboxed environments), or we may lack permission (rootless,
// restricted user namespace). None of these should block container start.
func isCompatDirIgnorableErr(err error) bool {
return errors.Is(err, os.ErrNotExist) ||
errors.Is(err, os.ErrPermission) ||
errors.Is(err, unix.EROFS)
}

// Install configures the properties for a scope unit but does not start the
// unit.
func (c *cgroupSystemd) Install(res *specs.LinuxResources) error {
Expand Down
224 changes: 224 additions & 0 deletions runsc/cgroup/systemd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ package cgroup

import (
"errors"
"os"
"path/filepath"
"strconv"
"testing"

systemdDbus "github.com/coreos/go-systemd/v22/dbus"
Expand Down Expand Up @@ -269,6 +272,227 @@ func TestInstall(t *testing.T) {
}
}

// newCompatDirTestCgroup constructs a cgroupSystemd anchored at a temporary
// mountpoint with its parent slice pre-created, mirroring what kubelet+systemd
// arrange on a real host. The leaf scope directory itself is intentionally
// not created -- installCompatDir is the unit under test for that.
func newCompatDirTestCgroup(t *testing.T) (*cgroupSystemd, string) {
t.Helper()
mountpoint := t.TempDir()
parentSlicePath := filepath.Join(mountpoint, "/parent.slice")
if err := os.MkdirAll(parentSlicePath, 0o755); err != nil {
t.Fatalf("mkdir parent slice: %v", err)
}
cg := &cgroupSystemd{
Name: "abc",
ScopePrefix: "cri-containerd",
Parent: "parent.slice",
cgroupV2: cgroupV2{
Mountpoint: mountpoint,
// Path is set by newCgroupV2Systemd to expanded slice + unitName.
Path: filepath.Join(expandSlice("parent.slice"), "cri-containerd-abc.scope"),
},
}
return cg, cg.MakePath("")
}

// seedCompatLeafFiles touches the v2 controller interface files in the leaf
// scope directory that the kernel would auto-create on a real cgroupfs mount
// when the corresponding controllers are enabled in the parent's
// cgroup.subtree_control. Without these, setValue (which uses O_WRONLY|
// O_TRUNC and does not create) returns ENOENT, which is the
// controller-not-enabled path.
func seedCompatLeafFiles(t *testing.T, leafDir string, names ...string) {
t.Helper()
if err := os.MkdirAll(leafDir, 0o755); err != nil {
t.Fatalf("mkdir leaf dir: %v", err)
}
for _, name := range names {
f, err := os.Create(filepath.Join(leafDir, name))
if err != nil {
t.Fatalf("create %s: %v", name, err)
}
f.Close()
}
}

// readCgroupFile reads a single cgroup interface file under leafDir and
// returns its trimmed contents.
func readCgroupFile(t *testing.T, leafDir, name string) string {
t.Helper()
b, err := os.ReadFile(filepath.Join(leafDir, name))
if err != nil {
t.Fatalf("read %s: %v", name, err)
}
return string(b)
}

// TestInstallCompatDir verifies that installCompatDir creates the cgroup
// directory at the resolved scope path under the parent slice (so cAdvisor
// can discover it via inotify), and that the embedded cgroupV2.Uninstall
// removes it via the c.Own bookkeeping. This is the cgroup v2 + systemd
// counterpart of #6500 / PR #6657, which created cAdvisor compat
// directories on cgroup v1.
func TestInstallCompatDir(t *testing.T) {
cg, wantDir := newCompatDirTestCgroup(t)
if got, want := wantDir, filepath.Join(cg.Mountpoint, "/parent.slice", "cri-containerd-abc.scope"); got != want {
t.Fatalf("MakePath() = %q, want %q", got, want)
}
if _, err := os.Stat(wantDir); !os.IsNotExist(err) {
t.Fatalf("compat dir already exists or unexpected error before install: %v", err)
}

if err := cg.installCompatDir(nil); err != nil {
t.Fatalf("installCompatDir(nil) error: %v", err)
}
info, err := os.Stat(wantDir)
if err != nil {
t.Fatalf("compat dir not created: %v", err)
}
if !info.IsDir() {
t.Fatalf("compat dir %q is not a directory", wantDir)
}
// The path must be tracked in Own so Uninstall can clean it up.
if got := len(cg.Own); got != 1 || cg.Own[0] != wantDir {
t.Fatalf("c.Own = %v, want exactly [%q]", cg.Own, wantDir)
}

// Idempotency: calling again on an existing dir should not error and
// should not double-track the path (avoid leaking entries on retries).
if err := cg.installCompatDir(nil); err != nil {
t.Fatalf("installCompatDir(nil) second call error: %v", err)
}
if got := len(cg.Own); got != 1 {
t.Fatalf("len(c.Own) after second installCompatDir = %d, want 1", got)
}

// Uninstall must remove the dir we created.
if err := cg.Uninstall(); err != nil {
t.Fatalf("Uninstall() error: %v", err)
}
if _, err := os.Stat(wantDir); !os.IsNotExist(err) {
t.Fatalf("compat dir still exists after Uninstall: %v", err)
}
}

// TestInstallCompatDirSpecFiles verifies that installCompatDir populates the
// cgroup interface files cAdvisor reads as container_spec_* (memory.max,
// cpu.max, cpu.weight, pids.max, memory.swap.max) when handed a non-nil
// LinuxResources. The leaf interface files are pre-touched to simulate what
// the kernel auto-creates when the corresponding controllers are enabled in
// the parent slice's cgroup.subtree_control on a real cgroupfs mount.
func TestInstallCompatDirSpecFiles(t *testing.T) {
cg, leafDir := newCompatDirTestCgroup(t)
seedCompatLeafFiles(t, leafDir,
"memory.max", "memory.swap.max", "memory.low",
"cpu.max", "cpu.weight",
"pids.max",
)

memLimit := int64(536870912) // 512 MiB
memSwap := int64(1073741824) // 1 GiB combined memory+swap (runc-style)
cpuQuota := int64(50000)
cpuPeriod := uint64(100000)
cpuShares := uint64(2048)
pidsLimit := int64(100)
res := &specs.LinuxResources{
Memory: &specs.LinuxMemory{
Limit: &memLimit,
Swap: &memSwap,
},
CPU: &specs.LinuxCPU{
Quota: &cpuQuota,
Period: &cpuPeriod,
Shares: &cpuShares,
},
Pids: &specs.LinuxPids{
Limit: pidsLimit,
},
}
if err := cg.installCompatDir(res); err != nil {
t.Fatalf("installCompatDir(res) error: %v", err)
}

if got, want := readCgroupFile(t, leafDir, "memory.max"), "536870912"; got != want {
t.Errorf("memory.max = %q, want %q", got, want)
}
// Swap in v2 is the swap-only value (memorySwap - memory).
if got, want := readCgroupFile(t, leafDir, "memory.swap.max"), "536870912"; got != want {
t.Errorf("memory.swap.max = %q, want %q", got, want)
}
if got, want := readCgroupFile(t, leafDir, "cpu.max"), "50000 100000"; got != want {
t.Errorf("cpu.max = %q, want %q", got, want)
}
// cpu.shares=2048 maps to cpu.weight via the runc-compatible formula.
wantWeight := strconv.FormatUint(convertCPUSharesToCgroupV2Value(cpuShares), 10)
if got := readCgroupFile(t, leafDir, "cpu.weight"); got != wantWeight {
t.Errorf("cpu.weight = %q, want %q", got, wantWeight)
}
if got, want := readCgroupFile(t, leafDir, "pids.max"), "100"; got != want {
t.Errorf("pids.max = %q, want %q", got, want)
}
}

// TestInstallCompatDirBestEffort verifies installCompatDir is best-effort:
// when controller interface files are absent (controller not enabled in the
// parent slice's cgroup.subtree_control on a real host), the missing-file
// errors from setValue are swallowed, the directory is still created, and
// installCompatDir returns success. This preserves the #6657 invariant that
// the compat path must never block container start.
func TestInstallCompatDirBestEffort(t *testing.T) {
cg, leafDir := newCompatDirTestCgroup(t)

memLimit := int64(536870912)
res := &specs.LinuxResources{
Memory: &specs.LinuxMemory{Limit: &memLimit},
}

// Deliberately do NOT seed any leaf interface files. set() will hit
// ENOENT for every controller it tries to write; installCompatDir must
// swallow that and still report success.
if err := cg.installCompatDir(res); err != nil {
t.Fatalf("installCompatDir(res) with no leaf files: got error %v, want nil (best-effort)", err)
}
if _, err := os.Stat(leafDir); err != nil {
t.Fatalf("compat dir not created: %v", err)
}
if got := len(cg.Own); got != 1 || cg.Own[0] != leafDir {
t.Fatalf("c.Own = %v, want exactly [%q]", cg.Own, leafDir)
}
}

// TestInstallSubcontainerCompatDirSystemd verifies the public dispatcher
// routes systemd cgroups to installCompatDir (not the dbus Install path),
// and that resources flow through to spec-file population on the leaf.
//
// Uninstall() is exercised by TestInstallCompatDir (with no spec files
// written, so unix.Rmdir succeeds on the empty leaf). It is intentionally
// not exercised here: on a real cgroupfs mount the kernel atomically
// removes the auto-generated interface files when rmdir-ing an empty cgroup
// directory, but in a regular tmpdir the spec files we wrote are real files
// that block rmdir. That's a test-environment artifact, not a behavior
// difference we should encode in production code.
func TestInstallSubcontainerCompatDirSystemd(t *testing.T) {
cg, leafDir := newCompatDirTestCgroup(t)
// Pre-create only memory.max so we can assert end-to-end that resources
// passed via the dispatcher reach the per-controller set() methods.
seedCompatLeafFiles(t, leafDir, "memory.max")

memLimit := int64(123456789)
res := &specs.LinuxResources{
Memory: &specs.LinuxMemory{Limit: &memLimit},
}
if err := InstallSubcontainerCompatDir(cg, res); err != nil {
t.Fatalf("InstallSubcontainerCompatDir(systemd, res) error: %v", err)
}
if _, err := os.Stat(leafDir); err != nil {
t.Fatalf("compat dir not created via InstallSubcontainerCompatDir: %v", err)
}
if got, want := readCgroupFile(t, leafDir, "memory.max"), "123456789"; got != want {
t.Errorf("memory.max = %q, want %q", got, want)
}
}

// filterProperties filters the list of properties in got to ones with
// the names of properties specified in want.
func filterProperties(got []systemdDbus.Property, want []systemdDbus.Property) []systemdDbus.Property {
Expand Down
23 changes: 21 additions & 2 deletions runsc/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -1961,8 +1961,27 @@ func (c *Container) setupCgroupForSubcontainer(conf *config.Config, spec *specs.
if cg == nil || err != nil {
return nil, err
}
// Use empty resources, just want the directory structure created.
return cgroupInstall(conf, cg, &specs.LinuxResources{})
// Want the directory structure created so cAdvisor (and other
// inotify-based tools) can discover the subcontainer, and the limit
// files populated from the OCI resources so cAdvisor's
// container_spec_* series report real values. The compat cgroup is
// process-less, so these limits have no kernel-side accounting effect;
// they exist solely for cAdvisor compatibility. On systemd v2,
// Install({}) is a no-op for directory creation -- the dir is otherwise
// only created in Join() via StartTransientUnit, which would
// inappropriately register a process-less unit for compat purposes.
// InstallSubcontainerCompatDir handles that case while preserving the
// existing cgroupfs (v1 / non-systemd v2) behavior.
if err := cgroup.InstallSubcontainerCompatDir(cg, spec.Linux.Resources); err != nil {
switch {
case (errors.Is(err, unix.EACCES) || errors.Is(err, unix.EROFS)) && conf.Rootless:
log.Warningf("Skipping subcontainer cgroup configuration in rootless mode: %v", err)
return nil, nil
default:
return nil, fmt.Errorf("configuring subcontainer cgroup: %v", err)
}
}
return cg, nil
}

// cgroupInstall creates cgroups dir structure and sets their respective
Expand Down