Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion cmd/gcs/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,9 +334,20 @@ func main() {
// Write 1 to memory.use_hierarchy on the root cgroup to enable hierarchy
// support. This needs to be set before we create any cgroups as the write
// will fail otherwise. This is only needed for cgroup v1.
//
// On kernels where CONFIG_MEMCG_V1 is disabled (e.g. 6.18+), the v1
// memory controller does not exist and this path will be absent even
// though init fell back to v1 for other controllers. Without the memory
// controller, container memory limits cannot be enforced and a workload
// could OOM the entire UVM, so we still crash but with a diagnostic
// message pointing at the kernel config.
if !cgroup.IsCgroupV2() {
if err := os.WriteFile("/sys/fs/cgroup/memory/memory.use_hierarchy", []byte("1"), 0644); err != nil {
logrus.WithError(err).Fatal("failed to enable hierarchy support for root cgroup")
if os.IsNotExist(err) {
logrus.WithError(err).Fatal("v1 memory controller not available; the kernel may lack CONFIG_MEMCG_V1 -- enable it or use a kernel with cgroup v2 + CONFIG_CGROUP_BPF")
} else {
logrus.WithError(err).Fatal("failed to enable hierarchy support for root cgroup")
}
}
}

Expand Down
95 changes: 76 additions & 19 deletions init/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include <sys/wait.h>
#include <unistd.h>
#include <dirent.h>
#include <sys/syscall.h>
#include <linux/bpf.h>

#ifdef MODULES
#include <ftw.h>
Expand Down Expand Up @@ -312,7 +314,8 @@ void init_fs(const struct InitOp* ops, size_t count) {
}
}

// Check if cgroup v2 should be disabled via kernel parameter
// Check if cgroup v2 should be disabled via kernel/init parameter.
// Recognizes "hcsshim.cgroup=v1" to force v1.
bool is_cgroup_v2_disabled() {
FILE* f = fopen("/proc/cmdline", "r");
if (f == NULL) {
Expand All @@ -324,9 +327,9 @@ bool is_cgroup_v2_disabled() {
// Kernel cmdline is space-separated; tokenize and match exactly.
char* token = strtok(cmdline, " \n");
while (token) {
if (strcmp(token, "cgroup_no_v2=all") == 0) {
if (strcmp(token, "hcsshim.cgroup=v1") == 0) {
fclose(f);
dmesgInfo("cgroup v2: disabled by kernel parameter cgroup_no_v2=all\n");
dmesgInfo("cgroup v2: disabled by kernel parameter hcsshim.cgroup=v1\n");
return true;
}
token = strtok(NULL, " \n");
Expand Down Expand Up @@ -392,16 +395,51 @@ bool try_init_cgroups_v2() {
}

// Check if controllers file appeared
if (access("/sys/fs/cgroup/cgroup.controllers", F_OK) == 0) {
dmesgInfo("cgroup v2: test mount successful, controllers found\n");
enable_subtree_control();
return true; // Leave it mounted, cgroup v2 is ready
if (access("/sys/fs/cgroup/cgroup.controllers", F_OK) != 0) {
dmesgInfo("cgroup v2: test mount succeeded but no controllers\n");
umount("/sys/fs/cgroup");
return false;
}

// Mount worked but no controllers - clean up
dmesgInfo("cgroup v2: test mount succeeded but no controllers\n");
umount("/sys/fs/cgroup");
return false;
dmesgInfo("cgroup v2: test mount successful, controllers found\n");

// Probe BPF cgroup support. runc's cgroup v2 device controller requires
// BPF_PROG_QUERY on BPF_CGROUP_DEVICE. Detect this early and fall back
// to cgroup v1 if BPF cgroup is not available.
//
// Possible error codes from bpf(BPF_PROG_QUERY):
// ENOSYS — CONFIG_BPF_SYSCALL not set (no BPF syscall at all)
// EINVAL — BPF syscall exists but CONFIG_CGROUP_BPF not set, or
// BPF_CGROUP_DEVICE attach type not recognized
// EPERM — BPF cgroup works, just no permission (means it's available)
// 0 — success, BPF cgroup works
//
// Only EPERM or success indicate BPF_CGROUP_DEVICE is functional.
int cgfd = open("/sys/fs/cgroup", O_RDONLY | O_DIRECTORY);
if (cgfd < 0) {
dmesgWarn("cgroup v2: cannot open /sys/fs/cgroup for BPF probe, falling back to v1\n");
umount("/sys/fs/cgroup");
return false;
}
{
union bpf_attr attr = {0};
attr.query.target_fd = cgfd;
attr.query.attach_type = BPF_CGROUP_DEVICE;
long ret = syscall(__NR_bpf, BPF_PROG_QUERY, &attr, sizeof(attr));
// Capture errno before close(), which may clobber it.
int err = errno;
close(cgfd);
if (ret < 0 && (err == ENOSYS || err == EINVAL)) {
dmesgWarn("cgroup v2: BPF cgroup not supported, falling back to v1\n");
umount("/sys/fs/cgroup");
return false;
}
// EPERM or success both mean BPF cgroup is functional.
dmesgInfo("cgroup v2: BPF cgroup probe ok\n");
}

enable_subtree_control();
return true; // Leave it mounted, cgroup v2 is ready
}

void init_cgroups_v1() {
Expand All @@ -423,6 +461,7 @@ void init_cgroups_v1() {
break;
}
}
int mounted = 0;
for (;;) {
static const char base_path[] = "/sys/fs/cgroup/";
char path[sizeof(base_path) - 1 + 64];
Expand All @@ -444,18 +483,23 @@ void init_cgroups_v1() {
if (mount(name, path, "cgroup", MS_NODEV | MS_NOSUID | MS_NOEXEC, name) < 0) {
die2("mount", path);
}
mounted++;
}
}
fclose(f);
if (mounted == 0) {
die("cgroup init failed: v2 unavailable and no v1 controllers enabled");
}
}

void init_cgroups() {
if (try_init_cgroups_v2()) {
dmesgInfo("Using cgroup v2\n");
} else {
dmesgInfo("Using cgroup v1\n");
init_cgroups_v1();
return;
}

dmesgInfo("Using cgroup v1\n");
init_cgroups_v1();
}

void init_network(const char* iface, int domain) {
Expand All @@ -481,16 +525,25 @@ void init_network(const char* iface, int domain) {
close(s);
}

// inject boot-time entropy after reading it from a vsock port
// inject boot-time entropy after reading it from a vsock port.
// Failures are non-fatal: entropy seeding improves randomness quality
// but the UVM can still function without it. On some kernels (e.g.,
// Ubuntu 6.17), the vsock read may fail with ENOMEM after hv_sock
// module load if the transport buffers are not yet fully initialized.
void init_entropy(int port) {
int s = openvsock(VMADDR_CID_HOST, port);
if (s < 0) {
die("openvsock entropy");
warn("openvsock entropy");
dmesgWarn("entropy: failed to open vsock, skipping entropy seeding\n");
return;
}

int e = open("/dev/random", O_RDWR);
if (e < 0) {
die("open /dev/random");
warn("open /dev/random");
dmesgWarn("entropy: failed to open /dev/random, skipping\n");
close(s);
return;
}

struct {
Expand All @@ -502,7 +555,9 @@ void init_entropy(int port) {
for (;;) {
ssize_t n = read(s, buf.buf, sizeof(buf.buf));
if (n < 0) {
die("read entropy");
warn("read entropy");
dmesgWarn("entropy: vsock read failed, continuing without full entropy\n");
break;
}

if (n == 0) {
Expand All @@ -512,7 +567,9 @@ void init_entropy(int port) {
buf.entropy_count = n * 8; // in bits
buf.buf_size = n; // in bytes
if (ioctl(e, RNDADDENTROPY, &buf) < 0) {
die("ioctl(RNDADDENTROPY)");
warn("ioctl(RNDADDENTROPY)");
dmesgWarn("entropy: RNDADDENTROPY ioctl failed, continuing\n");
break;
}
}

Expand Down