Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 115 additions & 22 deletions src/runtime/fork-state.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "debug/log.h"
#include "syscall/abi.h"
#include "syscall/internal.h"
#include "syscall/mem.h"
#include "syscall/proc.h"

int fork_ipc_write_all(int fd, const void *buf, size_t len)
Expand Down Expand Up @@ -494,7 +495,9 @@ static int fork_ipc_drain_bytes(int ipc_fd, uint32_t len)
return 0;
}

static int fork_ipc_recv_backing_fds(int ipc_fd, guest_t *g)
static int fork_ipc_recv_backing_fds(int ipc_fd,
guest_t *g,
const bool *parent_had_fd)
{
uint32_t nbacking;
if (fork_ipc_read_all(ipc_fd, &nbacking, sizeof(nbacking)) < 0) {
Expand All @@ -518,19 +521,59 @@ static int fork_ipc_recv_backing_fds(int ipc_fd, guest_t *g)
.msg_controllen = cmsg_sz,
};
ssize_t nr = recvmsg(ipc_fd, &msg, 0);
if (nr > 0) {
struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
if (cmsg && cmsg->cmsg_level == SOL_SOCKET &&
cmsg->cmsg_type == SCM_RIGHTS) {
int *region_fds = (int *) CMSG_DATA(cmsg);
uint32_t fi = 0;
for (int i = 0; i < g->nregions && fi < nbacking; i++) {
if (!(g->regions[i].flags & LINUX_MAP_ANONYMOUS) &&
g->regions[i].offset != (uint64_t) -1) {
g->regions[i].backing_fd = region_fds[fi++];
}
}
}
if (nr <= 0) {
free(cmsg_buf);
return -1;
}

struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg);
if (msg.msg_flags & MSG_CTRUNC) {
log_error("fork-child: backing fd SCM_RIGHTS payload truncated");
free(cmsg_buf);
return -1;
}
if (!cmsg || cmsg->cmsg_level != SOL_SOCKET ||
cmsg->cmsg_type != SCM_RIGHTS) {
log_error("fork-child: missing backing fd SCM_RIGHTS payload");
free(cmsg_buf);
return -1;
}
if (cmsg->cmsg_len < CMSG_LEN(0)) {
free(cmsg_buf);
return -1;
}

int *region_fds = (int *) CMSG_DATA(cmsg);
uint32_t nreceived =
(uint32_t) ((cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int));
uint32_t fi = 0;

/* Sender (fork_ipc_send_backing_fds) iterates regions and sends one fd per
* region with backing_fd >= 0. The receiver must iterate in the same order
* over regions that had backing_fd in the parent. parent_had_fd[i] is
* captured by the caller before backing_fd is cleared.
*
* The original filter (!MAP_ANONYMOUS && offset != -1) matched extra
* regions like the shim and ELF text, so the first received fd was
* misassigned and the actual file-backed region was left without
* backing_fd.
*/
for (int i = 0; i < g->nregions && fi < nreceived; i++) {
if (parent_had_fd && parent_had_fd[i])
g->regions[i].backing_fd = region_fds[fi++];
}

/* Close any received fds that did not get assigned: avoids leaking host fds
* into the child's process table when a mismatch occurs.
*/
while (fi < nreceived)
close(region_fds[fi++]);

if (nreceived != nbacking) {
log_error("fork-child: expected %u backing fds but received %u",
nbacking, nreceived);
free(cmsg_buf);
return -1;
}
free(cmsg_buf);
return 0;
Expand Down Expand Up @@ -618,23 +661,73 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig)
return -1;
}
g->nregions = (int) num_guest_regions;

/* Capture parent state before clearing the inherited overlay/backing fd
* fields. parent_had_fd lets recv_backing_fds iterate in the same order the
* sender used (regions with backing_fd >= 0); the parent_ovl_* arrays let
* mmap_fork_restore_overlays know which regions to re-install, with what
* overlay span. Heap-allocated to avoid pushing hundreds of KiB onto the
* recv stack frame.
*/
bool *parent_had_fd = NULL;
bool *parent_active = NULL;
uint64_t *parent_ovl_start = NULL;
uint64_t *parent_ovl_end = NULL;
if (g->nregions > 0) {
parent_had_fd = calloc((size_t) g->nregions, sizeof(*parent_had_fd));
parent_active = calloc((size_t) g->nregions, sizeof(*parent_active));
parent_ovl_start =
calloc((size_t) g->nregions, sizeof(*parent_ovl_start));
parent_ovl_end = calloc((size_t) g->nregions, sizeof(*parent_ovl_end));
if (!parent_had_fd || !parent_active || !parent_ovl_start ||
!parent_ovl_end) {
log_error("fork-child: parent overlay buffer alloc failed");
free(parent_had_fd);
free(parent_active);
free(parent_ovl_start);
free(parent_ovl_end);
return -1;
}
for (int i = 0; i < g->nregions; i++) {
parent_had_fd[i] = (g->regions[i].backing_fd >= 0);
parent_active[i] = g->regions[i].overlay_active;
parent_ovl_start[i] = g->regions[i].overlay_start;
parent_ovl_end[i] = g->regions[i].overlay_end;
}
}

for (int i = 0; i < g->nregions; i++) {
g->regions[i].backing_fd = -1;
/* Demote inherited overlays: the child does not yet re-establish
* host MAP_FIXED|MAP_SHARED mappings from the parent's overlay
* fds, so msync, MADV_DONTNEED and friends must use the
* snapshot-style emulation. The CoW path's pre-fork sync of
* overlay bytes into shm_fd already gave the child snapshot the
* correct content at fork time. Live cross-fork MAP_SHARED
* coherence is the next P1 TODO item.
/* Drop inherited overlay metadata; the host MAP_FIXED|MAP_SHARED
* mapping does not exist yet in the child. Re-establishment runs after
* fork_ipc_recv_backing_fds populates backing_fd from the
* parent-supplied SCM_RIGHTS bundle.
*/
g->regions[i].overlay_active = false;
g->regions[i].overlay_start = 0;
g->regions[i].overlay_end = 0;
}

if (fork_ipc_recv_backing_fds(ipc_fd, g) < 0)
if (fork_ipc_recv_backing_fds(ipc_fd, g, parent_had_fd) < 0) {
free(parent_had_fd);
free(parent_active);
free(parent_ovl_start);
free(parent_ovl_end);
return -1;
}

/* Re-install MAP_SHARED overlays for every region the parent had as
* overlay_active and that now carries a backing fd. Failures here fall back
* to snapshot semantics for the affected region; the child still boots and
* can run.
*/
if (g->nregions > 0)
(void) mmap_fork_restore_overlays(g, parent_active, parent_ovl_start,
parent_ovl_end);
free(parent_had_fd);
free(parent_active);
free(parent_ovl_start);
free(parent_ovl_end);

if (fork_ipc_read_all(ipc_fd, sig, sizeof(*sig)) < 0) {
log_error("fork-child: failed to read signal state");
Expand Down
80 changes: 52 additions & 28 deletions src/runtime/forkipc.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

#include "syscall/abi.h"
#include "syscall/internal.h"
#include "syscall/mem.h"
#include "syscall/net.h" /* absock namespace IPC state */
#include "syscall/poll.h" /* wakeup_pipe_signal */
#include "syscall/proc.h"
Expand Down Expand Up @@ -89,8 +90,8 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
absock_set_namespace_id(hdr.absock_namespace_id);
proc_set_session(hdr.sid, hdr.pgid);

/* Create guest memory before receiving state so all incoming offsets can
* be bounds-checked against the negotiated guest size.
/* Create guest memory before receiving state so all incoming offsets can be
* bounds-checked against the negotiated guest size.
*/
guest_t g;

Expand Down Expand Up @@ -176,6 +177,7 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
guest_destroy(&g);
return 1;
}

/* POSIX: "Signals pending to the parent shall not be pending to the child."
* Clear pending bitmask and RT queue before applying state.
* signal_set_state() is deferred until after thread_register_main()
Expand Down Expand Up @@ -218,17 +220,17 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_TPIDR_EL0, regs.tpidr_el0));

/* Enable MMU directly (page tables already in guest memory from IPC).
* SCTLR must include MMU-enable (M), caches (C, I), RES1 bits,
* and EL0 cache maintenance access (UCI, UCT) for JIT translators.
* SCTLR must include MMU-enable (M), caches (C, I), RES1 bits, and EL0
* cache maintenance access (UCI, UCT) for JIT translators.
*/
uint64_t sctlr_with_mmu = SCTLR_RES1 | SCTLR_M | SCTLR_C | SCTLR_I |
SCTLR_DZE | SCTLR_UCT | SCTLR_UCI;
HV_CHECK(hv_vcpu_set_sys_reg(vcpu, HV_SYS_REG_SCTLR_EL1, sctlr_with_mmu));

/* Restore all 31 GPRs from parent state, then override X0=0 (child
* clone return value). This preserves X1-X30 exactly as they were when
* the parent called clone(), which is required by the Linux syscall ABI
* (especially callee-saved X19-X28, FP=X29, LR=X30).
/* Restore all 31 GPRs from parent state, then override X0=0 (child clone
* return value). This preserves X1-X30 exactly as they were when the parent
* called clone(), which is required by the Linux syscall ABI (especially
* callee-saved X19-X28, FP=X29, LR=X30).
*/
vcpu_restore_gprs(vcpu, regs.x);
vcpu_set_gpr(vcpu, 0, 0); /* Child gets 0 from clone */
Expand All @@ -246,14 +248,14 @@ int fork_child_main(int ipc_fd, bool verbose, int timeout_sec)

/* Register the fork child's main thread in the thread table.
* Without this, current_thread is NULL and any syscall handler that
* accesses per-thread state (signal masks, ptrace, CLONE_THREAD)
* will dereference NULL.
* accesses per-thread state (signal masks, ptrace, CLONE_THREAD) will
* dereference NULL.
*/
thread_register_main(vcpu, vexit, hdr.child_pid, regs.sp_el1);

/* Now that current_thread is set, apply signal state. This must happen
* after thread_register_main() so the per-thread blocked mask and
* altstack are properly restored to the thread entry.
* after thread_register_main() so the per-thread blocked mask and altstack
* are properly restored to the thread entry.
*/
signal_set_state(&sig);

Expand Down Expand Up @@ -921,6 +923,22 @@ int64_t sys_clone(hv_vcpu_t vcpu,
*/
thread_quiesce_siblings();

mmap_fork_anon_shared_txn_t *anon_shared_txn = NULL;
guest_region_t *regions_snapshot = NULL;

/* Convert MAP_SHARED|MAP_ANONYMOUS regions that have no backing fd
* into memfd-backed overlay regions. The conversion seeds a private
* temp file with the current bytes and installs a host
* MAP_SHARED|MAP_FIXED overlay on the parent. The child receives the
* fd via SCM_RIGHTS and re-installs its own overlay so subsequent
* writes from either side flow through the kernel page cache and
* reach the other. File-backed MAP_SHARED regions already carry a
* backing fd and are unaffected. Misaligned shared regions
* (snapshot-style) remain incoherent across fork by design.
*/
if (mmap_fork_prepare_anon_shared(g, &anon_shared_txn) < 0)
goto fail_snapshot;

/* Determine if elfuse can use the CoW (shm) fast path.
* If shm_fd >= 0, elfuse freezes a snapshot via MAP_PRIVATE and sends the
* shm fd to the child. Otherwise fall back to region-by-region copy.
Expand All @@ -947,8 +965,6 @@ int64_t sys_clone(hv_vcpu_t vcpu,
* but before sibling vCPUs resume. Declared up front so all goto paths to
* fail_snapshot can free it unconditionally.
*/
guest_region_t *regions_snapshot = NULL;

/* Header */
ipc_header_t hdr = {
.magic = IPC_MAGIC_HEADER,
Expand Down Expand Up @@ -1064,25 +1080,25 @@ int64_t sys_clone(hv_vcpu_t vcpu,
if (nregions_snapshot > 0) {
regions_snapshot = malloc(snap_sz);
if (!regions_snapshot) {
thread_resume_siblings();
close(ipc_sock);
return -LINUX_ENOMEM;
goto fail_snapshot;
}
memcpy(regions_snapshot, g->regions, snap_sz);
}

if (fork_ipc_send_fd_table(ipc_sock) < 0)
goto fail_snapshot;

/* Resume sibling vCPUs now that the memory snapshot, semantic region
* snapshot, and FD snapshot have been serialized.
*/
thread_resume_siblings();

uint32_t num_guest_regions = (uint32_t) nregions_snapshot;
if (fork_ipc_send_process_state(ipc_sock, regions_snapshot,
num_guest_regions) < 0)
goto fail_ipc;
goto fail_snapshot;

/* The process-state payload includes the SCM_RIGHTS handoff for region
* backing fds. Keep siblings quiesced until that send completes so a
* concurrent munmap/remap cannot close or recycle the captured fd numbers.
*/
thread_resume_siblings();
mmap_fork_commit_anon_shared(&anon_shared_txn);

close(ipc_sock);

Expand Down Expand Up @@ -1112,13 +1128,21 @@ int64_t sys_clone(hv_vcpu_t vcpu,
free(regions_snapshot);
return child_guest_pid;

fail_ipc:
free(regions_snapshot);
close(ipc_sock);
return -LINUX_ENOMEM;

fail_snapshot:
free(regions_snapshot);
/* Roll back the in-place anon-shared overlay conversion while
* siblings are still parked. A partial rollback failure (e.g.,
* region drift past the quiesce timeout) leaves the parent in a
* mixed state: the originating fork-IPC error is the user-visible
* one, but log abort failures so post-mortem can spot the
* lingering overlay without grepping for behavioral symptoms.
*/
int abort_rc = mmap_fork_abort_anon_shared(g, &anon_shared_txn);
if (abort_rc < 0)
log_warn(
"clone: anon-shared rollback partial failure (%d); parent "
"may have stale memfd-backed regions",
abort_rc);
thread_resume_siblings();
close(ipc_sock);
return -LINUX_ENOMEM;
Expand Down
Loading
Loading