Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions src/core/guest.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,47 @@ static pthread_mutex_t pt_lock = PTHREAD_MUTEX_INITIALIZER; /* Lock order: 2 */
/* Track whether the 80% warning has been emitted (avoid log spam) */
static bool pt_pool_warned = false;

static size_t guest_host_page_size_cached(void)
{
static size_t cached;
if (!cached) {
long s = sysconf(_SC_PAGESIZE);
cached = (s > 0) ? (size_t) s : GUEST_PAGE_SIZE;
}
return cached;
}

static void guest_region_clear_overlay(guest_region_t *r)
{
r->overlay_active = false;
r->overlay_start = 0;
r->overlay_end = 0;
}

static void guest_region_clip_overlay(guest_region_t *r)
{
if (!r->overlay_active || r->end <= r->start) {
guest_region_clear_overlay(r);
return;
}

size_t hps = guest_host_page_size_cached();
uint64_t page_start = ALIGN_DOWN(r->start, hps);
uint64_t page_end = ALIGN_UP(r->end, hps);
uint64_t overlay_start =
r->overlay_start > page_start ? r->overlay_start : page_start;
uint64_t overlay_end =
r->overlay_end < page_end ? r->overlay_end : page_end;

if (overlay_end <= overlay_start) {
guest_region_clear_overlay(r);
return;
}

r->overlay_start = overlay_start;
r->overlay_end = overlay_end;
}

/* Allocate a zeroed 4KiB page from the page table pool.
* Returns GPA of the page, or 0 on pool exhaustion.
* Acquires pt_lock internally. Caller typically holds mmap_lock.
Expand Down Expand Up @@ -304,6 +345,12 @@ int guest_init(guest_t *g, uint64_t size, uint32_t ipa_bits)
return -1;
}

/* Seed HVF segment list with one entry covering the whole slab.
* sys_mmap may later split this for MAP_SHARED file overlays.
*/
g->segments[0] = (hvf_segment_t) {.ipa = GUEST_IPA_BASE, .len = size};
g->n_segments = 1;

return 0;
}

Expand Down Expand Up @@ -371,6 +418,13 @@ int guest_init_from_shm(guest_t *g,
return -1;
}

/* Seed HVF segment list. The child re-establishes any per-region file
* overlays the parent had after this call (handled by fork-state
* deserialization).
*/
g->segments[0] = (hvf_segment_t) {.ipa = GUEST_IPA_BASE, .len = size};
g->n_segments = 1;

log_debug(
"guest: CoW fork: mapped %llu GiB from shm "
"(ipa=%u bits)",
Expand All @@ -390,6 +444,13 @@ void guest_destroy(guest_t *g)
hv_vcpu_destroy(g->vcpu);
g->vcpu = 0;
}
/* Unmap each HVF segment. hv_vm_destroy releases all stage-2 state
* regardless, but unmapping explicitly keeps invariants clean for
* downstream tools (Instruments, leak detectors).
*/
for (int i = 0; i < g->n_segments; i++)
hv_vm_unmap(g->segments[i].ipa, g->segments[i].len);
g->n_segments = 0;
hv_vm_destroy();
if (g->host_base) {
munmap(g->host_base, g->guest_size);
Expand Down Expand Up @@ -901,6 +962,8 @@ static bool regions_mergeable(const guest_region_t *a, const guest_region_t *b)
*/
if (a->noreserve != b->noreserve)
return false;
if (a->overlay_active || b->overlay_active)
return false;
if (strcmp(a->name, b->name) != 0)
return false;

Expand Down Expand Up @@ -1014,6 +1077,7 @@ int guest_region_add_ex_owned(guest_t *g,
r->backing_fd = owned_backing_fd;
r->shared = (flags & 0x01) != 0; /* LINUX_MAP_SHARED = 0x01 */
r->noreserve = (flags & 0x4000) != 0; /* LINUX_MAP_NORESERVE = 0x4000 */
guest_region_clear_overlay(r);
if (name) {
str_copy_trunc(r->name, name, sizeof(r->name));
} else {
Expand Down Expand Up @@ -1062,13 +1126,15 @@ void guest_region_remove(guest_t *g, uint64_t start, uint64_t end)
uint64_t trimmed = end - r->start;
r->offset += trimmed;
r->start = end;
guest_region_clip_overlay(r);
i++;
continue;
}

/* Partial overlap: removal range cuts the end */
if (r->start < start && r->end > start && r->end <= end) {
r->end = start;
guest_region_clip_overlay(r);
i++;
continue;
}
Expand Down Expand Up @@ -1117,6 +1183,8 @@ void guest_region_remove(guest_t *g, uint64_t start, uint64_t end)

/* Left half keeps the original entry and shortens its end. */
r->end = start;
guest_region_clip_overlay(r);
guest_region_clip_overlay(right);

g->nregions++;
i += 2; /* skip both halves */
Expand Down Expand Up @@ -1173,6 +1241,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
g->nregions++;
/* Left half keeps original prot and backing_fd */
g->regions[i].end = start;
guest_region_clip_overlay(&g->regions[i]);
/* Right half will be processed next iteration */
g->regions[i + 1].offset += (start - g->regions[i + 1].start);
g->regions[i + 1].start = start;
Expand All @@ -1185,6 +1254,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
"split: %s",
strerror(errno));
}
guest_region_clip_overlay(&g->regions[i + 1]);
i++; /* advance to the right half */
r = &g->regions[i];
}
Expand Down Expand Up @@ -1213,6 +1283,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
/* Left half: [r->start, end) with new prot */
g->regions[i].end = end;
g->regions[i].prot = prot;
guest_region_clip_overlay(&g->regions[i]);
/* Right half: [end, old_end) keeps original prot */
g->regions[i + 1].offset += (end - g->regions[i + 1].start);
g->regions[i + 1].start = end;
Expand All @@ -1225,6 +1296,7 @@ void guest_region_set_prot(guest_t *g, uint64_t start, uint64_t end, int prot)
"end-split: %s",
strerror(errno));
}
guest_region_clip_overlay(&g->regions[i + 1]);
if (first_modified < 0)
first_modified = i;
last_modified = i;
Expand Down
41 changes: 40 additions & 1 deletion src/core/guest.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,26 @@ typedef struct {
*/
#define GUEST_MAX_REGIONS 4096

/* HVF stage-2 mapping segment. The slab is mapped to HVF in pieces so that
* file-backed MAP_SHARED regions can have real host-VA overlays applied via
* mmap MAP_FIXED|MAP_SHARED of a file fd. HVF requires hv_vm_unmap to target
* an exactly-previously-mapped range; sub-range unmap of a larger map fails
* with HV_BAD_ARGUMENT. To allow a 2 MiB-aligned middle range to be unmapped +
* remapped (refreshing HVF stage-2 caching after a host mmap MAP_FIXED), the
* slab is split into 2 MiB-aligned segments around each affected block. All
* segments are 2 MiB-aligned and 2 MiB-sized at minimum.
*
* 256 segments is generous: each MAP_SHARED file mmap costs at most 2 new
* segments (left/right of the carved block), and most workloads keep that count
* well under 50.
*/
typedef struct {
uint64_t ipa; /* 2 MiB-aligned IPA start */
uint64_t len; /* 2 MiB-aligned length */
} hvf_segment_t;

#define GUEST_MAX_HVF_SEGMENTS 256

/* A semantic memory region tracked for munmap/mprotect and /proc/self/maps.
* Distinct from mem_region_t which is used purely for page table construction.
* Regions are kept sorted by start address in guest_t.regions[].
Expand All @@ -120,7 +140,17 @@ typedef struct {
int backing_fd; /* Duplicated host fd for file-backed mappings, or -1 */
bool shared; /* MAP_SHARED (writes should propagate) */
bool noreserve; /* MAP_NORESERVE: PTEs deferred until fault */
char name[64]; /* Label: "[heap]", "[stack]", ELF path, etc. */
bool overlay_active; /* Region has a live host MAP_FIXED|MAP_SHARED overlay
* of backing_fd at host_base+start. The kernel's
* page cache keeps it coherent with the file and
* with peer overlays of the same file, so msync
* skips the snapshot-style pwrite-the-diff and
* refresh-from-file paths for these regions. */
uint64_t overlay_start; /* Host-page-aligned overlay start. May extend
* outside [start, end) when only part of a host
* page is guest-visible. */
uint64_t overlay_end; /* Host-page-aligned overlay end (exclusive). */
char name[64]; /* Label: "[heap]", "[stack]", ELF path, etc. */
} guest_region_t;

/* Guest state. */
Expand Down Expand Up @@ -160,6 +190,15 @@ typedef struct {
/* Semantic region tracking for munmap/mprotect/proc-self-maps */
guest_region_t regions[GUEST_MAX_REGIONS];
int nregions; /* Number of active regions */

/* HVF stage-2 segment list: the union of segments[0..n_segments) covers the
* live IPA range that is currently hv_vm_map'd to HVF. Sorted by ipa.
* Initially one segment spans the whole slab. See guest.h header comment on
* hvf_segment_t for the rationale.
*/
hvf_segment_t segments[GUEST_MAX_HVF_SEGMENTS];
int n_segments;

/* Page table generation counter: incremented on every PT modification.
* Used by the per-thread GVA TLB cache to detect stale entries.
* 64-bit to avoid wrap-around stale hits over long-running sessions.
Expand Down
14 changes: 13 additions & 1 deletion src/runtime/fork-state.c
Original file line number Diff line number Diff line change
Expand Up @@ -618,8 +618,20 @@ int fork_ipc_recv_process_state(int ipc_fd, guest_t *g, signal_state_t *sig)
return -1;
}
g->nregions = (int) num_guest_regions;
for (int i = 0; i < g->nregions; i++)
for (int i = 0; i < g->nregions; i++) {
g->regions[i].backing_fd = -1;
/* Demote inherited overlays: the child does not yet re-establish
* host MAP_FIXED|MAP_SHARED mappings from the parent's overlay
* fds, so msync, MADV_DONTNEED and friends must use the
* snapshot-style emulation. The CoW path's pre-fork sync of
* overlay bytes into shm_fd already gave the child snapshot the
* correct content at fork time. Live cross-fork MAP_SHARED
* coherence is the next P1 TODO item.
*/
g->regions[i].overlay_active = false;
g->regions[i].overlay_start = 0;
g->regions[i].overlay_end = 0;
}

if (fork_ipc_recv_backing_fds(ipc_fd, g) < 0)
return -1;
Expand Down
39 changes: 38 additions & 1 deletion src/runtime/forkipc.c
Original file line number Diff line number Diff line change
Expand Up @@ -984,8 +984,45 @@ int64_t sys_clone(hv_vcpu_t vcpu,
goto fail_snapshot;
}

/* CoW path: send shm fd to child via SCM_RIGHTS */
/* CoW path: sync MAP_SHARED file overlays back into shm_fd before
* sending it to the child. The parent's host VA at each overlay
* region maps the overlay file, not shm_fd, so shm_fd's content at
* those IPAs is stale (typically zero). The child's MAP_PRIVATE
* snapshot would expose that stale data at the overlay IPAs. Copy
* the live overlay bytes into shm_fd at the matching offsets so the
* child snapshot reflects the parent's view at fork time. Live
* cross-fork MAP_SHARED coherence (parent and child both seeing
* subsequent writes through the same file) is left to the cross-fork
* coherence TODO; this fix only avoids the stale-snapshot regression.
*/
if (use_shm) {
for (int i = 0; i < g->nregions; i++) {
const guest_region_t *r = &g->regions[i];
if (!r->overlay_active)
continue;
uint64_t len = r->end - r->start;
const uint8_t *src = (const uint8_t *) g->host_base + r->start;
uint64_t off = r->start;
while (len > 0) {
size_t chunk = len > (uint64_t) SSIZE_MAX ? (size_t) SSIZE_MAX
: (size_t) len;
ssize_t nw = pwrite(g->shm_fd, src, chunk, (off_t) off);
if (nw < 0) {
if (errno == EINTR)
continue;
log_error("clone: shm overlay sync pwrite failed: %s",
strerror(errno));
goto fail_snapshot;
}
if (nw == 0) {
log_error("clone: shm overlay sync pwrite returned 0");
goto fail_snapshot;
}
src += nw;
off += (uint64_t) nw;
len -= (uint64_t) nw;
}
}
if (fork_ipc_send_fds(ipc_sock, &g->shm_fd, 1) < 0) {
log_error("clone: failed to send shm fd");
goto fail_snapshot;
Expand Down
Loading
Loading