Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,666 changes: 1,063 additions & 603 deletions src/runtime/procemu.c

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions src/runtime/procemu.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <stddef.h>
#include <sys/stat.h>
#include <sys/uio.h>
#include "core/guest.h"

/* Sentinel return value: path was not intercepted, caller should fall through
Expand Down Expand Up @@ -53,6 +54,24 @@ int proc_intercept_write(int guest_fd,
int use_pwrite,
ssize_t *written_out);

/* Intercept reads from synthetic proc files that must reflect shared state on
* every read rather than the per-open temp-file snapshot.
* Returns 1 if handled (with *read_out set), 0 if not intercepted, or -1 on
* error with errno set.
*/
int proc_intercept_read(int guest_fd,
void *buf,
size_t count,
int64_t offset,
ssize_t *read_out);

/* Vector form of proc_intercept_read for readv/preadv. */
int proc_intercept_readv(int guest_fd,
const struct iovec *iov,
int iovcnt,
int64_t offset,
ssize_t *read_out);

/* Get the /dev/shm emulation directory path (creating it on first call).
* Used by sys_unlinkat to rewrite /dev/shm/<name> paths.
*/
Expand Down
150 changes: 117 additions & 33 deletions src/syscall/fd.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,53 @@ static int timerfd_alloc(void)
return sfd_alloc_slot(timerfd_state, TIMERFD_MAX, sizeof(timerfd_state[0]));
}

/* Called with sfd_lock held. Drain any kevent expirations sitting on the
* timer's kqueue and fold them into the slot's accumulator. Used by
* timerfd_read before consuming the counter and by timerfd_fdinfo_snapshot
* before reporting it; without this drain, fdinfo would lag the actual
* fire count by however many ticks were pending in the kqueue.
*/
static void timerfd_drain_pending_locked(int slot)
{
int kq = timerfd_state[slot].kq_fd;
struct kevent kev;
struct timespec ts_zero = {0, 0};
int nev = kevent(kq, NULL, 0, &kev, 1, &ts_zero);
if (nev > 0) {
uint64_t fires = (uint64_t) kev.data;
if (fires == 0)
fires = 1; /* At least one expiration */
timerfd_state[slot].expirations += fires;
}
}

/* Called with sfd_lock held. Returns nanoseconds until the next expiration,
* or 0 when the timer is disarmed or a one-shot timer has already expired.
*/
static int64_t timerfd_remaining_ns_locked(int slot, int64_t now_ns)
{
if (!timerfd_state[slot].armed)
return 0;

int64_t elapsed = now_ns - timerfd_state[slot].arm_time_ns;
if (elapsed < 0)
elapsed = 0;

if (timerfd_state[slot].interval_ns > 0) {
int64_t total = timerfd_state[slot].initial_ns;
if (elapsed >= total) {
int64_t since_first = elapsed - total;
int64_t interval = timerfd_state[slot].interval_ns;
int64_t remaining = interval - (since_first % interval);
return remaining == 0 ? interval : remaining;
}
return total - elapsed;
}

int64_t remaining = timerfd_state[slot].initial_ns - elapsed;
return remaining > 0 ? remaining : 0;
}

int64_t sys_timerfd_create(int clockid, int flags)
{
if (clockid != LINUX_CLOCK_REALTIME && clockid != LINUX_CLOCK_MONOTONIC)
Expand Down Expand Up @@ -203,8 +250,7 @@ int64_t sys_timerfd_settime(guest_t *g,
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
int64_t now_ns = now.tv_sec * NS_PER_SEC + now.tv_nsec;
int64_t elapsed = now_ns - timerfd_state[slot].arm_time_ns;
int64_t remaining = timerfd_state[slot].initial_ns - elapsed;
int64_t remaining = timerfd_remaining_ns_locked(slot, now_ns);
if (remaining > 0) {
old.it_value_sec = remaining / NS_PER_SEC;
old.it_value_nsec = remaining % NS_PER_SEC;
Expand Down Expand Up @@ -319,27 +365,10 @@ int64_t sys_timerfd_gettime(guest_t *g, int fd, uint64_t curr_value_gva)
its.it_interval_sec = timerfd_state[slot].interval_ns / NS_PER_SEC;
its.it_interval_nsec = timerfd_state[slot].interval_ns % NS_PER_SEC;

/* Compute actual remaining time from arm time + initial value */
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
int64_t now_ns = now.tv_sec * NS_PER_SEC + now.tv_nsec;
int64_t elapsed = now_ns - timerfd_state[slot].arm_time_ns;
int64_t remaining;

if (timerfd_state[slot].interval_ns > 0) {
/* Repeating timer: remaining = interval - (elapsed % interval) */
int64_t total = timerfd_state[slot].initial_ns;
if (elapsed >= total) {
int64_t since_first = elapsed - total;
remaining = timerfd_state[slot].interval_ns -
(since_first % timerfd_state[slot].interval_ns);
} else {
remaining = total - elapsed;
}
} else {
/* One-shot: remaining = initial - elapsed */
remaining = timerfd_state[slot].initial_ns - elapsed;
}
int64_t remaining = timerfd_remaining_ns_locked(slot, now_ns);

if (remaining <= 0) {
/* Timer already expired (one-shot) */
Expand Down Expand Up @@ -374,18 +403,8 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)

int kq = timerfd_state[slot].kq_fd;

/* Collect pending timer events via kevent(). The data field contains
* the number of times the timer fired since the last kevent() call.
*/
struct kevent kev;
struct timespec ts_zero = {0, 0};
int nev = kevent(kq, NULL, 0, &kev, 1, &ts_zero);
if (nev > 0) {
uint64_t fires = (uint64_t) kev.data;
if (fires == 0)
fires = 1; /* At least one expiration */
timerfd_state[slot].expirations += fires;
}
/* Collect pending timer events into the slot's accumulator. */
timerfd_drain_pending_locked(slot);

if (timerfd_state[slot].expirations == 0) {
/* No events yet; check if non-blocking */
Expand All @@ -408,8 +427,9 @@ int64_t timerfd_read(int guest_fd, guest_t *g, uint64_t buf_gva, uint64_t count)
* kevent() returns EBADF in that case, and the code re-validates the
* slot.
*/
struct kevent kev;
pthread_mutex_unlock(&sfd_lock);
nev = kevent(kq, NULL, 0, &kev, 1, NULL);
int nev = kevent(kq, NULL, 0, &kev, 1, NULL);
pthread_mutex_lock(&sfd_lock);
/* Re-validate: slot may have been freed by timerfd_close() */
if (timerfd_state[slot].guest_fd != guest_fd) {
Expand Down Expand Up @@ -1073,3 +1093,67 @@ void signalfd_notify(int signum)
}
pthread_mutex_unlock(&sfd_lock);
}

/* /proc/self/fdinfo type-specific snapshots. Each takes sfd_lock to prevent
* tearing across concurrent read/write/settime; lock order is fd_lock(3)
* -> sfd_lock(5a), and these accessors take only sfd_lock so the procemu
* caller is free to drop fd_lock between fd_snapshot and the lookup here.
*/

bool eventfd_fdinfo_snapshot(int guest_fd, uint64_t *count_out)
{
pthread_mutex_lock(&sfd_lock);
int slot = eventfd_find(guest_fd);
if (slot < 0) {
pthread_mutex_unlock(&sfd_lock);
return false;
}
*count_out = eventfd_state[slot].counter;
pthread_mutex_unlock(&sfd_lock);
return true;
}

bool signalfd_fdinfo_snapshot(int guest_fd, uint64_t *mask_out)
{
pthread_mutex_lock(&sfd_lock);
int slot = signalfd_find(guest_fd);
if (slot < 0) {
pthread_mutex_unlock(&sfd_lock);
return false;
}
*mask_out = signalfd_state[slot].mask;
pthread_mutex_unlock(&sfd_lock);
return true;
}

bool timerfd_fdinfo_snapshot(int guest_fd,
int *clockid_out,
uint64_t *ticks_out,
int64_t *value_ns_out,
int64_t *interval_ns_out)
{
pthread_mutex_lock(&sfd_lock);
int slot = timerfd_find(guest_fd);
if (slot < 0) {
pthread_mutex_unlock(&sfd_lock);
return false;
}
/* Fold any pending kqueue fires into expirations before exporting,
* matching what timerfd_read does. Without this, fdinfo lags by
* however many ticks were sitting on the kqueue.
*/
timerfd_drain_pending_locked(slot);
*clockid_out = timerfd_state[slot].clockid;
*ticks_out = timerfd_state[slot].expirations;
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
*interval_ns_out = timerfd_state[slot].interval_ns;
int64_t value_ns = 0;
if (timerfd_state[slot].armed) {
struct timespec now;
clock_gettime(CLOCK_MONOTONIC, &now);
int64_t now_ns = (int64_t) now.tv_sec * NS_PER_SEC + now.tv_nsec;
value_ns = timerfd_remaining_ns_locked(slot, now_ns);
}
*value_ns_out = value_ns;
pthread_mutex_unlock(&sfd_lock);
return true;
}
12 changes: 12 additions & 0 deletions src/syscall/fd.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,15 @@ int64_t timerfd_read(int guest_fd,
* writes a byte to make poll/epoll see readability.
*/
void signalfd_notify(int signum);

/* Snapshot per-fd state for /proc/self/fdinfo. Each accessor returns true when
* the guest_fd refers to a live instance of that special-fd type. The values
* are read under sfd_lock so concurrent read/write/settime cannot tear them.
*/
bool eventfd_fdinfo_snapshot(int guest_fd, uint64_t *count_out);
bool signalfd_fdinfo_snapshot(int guest_fd, uint64_t *mask_out);
bool timerfd_fdinfo_snapshot(int guest_fd,
int *clockid_out,
uint64_t *ticks_out,
int64_t *value_ns_out,
int64_t *interval_ns_out);
18 changes: 13 additions & 5 deletions src/syscall/fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,15 @@ static const char *proc_virtual_dir_path(const char *path,

static const char *proc_stateful_file_path(const char *path)
{
if (!path || strncmp(path, "/proc", 5) != 0)
if (!path || strncmp(path, "/proc/", 6) != 0)
return NULL;

if (!strcmp(path, "/proc/self/oom_score_adj") ||
!strcmp(path, "/proc/self/oom_adj")) {
!strcmp(path, "/proc/self/oom_adj") ||
!strcmp(path, "/proc/self/oom_score")) {
return path;
}

if (strncmp(path, "/proc/", 6) != 0)
return NULL;

char *endp;
long pid = strtol(path + 6, &endp, 10);
if (endp == path + 6 || pid != (long) proc_get_pid())
Expand All @@ -86,6 +84,8 @@ static const char *proc_stateful_file_path(const char *path)
return "/proc/self/oom_score_adj";
if (!strcmp(endp, "/oom_adj"))
return "/proc/self/oom_adj";
if (!strcmp(endp, "/oom_score"))
return "/proc/self/oom_score";

return NULL;
}
Expand Down Expand Up @@ -117,9 +117,14 @@ static const char *proc_virtual_dir_path(const char *path,
virt = "/proc";
} else if (!strcmp(path, "/proc/self") || !strcmp(path, "/proc/self/")) {
virt = "/proc/self";
} else if (!strcmp(path, "/proc/net") || !strcmp(path, "/proc/net/")) {
virt = "/proc/net";
} else if (!strcmp(path, "/proc/self/fd") ||
!strcmp(path, "/proc/self/fd/")) {
virt = "/proc/self/fd";
} else if (!strcmp(path, "/proc/self/fdinfo") ||
!strcmp(path, "/proc/self/fdinfo/")) {
virt = "/proc/self/fdinfo";
} else if (!strcmp(path, "/proc/self/task") ||
!strcmp(path, "/proc/self/task/")) {
virt = "/proc/self/task";
Expand All @@ -137,6 +142,9 @@ static const char *proc_virtual_dir_path(const char *path,
if (endp != path + 6 && pid == (long) proc_get_pid() &&
(*endp == '\0' || !strcmp(endp, "/"))) {
virt = "/proc/self";
} else if (endp != path + 6 && pid == (long) proc_get_pid() &&
(!strcmp(endp, "/fdinfo") || !strcmp(endp, "/fdinfo/"))) {
virt = "/proc/self/fdinfo";
} else if (endp != path + 6 && pid == (long) proc_get_pid() &&
!strcmp(endp, "/fd")) {
virt = "/proc/self/fd";
Expand Down
Loading
Loading