-
Notifications
You must be signed in to change notification settings - Fork 243
Expand file tree
/
Copy pathstats-ring.c
More file actions
306 lines (277 loc) · 9.79 KB
/
stats-ring.c
File metadata and controls
306 lines (277 loc) · 9.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
/*
* Per-child stats ring buffer + parent-side aggregate.
*
* Children produce stats deltas into their own ring (write-only-by-owner);
* the parent drains every ring once per main_loop iteration and applies
* the deltas to a parent-private struct stats_aggregate that lives in
* MAP_PRIVATE memory invisible to the kernel. The kernel can no longer
* scribble those counters via a wild syscall arg pointer because the
* authoritative copy is not at any kernel-visible address.
*
* The mirror page (struct stats_published) carries the small subset of
* the aggregate that children also need to read -- currently just
* fleet_op_count for the strategy rotation clock and the syscalls_todo
* termination check. Republished once per drain.
*/
#include <errno.h>
#include <stdatomic.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <sys/mman.h>
#include "arch.h" /* page_size, PAGE_MASK */
#include "child.h"
#include "pids.h"
#include "shm.h"
#include "spsc-ring.h"
#include "stats_ring.h"
#include "trinity.h"
#include "utils.h"
struct stats_aggregate parent_stats;
struct stats_published *shm_published;
void stats_ring_init(struct stats_ring *ring)
{
memset(ring->slots, 0, sizeof(ring->slots));
spsc_ring_init(&ring->base);
}
bool stats_ring_enqueue(struct stats_ring *ring, enum stats_field field,
uint16_t aux, uint32_t delta)
{
struct stats_ring_slot slot = {
.field_id = (uint16_t)field,
.aux = aux,
.delta = delta,
._reserved = 0,
};
if (ring == NULL)
return false;
return spsc_ring_try_enqueue(&ring->base, ring->slots, STATS_RING_SIZE,
sizeof(ring->slots[0]), &slot);
}
bool stats_ring_enqueue_call_complete(struct stats_ring *ring,
uint16_t category,
enum stats_result_class result)
{
struct stats_ring_slot slot = {
.field_id = (uint16_t)STATS_FIELD_CALL_COMPLETE,
.aux = category,
.delta = 1,
._reserved = (uint64_t)(uint8_t)result,
};
if (ring == NULL)
return false;
return spsc_ring_try_enqueue(&ring->base, ring->slots, STATS_RING_SIZE,
sizeof(ring->slots[0]), &slot);
}
/*
* Apply a single ring slot to parent_stats. Validates the field_id /
* aux combination before touching any array index -- children produce
* hostile fuzzed workload and a wild value-result syscall buffer that
* scribbled a slot can leave any field at any value.
*/
static void apply_slot(const void *p, void *ctx __unused__)
{
const struct stats_ring_slot *s = p;
enum stats_field field = (enum stats_field)s->field_id;
uint16_t aux = s->aux;
unsigned long delta = s->delta;
switch (field) {
case STATS_FIELD_OP_COUNT:
parent_stats.op_count += delta;
break;
case STATS_FIELD_FAULT_INJECTED:
parent_stats.fault_injected += delta;
break;
case STATS_FIELD_FAULT_CONSUMED:
parent_stats.fault_consumed += delta;
break;
case STATS_FIELD_SHARED_BUFFER_REDIRECTED:
parent_stats.shared_buffer_redirected += delta;
break;
case STATS_FIELD_LIBC_HEAP_REDIRECTED:
parent_stats.libc_heap_redirected += delta;
break;
case STATS_FIELD_LIBC_HEAP_EMBEDDED_REDIRECTED:
parent_stats.libc_heap_embedded_redirected += delta;
break;
case STATS_FIELD_RANGE_OVERLAPS_SHARED_REJECTS:
parent_stats.range_overlaps_shared_rejects += delta;
break;
case STATS_FIELD_GET_WRITABLE_SCRIBBLED_SHM_RANGE:
parent_stats.get_writable_address_scribbled_shm_range += delta;
break;
case STATS_FIELD_GET_WRITABLE_SCRIBBLED_MPROTECT_MMAP:
parent_stats.get_writable_address_scribbled_mprotect_mmap += delta;
break;
case STATS_FIELD_GET_WRITABLE_SCRIBBLED_MPROTECT_SHM:
parent_stats.get_writable_address_scribbled_mprotect_shm += delta;
break;
case STATS_FIELD_GET_WRITABLE_SCRIBBLED_POSTMP_MMAP:
parent_stats.get_writable_address_scribbled_postmp_mmap += delta;
break;
case STATS_FIELD_GET_WRITABLE_SCRIBBLED_POSTMP_SHM:
parent_stats.get_writable_address_scribbled_postmp_shm += delta;
break;
case STATS_FIELD_GET_WRITABLE_ENOMEM_EXHAUSTED:
parent_stats.get_writable_address_enomem_exhausted += delta;
break;
case STATS_FIELD_CHILDREN_RECYCLED_ON_STORM:
parent_stats.children_recycled_on_storm += delta;
break;
case STATS_FIELD_UNSHARE_NEWNET_THROTTLED:
parent_stats.unshare_newnet_throttled += delta;
break;
case STATS_FIELD_RANGE_REJECTS_PER_SYSCALL_64:
if (aux < MAX_NR_SYSCALL)
parent_stats.range_overlaps_shared_rejects_per_syscall_64[aux] += delta;
break;
case STATS_FIELD_RANGE_REJECTS_PER_SYSCALL_32:
if (aux < MAX_NR_SYSCALL)
parent_stats.range_overlaps_shared_rejects_per_syscall_32[aux] += delta;
break;
case STATS_FIELD_POST_HANDLER_CORRUPT_PTR:
parent_stats.post_handler_corrupt_ptr += delta;
break;
case STATS_FIELD_DEFERRED_FREE_REJECT:
parent_stats.deferred_free_reject += delta;
break;
case STATS_FIELD_DEFERRED_FREE_REJECT_PATHNAME:
parent_stats.deferred_free_reject_pathname += delta;
break;
case STATS_FIELD_DEFERRED_FREE_REJECT_IOVEC:
parent_stats.deferred_free_reject_iovec += delta;
break;
case STATS_FIELD_DEFERRED_FREE_REJECT_SOCKADDR:
parent_stats.deferred_free_reject_sockaddr += delta;
break;
case STATS_FIELD_DEFERRED_FREE_REJECT_OTHER:
parent_stats.deferred_free_reject_other += delta;
break;
case STATS_FIELD_SNAPSHOT_NON_HEAP_REJECT:
parent_stats.snapshot_non_heap_reject += delta;
break;
case STATS_FIELD_RING_EVICTION_CORRUPT:
parent_stats.ring_eviction_corrupt += delta;
break;
case STATS_FIELD_DEFERRED_FREE_CORRUPT_PTR:
parent_stats.deferred_free_corrupt_ptr += delta;
break;
case STATS_FIELD_CALL_COMPLETE: {
/* One slot, three logical bumps. op_count is unconditional
* (the SPSC slot wouldn't have made it past spsc_ring_drain
* without head/tail ordering, so its arrival IS the proof
* that a child dispatched a syscall). category is gated on
* aux < NR_SYSCAT; a scribbled aux loses just the category
* bump for this slot. successes/failures is gated on a
* known result_class; any other byte value in _reserved is
* treated as INCOMPLETE so a scribbled slot cannot fabricate
* a success/failure attribution. */
uint8_t result = (uint8_t)s->_reserved;
parent_stats.op_count += delta;
if (aux < NR_SYSCAT)
parent_stats.syscall_category_count[aux] += delta;
if (result == STATS_RESULT_SUCCESS)
parent_stats.successes += delta;
else if (result == STATS_RESULT_FAILURE)
parent_stats.failures += delta;
break;
}
case STATS_FIELD_NR:
default:
/* Out-of-range field_id: silent drop. A scribbled slot can
* carry any value; the surrounding ring overflow counter
* already conveys "we lost samples". */
break;
}
}
unsigned int stats_ring_drain(struct stats_ring *ring)
{
uint32_t overflow = 0;
uint32_t processed;
if (ring == NULL)
return 0;
processed = spsc_ring_drain(&ring->base, ring->slots, STATS_RING_SIZE,
sizeof(ring->slots[0]),
apply_slot, NULL, &overflow);
parent_stats.ring_overflow_total += overflow;
return processed;
}
/*
* Republish the mirror page from parent_stats. Caller must have already
* thawed the global-obj freeze (so the parent can write through to the
* mprotected page) and will refreeze afterwards.
*
* Mirror integrity is verified separately by shm_is_corrupt(): between
* this publish and the next iteration's read-back, nothing should write
* to the mirror, so a mismatch there flags a scribble.
*/
static void stats_publish_locked(void)
{
if (shm_published == NULL)
return;
shm_published->fleet_op_count = parent_stats.op_count;
}
void stats_ring_drain_all(void)
{
unsigned int i;
if (children == NULL)
return;
for_each_child(i) {
struct childdata *child;
struct stats_ring *ring;
child = __atomic_load_n(&children[i], __ATOMIC_ACQUIRE);
if (child == NULL)
continue;
ring = __atomic_load_n(&child->stats_ring, __ATOMIC_ACQUIRE);
if (ring == NULL)
continue;
(void) stats_ring_drain(ring);
}
stats_publish_locked();
}
void stats_published_init(void)
{
shm_published = alloc_shared(sizeof(struct stats_published));
memset(shm_published, 0, sizeof(*shm_published));
}
/*
* Per-child mprotect freeze of the shm_published mirror page. The
* mirror is parent-write / child-read: children read fleet_op_count
* off it on the cold path (maybe_rotate_strategy()'s rotation clock
* in random-syscall.c and the syscalls_todo termination check in
* child_process()), and the parent's stats_publish_locked() inside
* stats_ring_drain_all() is the sole writer. The mirror-integrity
* sample in shm_is_corrupt() (main.c) already documents the
* PROT_READ contract -- "republish-time we wrote ... and then
* mprotected the page PROT_READ" -- but the matching mprotect()
* call was missing, leaving the contract as comment only. A wild
* kernel store through a fuzzed syscall arg pointer could scribble
* fleet_op_count between publishes, perturbing the rotation clock
* and syscalls_todo progress; the integrity check would only flag
* the damage post-hoc.
*
* Called from the per-child post-fork init hook so the freeze
* applies in child address space. mprotect is per-process, so the
* parent's mapping stays PROT_READ|PROT_WRITE and the drain's
* publish keeps writing through; only children see the read-only
* view.
*
* Best-effort on failure: log via the canonical helper and continue.
* mprotect can ENOMEM if the kernel runs out of VMA slots splitting
* the mapping that backs the mirror (same failure mode as the
* healer/edgepair freeze helpers and the freeze_sibling_childdata
* sweep) and turning a transient kernel limit into a fleet-wide
* crash would be worse than leaving the mirror RW for the lifetime
* of the affected child.
*/
void stats_published_freeze(void)
{
size_t bytes;
if (shm_published == NULL)
return;
bytes = sizeof(struct stats_published);
bytes = (bytes + page_size - 1) & PAGE_MASK;
if (mprotect(shm_published, bytes, PROT_READ) != 0)
log_mprotect_failure(shm_published, bytes, PROT_READ,
__builtin_return_address(0), errno);
}