Swately · Swately · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
@@ -169,9 +169,11 @@ jobs:
           bash scripts/lint_docs.sh
 
   # ─── Reference-doc sync check ─────────────────────────────────────────────
-  # Branch-level coverage check: if framework/<pillar>/include/ changed, then
+  # Coverage check: if framework/<pillar>/include/ changed, then
   # docs/reference/<pillar>.md must change too (or a [skip-doc-sync] token must
-  # be present). A reminder, not a signature parser. See docs/planning/DOC_AUDIT.md.
+  # be present). A reminder, not a signature parser. Fast (git only — no build).
+  # Works on PRs (diff vs target) AND on direct pushes to main (diffs exactly the
+  # commits just pushed, via github.event.before). See docs/planning/DOC_AUDIT.md.
   doc-sync:
     name: doc-sync
     runs-on: ubuntu-24.04
@@ -184,6 +186,16 @@ jobs:
 
       - name: Run check_doc_sync.sh
         run: |
-          BASE="origin/${{ github.base_ref }}"
-          if [ -z "${{ github.base_ref }}" ]; then BASE="origin/main"; fi
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            # PR: diff the whole branch against its target.
+            BASE="origin/${{ github.base_ref }}"
+          else
+            # Direct push (e.g. to main): diff exactly the commits just pushed.
+            BASE="${{ github.event.before }}"
+            # New branch / no parent (all-zero SHA): fall back to the parent commit.
+            if [ -z "$BASE" ] || [ "$BASE" = "0000000000000000000000000000000000000000" ]; then
+              BASE="HEAD~1"
+            fi
+          fi
+          echo "doc-sync base: $BASE"
           bash scripts/check_doc_sync.sh "$BASE"
@@ -110,6 +110,8 @@ framework/render/vulkan/third_party/ffx/)"
     OFF)
 option(PHYRIAD_BUILD_AYAMA      "Build Ayama showcase application"          OFF)
 option(PHYRIAD_BUILD_BENCHMARKS "Build perf micro-benchmarks (bench/*)"     OFF)
+option(PHYRIAD_BUILD_STRESS_TESTS
+    "Build heavy stochastic stress/soak tests (gated OFF — not a CI gate)"  OFF)
 option(PHYRIAD_STANDALONE       "Build only the Phyriad framework, no apps"   OFF)
 option(PHYRIAD_ENABLE_LTO       "Enable cross-pillar LTO in Release builds" ON)
 option(PHYRIAD_PGO_INSTRUMENT   "Build instrumented binaries to collect PGO profile" OFF)

@@ -162,14 +162,17 @@ notes.
 
 **3. The reference-doc sync check (CI)**
 
-`scripts/check_doc_sync.sh` is a branch-level CI check: if a PR changes a
-pillar's public headers (`framework/<pillar>/include/`) it must also touch that
-pillar's `docs/reference/<pillar>.md`. It is a **coverage reminder, not a
-signature parser** — it ensures the doc was not *forgotten*, not that it is
-correct. For a header change that genuinely needs no reference-doc update (a
-comment, an internal/private detail, a pure refactor), put `[skip-doc-sync]` in
-a commit message on the branch. It runs in CI only (a branch may change a header
-in one commit and the doc in another), not in the pre-commit hook.
+`scripts/check_doc_sync.sh` is a CI check: if a change touches a pillar's public
+headers (`framework/<pillar>/include/`) it must also touch that pillar's
+`docs/reference/<pillar>.md`. It runs on **pull requests** (diff vs the target
+branch) **and on direct pushes to `main`** (it diffs exactly the commits just
+pushed) — so a solo, push-to-main workflow is covered too. It is fast (git only,
+no build/tests). It is a **coverage reminder, not a signature parser** — it
+ensures the doc was not *forgotten*, not that it is correct. For a header change
+that genuinely needs no reference-doc update (a comment, an internal/private
+detail, a pure refactor), put `[skip-doc-sync]` in a commit message. It runs in
+CI only (a change may touch a header in one commit and the doc in another), not
+in the pre-commit hook.
 
 ---
 

@@ -155,12 +155,16 @@ excluded; a `lint-docs-allow` marker is the per-line escape hatch).
 appear in honest narrative and retraction notes.
 
 **Status (2):** shipped on 2026-05-21 as the *lightweight coverage check* —
-`scripts/check_doc_sync.sh` + a `doc-sync` CI job. If a PR changes
+`scripts/check_doc_sync.sh` + a `doc-sync` CI job. If a change touches
 `framework/<pillar>/include/`, it must also touch `docs/reference/<pillar>.md`,
 else CI fails (bypass: `[skip-doc-sync]` in a commit message for doc-irrelevant
-header changes). It is CI-only (branch-level), not in the pre-commit hook, and
-is a reminder — it checks the doc was not *forgotten*, not that it is correct.
-The full libclang signature differ stays out of scope.
+header changes). It runs on PRs (diff vs target) **and on direct pushes to
+`main`** (diffs the commits just pushed, via `github.event.before`), so a
+solo push-to-main workflow is covered; it is fast (git only — no build/tests).
+It is CI-only (a change may touch a header in one commit and the doc in
+another), not in the pre-commit hook, and is a reminder — it checks the doc was
+not *forgotten*, not that it is correct. The full libclang signature differ
+stays out of scope.
 
 ---
 

@@ -98,5 +98,30 @@ if(BUILD_TESTING OR PHYRIAD_BUILD_TESTS)
     add_executable(pool_integration_test tests/pool_integration_test.cpp)
     target_link_libraries(pool_integration_test PRIVATE phyriad_pool)
     target_compile_features(pool_integration_test PRIVATE cxx_std_23)
+    # DIAGNOSTIC: keep minimal debug info + export symbols to the dynamic table
+    # so the in-test backtrace handler (execinfo) resolves named frames in the CI
+    # log when §10 SegFaults on the 2-vCPU runner. No effect on test behaviour.
+    # Linux only — the execinfo backtrace handler and -rdynamic are glibc/ELF
+    # specific; the handler compiles to a no-op elsewhere (e.g. Windows MinGW).
+    if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+        target_compile_options(pool_integration_test PRIVATE -g1)
+        target_link_options(pool_integration_test PRIVATE -rdynamic)
+    endif()
     add_test(NAME pool_integration_test COMMAND pool_integration_test)
+
+    # ── Gated stress/soak (NOT a CI gate) ────────────────────────────────────
+    # Heavy stochastic saturation soak for CPU_ANY (lossless-under-backpressure).
+    # Built only with -DPHYRIAD_BUILD_STRESS_TESTS=ON; labelled "stress" so it can
+    # be selected/excluded explicitly. See pool_cpu_any_stress_test.cpp header.
+    if(PHYRIAD_BUILD_STRESS_TESTS)
+        add_executable(pool_cpu_any_stress tests/pool_cpu_any_stress_test.cpp)
+        target_link_libraries(pool_cpu_any_stress PRIVATE phyriad_pool)
+        target_compile_features(pool_cpu_any_stress PRIVATE cxx_std_23)
+        if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+            target_compile_options(pool_cpu_any_stress PRIVATE -g1)
+            target_link_options(pool_cpu_any_stress PRIVATE -rdynamic)
+        endif()
+        add_test(NAME pool_cpu_any_stress COMMAND pool_cpu_any_stress)
+        set_tests_properties(pool_cpu_any_stress PROPERTIES LABELS "stress")
+    endif()
 endif()
@@ -60,7 +60,8 @@ struct alignas(hal::kDestructivePad) SubmitterSlot {
 class SubmitterRegistry {
 public:
     explicit SubmitterRegistry(uint32_t max_slots) noexcept
-        : max_slots_(std::min(max_slots == 0u ? kMaxSubmitters : max_slots,
+        : instance_id_(hal::stat_fetch_add_relaxed(s_next_instance_id_, uint64_t{1}) + 1u)
+        , max_slots_(std::min(max_slots == 0u ? kMaxSubmitters : max_slots,
                               kMaxSubmittersCeiling))
         , slots_(std::make_unique<SubmitterSlot[]>(max_slots_))
     {
@@ -84,20 +85,33 @@ class SubmitterRegistry {
     SubmitterRegistry& operator=(const SubmitterRegistry&) = delete;
 
     // Acquire or return the already-claimed slot for the calling thread.
-    // Fast path: O(1) with two pointer comparisons + one relaxed atomic load.
+    // Fast path: O(1) — one instance-id compare + a non-null check.
     [[nodiscard]] SubmitterSlot* acquire_current() noexcept {
         auto& cache = tls_cache_;
-        const uint64_t tid = thread_id_u64_();
 
-        // Validate: same registry instance AND the slot is still claimed by us.
-        // The owning_thread_id check guards against allocator address reuse, where
-        // a new PoolRuntime at the same address has reset all slots to 0.
-        if (cache.registry == this && cache.slot &&
-            hal::stat_load_relaxed(cache.slot->owning_thread_id) == tid)
+        // Validate against this registry's unique instance id. A registry is
+        // identified by a monotonic id assigned at construction — NOT by its
+        // address. This is the crucial correctness point: the allocator readily
+        // places a freshly-created PoolRuntime at the SAME address as a just-
+        // destroyed one, so a `registry == this` test passes for a *different*
+        // registry instance, and the previous registry's freed slots_ array
+        // often still holds this (long-lived) thread's tid — so the old
+        // `registry == this && cache.slot->owning_thread_id == tid` guard would
+        // wrongly accept the cache and return a slot dangling into freed memory,
+        // whose `ring` pointer (since overwritten) was then dereferenced. That
+        // was the §10 pool_integration_test SegFault on the ci-linux runner.
+        //
+        // The instance id never repeats, so a cache populated by a destroyed
+        // registry can never match a live one — we never dereference a stale
+        // slot pointer. Within one live instance the slot stays ours (slots are
+        // never released), so a match is sufficient to return it directly.
+        if (cache.instance_id == instance_id_ && cache.slot)
             return cache.slot;
 
-        // Cache miss or stale — reset and claim a slot.
-        cache = {nullptr, nullptr};
+        // Cache miss or a different (possibly recycled-address) registry —
+        // reset and claim a fresh slot in THIS instance.
+        const uint64_t tid = thread_id_u64_();
+        cache = {0u, nullptr};
 
         for (uint32_t i = 0; i < max_slots_; ++i) {
             if (!slots_[i].ring) continue;   // allocation failed at construction
@@ -108,7 +122,7 @@ class SubmitterRegistry {
                     expected, tid, std::memory_order_acq_rel,  // HAL: explicit ordering — see surrounding context
                     std::memory_order_acquire))  // HAL: acquire ordering — paired with explicit release fence
             {
-                cache = {this, &slots_[i]};
+                cache = {instance_id_, &slots_[i]};
                 return cache.slot;
             }
         }
@@ -131,17 +145,27 @@ class SubmitterRegistry {
         return v;
     }
 
+    // Cache holds the registry INSTANCE id it was populated for (not a raw
+    // pointer): a destroyed-then-recreated registry at the same address gets a
+    // different id, so the cache cannot be mistaken as still valid. See
+    // acquire_current() for the SegFault this prevents.
     struct TlsCache {
-        SubmitterRegistry* registry{nullptr};
-        SubmitterSlot*     slot{nullptr};
+        uint64_t       instance_id{0};   // 0 = empty (ids start at 1)
+        SubmitterSlot* slot{nullptr};
     };
 
+    // Monotonic, never-reused id assigned per construction. `this` address is
+    // NOT a stable identity (the allocator recycles it across pool create/
+    // destroy), so the TLS cache keys on this instead.
+    const uint64_t                    instance_id_;
     uint32_t                          max_slots_;
     std::unique_ptr<SubmitterSlot[]>  slots_;
 
+    static std::atomic<uint64_t> s_next_instance_id_;
     static thread_local TlsCache tls_cache_;
 };
 
+inline std::atomic<uint64_t>          SubmitterRegistry::s_next_instance_id_{0};
 inline thread_local SubmitterRegistry::TlsCache SubmitterRegistry::tls_cache_{};
 
 } // namespace phyriad::pool::detail
@@ -0,0 +1,125 @@
+// framework/pool/tests/pool_cpu_any_stress_test.cpp
+// GATED stress/soak test — built only with -DPHYRIAD_BUILD_STRESS_TESTS=ON, NOT
+// part of the default CI gate. Companion to §10 of pool_integration_test.cpp.
+//
+// Why separate (see docs/planning/TEST_VALIDATION_PLAN.md):
+//   §10 of the integration test is now a DETERMINISTIC unit test (it proves the
+//   CPU_ANY candidate set spans both CPU groups via affinity_hint + worker_id).
+//   This file keeps the heavy, stochastic, open-loop SATURATION workload as a
+//   stress test, because saturation is a different thing to exercise: it stresses
+//   the lossless-under-backpressure path (the D-2 task-loss regression guard) and
+//   the classifier's fill%-balanced rotation across both groups under real
+//   oversubscription. A stress test may be stochastic; it must NOT be a flaky CI
+//   gate, so it is gated off by default and asserts only a DETERMINISTIC property.
+//
+// What it asserts (deterministic): under a saturating CPU_ANY burst, EVERY
+// accepted task completes — nothing is silently lost. (The submit loop is
+// lossless: it spins on POOL_FULL, so `accepted == total`, and the test then
+// waits for all completions.) It does NOT assert a sampled fill distribution —
+// that timing-dependent claim is what made the old §10 flaky.
+//
+// Reproduce / soak:
+//   cmake -S . -B build-stress -G Ninja -DCMAKE_BUILD_TYPE=Release \
+//         -DPHYRIAD_STANDALONE=ON -DPHYRIAD_BUILD_TESTS=ON \
+//         -DPHYRIAD_BUILD_STRESS_TESTS=ON
+//   cmake --build build-stress --target pool_cpu_any_stress
+//   # tighten the screws on a 2-vCPU box or with taskset:
+//   taskset -c 0,1 ./build-stress/framework/pool/pool_cpu_any_stress
+//
+#include <phyriad/pool/PoolRuntime.hpp>
+#include <phyriad/pool/types/Task.hpp>
+#include <phyriad/pool/types/TaskResult.hpp>
+#include <phyriad/pool/types/PoolConfig.hpp>
+#include <phyriad/hal/MemoryOrder.hpp>
+#include "test_crash_handler.hpp"
+#include <atomic>
+#include <chrono>
+#include <cstdint>
+#include <cstdio>
+#include <thread>
+#include <vector>
+
+using namespace phyriad::pool;
+using namespace std::chrono_literals;
+namespace hal = phyriad::hal;
+
+// Completion counter + a sink so the work loop can't be optimized away. The fn
+// ignores ctx (so many in-flight tasks don't race on a shared payload); ~1 µs of
+// real arithmetic per task so a saturating burst actually builds ring fill.
+static std::atomic<uint64_t> g_any_done{0};
+static std::atomic<uint64_t> g_any_sink{0};
+static void cpu_any_work_fn(void*) noexcept {
+    uint64_t acc = hal::stat_load_relaxed(g_any_done) + 0x9E3779B97F4A7C15ULL;
+    for (int i = 0; i < 4000; ++i)
+        acc = acc * 6364136223846793005ULL + 1442695040888963407ULL;
+    hal::stat_fetch_add_relaxed(g_any_sink, acc);     // sink → not DCE'd
+    hal::stat_fetch_add_relaxed(g_any_done, uint64_t{1});
+}
+
+int main() {
+    phyriad_install_crash_handler();
+    std::setvbuf(stdout, nullptr, _IONBF, 0);
+    std::printf("[pool_cpu_any_stress] saturating CPU_ANY burst — lossless soak\n");
+
+    PoolConfig cfg{};
+    cfg.cpu_short_workers = 2u;
+    cfg.cpu_long_workers  = 2u;
+    cfg.io_workers        = 1u;
+    auto r = PoolRuntime::create(cfg);
+    if (!r) { std::printf("[FAIL] pool create failed\n"); return 1; }
+    auto& p = *r;
+
+    constexpr int kT = 8;                                  // oversubscribe 4 CPU workers
+    constexpr int kN = 3000;
+    const uint64_t total = static_cast<uint64_t>(kT) * kN;
+    hal::stat_store_relaxed(g_any_done, uint64_t{0});
+
+    // Informational only (NOT asserted): sample which worker rings ever showed
+    // fill, to eyeball the fill%-balanced rotation across both groups.
+    uint32_t          busy_mask = 0;
+    std::atomic<bool> sampling{true};
+    std::thread sampler([&]{
+        while (hal::stat_load_relaxed(sampling)) {
+            auto m = p->metrics_snapshot();
+            for (uint32_t w = 0; w < 4u; ++w)
+                if (m.ring_fill_pct[w] > 0) busy_mask |= (1u << w);
+            std::this_thread::sleep_for(50us);
+        }
+    });
+
+    std::vector<std::thread> threads;
+    threads.reserve(kT);
+    for (int t = 0; t < kT; ++t) {
+        threads.emplace_back([&]{
+            Task work{};
+            work.fn   = cpu_any_work_fn;
+            work.kind = TaskKind::CPU_ANY;
+            work.ctx  = nullptr;
+            for (int i = 0; i < kN; ++i)
+                for (;;) { auto id = p->submit(work); if (id) break; }  // lossless: spin on POOL_FULL
+        });
+    }
+    for (auto& th : threads) th.join();
+    while (hal::stat_load_relaxed(g_any_done) < total)
+        std::this_thread::sleep_for(100us);
+    hal::stat_store_relaxed(sampling, false);
+    sampler.join();
+
+    const uint64_t done = hal::stat_load_relaxed(g_any_done);
+    const bool short_used = (busy_mask & 0b0011u) != 0;
+    const bool long_used  = (busy_mask & 0b1100u) != 0;
+    std::printf("  completed %llu / %llu   (sampled groups: short=%d long=%d)\n",
+                static_cast<unsigned long long>(done),
+                static_cast<unsigned long long>(total),
+                short_used ? 1 : 0, long_used ? 1 : 0);
+
+    // DETERMINISTIC assertion: nothing lost under saturation (D-2 guard).
+    if (done != total) {
+        std::printf("[FAIL] lost tasks under saturation: %llu / %llu\n",
+                    static_cast<unsigned long long>(done),
+                    static_cast<unsigned long long>(total));
+        return 1;
+    }
+    std::printf("[OK] lossless under saturation\n");
+    return 0;
+}