adcontextprotocol · EmmaLouise2018 · May 8, 2026 · May 11, 2026
diff --git a/.changeset/unify-owner-evaluate-agent-stop-legacy-write.md b/.changeset/unify-owner-evaluate-agent-stop-legacy-write.md
@@ -0,0 +1,54 @@
+---
+---
+
+PR 3 of the #4247 unification stack. Two coupled changes:
+
+**Stop the dual write for owner runs.** `evaluate_agent_quality` no longer
+calls `agentContextDb.recordTest()` when the caller owns the agent — that
+path was the dual-write bug #4247 is closing. The legacy `recordTest` call
+is retained ONLY for third-party runs so a stranger who tests someone
+else's agent still has a session-scoped audit trail in their own
+`agent_test_history`. Owner-triggered runs persist exclusively to canonical
+state via PR #4250's runtime path going forward.
+
+**Operator-runnable backfill script for historical owner-triggered rows.**
+`server/src/scripts/backfill-owner-test-history.ts` copies every
+`agent_test_history` row with a `user_id` into `agent_compliance_runs` as
+`triggered_by = 'owner_test'`. NOT a migration — backfilling a prod-sized
+`agent_test_history` from a release-command DDL would hold locks the whole
+time and is a single point of failure. The script ships in
+`server/src/scripts/` per the repo's prod-runnable convention.
+
+Script properties:
+
+- **Chunked** by primary-key id ascending, default 1000 rows per chunk, 100ms
+  sleep between chunks. Each chunk is its own short transaction so heartbeat
+  and runtime writes never wait on a long-running lock. Tune via
+  `--chunk-size N` / `--sleep-ms N` for prod-sized tables.
+- **Cutover guard**: skips any `agent_test_history` row whose `started_at`
+  is at or after the earliest `triggered_by='owner_test'` row in
+  `agent_compliance_runs`. That cutover point is the live-write boundary —
+  rows after it would already have been written by PR #4250's runtime path.
+  Without the guard, owner runs inside the
+  PR-#4250-deployed-but-script-not-yet-run window would be inserted twice.
+  Fresh deploys with no canonical `owner_test` rows yet treat the cutover
+  as `+infinity` and backfill all eligible rows.
+- **Idempotent**: each inserted row carries `observations_json.backfill_source_id`
+  pointing back to the source `agent_test_history.id`; re-runs use
+  `WHERE NOT EXISTS` against that field. A partial run resumes cleanly.
+- **Does NOT touch `agent_compliance_status`**. The runtime canonical write
+  path maintains the current-status row; backfilling historical history rows
+  shouldn't retroactively change "current" status. Resolves the
+  contract-drift concern with PR #4250's `compliance-db-last-write-wins`
+  test — the script's only writes go to the history table, leaving the
+  status table's last-write-wins runtime contract intact.
+- **Dry-run** via `--dry-run` counts eligible rows per chunk without writing.
+
+**Out of scope** (deferred to a follow-up after PR #4264 lands):
+
+- Drop `agent_test_history` table — gated on S3 cold-storage export of the
+  remaining (`user_id IS NULL`) third-party rows.
+- Collapse `agent_contexts.last_test_*` into a derived view — PR 4 of
+  the #4247 stack (#4268).
+
+**Stacked on** #4263 (merged) → #4250 (merged).
diff --git a/server/src/addie/mcp/member-tools.ts b/server/src/addie/mcp/member-tools.ts
@@ -3624,35 +3624,41 @@ export function createMemberToolHandlers(
           }
         }
 
-        // Legacy session-scoped audit trail. Retained until Emma's #4247 PR 3
-        // backfills + drops `agent_test_history`. NOT a public-state write —
-        // the previous dual-write to `recordComplianceRun(..., 'manual')` was
-        // dropped because (a) it was gated only on `agent_contexts` row
-        // existence (which `save_agent` lets any org create for any URL — no
-        // ownership check), so any user could publish a `manual` verdict on
-        // someone else's agent; and (b) the owner-test branch above already
-        // updates canonical state with proper ownership checking, so the
-        // legacy write has no remaining function for owners. Non-owner runs
-        // continue to land here in `agent_test_history` only.
-        try {
-          const context = await agentContextDb.getByOrgAndUrl(organizationId, resolved.resolvedUrl);
-          if (context) {
-            await agentContextDb.recordTest({
-              agent_context_id: context.id,
-              scenario: 'quality_evaluation',
-              overall_passed: result.overall_status === 'passing',
-              steps_passed: result.summary.tracks_passed,
-              steps_failed: result.summary.tracks_failed,
-              total_duration_ms: result.total_duration_ms,
-              summary: result.summary.headline,
-              dry_run: true,
-              triggered_by: 'user',
-              user_id: memberContext?.workos_user?.workos_user_id,
-              agent_profile_json: result.agent_profile,
-            });
+        // Legacy write to agent_contexts + agent_test_history. Retained ONLY
+        // for non-owner runs so a third-party who runs evaluate_agent_quality
+        // against someone else's agent still has a session-scoped audit trail
+        // (their own org's agent_test_history). Owner runs already wrote
+        // canonical state above (PR #4250 / merged); writing twice for owners
+        // would split the audit and re-introduce the dual-write bug PR #4247
+        // is closing. The previous unconditional `recordComplianceRun(...,
+        // 'manual')` dual-write was already dropped on main because it had
+        // no ownership check; this PR completes the picture by gating the
+        // remaining recordTest call on `!isAgentOwner`.
+        //
+        // PR 4 of #4247 collapses agent_contexts.last_test_* into a derived
+        // view, after which this legacy block (and recordTest itself) drop
+        // entirely.
+        if (!isAgentOwner) {
+          try {
+            const context = await agentContextDb.getByOrgAndUrl(organizationId, resolved.resolvedUrl);
+            if (context) {
+              await agentContextDb.recordTest({
+                agent_context_id: context.id,
+                scenario: 'quality_evaluation',
+                overall_passed: result.overall_status === 'passing',
+                steps_passed: result.summary.tracks_passed,
+                steps_failed: result.summary.tracks_failed,
+                total_duration_ms: result.total_duration_ms,
+                summary: result.summary.headline,
+                dry_run: true,
+                triggered_by: 'user',
+                user_id: memberContext?.workos_user?.workos_user_id,
+                agent_profile_json: result.agent_profile,
+              });
+            }
+          } catch (error) {
+            logger.debug({ error }, 'Could not record quality evaluation result');
           }
-        } catch (error) {
-          logger.debug({ error }, 'Could not record quality evaluation result');
         }
       }
 

diff --git a/server/src/scripts/backfill-owner-test-history.ts b/server/src/scripts/backfill-owner-test-history.ts
@@ -0,0 +1,251 @@
+/**
+ * Backfill historical owner-triggered `agent_test_history` rows into
+ * `agent_compliance_runs` as `triggered_by='owner_test'`. Part of the #4247
+ * compliance-state unification — PR #4250 (merged) made
+ * `evaluate_agent_quality` write canonical going forward; this script
+ * backfills the pre-PR-#4250 rows so the compliance API and dashboard
+ * reflect the full owner-test history.
+ *
+ * ## Usage
+ *
+ *   # Dry run (counts what would land, no writes):
+ *   DATABASE_URL=… npx tsx server/src/scripts/backfill-owner-test-history.ts --dry-run
+ *
+ *   # Commit, default chunk size:
+ *   DATABASE_URL=… npx tsx server/src/scripts/backfill-owner-test-history.ts
+ *
+ *   # Commit, custom chunk + sleep (for prod-sized tables):
+ *   DATABASE_URL=… npx tsx server/src/scripts/backfill-owner-test-history.ts \
+ *     --chunk-size 500 --sleep-ms 250
+ *
+ * ## Env
+ *
+ *   DATABASE_URL   required
+ *
+ * ## Safety / chunking strategy
+ *
+ * - Cutover guard: skips any `agent_test_history` row whose `started_at` is at
+ *   or after the earliest `triggered_by='owner_test'` row in
+ *   `agent_compliance_runs`. That earliest row is the live-write cutover
+ *   point — anything after that timestamp would already have been written
+ *   by the runtime canonical path in PR #4250 and re-inserting it here
+ *   would duplicate. When no `owner_test` rows exist yet (fresh deploy),
+ *   the cutover is treated as `+infinity` and every row is eligible.
+ *
+ * - Idempotency: each row carries its source `agent_test_history.id` in
+ *   `observations_json.backfill_source_id`. The chunk SELECT filters with
+ *   `WHERE NOT EXISTS (… backfill_source_id = ath.id)`. Re-running the
+ *   script after a partial failure resumes cleanly — already-backfilled
+ *   rows are skipped without duplicate writes.
+ *
+ * - Chunks by primary-key id ascending, --chunk-size at a time. A short
+ *   `--sleep-ms` between chunks lets the heartbeat and other writers
+ *   keep moving. Default chunk 1000 / sleep 100ms — tune for table size.
+ *
+ * - Does NOT update `agent_compliance_status`. The runtime canonical
+ *   write path in PR #4250 already maintains that row; backfilling
+ *   historical runs into the history table should not retroactively
+ *   change "current" status.
+ */
+
+import { initializeDatabase, closeDatabase, getPool } from '../db/client.js';
+import { createLogger } from '../logger.js';
+
+const logger = createLogger('backfill-owner-test-history');
+
+interface Args {
+  chunkSize: number;
+  sleepMs: number;
+  dryRun: boolean;
+}
+
+function parseArgs(argv: string[]): Args {
+  const args: Args = { chunkSize: 1000, sleepMs: 100, dryRun: false };
+  for (let i = 2; i < argv.length; i += 1) {
+    const a = argv[i];
+    if (a === '--chunk-size') args.chunkSize = parseInt(argv[++i], 10);
+    else if (a === '--sleep-ms') args.sleepMs = parseInt(argv[++i], 10);
+    else if (a === '--dry-run') args.dryRun = true;
+    else if (a === '--help' || a === '-h') {
+      console.log('Usage: backfill-owner-test-history.ts [--chunk-size N] [--sleep-ms N] [--dry-run]');
+      process.exit(0);
+    } else throw new Error(`Unknown arg: ${a}`);
+  }
+  if (args.chunkSize <= 0) throw new Error('--chunk-size must be > 0');
+  if (args.sleepMs < 0) throw new Error('--sleep-ms must be >= 0');
+  return args;
+}
+
+const sleep = (ms: number) => new Promise<void>((resolve) => setTimeout(resolve, ms));
+
+async function main(): Promise<void> {
+  const args = parseArgs(process.argv);
+  if (!process.env.DATABASE_URL) throw new Error('DATABASE_URL not set');
+
+  initializeDatabase({ connectionString: process.env.DATABASE_URL });
+  const pool = getPool();
+
+  // Cutover: earliest live owner_test write in agent_compliance_runs.
+  // Anything in agent_test_history with started_at >= cutover would
+  // duplicate a row PR #4250's runtime path already inserted.
+  const cutoverRow = await pool.query<{ cutover: Date | null }>(
+    `SELECT MIN(tested_at) AS cutover FROM agent_compliance_runs WHERE triggered_by = 'owner_test'`,
+  );
+  const cutover: Date | null = cutoverRow.rows[0]?.cutover ?? null;
+  logger.info(
+    { cutover: cutover ? cutover.toISOString() : null, mode: args.dryRun ? 'dry-run' : 'commit' },
+    `Backfill starting (cutover = ${cutover ? cutover.toISOString() : 'no canonical owner_test rows yet — backfilling all eligible'})`,
+  );
+
+  // Eligible-row count for visibility before commit.
+  const eligibleCount = await pool.query<{ count: string }>(
+    `SELECT COUNT(*)::text AS count
+       FROM agent_test_history ath
+       JOIN agent_contexts ac ON ac.id = ath.agent_context_id
+      WHERE ath.user_id IS NOT NULL
+        AND ($1::timestamptz IS NULL OR ath.started_at < $1::timestamptz)
+        AND NOT EXISTS (
+          SELECT 1 FROM agent_compliance_runs acr
+           WHERE acr.observations_json->>'backfill_source_id' = ath.id::text
+        )`,
+    [cutover],
+  );
+  const totalEligible = parseInt(eligibleCount.rows[0]?.count ?? '0', 10);
+  logger.info({ totalEligible, chunkSize: args.chunkSize }, `${totalEligible} row(s) eligible for backfill`);
+
+  if (totalEligible === 0) {
+    logger.info('Nothing to backfill — exiting cleanly.');
+    await closeDatabase();
+    return;
+  }
+
+  let totalInserted = 0;
+  let lastId: string | null = null;
+
+  // Chunked loop: pull the next `chunkSize` eligible rows by id ASC, insert
+  // them into agent_compliance_runs, advance the cursor. Each chunk is its
+  // own short transaction so heartbeat / runtime writes never wait on a
+  // long-running backfill lock.
+  while (true) {
+    const insertResult: { rows: Array<{ inserted_count: number; max_id: string | null }> } = await pool.query<{ inserted_count: number; max_id: string | null }>(
+      `WITH eligible AS (
+        SELECT ath.*, ac.agent_url
+          FROM agent_test_history ath
+          JOIN agent_contexts ac ON ac.id = ath.agent_context_id
+         WHERE ath.user_id IS NOT NULL
+           AND ($1::timestamptz IS NULL OR ath.started_at < $1::timestamptz)
+           AND ($2::uuid IS NULL OR ath.id > $2::uuid)
+           AND NOT EXISTS (
+             SELECT 1 FROM agent_compliance_runs acr
+              WHERE acr.observations_json->>'backfill_source_id' = ath.id::text
+           )
+         ORDER BY ath.id ASC
+         LIMIT $3
+      ),
+      inserted AS (
+        INSERT INTO agent_compliance_runs (
+          agent_url,
+          lifecycle_stage,
+          overall_status,
+          headline,
+          total_duration_ms,
+          tracks_json,
+          tracks_passed,
+          tracks_failed,
+          tracks_skipped,
+          tracks_partial,
+          agent_profile_json,
+          observations_json,
+          triggered_by,
+          dry_run,
+          tested_at
+        )
+        SELECT
+          eligible.agent_url,
+          COALESCE(arm.lifecycle_stage, 'production'),
+          CASE WHEN eligible.overall_passed THEN 'passing' ELSE 'failing' END,
+          eligible.summary,
+          eligible.total_duration_ms,
+          '[]'::jsonb,
+          COALESCE(eligible.steps_passed, 0),
+          COALESCE(eligible.steps_failed, 0),
+          0,
+          0,
+          eligible.agent_profile_json,
+          jsonb_build_object(
+            'backfill_source', 'agent_test_history',
+            'backfill_source_id', eligible.id::text,
+            'backfill_script', 'backfill-owner-test-history',
+            'original_scenario', eligible.scenario
+          ),
+          'owner_test',
+          FALSE,
+          eligible.started_at
+        FROM eligible
+        LEFT JOIN agent_registry_metadata arm ON arm.agent_url = eligible.agent_url
+        WHERE NOT $4::boolean
+        RETURNING 1
+      )
+      SELECT
+        (SELECT COUNT(*)::int FROM inserted) AS inserted_count,
+        (SELECT MAX(id)::text FROM eligible) AS max_id`,
+      [cutover, lastId, args.chunkSize, args.dryRun],
+    );
+
+    const chunk = insertResult.rows[0];
+    const inserted = chunk?.inserted_count ?? 0;
+    const maxId = chunk?.max_id ?? null;
+
+    if (!maxId) break;
+
+    if (args.dryRun) {
+      // Dry-run path: cursor still needs to advance via the eligible CTE's
+      // max id, but no rows were inserted. Count what WOULD have landed.
+      const wouldInsert = await pool.query<{ count: string }>(
+        `SELECT COUNT(*)::text AS count
+           FROM agent_test_history ath
+           JOIN agent_contexts ac ON ac.id = ath.agent_context_id
+          WHERE ath.user_id IS NOT NULL
+            AND ($1::timestamptz IS NULL OR ath.started_at < $1::timestamptz)
+            AND ($2::uuid IS NULL OR ath.id > $2::uuid)
+            AND ath.id <= $3::uuid
+            AND NOT EXISTS (
+              SELECT 1 FROM agent_compliance_runs acr
+               WHERE acr.observations_json->>'backfill_source_id' = ath.id::text
+            )`,
+        [cutover, lastId, maxId],
+      );
+      const wouldCount = parseInt(wouldInsert.rows[0]?.count ?? '0', 10);
+      totalInserted += wouldCount;
+      logger.info(
+        { chunkInserted: wouldCount, totalInserted, lastId: maxId },
+        `[dry-run] would insert ${wouldCount} (cumulative ${totalInserted})`,
+      );
+    } else {
+      totalInserted += inserted;
+      logger.info(
+        { chunkInserted: inserted, totalInserted, lastId: maxId },
+        `chunk landed ${inserted} (cumulative ${totalInserted})`,
+      );
+    }
+
+    lastId = maxId;
+
+    if (args.sleepMs > 0) await sleep(args.sleepMs);
+  }
+
+  logger.info(
+    { totalInserted, mode: args.dryRun ? 'dry-run' : 'commit' },
+    `Backfill ${args.dryRun ? 'dry-run' : 'commit'} complete — ${totalInserted} row(s) ${args.dryRun ? 'would have been' : ''} inserted`,
+  );
+
+  await closeDatabase();
+}
+
+main()
+  .then(() => process.exit(0))
+  .catch((err: unknown) => {
+    logger.error({ err }, 'backfill-owner-test-history failed');
+    console.error(err);
+    process.exit(1);
+  });