Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@tangle-network/agent-eval",
"version": "0.30.1",
"version": "0.31.0",
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
"homepage": "https://github.com/tangle-network/agent-eval#readme",
"repository": {
Expand Down
1 change: 1 addition & 0 deletions src/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ export type AgentEvalErrorCode =
| 'judge'
| 'verification'
| 'replay'
| 'backend_integrity'

export class AgentEvalError extends Error {
/** Stable string code. Survives minification; safe to switch on. */
Expand Down
10 changes: 10 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,16 @@ export {
integrationManifestResolvedPayload,
integrationManifestValidatedPayload,
} from './integration-gates'
// ── Backend-integrity guard ───────────────────────────────────────────
// Distinguish "agent failed" from "eval ran blind against a stub or
// unconfigured backend." Required after every canonical eval so a 0/N
// pass-rate never silently masks a misconfigured runtime.
export type { BackendIntegrityReport } from './integrity/backend-integrity'
export {
assertRealBackend,
BackendIntegrityError,
summarizeBackendIntegrity,
} from './integrity/backend-integrity'
export {
adversarialJudge,
codeExecutionJudge,
Expand Down
183 changes: 183 additions & 0 deletions src/integrity/backend-integrity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/**
* Backend-integrity guard: distinguish "agent failed" from "eval ran against
* a stub / unconfigured backend." Without this guard a canonical eval can
* silently report `0/N passed` and look like an agent-quality problem when
* the LLM was never actually called — the failure mode we just hit running
* the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104
* char strings; gtm/creative defaulted to a cli-bridge that wasn't running).
*
* The shape:
*
* const report = summarizeBackendIntegrity(records)
* assertRealBackend(records) // throws BackendIntegrityError if 100% stub
*
* A record is "stub-mode" if its `tokenUsage.input === 0 && tokenUsage.output === 0`.
* (`costUsd` alone is unreliable — some backends successfully call LLMs but
* don't propagate pricing, producing real tokens with $0 cost.)
*
* Verdicts:
* - `real` — at least one record has nonzero token usage
* - `stub` — every record is stub-mode (eval ran blind)
* - `mixed` — some records real, some stub (partial backend failure;
* often the 429-cascade or auth-half-failed case)
*/

import { AgentEvalError } from '../errors'
import type { RunRecord } from '../run-record'

export interface BackendIntegrityReport {
/** Total records inspected. */
totalRecords: number
/** Records with input=0 AND output=0 (a stub fingerprint). */
stubRecords: number
/** Records with nonzero token usage (real LLM activity). */
realRecords: number
/** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */
uncostedRecords: number
/** Sum of input tokens across all records. */
totalInputTokens: number
/** Sum of output tokens across all records. */
totalOutputTokens: number
/** Sum of costUsd across all records. */
totalCostUsd: number
/** Worst-case integrity verdict. */
verdict: 'real' | 'mixed' | 'stub'
/** Human-readable diagnosis suitable for terminal output. */
diagnosis: string
}

/**
* Error thrown when an integrity assertion fails. Caller can pattern-match
* by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other
* errors.
*/
export class BackendIntegrityError extends AgentEvalError {
constructor(
message: string,
public readonly report: BackendIntegrityReport,
) {
super('backend_integrity', message)
}
}

function isStubRecord(rec: RunRecord): boolean {
return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0
}

function isUncostedRecord(rec: RunRecord): boolean {
return rec.tokenUsage.output > 0 && rec.costUsd === 0
}

/**
* Inspect a batch of RunRecords and return an integrity report. Pure
* function — no I/O, no logging. The caller decides what to do with the
* verdict (print warning, throw, gate CI, etc.).
*/
export function summarizeBackendIntegrity(
records: ReadonlyArray<RunRecord>,
): BackendIntegrityReport {
const totalRecords = records.length
let stubRecords = 0
let realRecords = 0
let uncostedRecords = 0
let totalInputTokens = 0
let totalOutputTokens = 0
let totalCostUsd = 0
for (const rec of records) {
totalInputTokens += rec.tokenUsage.input
totalOutputTokens += rec.tokenUsage.output
totalCostUsd += rec.costUsd
if (isStubRecord(rec)) stubRecords++
else realRecords++
if (isUncostedRecord(rec)) uncostedRecords++
}
const verdict: BackendIntegrityReport['verdict'] =
totalRecords === 0
? 'stub'
: stubRecords === totalRecords
? 'stub'
: stubRecords === 0
? 'real'
: 'mixed'
const diagnosis = buildDiagnosis({
totalRecords,
stubRecords,
realRecords,
uncostedRecords,
totalInputTokens,
totalOutputTokens,
totalCostUsd,
verdict,
})
return {
totalRecords,
stubRecords,
realRecords,
uncostedRecords,
totalInputTokens,
totalOutputTokens,
totalCostUsd,
verdict,
diagnosis,
}
}

function buildDiagnosis(r: Omit<BackendIntegrityReport, 'diagnosis'>): string {
if (r.totalRecords === 0) {
return 'no records — eval produced zero runs; backend likely failed before first turn'
}
if (r.verdict === 'stub') {
return [
`all ${r.totalRecords} records have zero token usage — the LLM backend was never called.`,
'common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;',
'auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,',
'or boot the cli-bridge / sandbox before invoking the eval.',
].join(' ')
}
if (r.verdict === 'mixed') {
const pct = ((r.stubRecords / r.totalRecords) * 100).toFixed(0)
return [
`${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage — the backend partially failed.`,
'common causes: rate-limit cascade (429s after the first N personas);',
'transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures.',
].join(' ')
}
// verdict === 'real'
if (r.uncostedRecords > 0) {
const pct = ((r.uncostedRecords / r.totalRecords) * 100).toFixed(0)
return [
`${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`,
`${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 — cost ledger is mis-wired (no input-token`,
'propagation from the runtime stream into RunRecord).',
].join(' ')
}
return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).`
}

/**
* Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record
* shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }`
* to also reject mixed verdicts (recommended for CI gates).
*
* Real backends pass through silently.
*/
export function assertRealBackend(
records: ReadonlyArray<RunRecord>,
opts: { allowMixed?: boolean } = {},
): BackendIntegrityReport {
const report = summarizeBackendIntegrity(records)
const allowMixed = opts.allowMixed ?? true
if (report.verdict === 'stub') {
throw new BackendIntegrityError(
`backend-integrity: ran against a stub or unconfigured backend — ${report.diagnosis}`,
report,
)
}
if (!allowMixed && report.verdict === 'mixed') {
throw new BackendIntegrityError(
`backend-integrity: partial backend failure rejected — ${report.diagnosis}`,
report,
)
}
return report
}
136 changes: 136 additions & 0 deletions tests/backend-integrity.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import { describe, expect, it } from 'vitest'
import {
assertRealBackend,
BackendIntegrityError,
summarizeBackendIntegrity,
} from '../src/integrity/backend-integrity'
import type { RunRecord } from '../src/run-record'

function makeRecord(input: number, output: number, costUsd: number): RunRecord {
return {
runId: `r-${Math.random()}`,
experimentId: 'exp',
candidateId: 'cand',
seed: 0,
model: 'test-model@2026-01',
promptHash: 'a'.repeat(64),
configHash: 'b'.repeat(64),
commitSha: 'c'.repeat(40),
wallMs: 100,
costUsd,
tokenUsage: { input, output },
outcome: { holdoutScore: 0.5, raw: {} },
splitTag: 'holdout',
scenarioId: 'scn',
}
}

describe('backend-integrity', () => {
describe('summarizeBackendIntegrity', () => {
it('classifies all-zero token usage as stub', () => {
const r = summarizeBackendIntegrity([
makeRecord(0, 0, 0),
makeRecord(0, 0, 0),
makeRecord(0, 0, 0),
])
expect(r.verdict).toBe('stub')
expect(r.stubRecords).toBe(3)
expect(r.realRecords).toBe(0)
expect(r.diagnosis).toContain('LLM backend was never called')
})

it('classifies any real activity as real', () => {
const r = summarizeBackendIntegrity([
makeRecord(500, 1000, 0.01),
makeRecord(600, 1200, 0.012),
])
expect(r.verdict).toBe('real')
expect(r.realRecords).toBe(2)
expect(r.stubRecords).toBe(0)
expect(r.totalInputTokens).toBe(1100)
expect(r.totalOutputTokens).toBe(2200)
})

it('classifies partial stub-mode as mixed', () => {
const r = summarizeBackendIntegrity([
makeRecord(500, 1000, 0.01),
makeRecord(0, 0, 0),
makeRecord(0, 0, 0),
])
expect(r.verdict).toBe('mixed')
expect(r.stubRecords).toBe(2)
expect(r.realRecords).toBe(1)
expect(r.diagnosis).toContain('2/3 records (67%) have zero')
})

it('flags output-tokens-without-cost as uncosted', () => {
const r = summarizeBackendIntegrity([
makeRecord(500, 1000, 0),
makeRecord(500, 1000, 0),
])
expect(r.verdict).toBe('real')
expect(r.uncostedRecords).toBe(2)
expect(r.diagnosis).toContain('cost ledger is mis-wired')
})

it('handles empty input', () => {
const r = summarizeBackendIntegrity([])
expect(r.verdict).toBe('stub')
expect(r.totalRecords).toBe(0)
expect(r.diagnosis).toContain('no records')
})

it('does not count input=0+output>0 as stub (partial usage propagation)', () => {
const r = summarizeBackendIntegrity([makeRecord(0, 1000, 0)])
expect(r.verdict).toBe('real')
expect(r.stubRecords).toBe(0)
expect(r.uncostedRecords).toBe(1)
})
})

describe('assertRealBackend', () => {
it('throws on pure-stub verdict', () => {
expect(() => assertRealBackend([makeRecord(0, 0, 0)])).toThrow(BackendIntegrityError)
})

it('throws on empty input', () => {
expect(() => assertRealBackend([])).toThrow(BackendIntegrityError)
})

it('passes through on real verdict', () => {
const r = assertRealBackend([makeRecord(500, 1000, 0.01)])
expect(r.verdict).toBe('real')
})

it('allows mixed by default', () => {
const r = assertRealBackend([
makeRecord(500, 1000, 0.01),
makeRecord(0, 0, 0),
])
expect(r.verdict).toBe('mixed')
})

it('rejects mixed when allowMixed=false', () => {
expect(() =>
assertRealBackend(
[makeRecord(500, 1000, 0.01), makeRecord(0, 0, 0)],
{ allowMixed: false },
),
).toThrow(BackendIntegrityError)
})

it('thrown error carries the report and the right code', () => {
try {
assertRealBackend([makeRecord(0, 0, 0), makeRecord(0, 0, 0)])
expect.fail('should have thrown')
} catch (e) {
expect(e).toBeInstanceOf(BackendIntegrityError)
if (e instanceof BackendIntegrityError) {
expect(e.code).toBe('backend_integrity')
expect(e.report.verdict).toBe('stub')
expect(e.report.totalRecords).toBe(2)
}
}
})
})
})
Loading