tangle-network · tangletools · May 20, 2026 · May 20, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,62 @@
 # Changelog
 
+## 0.31.0 — 2026-05-20
+
+### `JudgeScoresRecord` on `RunRecord.outcome` — substrate-blessed ensemble shape
+
+Multi-judge consumers (forge-chat in agent-builder, and four sibling
+product agents on the same trajectory) compute per-judge per-dimension
+scores per cell, then collapse to a single composite for the gate. The
+substrate's `RunOutcome` only had a slot for the composite plus a free
+`raw: Record<string, number>` bag. Consumers were either dropping the
+breakdown on the floor or smuggling it through stringly-typed `raw`
+keys like `judge_kimi_helpfulness` — neither survives a corpus-IRR run
+(0.27.2's `corpusInterRaterAgreement` expects structured per-judge
+per-dim records, not parsed strings).
+
+This release ships the typed slot so every product agent speaks the
+same shape, and the inter-rater primitives consume it without a
+per-consumer adapter.
+
+### Added
+
+- **`JudgeScoresRecord`** (`src/run-record.ts`) — `perJudge[judgeId][dim]`
+  is the canonical store; `perDimMean` and `composite` are precomputed
+  projections so reporters and IRR primitives don't repeat the
+  aggregation; `failedJudges?: string[]` records dead-judge ids
+  explicitly (no inferring partial-failure from missing keys);
+  `notes?: string` carries panel prose.
+- **`RunOutcome.judgeScores?: JudgeScoresRecord`** — optional. Single-
+  judge or scalar-only runs leave it unset; ensemble runs populate it.
+- **`CampaignRunOutcome.judgeScores?: JudgeScoresRecord`** — runners
+  return it on the per-cell outcome; `runEvalCampaign` threads it onto
+  the resulting `RunRecord.outcome.judgeScores` without coercion.
+
+### Validator extended
+
+`validateRunRecord` validates `outcome.judgeScores` when present.
+Every `perJudge[judge][dim]` and every `perDimMean[dim]` and the
+`composite` must be finite numbers — the NaN-as-silent-zero bug class
+banned by `CLAUDE.md` cannot pass the boundary. `failedJudges` must be
+an array of non-empty strings; `notes` must be a string. Round-trip
+tested in `tests/run-record.test.ts`.
+
+### Fail-loud contract
+
+A judge that throws lands in `failedJudges` by id, not a silent zero
+in `perJudge`. The composite is computed over surviving judges only;
+the partial-failure signal is preserved through to the gate.
+`tests/eval-campaign.test.ts` covers the four shapes (full, partial,
+missing, with notes) plus an explicit fail-loud case where one judge
+throws and the run record carries `failedJudges: ['glm-5.1@...']`.
+
+### Consumer contract
+
+`tests/consumer-contract.test.ts` pins `JudgeScoresRecord` as a
+type-level export at the root entry. The 0.30.0 surface is preserved —
+the new field is additive on `RunOutcome` and the new type is a new
+export, so existing consumers stay green.
+
 ## 0.29.0 — 2026-05-19
 
 ### Analyst kinds + cross-run findings context

diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agent-eval-rpc"
-version = "0.30.0"
+version = "0.31.0"
 description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
 readme = "README.md"
 requires-python = ">=3.10"

diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py
@@ -48,7 +48,7 @@
 try:
     __version__ = version("agent-eval-rpc")
 except PackageNotFoundError:
-    __version__ = "0.30.0"
+    __version__ = "0.31.0"
 
 __all__ = [
     "Client",

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.30.0",
+  "version": "0.31.0",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {

diff --git a/src/eval-campaign.ts b/src/eval-campaign.ts
@@ -41,6 +41,7 @@
 import { assertLlmRoute, type LlmClientOptions, type LlmRouteRequirements } from './llm-client'
 import { canonicalize, hashJson } from './pre-registration'
 import type {
+  JudgeScoresRecord,
   RunJudgeMetadata,
   RunOutcome,
   RunRecord,
@@ -120,6 +121,12 @@ export interface CampaignRunOutcome {
   failureMode?: string
   /** Optional judge metadata when a judge was used. */
   judgeMetadata?: RunJudgeMetadata
+  /**
+   * Optional per-judge / per-dim breakdown for ensemble-judged runs.
+   * Propagated to `outcome.judgeScores` on the resulting `RunRecord`.
+   * Single-judge or scalar-only runs leave this unset.
+   */
+  judgeScores?: JudgeScoresRecord
 }
 
 export type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>
@@ -457,6 +464,7 @@ export async function runEvalCampaign<V>(
     }
     if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score
     else recordOutcome.searchScore = outcome.score
+    if (outcome.judgeScores !== undefined) recordOutcome.judgeScores = outcome.judgeScores
 
     const record: RunRecord = {
       runId,

diff --git a/src/index.ts b/src/index.ts
@@ -1087,6 +1087,7 @@ export { CallbackResearcher, NoopResearcher } from './researcher'
 // tournaments, adversarial, compute curves, auto-research — live on the
 // dedicated subpath: @tangle-network/agent-eval/rl
 export type {
+  JudgeScoresRecord,
   RunJudgeMetadata,
   RunOutcome,
   RunRecord,

diff --git a/src/run-record.ts b/src/run-record.ts
@@ -44,6 +44,42 @@ export interface RunJudgeMetadata {
   fallback: boolean
 }
 
+/**
+ * Per-judge / per-dimension breakdown for runs scored by an ensemble of
+ * judges over a multi-dimensional rubric.
+ *
+ * The collapsed `outcome.searchScore` / `holdoutScore` carries the
+ * composite the gate uses. The full breakdown belongs here so consumers
+ * can answer "which judge disagreed?", "which dimension dragged the
+ * composite down?", and "did half the panel fail?" without re-running.
+ *
+ * `perJudge[judgeId][dim]` is the canonical source; `perDimMean` and
+ * `composite` are convenience projections — derivable but precomputed so
+ * downstream IRR primitives (`interRaterReliability`,
+ * `corpusInterRaterAgreement`) and reporters don't pay the same
+ * aggregation twice.
+ *
+ * Fail-loud discipline: judges that errored out land in `failedJudges`
+ * by id. A missing key in `perJudge` is ambiguous (silent zero vs not
+ * run); the explicit list makes a partial-failure recorded as such.
+ */
+export interface JudgeScoresRecord {
+  /** Per-judge per-dimension scores. `{ "kimi-k2.6": { helpfulness: 0.8, clarity: 0.7 }, ... }`. */
+  perJudge: Record<string, Record<string, number>>
+  /** Per-dim mean across judges. Convenience — derivable from `perJudge`. */
+  perDimMean: Record<string, number>
+  /** Composite mean across all dims and judges. Mirrors the score
+   *  the gate sees on `outcome.searchScore` / `holdoutScore`. */
+  composite: number
+  /** Judges that errored or returned an unparseable verdict. Recorded
+   *  by id (e.g. `['glm-5.1']`) so a partial-failure case is explicit,
+   *  not inferred from missing keys in `perJudge`. */
+  failedJudges?: string[]
+  /** Free-form notes the judges emitted (joined across judges or
+   *  first-judge only — consumer's choice). */
+  notes?: string
+}
+
 export interface RunOutcome {
   /** Score on the search/optimization split. Optional because a
    *  holdout-only evaluation only fills `holdoutScore`. */
@@ -55,6 +91,12 @@ export interface RunOutcome {
    *  pass/fail counters, latency stats, etc. Numeric only — keeps
    *  reporters honest. */
   raw: Record<string, number>
+  /** Per-judge / per-dim breakdown. Consumers writing ensemble
+   *  judgements populate this; substrate primitives like
+   *  `interRaterReliability` and `corpusInterRaterAgreement` accept
+   *  these records as input. Optional — single-judge or scalar-only
+   *  runs leave it unset. */
+  judgeScores?: JudgeScoresRecord
 }
 
 /**
@@ -242,6 +284,11 @@ export function validateRunRecord(input: unknown): RunRecord {
     expectFiniteNumber(v, `outcome.raw.${k}`)
   }
 
+  // Per-judge / per-dim breakdown, optional.
+  if (outRec.judgeScores !== undefined) {
+    validateJudgeScores(outRec.judgeScores, 'outcome.judgeScores')
+  }
+
   // Failure mode optional.
   if (obj.failureMode !== undefined) expectString(obj.failureMode, 'failureMode')
 
@@ -298,6 +345,61 @@ function expectFiniteNumber(value: unknown, path: string): void {
   }
 }
 
+function validateJudgeScores(value: unknown, path: string): void {
+  if (value === null || typeof value !== 'object') {
+    throw new RunRecordValidationError('judgeScores must be an object', path)
+  }
+  const rec = value as Record<string, unknown>
+
+  const perJudge = rec.perJudge
+  if (perJudge === null || typeof perJudge !== 'object') {
+    throw new RunRecordValidationError('perJudge must be an object', `${path}.perJudge`)
+  }
+  for (const [judgeId, dims] of Object.entries(perJudge as Record<string, unknown>)) {
+    if (dims === null || typeof dims !== 'object') {
+      throw new RunRecordValidationError(
+        'per-judge entry must be an object of dimension scores',
+        `${path}.perJudge.${judgeId}`,
+      )
+    }
+    for (const [dim, score] of Object.entries(dims as Record<string, unknown>)) {
+      expectFiniteNumber(score, `${path}.perJudge.${judgeId}.${dim}`)
+    }
+  }
+
+  const perDimMean = rec.perDimMean
+  if (perDimMean === null || typeof perDimMean !== 'object') {
+    throw new RunRecordValidationError('perDimMean must be an object', `${path}.perDimMean`)
+  }
+  for (const [dim, mean] of Object.entries(perDimMean as Record<string, unknown>)) {
+    expectFiniteNumber(mean, `${path}.perDimMean.${dim}`)
+  }
+
+  expectFiniteNumber(rec.composite, `${path}.composite`)
+
+  if (rec.failedJudges !== undefined) {
+    if (!Array.isArray(rec.failedJudges)) {
+      throw new RunRecordValidationError(
+        'failedJudges must be an array of strings',
+        `${path}.failedJudges`,
+      )
+    }
+    for (let i = 0; i < rec.failedJudges.length; i++) {
+      const id = rec.failedJudges[i]
+      if (typeof id !== 'string' || id.length === 0) {
+        throw new RunRecordValidationError(
+          'failedJudges entry must be a non-empty string',
+          `${path}.failedJudges[${i}]`,
+        )
+      }
+    }
+  }
+
+  if (rec.notes !== undefined && typeof rec.notes !== 'string') {
+    throw new RunRecordValidationError('notes must be a string', `${path}.notes`)
+  }
+}
+
 /**
  * Heuristic snapshot check. Accepts:
  *   - `name@YYYY-MM-DD` (Anthropic style: `claude-sonnet-4-6@2025-04-15`)

diff --git a/tests/consumer-contract.test.ts b/tests/consumer-contract.test.ts
@@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest'
 import * as builderEval from '../src/builder-eval/index'
 import * as agentEval from '../src/index'
 import * as rl from '../src/rl/index'
+import type { JudgeScoresRecord, RunOutcome } from '../src/index'
 
 /**
  * Public-surface contract for `@tangle-network/agent-eval`.
@@ -109,4 +110,23 @@ describe('public-surface contract for consumers', () => {
       expect(proto instanceof Error, `${name} must extend Error`).toBe(true)
     }
   })
+
+  it('exposes JudgeScoresRecord as the canonical ensemble shape on RunOutcome', () => {
+    // Type-level pin: a `JudgeScoresRecord` is assignable to
+    // `RunOutcome.judgeScores`. If the interface gets renamed or the
+    // field gets dropped from `RunOutcome`, this stops compiling — the
+    // contract that protects forge-chat / multi-judge consumers.
+    const judgeScores: JudgeScoresRecord = {
+      perJudge: { 'kimi-k2.6': { helpfulness: 0.8, clarity: 0.7 } },
+      perDimMean: { helpfulness: 0.8, clarity: 0.7 },
+      composite: 0.75,
+    }
+    const outcome: RunOutcome = {
+      holdoutScore: 0.75,
+      raw: {},
+      judgeScores,
+    }
+    expect(outcome.judgeScores).toBe(judgeScores)
+    expect(outcome.judgeScores?.composite).toBe(0.75)
+  })
 })