Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clients/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "agent-eval-rpc"
version = "0.29.1"
version = "0.30.0"
description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
readme = "README.md"
requires-python = ">=3.10"
Expand Down
2 changes: 1 addition & 1 deletion clients/python/src/agent_eval_rpc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
try:
__version__ = version("agent-eval-rpc")
except PackageNotFoundError:
__version__ = "0.29.1"
__version__ = "0.30.0"

__all__ = [
"Client",
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@tangle-network/agent-eval",
"version": "0.29.1",
"version": "0.30.0",
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
"homepage": "https://github.com/tangle-network/agent-eval#readme",
"repository": {
Expand Down
20 changes: 19 additions & 1 deletion src/analyst/finding-signature.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,32 @@
*/

import { z } from 'zod'
import { parseFindingSubject } from './finding-subject'

export const ANALYST_SEVERITIES = ['critical', 'high', 'medium', 'low', 'info'] as const

export const RawAnalystFindingSchema = z
.object({
severity: z.enum(ANALYST_SEVERITIES),
claim: z.string().min(1).max(2000),
subject: z.string().max(400).optional(),
/**
* Subject locus the finding is about. Validated at parse time
* against the documented grammar (`finding-subject.ts`). Findings
* with a malformed subject are rejected — they would have been
* silently skipped by every downstream adapter, so failing loud at
* parse time turns a hidden no-op into a kind-prompt audit signal.
*
* Optional because purely descriptive findings (no actionable
* locus) are legitimate; they just don't route through the
* KnowledgeAdapter / ImprovementAdapter.
*/
subject: z
.string()
.max(400)
.refine((s) => parseFindingSubject(s) !== null, {
message: 'subject does not match the finding-subject grammar',
})
.optional(),
evidence_uri: z.string().min(1).max(2000),
evidence_excerpt: z.string().max(2000).optional(),
confidence: z.number().min(0).max(1),
Expand Down
272 changes: 272 additions & 0 deletions src/analyst/finding-subject.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
import { describe, expect, it } from 'vitest'
import {
FINDING_SUBJECT_KINDS,
type FindingSubject,
KIND_EXPECTED_SUBJECTS,
parseFindingSubject,
renderFindingSubject,
} from './finding-subject'

describe('parseFindingSubject — knowledge loci', () => {
it('parses agent-knowledge:wiki:<slug>', () => {
expect(parseFindingSubject('agent-knowledge:wiki:invoice-shape')).toEqual({
kind: 'knowledge.wiki',
slug: 'invoice-shape',
})
})

it('parses agent-knowledge:wiki:<slug>#<heading>', () => {
expect(parseFindingSubject('agent-knowledge:wiki:invoice-shape#line-items')).toEqual({
kind: 'knowledge.wiki',
slug: 'invoice-shape',
heading: 'line-items',
})
})

it('parses agent-knowledge:claim:<topic>', () => {
expect(parseFindingSubject('agent-knowledge:claim:cap-table-shape')).toEqual({
kind: 'knowledge.claim',
topic: 'cap-table-shape',
})
})

it('parses agent-knowledge:raw:<source-id>', () => {
expect(parseFindingSubject('agent-knowledge:raw:irs-pub-501-2024')).toEqual({
kind: 'knowledge.raw',
sourceId: 'irs-pub-501-2024',
})
})

it('parses agent-knowledge:stale:<slug>', () => {
expect(parseFindingSubject('agent-knowledge:stale:old-vat-rates')).toEqual({
kind: 'knowledge.stale',
slug: 'old-vat-rates',
})
})

it('rejects malformed wiki slug (uppercase / underscore)', () => {
expect(parseFindingSubject('agent-knowledge:wiki:InvoiceShape')).toBeNull()
expect(parseFindingSubject('agent-knowledge:wiki:invoice_shape')).toBeNull()
})

it('rejects malformed wiki anchor heading', () => {
expect(parseFindingSubject('agent-knowledge:wiki:slug#Heading_With_Caps')).toBeNull()
})
})

describe('parseFindingSubject — runtime surfaces', () => {
it('parses system-prompt:<section> with kebab section', () => {
expect(parseFindingSubject('system-prompt:request-classification')).toEqual({
kind: 'system-prompt',
section: 'request-classification',
})
})

it('parses system-prompt:<section> with free-form section text', () => {
expect(parseFindingSubject('system-prompt:Tool Selection')).toEqual({
kind: 'system-prompt',
section: 'Tool Selection',
})
})

it('parses tool-doc:<tool>', () => {
expect(parseFindingSubject('tool-doc:list_invoices')).toEqual({
kind: 'tool-doc',
tool: 'list_invoices',
})
})

it('parses tool-doc:<tool>:<aspect>', () => {
expect(parseFindingSubject('tool-doc:list_invoices:examples')).toEqual({
kind: 'tool-doc',
tool: 'list_invoices',
aspect: 'examples',
})
})

it('parses new-tool:<name>', () => {
expect(parseFindingSubject('new-tool:diff_csv')).toEqual({
kind: 'new-tool',
name: 'diff_csv',
})
})

it('parses rag:<corpus>:<doc>', () => {
expect(parseFindingSubject('rag:irs-rulings:rev-rul-2024-12')).toEqual({
kind: 'rag',
corpus: 'irs-rulings',
docId: 'rev-rul-2024-12',
})
})

it('parses memory:<key>', () => {
expect(parseFindingSubject('memory:last-customer-id')).toEqual({
kind: 'memory',
key: 'last-customer-id',
})
})

it('parses scaffolding:<concern>', () => {
expect(parseFindingSubject('scaffolding:retry-policy')).toEqual({
kind: 'scaffolding',
concern: 'retry-policy',
})
})

it('parses output-schema:<field>', () => {
expect(parseFindingSubject('output-schema:filing_year')).toEqual({
kind: 'output-schema',
field: 'filing_year',
})
})

it('rejects tool-doc with uppercase tool name', () => {
expect(parseFindingSubject('tool-doc:ListInvoices')).toBeNull()
})

it('rejects new-tool with empty name', () => {
expect(parseFindingSubject('new-tool:')).toBeNull()
})

it('rejects rag without corpus or doc id', () => {
expect(parseFindingSubject('rag:irs-rulings')).toBeNull()
expect(parseFindingSubject('rag:irs-rulings:')).toBeNull()
})
})

describe('parseFindingSubject — stale signals', () => {
it('parses websearch:outdated:<topic>', () => {
expect(parseFindingSubject('websearch:outdated:capital-gains-rates-2023')).toEqual({
kind: 'websearch.outdated',
topic: 'capital-gains-rates-2023',
})
})

it('parses prior-run-summary:<topic>', () => {
expect(parseFindingSubject('prior-run-summary:cost-basis-method')).toEqual({
kind: 'prior-run-summary',
topic: 'cost-basis-method',
})
})
})

describe('parseFindingSubject — cluster labels (failure-mode)', () => {
it('parses a kebab-case cluster label', () => {
expect(parseFindingSubject('tool-call-loop')).toEqual({
kind: 'cluster',
label: 'tool-call-loop',
})
})

it('parses a long but valid label', () => {
expect(parseFindingSubject('auth-revoked-mid-run')).toEqual({
kind: 'cluster',
label: 'auth-revoked-mid-run',
})
})

it('rejects a cluster label with whitespace', () => {
expect(parseFindingSubject('tool call loop')).toBeNull()
})

it('rejects a cluster label with uppercase letters', () => {
expect(parseFindingSubject('ToolCallLoop')).toBeNull()
})

it('rejects an overly long label', () => {
expect(parseFindingSubject('a'.repeat(81))).toBeNull()
})
})

describe('parseFindingSubject — boundary cases', () => {
it('returns null for undefined', () => {
expect(parseFindingSubject(undefined)).toBeNull()
})

it('returns null for null', () => {
expect(parseFindingSubject(null)).toBeNull()
})

it('returns null for empty string', () => {
expect(parseFindingSubject('')).toBeNull()
})

it('returns null for whitespace-only string', () => {
expect(parseFindingSubject(' ')).toBeNull()
})

it('returns null for prose subject ("fix the prompt")', () => {
expect(parseFindingSubject('fix the prompt')).toBeNull()
})

it('returns null for unknown prefix', () => {
expect(parseFindingSubject('unknown-prefix:foo')).toBeNull()
})

it('trims leading/trailing whitespace before parsing', () => {
expect(parseFindingSubject(' system-prompt:request-classification ')).toEqual({
kind: 'system-prompt',
section: 'request-classification',
})
})
})

describe('renderFindingSubject', () => {
it('round-trips every parseable subject', () => {
const cases: Array<FindingSubject> = [
{ kind: 'knowledge.wiki', slug: 'invoice-shape' },
{ kind: 'knowledge.wiki', slug: 'invoice-shape', heading: 'line-items' },
{ kind: 'knowledge.claim', topic: 'cap-table-shape' },
{ kind: 'knowledge.raw', sourceId: 'irs-pub-501-2024' },
{ kind: 'knowledge.stale', slug: 'old-vat-rates' },
{ kind: 'system-prompt', section: 'request-classification' },
{ kind: 'tool-doc', tool: 'list_invoices' },
{ kind: 'tool-doc', tool: 'list_invoices', aspect: 'examples' },
{ kind: 'new-tool', name: 'diff_csv' },
{ kind: 'rag', corpus: 'irs-rulings', docId: 'rev-rul-2024-12' },
{ kind: 'memory', key: 'last-customer-id' },
{ kind: 'scaffolding', concern: 'retry-policy' },
{ kind: 'output-schema', field: 'filing_year' },
{ kind: 'websearch.outdated', topic: 'capital-gains-rates-2023' },
{ kind: 'prior-run-summary', topic: 'cost-basis-method' },
{ kind: 'cluster', label: 'tool-call-loop' },
]
for (const s of cases) {
const rendered = renderFindingSubject(s)
const reparsed = parseFindingSubject(rendered)
expect(reparsed).toEqual(s)
}
})
})

describe('KIND_EXPECTED_SUBJECTS', () => {
it('covers every emitted kind in DEFAULT_TRACE_ANALYST_KINDS', () => {
expect(Object.keys(KIND_EXPECTED_SUBJECTS).sort()).toEqual(
['failure-mode', 'improvement', 'knowledge-gap', 'knowledge-poisoning'].sort(),
)
})

it('failure-mode is the ONLY kind that emits cluster subjects', () => {
for (const [kindId, allowed] of Object.entries(KIND_EXPECTED_SUBJECTS)) {
if (kindId === 'failure-mode') {
expect(allowed).toContain('cluster')
} else {
expect(allowed).not.toContain('cluster')
}
}
})

it('every expected variant is a known FindingSubject kind', () => {
for (const allowed of Object.values(KIND_EXPECTED_SUBJECTS)) {
for (const variant of allowed) {
expect(FINDING_SUBJECT_KINDS).toContain(variant)
}
}
})

it('improvement does not include websearch.outdated / prior-run-summary (stale signals are a knowledge-poisoning concern)', () => {
const improvement = KIND_EXPECTED_SUBJECTS.improvement!
expect(improvement).not.toContain('websearch.outdated')
expect(improvement).not.toContain('prior-run-summary')
})
})
Loading
Loading