Skip to content

Commit a7437c3

Browse files
feat: add semantic validation intent classifier with bash timeout fix
- Add `ValidationIntent.classify()` in `src/altimate/intent/validation-classifier.ts` that uses Haiku via the existing provider stack to detect validation queries semantically — handles natural language like "audit my session" or "check this trace" without relying on keyword/rule-based matching - Intercept `session.prompt` route to auto-redirect validation queries to the `validate` skill command, keeping all other flows unchanged - Fail open: classifier returns `false` on any error (no model, network down, etc.) - Update `SKILL.md` to pass `timeout: 900000` to the Bash tool when running `batch_validate.py`, overriding the default 2-minute bash timeout that was killing long-running validation jobs
1 parent 11da975 commit a7437c3

3 files changed

Lines changed: 77 additions & 3 deletions

File tree

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import { generateText } from "ai"
2+
import { Provider } from "@/provider/provider"
3+
import { Log } from "@/util/log"
4+
5+
export namespace ValidationIntent {
6+
const log = Log.create({ service: "validation-intent" })
7+
8+
const SYSTEM_PROMPT = `You are a binary intent classifier.
9+
A validation query is any request to check, audit, evaluate, or quality-assess an AI trace, session, or conversation response.
10+
Respond ONLY with valid JSON: {"is_validation": true} or {"is_validation": false}. No explanation.`
11+
12+
// Prefer haiku — fast and cheap. Other small models as fallback.
13+
const CANDIDATE_QUERIES = [
14+
["anthropic", ["haiku"]],
15+
["anthropic", ["claude-3-haiku"]],
16+
["anthropic", ["claude-3-5-haiku"]],
17+
] as const
18+
19+
async function getClassifierLanguageModel() {
20+
for (const [providerID, query] of CANDIDATE_QUERIES) {
21+
try {
22+
const closest = await Provider.closest(providerID, [...query])
23+
if (!closest) continue
24+
const model = await Provider.getModel(closest.providerID, closest.modelID)
25+
return await Provider.getLanguage(model)
26+
} catch {
27+
// try next candidate
28+
}
29+
}
30+
return null
31+
}
32+
33+
export async function classify(message: string): Promise<boolean> {
34+
if (!message.trim()) return false
35+
36+
try {
37+
const language = await getClassifierLanguageModel()
38+
if (!language) return false
39+
40+
const { text } = await generateText({
41+
model: language,
42+
system: SYSTEM_PROMPT,
43+
prompt: message,
44+
maxOutputTokens: 20,
45+
})
46+
47+
const result = JSON.parse(text.trim())
48+
return result.is_validation === true
49+
} catch (e) {
50+
log.warn("intent classification failed, proceeding normally", { error: String(e) })
51+
return false // fail open — let main flow handle it
52+
}
53+
}
54+
}

packages/opencode/src/server/routes/session.ts

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import z from "zod"
55
import { Session } from "../../session"
66
import { MessageV2 } from "../../session/message-v2"
77
import { SessionPrompt } from "../../session/prompt"
8+
import { ValidationIntent } from "@/altimate/intent/validation-classifier"
89
import { SessionCompaction } from "../../session/compaction"
910
import { SessionRevert } from "../../session/revert"
1011
import { SessionStatus } from "@/session/status"
@@ -763,7 +764,23 @@ export const SessionRoutes = lazy(() =>
763764
return stream(c, async (stream) => {
764765
const sessionID = c.req.valid("param").sessionID
765766
const body = c.req.valid("json")
766-
const msg = await SessionPrompt.prompt({ ...body, sessionID })
767+
768+
const textPart = body.parts?.find((p: { type: string }) => p.type === "text")
769+
const textContent = textPart && "text" in textPart ? (textPart.text as string) : ""
770+
const isValidation = await ValidationIntent.classify(textContent)
771+
772+
const msg = isValidation
773+
? await SessionPrompt.command({
774+
sessionID,
775+
command: "validate",
776+
arguments: textContent,
777+
model: body.model ? `${body.model.providerID}/${body.model.modelID}` : undefined,
778+
agent: body.agent,
779+
variant: body.variant,
780+
messageID: body.messageID,
781+
})
782+
: await SessionPrompt.prompt({ ...body, sessionID })
783+
767784
stream.write(JSON.stringify(msg))
768785
})
769786
},

packages/opencode/src/skill/validate/SKILL.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,14 @@ Parse `$ARGUMENTS` to determine the mode and construct the command:
4747
- If it contains `--from` → date range mode: `uv run --with python-dotenv --with requests python "$VALIDATE_SCRIPT" --project-root "$PROJECT_ROOT" --from-time "<from>" --to-time "<to>" --user-id "<user_id>"`
4848
- Otherwise → single trace ID: `uv run --with python-dotenv --with requests python "$VALIDATE_SCRIPT" --project-root "$PROJECT_ROOT" --trace-ids "$ARGUMENTS"`
4949

50-
Run the command:
50+
Run the command using the Bash tool with `timeout: 900000` (milliseconds) to allow up to ~15 minutes for long-running validations:
5151

5252
```bash
5353
uv run --with python-dotenv --with requests python "$VALIDATE_SCRIPT" --project-root "$PROJECT_ROOT" <appropriate_args>
5454
```
5555

56+
**IMPORTANT**: Always pass `timeout: 900000` to the Bash tool when running this command. The default 2-minute bash timeout is too short for validation jobs.
57+
5658
The script will:
5759
- Call the Altimate backend directly (no Langfuse dependency)
5860
- Stream results via SSE as each trace completes
@@ -95,10 +97,11 @@ The script will:
9597

9698
For EACH trace in the results array, apply semantic matching to Groundedness:
9799

98-
1. Parse the `criteria_results.Groundedness.text_response` and identify all **failed claims** whose `input_transformation_type` is `string_match`.
100+
1. Parse the `criteria_results.Groundedness.text_response` and identify all **failed claims**.
99101
2. If there are claims identified:
100102
2.1. **For each claim , check whether `claim_text` and `source_data` are semantically the same.
101103
- 2 statements are considered **semantically same** if they talk about the same topics.
104+
- If the comparison involves numbers then **make sure you compare those numbers properly using tools if needed.**
102105
- 2 statements are considered **semantically different** if they talk about different topics.
103106
- If semantically same → update claim status to `SUCCESS`.
104107
2.2. Re-count the number of failing claims whose status is `FAILURE`.

0 commit comments

Comments
 (0)