feat: add semantic validation intent classifier with bash timeout fix

Bharatram-altimate-ai · Bharatram-altimate-ai · commit a7437c38406c · 2026-03-12T11:29:35.000+05:30
- Add `ValidationIntent.classify()` in `src/altimate/intent/validation-classifier.ts`
  that uses Haiku via the existing provider stack to detect validation queries
  semantically — handles natural language like "audit my session" or "check
  this trace" without relying on keyword/rule-based matching
- Intercept `session.prompt` route to auto-redirect validation queries to the
  `validate` skill command, keeping all other flows unchanged
- Fail open: classifier returns `false` on any error (no model, network down, etc.)
- Update `SKILL.md` to pass `timeout: 900000` to the Bash tool when running
  `batch_validate.py`, overriding the default 2-minute bash timeout that was
  killing long-running validation jobs
diff --git a/packages/opencode/src/altimate/intent/validation-classifier.ts b/packages/opencode/src/altimate/intent/validation-classifier.ts
@@ -0,0 +1,54 @@
+import { generateText } from "ai"
+import { Provider } from "@/provider/provider"
+import { Log } from "@/util/log"
+
+export namespace ValidationIntent {
+  const log = Log.create({ service: "validation-intent" })
+
+  const SYSTEM_PROMPT = `You are a binary intent classifier.
+A validation query is any request to check, audit, evaluate, or quality-assess an AI trace, session, or conversation response.
+Respond ONLY with valid JSON: {"is_validation": true} or {"is_validation": false}. No explanation.`
+
+  // Prefer haiku — fast and cheap. Other small models as fallback.
+  const CANDIDATE_QUERIES = [
+    ["anthropic", ["haiku"]],
+    ["anthropic", ["claude-3-haiku"]],
+    ["anthropic", ["claude-3-5-haiku"]],
+  ] as const
+
+  async function getClassifierLanguageModel() {
+    for (const [providerID, query] of CANDIDATE_QUERIES) {
+      try {
+        const closest = await Provider.closest(providerID, [...query])
+        if (!closest) continue
+        const model = await Provider.getModel(closest.providerID, closest.modelID)
+        return await Provider.getLanguage(model)
+      } catch {
+        // try next candidate
+      }
+    }
+    return null
+  }
+
+  export async function classify(message: string): Promise<boolean> {
+    if (!message.trim()) return false
+
+    try {
+      const language = await getClassifierLanguageModel()
+      if (!language) return false
+
+      const { text } = await generateText({
+        model: language,
+        system: SYSTEM_PROMPT,
+        prompt: message,
+        maxOutputTokens: 20,
+      })
+
+      const result = JSON.parse(text.trim())
+      return result.is_validation === true
+    } catch (e) {
+      log.warn("intent classification failed, proceeding normally", { error: String(e) })
+      return false // fail open — let main flow handle it
+    }
+  }
+}
diff --git a/packages/opencode/src/server/routes/session.ts b/packages/opencode/src/server/routes/session.ts
@@ -5,6 +5,7 @@ import z from "zod"
 import { Session } from "../../session"
 import { MessageV2 } from "../../session/message-v2"
 import { SessionPrompt } from "../../session/prompt"
+import { ValidationIntent } from "@/altimate/intent/validation-classifier"
 import { SessionCompaction } from "../../session/compaction"
 import { SessionRevert } from "../../session/revert"
 import { SessionStatus } from "@/session/status"
@@ -763,7 +764,23 @@ export const SessionRoutes = lazy(() =>
         return stream(c, async (stream) => {
           const sessionID = c.req.valid("param").sessionID
           const body = c.req.valid("json")
-          const msg = await SessionPrompt.prompt({ ...body, sessionID })
+
+          const textPart = body.parts?.find((p: { type: string }) => p.type === "text")
+          const textContent = textPart && "text" in textPart ? (textPart.text as string) : ""
+          const isValidation = await ValidationIntent.classify(textContent)
+
+          const msg = isValidation
+            ? await SessionPrompt.command({
+                sessionID,
+                command: "validate",
+                arguments: textContent,
+                model: body.model ? `${body.model.providerID}/${body.model.modelID}` : undefined,
+                agent: body.agent,
+                variant: body.variant,
+                messageID: body.messageID,
+              })
+            : await SessionPrompt.prompt({ ...body, sessionID })
+
           stream.write(JSON.stringify(msg))
         })
       },
diff --git a/packages/opencode/src/skill/validate/SKILL.md b/packages/opencode/src/skill/validate/SKILL.md
@@ -47,12 +47,14 @@ Parse `$ARGUMENTS` to determine the mode and construct the command:
 - If it contains `--from` → date range mode: `uv run --with python-dotenv --with requests python "$VALIDATE_SCRIPT" --project-root "$PROJECT_ROOT" --from-time "<from>" --to-time "<to>" --user-id "<user_id>"`
 - Otherwise → single trace ID: `uv run --with python-dotenv --with requests python "$VALIDATE_SCRIPT" --project-root "$PROJECT_ROOT" --trace-ids "$ARGUMENTS"`
 
-Run the command:
+Run the command using the Bash tool with `timeout: 900000` (milliseconds) to allow up to ~15 minutes for long-running validations:
 
 ```bash
 uv run --with python-dotenv --with requests python "$VALIDATE_SCRIPT" --project-root "$PROJECT_ROOT" <appropriate_args>
 ```
 
+**IMPORTANT**: Always pass `timeout: 900000` to the Bash tool when running this command. The default 2-minute bash timeout is too short for validation jobs.
+
 The script will:
 - Call the Altimate backend directly (no Langfuse dependency)
 - Stream results via SSE as each trace completes
@@ -95,10 +97,11 @@ The script will:
 
 For EACH trace in the results array, apply semantic matching to Groundedness:
 
-1. Parse the `criteria_results.Groundedness.text_response` and identify all **failed claims** whose `input_transformation_type` is `string_match`.
+1. Parse the `criteria_results.Groundedness.text_response` and identify all **failed claims**.
 2. If there are claims identified:
     2.1. **For each claim , check whether `claim_text` and `source_data` are semantically the same.
         - 2 statements are considered **semantically same** if they talk about the same topics.
+           - If the comparison involves numbers then **make sure you compare those numbers properly using tools if needed.**
         - 2 statements are considered **semantically different** if they talk about different topics.
         - If semantically same → update claim status to `SUCCESS`.
     2.2. Re-count the number of failing claims whose status is `FAILURE`.