AgentWorkforce · khaliqgant · May 9, 2026 · May 8, 2026 · May 8, 2026 · May 9, 2026
diff --git a/.github/workflows/ricky-evals.yml b/.github/workflows/ricky-evals.yml
@@ -0,0 +1,114 @@
+name: Ricky Evals
+
+on:
+  pull_request:
+    paths:
+      - ".github/workflows/ricky-evals.yml"
+      - "evals/**"
+      - "scripts/evals/**"
+      - "AGENTS.md"
+      - "README.md"
+      - "SPEC.md"
+      - "docs/**"
+      - "specs/**"
+      - "src/cloud/**"
+      - "src/local/**"
+      - "src/product/**"
+      - "src/runtime/**"
+      - "src/surfaces/**"
+      - "src/shared/**"
+      - "workflows/shared/**"
+      - "workflows/meta/spec/**"
+      - "package.json"
+      - "package-lock.json"
+  push:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/ricky-evals.yml"
+      - "evals/**"
+      - "scripts/evals/**"
+      - "AGENTS.md"
+      - "README.md"
+      - "SPEC.md"
+      - "docs/**"
+      - "specs/**"
+      - "src/cloud/**"
+      - "src/local/**"
+      - "src/product/**"
+      - "src/runtime/**"
+      - "src/surfaces/**"
+      - "src/shared/**"
+      - "workflows/shared/**"
+      - "workflows/meta/spec/**"
+      - "package.json"
+      - "package-lock.json"
+  workflow_dispatch:
+
+concurrency:
+  group: ricky-evals-${{ github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pull-requests: write
+
+env:
+  NODE_VERSION: "22.14.0"
+  NPM_CONFIG_FUND: "false"
+  RICKY_EVAL_OPENROUTER_MODEL: openai/gpt-oss-120b:free
+
+jobs:
+  evals:
+    name: Provider-backed evals
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: ${{ env.NODE_VERSION }}
+          cache: npm
+          cache-dependency-path: package-lock.json
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Require OpenRouter API key
+        if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository
+        env:
+          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+        run: |
+          if [ -z "${OPENROUTER_API_KEY}" ]; then
+            echo "OPENROUTER_API_KEY GitHub secret is required for provider-backed Ricky evals."
+            exit 1
+          fi
+
+      - name: Run provider evals
+        if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository
+        env:
+          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+        run: npm run evals:provider -- --trials 1
+
+      - name: Summarize evals
+        if: always()
+        run: node scripts/evals/ci-summary.mjs
+
+      - name: Comment human-review cases
+        if: always() && github.event_name == 'pull_request'
+        env:
+          GITHUB_TOKEN: ${{ github.token }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+        run: node scripts/evals/ci-review-comment.mjs
+
+      - name: Upload eval artifacts
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: ricky-eval-run
+          path: .ricky/evals/runs/
+          retention-days: 14
+          if-no-files-found: ignore
diff --git a/package.json b/package.json
@@ -44,6 +44,7 @@
     "test": "npm run bundle && vitest run",
     "evals:compile": "node scripts/evals/compile-ricky-evals.mjs",
     "evals": "npm run evals:compile && node scripts/evals/run-ricky-evals.mjs",
+    "evals:provider": "npm run build && npm run evals -- --provider --executor openrouter",
     "evals:opencode": "npm run evals:compile && node scripts/evals/run-ricky-evals.mjs --provider --executor opencode",
     "evals:list": "npm run evals:compile && node scripts/evals/run-ricky-evals.mjs --list",
     "evals:summary": "node scripts/evals/summarize-ricky-evals.mjs",

diff --git a/scripts/evals/ci-review-comment.mjs b/scripts/evals/ci-review-comment.mjs
@@ -0,0 +1,262 @@
+#!/usr/bin/env node
+
+import { appendFileSync, existsSync, readdirSync, readFileSync } from 'node:fs';
+import path from 'node:path';
+import process from 'node:process';
+import { fileURLToPath } from 'node:url';
+
+const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../..');
+const RUNS_DIR = path.join(ROOT, '.ricky', 'evals', 'runs');
+const MARKER = '<!-- ricky-eval-human-review -->';
+const MAX_COMMENT_CHARS = 60000;
+const MAX_OUTPUT_CHARS = 1200;
+
+const runDir = findLatestRunDir();
+if (!runDir) {
+  console.log('No Ricky eval run found; skipping PR comment.');
+  process.exit(0);
+}
+
+const result = readResultJson(path.join(runDir, 'result.json'));
+const comment = renderComment({ result, runDir });
+
+if (process.env.GITHUB_STEP_SUMMARY) {
+  appendFileSync(process.env.GITHUB_STEP_SUMMARY, [
+    '',
+    '## Ricky Eval Review Comment',
+    '',
+    'A detailed human-review comment was generated for this PR.',
+    '',
+  ].join('\n'));
+}
+
+if (process.env.GITHUB_TOKEN && process.env.GITHUB_REPOSITORY && process.env.PR_NUMBER) {
+  await upsertPullRequestComment(comment);
+} else {
+  console.log(comment);
+}
+
+function renderComment({ result, runDir }) {
+  const failed = result.tests.filter((test) => test.status === 'failed');
+  const skipped = result.tests.filter((test) => test.status === 'skipped');
+  const needsHuman = result.tests.filter((test) => test.status === 'needs-human');
+  const reviewableNeedsHuman = needsHuman.filter(hasCapturedOutput);
+  const missingOutputNeedsHuman = needsHuman.filter((test) => !hasCapturedOutput(test));
+  const lines = [
+    MARKER,
+    '# Ricky Eval Review',
+    '',
+    `Run: \`${path.relative(ROOT, runDir)}\``,
+    `Mode: \`${result.mode}\``,
+    `Git SHA: \`${result.git_sha}\``,
+    '',
+    `**Passed:** ${result.passed} | **Needs human:** ${result.needs_human} | **Reviewable:** ${reviewableNeedsHuman.length} | **Missing output:** ${missingOutputNeedsHuman.length} | **Failed:** ${result.failed} | **Skipped:** ${result.skipped}`,
+    '',
+  ];
+
+  if (failed.length > 0 || skipped.length > 0) {
+    lines.push('## Blocking Cases', '');
+    for (const test of [...failed, ...skipped]) {
+      appendCaseDetails(lines, test, { forceOpen: true });
+    }
+  }
+
+  if (reviewableNeedsHuman.length > 0) {
+    lines.push(
+      '## Human Review Cases',
+      '',
+      'These cases passed deterministic checks and include captured Ricky output for a human verdict against their `Must` / `Must Not` rubric.',
+      '',
+    );
+    for (const test of reviewableNeedsHuman) {
+      appendCaseDetails(lines, test, { forceOpen: false });
+    }
+  } else {
+    lines.push('## Human Review Cases', '', 'No reviewable human-review cases captured Ricky output.', '');
+  }
+
+  if (missingOutputNeedsHuman.length > 0) {
+    lines.push(
+      '## Cases Missing Ricky Output',
+      '',
+      'These cases are not expanded because there is no candidate Ricky response to judge. Change them to `Executor: openrouter`, run with `--executor openrouter`, or provide `Candidate Output`, before treating them as human-review evidence.',
+      '',
+    );
+    for (const test of missingOutputNeedsHuman) {
+      lines.push(`- \`${test.id}\` (${test.suite}/${test.executor})`);
+    }
+    lines.push('');
+  }
+
+  const body = `${lines.join('\n')}\n`;
+  if (body.length <= MAX_COMMENT_CHARS) return body;
+  return `${body.slice(0, MAX_COMMENT_CHARS - 2000)}\n\n---\n\n_Comment truncated to stay within GitHub limits. Download the \`ricky-eval-run\` artifact for the full \`human-review.md\`._\n`;
+}
+
+function appendCaseDetails(lines, test, { forceOpen }) {
+  const summaryStatus = test.status === 'failed' ? 'FAIL' : test.status === 'skipped' ? 'SKIP' : 'REVIEW';
+  lines.push(`<details${forceOpen ? ' open' : ''}>`);
+  lines.push(`<summary><strong>${summaryStatus}</strong> <code>${escapeHtml(test.id)}</code> (${escapeHtml(test.suite)}/${escapeHtml(test.executor)})</summary>`);
+  lines.push('');
+
+  if (test.input?.message) {
+    lines.push('**User message**', '');
+    lines.push(blockquote(String(test.input.message)));
+    lines.push('');
+  }
+
+  appendRickyOutput(lines, test);
+  appendRubricList(lines, 'Must', test.expected?.must);
+  appendRubricList(lines, 'Must Not', test.expected?.mustNot);
+
+  const deterministicChecks = (test.checks ?? []).filter((check) => !String(check.name).startsWith('human:'));
+  if (deterministicChecks.length > 0) {
+    lines.push('**Deterministic checks**', '');
+    for (const check of deterministicChecks) {
+      lines.push(`- ${check.passed ? 'PASS' : 'FAIL'} \`${check.name}\`: ${check.message ?? ''}`);
+    }
+    lines.push('');
+  }
+
+  if (test.error) {
+    lines.push('**Error**', '');
+    lines.push('```text');
+    lines.push(String(test.error).slice(0, MAX_OUTPUT_CHARS));
+    lines.push('```', '');
+  }
+
+  lines.push('</details>', '');
+}
+
+function appendRickyOutput(lines, test) {
+  const actualContent = getCapturedOutput(test).trim();
+  lines.push('**Ricky output**', '');
+  if (actualContent.length > 0) {
+    const preview = actualContent.length > MAX_OUTPUT_CHARS
+      ? `${actualContent.slice(0, MAX_OUTPUT_CHARS)}\n...[truncated]`
+      : actualContent;
+    lines.push('```text');
+    lines.push(preview);
+    lines.push('```', '');
+  } else {
+    lines.push(`_No Ricky output captured for this case. Executor: \`${test.executor}\`._`, '');
+  }
+}
+
+function hasCapturedOutput(test) {
+  return getCapturedOutput(test).trim().length > 0;
+}
+
+function getCapturedOutput(test) {
+  return String(
+    test.actual?.content ??
+      test.candidate_output ??
+      test.candidateOutput ??
+      test.candidate?.content ??
+      '',
+  );
+}
+
+function appendRubricList(lines, title, items) {
+  if (!Array.isArray(items) || items.length === 0) return;
+  lines.push(`**${title}**`, '');
+  for (const item of items) {
+    lines.push(`- ${String(item)}`);
+  }
+  lines.push('');
+}
+
+function blockquote(text) {
+  return text.split('\n').map((line) => `> ${line}`).join('\n');
+}
+
+function escapeHtml(value) {
+  return String(value)
+    .replace(/&/g, '&amp;')
+    .replace(/</g, '&lt;')
+    .replace(/>/g, '&gt;');
+}
+
+function findLatestRunDir() {
+  if (!existsSync(RUNS_DIR)) return null;
+  const runs = readdirSync(RUNS_DIR)
+    .map((dir) => path.join(RUNS_DIR, dir))
+    .filter((dir) => existsSync(path.join(dir, 'result.json')))
+    .flatMap((dir) => {
+      const result = safeReadResultJson(path.join(dir, 'result.json'));
+      return result ? [{ dir, result }] : [];
+    })
+    .sort((a, b) => String(b.result.timestamp).localeCompare(String(a.result.timestamp)));
+  return runs[0]?.dir ?? null;
+}
+
+function readResultJson(filePath) {
+  const result = safeReadResultJson(filePath);
+  if (!result) {
+    throw new Error(`Could not parse Ricky eval result: ${path.relative(ROOT, filePath)}`);
+  }
+  return result;
+}
+
+function safeReadResultJson(filePath) {
+  try {
+    return JSON.parse(readFileSync(filePath, 'utf8'));
+  } catch (error) {
+    console.warn(`Skipping malformed Ricky eval result ${path.relative(ROOT, filePath)}: ${error instanceof Error ? error.message : String(error)}`);
+    return null;
+  }
+}
+
+async function upsertPullRequestComment(body) {
+  const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/');
+  if (!owner || !repo) {
+    throw new Error(`Invalid GITHUB_REPOSITORY format: expected "owner/repo", got "${process.env.GITHUB_REPOSITORY}"`);
+  }
+  const prNumber = process.env.PR_NUMBER;
+  const headers = {
+    accept: 'application/vnd.github+json',
+    authorization: `Bearer ${process.env.GITHUB_TOKEN}`,
+    'content-type': 'application/json',
+    'x-github-api-version': '2022-11-28',
+  };
+  const commentsUrl = `https://api.github.com/repos/${owner}/${repo}/issues/${prNumber}/comments`;
+  const existing = await findExistingReviewComment(commentsUrl, headers);
+
+  const method = existing?.url ? 'PATCH' : 'POST';
+  const url = existing?.url ?? commentsUrl;
+  const response = await globalThis.fetch(url, {
+    method,
+    headers,
+    body: JSON.stringify({ body }),
+  });
+  if (!response.ok) {
+    throw new Error(`Failed to ${method === 'PATCH' ? 'update' : 'create'} PR comment: ${response.status} ${await response.text()}`);
+  }
+  console.log(`${method === 'PATCH' ? 'Updated' : 'Created'} Ricky eval review comment.`);
+}
+
+async function findExistingReviewComment(commentsUrl, headers) {
+  let url = `${commentsUrl}?per_page=100`;
+  while (url) {
+    const response = await globalThis.fetch(url, { headers });
+    if (!response.ok) {
+      throw new Error(`Failed to list PR comments: ${response.status} ${await response.text()}`);
+    }
+    const comments = await response.json();
+    if (Array.isArray(comments)) {
+      const existing = comments.find((comment) => typeof comment.body === 'string' && comment.body.includes(MARKER));
+      if (existing) return existing;
+    }
+    url = getNextLink(response.headers.get('link'));
+  }
+  return undefined;
+}
+
+function getNextLink(linkHeader) {
+  if (!linkHeader) return null;
+  for (const part of linkHeader.split(',')) {
+    const match = /<([^>]+)>;\s*rel="next"/.exec(part.trim());
+    if (match) return match[1];
+  }
+  return null;
+}