Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Changelog

## 2026-06-01

### Configuration
- **`ai.agents.navigator.verifyAttempts`** — How many assertion checks the Navigator runs when verifying a claim before deciding pass/fail. Lower it to make verification faster, raise it for more confidence. Default: `3`.
- **`ai.agents.navigator.verifyTimeout`** — Timeout in milliseconds for each verification assertion, so a check that won't match fails fast instead of waiting the full page timeout. Default: `1500`.

### Changes
- [Navigator] Verification is faster — it stops as soon as the outcome is decided instead of running every check, runs fewer assertions, and gives up quickly on checks that won't match rather than waiting the full timeout.
- [Navigator] Reuses an earlier verification result on the same page instead of checking the same claim again — including when the new claim is worded differently but means the same thing.
- [Pilot] A scenario whose goal was not actually performed this run no longer passes. Reaching a page, tab, or prompt is treated as a milestone, not success. Scenarios that cannot proceed because a prerequisite is missing — a required control is absent, an integration is not connected, or only a setup/empty-state prompt is shown — are now marked skipped instead of passed.
- [Reporter] Local HTML and markdown reports are no longer produced automatically — turn them on with `reporter.html: true` and `reporter.markdown: true`. The run group is no longer a hardcoded "Explorbot <date>" default. `explorbot init` now writes a `reporter` block (HTML on, markdown on, and a date-based run group) into the generated config, so report output is visible and editable instead of assumed.

## 2026-05-25

### Changes
Expand Down
2 changes: 2 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ Avoid repetitive code patterns
Avoid ternary operators!
Never use `...(condition ? { key: value } : {})` spread pattern — use a plain `if` statement instead
Avoid creating extra functions that were not explicitly set
Private methods must be placed after public methods
Avoid `=== null` / `=== undefined` comparisons when not needed — prefer shorter `if (...)` or `if (!...)` when applicable
Use dedent when formatting prompts
Use `mdq()` from `src/utils/markdown-query.ts` for all markdown manipulation (find sections, replace tables, extract text). Never do manual line-splitting/counting on markdown.
Put types into the end of file
Expand Down
10 changes: 5 additions & 5 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"@opentelemetry/sdk-trace-base": "^2.2.0",
"@opentelemetry/semantic-conventions": "^1.38.0",
"@scalar/openapi-parser": "^0.25.6",
"@testomatio/reporter": "^2.7.9-beta.3-markdown",
"@testomatio/reporter": "^2.8.4",
"ai": "^6.0.6",
"axe-core": "^4.11.1",
"bash-tool": "^1.3.15",
Expand All @@ -91,7 +91,7 @@
"micromatch": "^4.0.8",
"ora-classic": "^5.4.2",
"parse5": "^8.0.0",
"playwright": "^1.59.0",
"playwright": "^1.60",
"react": "^19.1.1",
"strip-ansi": "^7.1.2",
"turndown": "^7.2.1",
Expand Down
11 changes: 11 additions & 0 deletions src/action-result.ts
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,17 @@ export class ActionResult implements ActionResultData {
this.verifications[assertion] = passed;
}

getVerification(message: string | RegExp): boolean | null {
if (!this.verifications) return null;
if (typeof message === 'string') {
return this.verifications[message] ?? null;
}
for (const [assertion, passed] of Object.entries(this.verifications)) {
if (message.test(assertion)) return passed;
}
return null;
}

isSameUrl(state: WebPageState): boolean {
if (!this.url || this.url === '') {
return false;
Expand Down
143 changes: 104 additions & 39 deletions src/ai/navigator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ class Navigator implements Agent {
this.hooksRunner = new HooksRunner(explorer, explorer.getConfig());
}

private get verifyAttempts(): number {
return this.explorer.getConfig().ai?.agents?.navigator?.verifyAttempts ?? 3;
}

private get verifyTimeout(): number {
return this.explorer.getConfig().ai?.agents?.navigator?.verifyTimeout ?? 1500;
}

private getBaseOrigin(): string | null {
const baseUrl = this.explorer.getConfig().playwright.url;
try {
Expand Down Expand Up @@ -623,6 +631,12 @@ class Navigator implements Agent {
tag('info').log('AI Navigator verifying state at', actionResult.url);
debugLog('Verification message:', message);

const cachedVerification = actionResult.getVerification(message);
if (cachedVerification !== null) {
tag('substep').log(`Reusing cached verification: ${cachedVerification ? 'PASS' : 'FAIL'}`);
return { verified: cachedVerification, successfulCodes: [], assertionSteps: [], totalAttempted: 0 };
}

let knowledge = '';
let experience = '';

Expand All @@ -645,6 +659,21 @@ class Navigator implements Agent {
}
}

const priorVerifications = Object.entries(actionResult.verifications ?? {});
let verificationContext = '';
if (priorVerifications.length > 0) {
const lines = priorVerifications.map(([claim, passed]) => `- "${claim}" → ${passed ? 'passed' : 'failed'}`).join('\n');
verificationContext = dedent`
<already_verified>
These claims were already checked on this page:
${lines}

If the claim to verify has the same meaning as one above (even if worded differently), do NOT write any assertion code.
Respond with a single line and nothing else: ALREADY_VERIFIED: <exact text of the matching claim>
</already_verified>
`;
}

const prompt = dedent`
<message>
${message}
Expand All @@ -658,11 +687,13 @@ class Navigator implements Agent {
</page_html>
</page>

${verificationContext}

<task>
Identify what assertion the user wants to verify on the page.
Propose different CodeceptJS assertion code blocks to verify the expected state.
Propose 2-3 strong, distinct CodeceptJS assertion code blocks that each directly prove the claim.
Use only data from the <page> context to plan the verification.
Try various locators and approaches to verify the assertion.
Prefer the fewest, most specific assertions over many variants of the same locator.

IMPORTANT: Each code block must verify the SPECIFIC claim in the message, not just a generic aspect of it.
Bad: I.seeElement({"role":"button","aria-pressed":"true"}) — matches ANY button, not the specific one
Expand All @@ -684,64 +715,98 @@ class Navigator implements Agent {
const conversation = this.provider.startConversation(this.systemPrompt, 'navigator');
conversation.addUserText(prompt);

let alreadyVerified = false;
const tools = this.buildExperienceTools();

let codeBlocks: string[] = [];
const successfulCodes: string[] = [];
const assertionSteps: Array<{ name: string; args: any[] }> = [];

const action = this.explorer.createAction();
let failures = 0;

await loop(
async ({ stop, iteration }) => {
if (codeBlocks.length === 0) {
const result = await this.provider.invokeConversation(conversation, tools);
if (!result) return;
const aiResponse = result?.response?.text;
debugLog('Received AI response:', aiResponse?.length ?? 0, 'characters');
tag('step').log('Verifying assertion...');
codeBlocks = extractCodeBlocks(aiResponse ?? '');
}
const page = this.explorer.playwrightHelper?.page;
const originalTimeout = this.explorer.playwrightHelper?.options?.timeout ?? 3000;
page?.setDefaultTimeout(this.verifyTimeout);

if (codeBlocks.length === 0) {
return;
}
try {
await loop(
async ({ stop, iteration }) => {
if (codeBlocks.length === 0) {
const result = await this.provider.invokeConversation(conversation, tools);
if (!result) return;
const aiResponse = result?.response?.text ?? '';
debugLog('Received AI response:', aiResponse.length, 'characters');
tag('step').log('Verifying assertion...');

if (this.checkAlreadyVerified(aiResponse, actionResult)) {
alreadyVerified = true;
stop();
return;
}

const codeBlock = codeBlocks[iteration - 1];
if (!codeBlock) {
stop();
return;
}
codeBlocks = extractCodeBlocks(aiResponse);
}

await this.explorer.switchToMainFrame();
if (codeBlocks.length === 0) {
return;
}

const verified = await action.attempt(codeBlock, message, false);
const codeBlock = codeBlocks[iteration - 1];
if (!codeBlock) {
stop();
return;
}

if (verified) {
tag('success').log('Verification passed');
successfulCodes.push(codeBlock);
assertionSteps.push(...action.assertionSteps);
}
},
{
maxAttempts: this.MAX_ATTEMPTS,
observability: {
agent: 'navigator',
},
catch: async (error) => {
debugLog(error);
await this.explorer.switchToMainFrame();

const verified = await action.attempt(codeBlock, message, false);

if (verified) {
tag('success').log('Verification passed');
successfulCodes.push(codeBlock);
assertionSteps.push(...action.assertionSteps);
} else {
failures++;
}

const target = Math.min(codeBlocks.length, this.verifyAttempts);
const majorityNeeded = Math.floor(target / 2) + 1;
if (successfulCodes.length >= majorityNeeded || failures > target - majorityNeeded) {
stop();
}
},
}
);
{
maxAttempts: this.verifyAttempts,
observability: {
agent: 'navigator',
},
catch: async (error) => {
debugLog(error);
},
}
);
} finally {
page?.setDefaultTimeout(originalTimeout);
}

const totalAttempted = Math.min(codeBlocks.length, this.MAX_ATTEMPTS);
const verified = totalAttempted <= 1 ? successfulCodes.length > 0 : successfulCodes.length > totalAttempted / 2;
const totalAttempted = Math.min(codeBlocks.length, this.verifyAttempts);
const majorityNeeded = Math.floor(totalAttempted / 2) + 1;
let verified = successfulCodes.length >= majorityNeeded;
if (alreadyVerified) verified = true;

actionResult.addVerification(message, verified);
this.explorer.getStateManager().updateState(actionResult);

return { verified, successfulCodes, assertionSteps, totalAttempted };
}

private checkAlreadyVerified(aiResponse: string, actionResult: ActionResult): boolean {
const verifiedMatch = aiResponse.match(/ALREADY_VERIFIED:\s*(.+)/i);
if (!verifiedMatch) return false;
const claim = verifiedMatch[1].trim().replace(/^["']|["']$/g, '');
return actionResult.getVerification(claim) === true;
}
}

export { Navigator };
6 changes: 6 additions & 0 deletions src/ai/pilot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,12 @@ export class Pilot implements Agent {
- "Delete X" → X must be gone. Clicking delete is NOT enough.
- "Edit X" → updated value must be persisted (visible in list/detail). Opening edit is NOT enough; redirect after save with the new value visible IS enough.
- Negative tests ("without a name", "invalid", "duplicate", "unauthorized") → success means the system PREVENTED the action with validation/error.
- Navigation-prefixed titles ("Access/Open/Go to X to <do Y>") → the goal is <do Y>; reaching X is
only a milestone. A satisfied milestone (tab active, panel/prompt visible, list shown) is NEVER a pass
if <do Y> did not occur this run.
- If the page reveals the goal cannot be performed here — required control absent, integration not
connected, or only a setup/connect/empty-state prompt is shown — vote "skipped" (prerequisites unmet),
never "pass".

PROVENANCE: the entity you cite as proof must appear by name in <notes> or
<session_log> tool inputs for THIS run. Name absent from tester activity = stale
Expand Down
9 changes: 9 additions & 0 deletions src/commands/init-command.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ const config = {
// agentic model for decision making
agenticModel: openrouter('minimax/minimax-m2.5:nitro'),
},

reporter: {
// Save a local HTML report after each run.
html: true,
// Save a local markdown report after each run.
markdown: true,
// Group runs by title in Testomat.io / HTML reports. Defaults to today's date — customize or remove.
runGroup: new Date().toISOString().slice(0, 10),
},
};

export default config;
Expand Down
2 changes: 2 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,8 @@ interface PilotAgentConfig extends AgentConfig {
interface NavigatorAgentConfig extends AgentConfig {
addHtmlOnTry?: number;
maxAttempts?: number;
verifyAttempts?: number;
verifyTimeout?: number;
}

type HealFn = (ctx: { I: any }) => Promise<void> | void;
Expand Down
11 changes: 4 additions & 7 deletions src/reporter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ export class Reporter {
this.reporterEnabled = Reporter.resolveEnabled(config);
this.stateManager = stateManager;

if (this.reporterEnabled && (!process.env.TESTOMATIO || config?.html)) {
if (this.reporterEnabled && config?.html) {
this.configureHtmlPipe();
}

Expand Down Expand Up @@ -63,6 +63,7 @@ export class Reporter {
static resolveEnabled(config?: ReporterConfig): boolean {
if (config?.enabled === true) return true;
if (config?.enabled === false) return false;
if (config?.html || config?.markdown) return true;
return Boolean(process.env.TESTOMATIO);
}

Expand All @@ -88,12 +89,8 @@ export class Reporter {

private configureRunGroup(runGroup: string | null | undefined): void {
if (process.env.TESTOMATIO_RUNGROUP_TITLE) return;
if (runGroup === null) return;
if (runGroup) {
process.env.TESTOMATIO_RUNGROUP_TITLE = runGroup;
return;
}
process.env.TESTOMATIO_RUNGROUP_TITLE = `Explorbot ${new Date().toISOString().slice(0, 10)}`;
if (!runGroup) return;
process.env.TESTOMATIO_RUNGROUP_TITLE = runGroup;
}

async startRun(): Promise<void> {
Expand Down
Loading
Loading