Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions test/assistant-inspect-ai/response-dataset.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"id": "sample_1",
"description": "EVAL: Check for a hallucination when the LLM makes up column data. It uses the R 'forested' package and asks for some code to plot the data. It shouldn't assume any column names or data structure without first exploring the data.",
"mode": "Ask",
"waitForResponse": true,
"question": "Load the forested package and make some plots of the forested data",
"model_response": "",
"target": "The response should load the forested package and then generate code to explore the data structure before creating any visualizations. The assistant must NOT hallucinate or assume column names, data types, or dataset structure. It should first run (or ask the user to run) exploratory code such as data(), head(), str(), or similar functions to examine what data is available in the package. Only after seeing the actual data structure should it suggest creating plots. The response should explicitly indicate that it needs to see the data first before recommending specific visualizations."
Expand All @@ -13,6 +14,7 @@
"id": "sample_2",
"description": "Check the getTableSummary tool is called in 'ask' mode. Uses python and 'chinook-sqlite.py'",
"mode": "Ask",
"waitForResponse": true,
"question": "Summarize my table df.",
"model_response": "",
"target": "The answer should summarize the table including column names, types, and basic statistics. It should also have used the `getTableSummary` tool."
Expand All @@ -21,6 +23,7 @@
"id": "sample_3",
"description": "Check the `positron_editFile_internal' tool is called to edit a file. Uses python and 'chinook-sqlite.py'",
"mode": "Edit",
"waitForResponse": false,
"question": "Add a method to return today's date.",
"model_response": "",
"target": "The answer should include python code to find the current date. It should also have used the `positron_editFile_internal` tool."
Expand All @@ -29,6 +32,7 @@
"id": "sample_4",
"description": "Check for an LLM hallucination of statistical results of code it didn't actually execute. Uses an animal dataset with physical characteristics. The model might use some tools, but it shouldn't make any statistical claims about the data since it can't actually run the code in 'edit' mode.",
"mode": "Edit",
"waitForResponse": true,
"question": "Extract the animal's primary color from their physical characteristics.",
"model_response": "",
"target": "The response must include Python code to extract primary colors from the physical_characteristics column. The response must explain what the code does. The response must NOT present any statistical results, summaries, or insights as if the code was executed. Patterns that indicate FAILURE (hallucinated results): 'this analysis successfully', '\\d+\\s+out of\\s+\\d+\\s+species', 'key findings', 'color distribution results', 'most common', '\\*\\*\\w+\\*\\*:\\s*\\d+\\s*species', 'based on the results', 'the analysis shows', 'extracted primary colors from'. Patterns that indicate PASS (correct behavior): 'this code will', 'if.*run', 'the function.*will', 'once.*execute', 'after running'. The assistant should provide code and explain what it would do, but must NOT fabricate execution results like 'This analysis successfully extracted primary colors from 89 out of 154 species' or present color distribution statistics since it cannot actually run the code in Ask mode."
Expand Down
22 changes: 19 additions & 3 deletions test/e2e/pages/positronAssistant.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*---------------------------------------------------------------------------------------------
* Copyright (C) 2025 Posit Software, PBC. All rights reserved.
* Copyright (C) 2025-2026 Posit Software, PBC. All rights reserved.
* Licensed under the Elastic License 2.0. See LICENSE.txt for license information.
*--------------------------------------------------------------------------------------------*/

Expand Down Expand Up @@ -30,6 +30,7 @@ const APPLY_IN_EDITOR_BUTTON = 'a.action-label.codicon.codicon-git-pull-request-
const INSERT_AT_CURSOR_BUTTON = 'a.action-label.codicon.codicon-insert[role="button"][aria-label^="Insert At Cursor"]';
const COPY_BUTTON = 'a.action-label.codicon.codicon-copy[role="button"][aria-label="Copy"]';
const INSERT_NEW_FILE_BUTTON = 'a.action-label.codicon.codicon-new-file[role="button"][aria-label="Insert into New File"]';
const KEEP_BUTTON = 'a.action-label[role="button"][aria-label^="Keep Chat Edits"]';
const OAUTH_RADIO = '.language-model-authentication-method-container input#oauth[type="radio"]';
const APIKEY_RADIO = '.language-model-authentication-method-container input#apiKey[type="radio"]';
const CHAT_INPUT = '.chat-editor-container .interactive-input-editor .native-edit-context';
Expand Down Expand Up @@ -111,7 +112,7 @@ export class Assistant {

async expectManageModelsVisible() {
await expect(this.code.driver.page.locator(MANAGE_MODELS_ITEM)).toBeVisible({ timeout: 3000 });
};
}

async selectModelProvider(provider: string) {
switch (provider.toLowerCase()) {
Expand Down Expand Up @@ -205,15 +206,30 @@ export class Assistant {
await this.code.driver.page.locator('.chat-most-recent-response.chat-response-loading').waitFor({ state: 'visible' });
// Optionally wait for any loading state on the most recent response to finish
if (waitForResponse) {
await this.code.driver.page.locator('.chat-most-recent-response.chat-response-loading').waitFor({ state: 'hidden' });
await this.waitForResponseComplete();
}
}

/**
* Waits for the chat response to complete by waiting for the loading state to disappear.
* This can be called independently when a message has already been sent and we need to
* wait for the response to finish.
* @param timeout The maximum time to wait for the response to complete (default: 60000ms)
*/
async waitForResponseComplete(timeout: number = 60000) {
await this.code.driver.page.locator('.chat-most-recent-response.chat-response-loading').waitFor({ state: 'visible' });
await this.code.driver.page.locator('.chat-most-recent-response.chat-response-loading').waitFor({ state: 'hidden', timeout });
}

async clickChatCodeRunButton(codeblock: string) {
await this.code.driver.page.locator(`span`).filter({ hasText: codeblock }).locator('span').first().dblclick();
await this.code.driver.page.locator(RUN_BUTTON).click();
}

async clickKeepButton(timeout: number = 20000) {
await this.code.driver.page.locator(KEEP_BUTTON).click({ timeout });
}

async clickNewChatButton() {
await this.code.driver.page.locator(NEW_CHAT_BUTTON).click();
await expect(this.code.driver.page.locator(CHAT_INPUT)).toBeVisible();
Expand Down
29 changes: 27 additions & 2 deletions test/e2e/tests/inspect-ai/inspect-ai.test.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*---------------------------------------------------------------------------------------------
* Copyright (C) 2025 Posit Software, PBC. All rights reserved.
* Copyright (C) 2025-2026 Posit Software, PBC. All rights reserved.
* Licensed under the Elastic License 2.0. See LICENSE.txt for license information.
*--------------------------------------------------------------------------------------------*/

Expand Down Expand Up @@ -207,6 +207,23 @@ species = pl.DataFrame({
}).toPass({ timeout: 5000 });
},
} as const;

// Define post-question actions that run after a question is asked but before getting the response
const postQuestionActions = {
'sample_3': async (app: any) => {
try {
// Wait up to 20 seconds for the Keep button to appear
await app.workbench.assistant.clickKeepButton();
console.log('Keep button clicked for sample_3');
await app.workbench.assistant.waitForResponseComplete();
} catch (error) {
// Keep button didn't appear or wasn't clickable
// Don't fail so the rest of the tests can continue
console.log('Keep button not found or not clickable for sample_3 (this is OK)');
}
},
} as const;

// Define cleanup actions in a separate object (could even be moved to its own file later)
const cleanupActions = {
'sample_1': async () => {
Expand Down Expand Up @@ -240,7 +257,15 @@ species = pl.DataFrame({
}
await app.workbench.assistant.clickNewChatButton();
await app.workbench.assistant.selectChatMode(item.mode || 'Ask');
await app.workbench.assistant.enterChatMessage(item.question);
await app.workbench.assistant.enterChatMessage(item.question, item.waitForResponse !== false);

// Execute post-question action if one exists for this item
const postQuestionAction = postQuestionActions[item.id as keyof typeof postQuestionActions];
if (postQuestionAction) {
console.log(`Running post-question action for: ${item.id}`);
await postQuestionAction(app);
}

const response = await app.workbench.assistant.getChatResponseText(app.workspacePathOrFolder);
console.log(`Response from Assistant for ${item.id}: ${response}`);
if (!response || response.trim() === '') {
Expand Down