Skip to content

Commit 3a2d08a

Browse files
committed
Fix context pruner to more accurately count tokens including all fields
1 parent fb12460 commit 3a2d08a

File tree

2 files changed

+218
-35
lines changed

2 files changed

+218
-35
lines changed

.agents/__tests__/context-pruner.test.ts

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,6 +1041,201 @@ describe('context-pruner saved run state overflow', () => {
10411041
})
10421042
})
10431043

1044+
describe('context-pruner token counting accuracy', () => {
1045+
let mockAgentState: any
1046+
1047+
beforeEach(() => {
1048+
mockAgentState = {
1049+
messageHistory: [] as Message[],
1050+
}
1051+
})
1052+
1053+
const runHandleSteps = (messages: Message[], maxContextLength?: number) => {
1054+
mockAgentState.messageHistory = messages
1055+
const mockLogger = {
1056+
debug: () => {},
1057+
info: () => {},
1058+
warn: () => {},
1059+
error: () => {},
1060+
}
1061+
const generator = contextPruner.handleSteps!({
1062+
agentState: mockAgentState,
1063+
logger: mockLogger,
1064+
params: maxContextLength ? { maxContextLength } : {},
1065+
})
1066+
const results: any[] = []
1067+
let result = generator.next()
1068+
while (!result.done) {
1069+
if (typeof result.value === 'object') {
1070+
results.push(result.value)
1071+
}
1072+
result = generator.next()
1073+
}
1074+
return results
1075+
}
1076+
1077+
test('accurately counts tokens for message with large text content', () => {
1078+
// Create a message with large content that would be significantly undercounted
1079+
// if we only counted metadata fields without the content
1080+
const largeText = 'x'.repeat(90000) // ~30k tokens
1081+
1082+
const messageWithLargeContent: Message = {
1083+
role: 'user',
1084+
content: [
1085+
{
1086+
type: 'text',
1087+
text: largeText,
1088+
},
1089+
],
1090+
}
1091+
1092+
// With a 50k token limit and a ~30k token message, should NOT be pruned
1093+
// If token counting was broken and didn't count content, it would see ~0 tokens
1094+
// and definitely not prune
1095+
const results = runHandleSteps([messageWithLargeContent], 50000)
1096+
1097+
expect(results).toHaveLength(1)
1098+
// Message should be preserved (under limit)
1099+
expect(results[0].input.messages).toHaveLength(1)
1100+
expect(results[0].input.messages[0].content[0].text).toBe(largeText)
1101+
})
1102+
1103+
test('prunes when large content exceeds token limit', () => {
1104+
// Create multiple messages with large content that should trigger pruning
1105+
const largeText = 'x'.repeat(60000) // ~20k tokens each
1106+
1107+
const messages: Message[] = [
1108+
{
1109+
role: 'user',
1110+
content: [{ type: 'text', text: `First: ${largeText}` }],
1111+
},
1112+
{
1113+
role: 'assistant',
1114+
content: [{ type: 'text', text: `Second: ${largeText}` }],
1115+
},
1116+
{
1117+
role: 'user',
1118+
content: [{ type: 'text', text: `Third: ${largeText}` }],
1119+
},
1120+
{
1121+
role: 'assistant',
1122+
content: [{ type: 'text', text: `Fourth: ${largeText}` }],
1123+
},
1124+
]
1125+
1126+
// ~80k tokens total, with 50k limit should trigger pruning
1127+
// If content wasn't being counted, it would see ~0 tokens and NOT prune
1128+
const results = runHandleSteps(messages, 50000)
1129+
1130+
expect(results).toHaveLength(1)
1131+
const resultMessages = results[0].input.messages
1132+
1133+
// Should have pruned some messages (either removed or replaced with placeholder)
1134+
// If token counting was broken, all 4 messages would remain
1135+
const hasReplacementMessage = resultMessages.some(
1136+
(m: any) =>
1137+
Array.isArray(m.content) &&
1138+
m.content.some(
1139+
(part: any) =>
1140+
part.type === 'text' &&
1141+
part.text.includes('Previous message(s) omitted'),
1142+
),
1143+
)
1144+
expect(hasReplacementMessage).toBe(true)
1145+
})
1146+
1147+
test('many small messages have JSON structure overhead counted correctly', () => {
1148+
// When there are MANY small messages, the JSON structure overhead becomes significant:
1149+
// Each message has ~50 chars of structure: {"role":"user","content":[{"type":"text","text":""}]}
1150+
// With 500 messages, that's ~25k chars = ~8k tokens just in structure
1151+
// The old code would undercount because it only counted content parts + minimal metadata
1152+
1153+
const smallMessages: Message[] = []
1154+
for (let i = 0; i < 500; i++) {
1155+
smallMessages.push({
1156+
role: i % 2 === 0 ? 'user' : 'assistant',
1157+
content: [{ type: 'text', text: `msg${i}` }], // ~5 chars of actual content each
1158+
})
1159+
}
1160+
1161+
// Calculate expected tokens:
1162+
// Each message stringified is roughly: {"role":"user","content":[{"type":"text","text":"msg123"}]}
1163+
// That's about 60-65 chars per message = ~20 tokens per message
1164+
// 500 messages * 20 tokens = ~10k tokens
1165+
// With a 5k limit, should trigger pruning
1166+
// If structure wasn't counted (only content parts), we'd see ~500 * 5 chars / 3 = ~800 tokens
1167+
// which wouldn't trigger pruning
1168+
1169+
const results = runHandleSteps(smallMessages, 5000)
1170+
1171+
expect(results).toHaveLength(1)
1172+
const resultMessages = results[0].input.messages
1173+
1174+
// Should have pruned - either fewer messages or replacement placeholders
1175+
// If JSON structure wasn't being counted, all 500 messages would fit in 5k tokens
1176+
const hasReplacementMessage = resultMessages.some(
1177+
(m: any) =>
1178+
Array.isArray(m.content) &&
1179+
m.content.some(
1180+
(part: any) =>
1181+
part.type === 'text' &&
1182+
part.text.includes('Previous message(s) omitted'),
1183+
),
1184+
)
1185+
// Either we have fewer messages OR we have replacement messages
1186+
const wasPruned = resultMessages.length < 500 || hasReplacementMessage
1187+
expect(wasPruned).toBe(true)
1188+
})
1189+
1190+
test('tool message with large result content is counted correctly', () => {
1191+
// Tool message with large content that must be counted
1192+
const largeResult = 'y'.repeat(90000) // ~30k tokens
1193+
1194+
const toolCallMessage: Message = {
1195+
role: 'assistant',
1196+
content: [
1197+
{
1198+
type: 'tool-call',
1199+
toolCallId: 'test-large-result',
1200+
toolName: 'read_files',
1201+
input: { paths: ['big-file.ts'] },
1202+
},
1203+
],
1204+
}
1205+
1206+
const toolResultMessage: ToolMessage = {
1207+
role: 'tool',
1208+
toolCallId: 'test-large-result',
1209+
toolName: 'read_files',
1210+
content: [
1211+
{
1212+
type: 'json',
1213+
value: { content: largeResult },
1214+
},
1215+
],
1216+
}
1217+
1218+
// With 50k limit and ~30k token tool result, should not trigger message-level pruning
1219+
// but may trigger large tool result simplification (>1000 chars)
1220+
const results = runHandleSteps([toolCallMessage, toolResultMessage], 50000)
1221+
1222+
expect(results).toHaveLength(1)
1223+
// Both tool call and result should be present (may be simplified but paired)
1224+
const resultMessages = results[0].input.messages
1225+
const hasToolCall = resultMessages.some(
1226+
(m: any) =>
1227+
m.role === 'assistant' &&
1228+
Array.isArray(m.content) &&
1229+
m.content.some((c: any) => c.toolCallId === 'test-large-result'),
1230+
)
1231+
const hasToolResult = resultMessages.some(
1232+
(m: any) => m.role === 'tool' && m.toolCallId === 'test-large-result',
1233+
)
1234+
expect(hasToolCall).toBe(true)
1235+
expect(hasToolResult).toBe(true)
1236+
})
1237+
})
1238+
10441239
describe('context-pruner edge cases', () => {
10451240
let mockAgentState: any
10461241

.agents/context-pruner.ts

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -43,46 +43,34 @@ const definition: AgentDefinition = {
4343

4444
// Count tokens for a message, handling media content specially
4545
const countMessageTokens = (message: Message): number => {
46-
// For tool messages, check if content contains media type
47-
if (message.role === 'tool' && Array.isArray(message.content)) {
48-
let tokens = 0
49-
for (const part of message.content) {
50-
if (part.type === 'media') {
51-
// Use fixed token count for images since we compress them
52-
tokens += TOKENS_PER_IMAGE
53-
} else {
54-
tokens += countTokensJson(part)
55-
}
56-
}
57-
// Add overhead for message metadata
58-
tokens += countTokensJson({
59-
role: message.role,
60-
toolCallId: message.toolCallId,
61-
toolName: message.toolName,
62-
})
63-
return tokens
64-
}
46+
// For messages with images/media, we need special handling to avoid counting base64 data
47+
if (Array.isArray(message.content)) {
48+
// Check if there are any images or media
49+
const hasImagesOrMedia = message.content.some(
50+
(part: any) => part.type === 'image' || part.type === 'media',
51+
)
6552

66-
// For user/assistant messages, check content array for images
67-
if (
68-
(message.role === 'user' || message.role === 'assistant') &&
69-
Array.isArray(message.content)
70-
) {
71-
let tokens = 0
72-
for (const part of message.content) {
73-
if (part.type === 'image') {
74-
// Use fixed token count for images
75-
tokens += TOKENS_PER_IMAGE
76-
} else {
77-
tokens += countTokensJson(part)
53+
if (hasImagesOrMedia) {
54+
let tokens = 0
55+
56+
// Count content parts, handling images specially
57+
for (const part of message.content) {
58+
if (part.type === 'image' || part.type === 'media') {
59+
tokens += TOKENS_PER_IMAGE
60+
} else {
61+
tokens += countTokensJson(part)
62+
}
7863
}
64+
65+
// Count the rest of the message fields (role, toolCallId, toolName, tags, etc.)
66+
const { content, ...rest } = message
67+
tokens += countTokensJson(rest)
68+
69+
return tokens
7970
}
80-
// Add overhead for message metadata
81-
tokens += countTokensJson({ role: message.role })
82-
return tokens
8371
}
8472

85-
// Fallback to JSON-based counting
73+
// No images/media, just count the whole message
8674
return countTokensJson(message)
8775
}
8876

0 commit comments

Comments
 (0)