@@ -1041,6 +1041,201 @@ describe('context-pruner saved run state overflow', () => {
10411041 } )
10421042} )
10431043
1044+ describe ( 'context-pruner token counting accuracy' , ( ) => {
1045+ let mockAgentState : any
1046+
1047+ beforeEach ( ( ) => {
1048+ mockAgentState = {
1049+ messageHistory : [ ] as Message [ ] ,
1050+ }
1051+ } )
1052+
1053+ const runHandleSteps = ( messages : Message [ ] , maxContextLength ?: number ) => {
1054+ mockAgentState . messageHistory = messages
1055+ const mockLogger = {
1056+ debug : ( ) => { } ,
1057+ info : ( ) => { } ,
1058+ warn : ( ) => { } ,
1059+ error : ( ) => { } ,
1060+ }
1061+ const generator = contextPruner . handleSteps ! ( {
1062+ agentState : mockAgentState ,
1063+ logger : mockLogger ,
1064+ params : maxContextLength ? { maxContextLength } : { } ,
1065+ } )
1066+ const results : any [ ] = [ ]
1067+ let result = generator . next ( )
1068+ while ( ! result . done ) {
1069+ if ( typeof result . value === 'object' ) {
1070+ results . push ( result . value )
1071+ }
1072+ result = generator . next ( )
1073+ }
1074+ return results
1075+ }
1076+
1077+ test ( 'accurately counts tokens for message with large text content' , ( ) => {
1078+ // Create a message with large content that would be significantly undercounted
1079+ // if we only counted metadata fields without the content
1080+ const largeText = 'x' . repeat ( 90000 ) // ~30k tokens
1081+
1082+ const messageWithLargeContent : Message = {
1083+ role : 'user' ,
1084+ content : [
1085+ {
1086+ type : 'text' ,
1087+ text : largeText ,
1088+ } ,
1089+ ] ,
1090+ }
1091+
1092+ // With a 50k token limit and a ~30k token message, should NOT be pruned
1093+ // If token counting was broken and didn't count content, it would see ~0 tokens
1094+ // and definitely not prune
1095+ const results = runHandleSteps ( [ messageWithLargeContent ] , 50000 )
1096+
1097+ expect ( results ) . toHaveLength ( 1 )
1098+ // Message should be preserved (under limit)
1099+ expect ( results [ 0 ] . input . messages ) . toHaveLength ( 1 )
1100+ expect ( results [ 0 ] . input . messages [ 0 ] . content [ 0 ] . text ) . toBe ( largeText )
1101+ } )
1102+
1103+ test ( 'prunes when large content exceeds token limit' , ( ) => {
1104+ // Create multiple messages with large content that should trigger pruning
1105+ const largeText = 'x' . repeat ( 60000 ) // ~20k tokens each
1106+
1107+ const messages : Message [ ] = [
1108+ {
1109+ role : 'user' ,
1110+ content : [ { type : 'text' , text : `First: ${ largeText } ` } ] ,
1111+ } ,
1112+ {
1113+ role : 'assistant' ,
1114+ content : [ { type : 'text' , text : `Second: ${ largeText } ` } ] ,
1115+ } ,
1116+ {
1117+ role : 'user' ,
1118+ content : [ { type : 'text' , text : `Third: ${ largeText } ` } ] ,
1119+ } ,
1120+ {
1121+ role : 'assistant' ,
1122+ content : [ { type : 'text' , text : `Fourth: ${ largeText } ` } ] ,
1123+ } ,
1124+ ]
1125+
1126+ // ~80k tokens total, with 50k limit should trigger pruning
1127+ // If content wasn't being counted, it would see ~0 tokens and NOT prune
1128+ const results = runHandleSteps ( messages , 50000 )
1129+
1130+ expect ( results ) . toHaveLength ( 1 )
1131+ const resultMessages = results [ 0 ] . input . messages
1132+
1133+ // Should have pruned some messages (either removed or replaced with placeholder)
1134+ // If token counting was broken, all 4 messages would remain
1135+ const hasReplacementMessage = resultMessages . some (
1136+ ( m : any ) =>
1137+ Array . isArray ( m . content ) &&
1138+ m . content . some (
1139+ ( part : any ) =>
1140+ part . type === 'text' &&
1141+ part . text . includes ( 'Previous message(s) omitted' ) ,
1142+ ) ,
1143+ )
1144+ expect ( hasReplacementMessage ) . toBe ( true )
1145+ } )
1146+
1147+ test ( 'many small messages have JSON structure overhead counted correctly' , ( ) => {
1148+ // When there are MANY small messages, the JSON structure overhead becomes significant:
1149+ // Each message has ~50 chars of structure: {"role":"user","content":[{"type":"text","text":""}]}
1150+ // With 500 messages, that's ~25k chars = ~8k tokens just in structure
1151+ // The old code would undercount because it only counted content parts + minimal metadata
1152+
1153+ const smallMessages : Message [ ] = [ ]
1154+ for ( let i = 0 ; i < 500 ; i ++ ) {
1155+ smallMessages . push ( {
1156+ role : i % 2 === 0 ? 'user' : 'assistant' ,
1157+ content : [ { type : 'text' , text : `msg${ i } ` } ] , // ~5 chars of actual content each
1158+ } )
1159+ }
1160+
1161+ // Calculate expected tokens:
1162+ // Each message stringified is roughly: {"role":"user","content":[{"type":"text","text":"msg123"}] }
1163+ // That's about 60-65 chars per message = ~20 tokens per message
1164+ // 500 messages * 20 tokens = ~10k tokens
1165+ // With a 5k limit, should trigger pruning
1166+ // If structure wasn't counted (only content parts), we'd see ~500 * 5 chars / 3 = ~800 tokens
1167+ // which wouldn't trigger pruning
1168+
1169+ const results = runHandleSteps ( smallMessages , 5000 )
1170+
1171+ expect ( results ) . toHaveLength ( 1 )
1172+ const resultMessages = results [ 0 ] . input . messages
1173+
1174+ // Should have pruned - either fewer messages or replacement placeholders
1175+ // If JSON structure wasn't being counted, all 500 messages would fit in 5k tokens
1176+ const hasReplacementMessage = resultMessages . some (
1177+ ( m : any ) =>
1178+ Array . isArray ( m . content ) &&
1179+ m . content . some (
1180+ ( part : any ) =>
1181+ part . type === 'text' &&
1182+ part . text . includes ( 'Previous message(s) omitted' ) ,
1183+ ) ,
1184+ )
1185+ // Either we have fewer messages OR we have replacement messages
1186+ const wasPruned = resultMessages . length < 500 || hasReplacementMessage
1187+ expect ( wasPruned ) . toBe ( true )
1188+ } )
1189+
1190+ test ( 'tool message with large result content is counted correctly' , ( ) => {
1191+ // Tool message with large content that must be counted
1192+ const largeResult = 'y' . repeat ( 90000 ) // ~30k tokens
1193+
1194+ const toolCallMessage : Message = {
1195+ role : 'assistant' ,
1196+ content : [
1197+ {
1198+ type : 'tool-call' ,
1199+ toolCallId : 'test-large-result' ,
1200+ toolName : 'read_files' ,
1201+ input : { paths : [ 'big-file.ts' ] } ,
1202+ } ,
1203+ ] ,
1204+ }
1205+
1206+ const toolResultMessage : ToolMessage = {
1207+ role : 'tool' ,
1208+ toolCallId : 'test-large-result' ,
1209+ toolName : 'read_files' ,
1210+ content : [
1211+ {
1212+ type : 'json' ,
1213+ value : { content : largeResult } ,
1214+ } ,
1215+ ] ,
1216+ }
1217+
1218+ // With 50k limit and ~30k token tool result, should not trigger message-level pruning
1219+ // but may trigger large tool result simplification (>1000 chars)
1220+ const results = runHandleSteps ( [ toolCallMessage , toolResultMessage ] , 50000 )
1221+
1222+ expect ( results ) . toHaveLength ( 1 )
1223+ // Both tool call and result should be present (may be simplified but paired)
1224+ const resultMessages = results [ 0 ] . input . messages
1225+ const hasToolCall = resultMessages . some (
1226+ ( m : any ) =>
1227+ m . role === 'assistant' &&
1228+ Array . isArray ( m . content ) &&
1229+ m . content . some ( ( c : any ) => c . toolCallId === 'test-large-result' ) ,
1230+ )
1231+ const hasToolResult = resultMessages . some (
1232+ ( m : any ) => m . role === 'tool' && m . toolCallId === 'test-large-result' ,
1233+ )
1234+ expect ( hasToolCall ) . toBe ( true )
1235+ expect ( hasToolResult ) . toBe ( true )
1236+ } )
1237+ } )
1238+
10441239describe ( 'context-pruner edge cases' , ( ) => {
10451240 let mockAgentState : any
10461241
0 commit comments