Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 37 additions & 16 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
"@ai-sdk/anthropic": "^2.0.56",
"@ai-sdk/google": "^2.0.49",
"@ai-sdk/openai": "^2.0.88",
"@anthropic-ai/tokenizer": "^0.0.4",
"@getzep/zep-cloud": "^3.13.0",
"ai": "^5.0.115",
"drizzle-orm": "^0.45.1",
"js-tiktoken": "^1.0.21",
"mem0ai": "^2.1.38",
"supermemory": "^4.0.0",
"zod": "^3.24.4"
Expand Down
18 changes: 17 additions & 1 deletion src/orchestrator/phases/answer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { buildDefaultAnswerPrompt } from "../../prompts/defaults"
import { buildContextString } from "../../types/prompts"
import { ConcurrentExecutor } from "../concurrent"
import { resolveConcurrency } from "../../types/concurrency"
import { countTokens } from "../../utils/tokens"

type LanguageModel =
| ReturnType<typeof createOpenAI>
Expand Down Expand Up @@ -118,8 +119,16 @@ export async function runAnswerPhase(
const context: unknown[] = searchData.results || []
const questionDate = checkpoint.questions[question.questionId]?.questionDate

// Build prompts to count tokens separately
const basePrompt = buildAnswerPrompt(question.question, [], questionDate, provider)
const contextStr = buildContextString(context)
const prompt = buildAnswerPrompt(question.question, context, questionDate, provider)

// Count tokens separately for better analytics
const basePromptTokens = countTokens(basePrompt, modelConfig)
const contextTokens = countTokens(contextStr, modelConfig)
const promptTokens = countTokens(prompt, modelConfig)

const params: Record<string, unknown> = {
model: client(modelConfig.id),
prompt,
Expand All @@ -136,11 +145,18 @@ export async function runAnswerPhase(
checkpointManager.updatePhase(checkpoint, question.questionId, "answer", {
status: "completed",
hypothesis: text.trim(),
promptTokens,
basePromptTokens,
contextTokens,
completedAt: new Date().toISOString(),
durationMs,
})

logger.progress(index + 1, total, `Answered ${question.questionId} (${durationMs}ms)`)
logger.progress(
index + 1,
total,
`Answered ${question.questionId} (${durationMs}ms, ${promptTokens} tokens: ${basePromptTokens} base + ${contextTokens} context)`
)
return { questionId: question.questionId, durationMs }
} catch (e) {
const error = e instanceof Error ? e.message : String(e)
Expand Down
39 changes: 39 additions & 0 deletions src/orchestrator/phases/report.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import type {
QuestionTypeStats,
RetrievalMetrics,
RetrievalAggregates,
TokenMetrics,
} from "../../types/unified"
import { logger } from "../../utils/logger"

Expand Down Expand Up @@ -185,6 +186,43 @@ export function generateReport(benchmark: Benchmark, checkpoint: RunCheckpoint):

const overallRetrieval = aggregateRetrievalMetrics(allRetrievalMetrics)

// Aggregate token metrics
let tokenMetrics: TokenMetrics | undefined
const allPromptTokens: number[] = []
const allBasePromptTokens: number[] = []
const allContextTokens: number[] = []

for (const question of questions) {
const qCheckpoint = checkpoint.questions[question.questionId]
if (!qCheckpoint) continue

const answerPhase = qCheckpoint.phases.answer
if (answerPhase.status === "completed") {
if (answerPhase.promptTokens) allPromptTokens.push(answerPhase.promptTokens)
if (answerPhase.basePromptTokens) allBasePromptTokens.push(answerPhase.basePromptTokens)
if (answerPhase.contextTokens) allContextTokens.push(answerPhase.contextTokens)
}
}

if (allPromptTokens.length > 0) {
const totalTokens = allPromptTokens.reduce((a, b) => a + b, 0)
const totalBasePromptTokens = allBasePromptTokens.reduce((a, b) => a + b, 0)
const totalContextTokens = allContextTokens.reduce((a, b) => a + b, 0)

tokenMetrics = {
totalTokens,
basePromptTokens: totalBasePromptTokens,
contextTokens: totalContextTokens,
avgTokensPerQuestion: Math.round(totalTokens / allPromptTokens.length),
avgBasePromptTokens: allBasePromptTokens.length > 0
? Math.round(totalBasePromptTokens / allBasePromptTokens.length)
: 0,
avgContextTokens: allContextTokens.length > 0
? Math.round(totalContextTokens / allContextTokens.length)
: 0,
}
}

const totalQuestions = evaluations.length
const correctCount = evaluations.filter((e) => e.score === 1).length
const accuracy = totalQuestions > 0 ? correctCount / totalQuestions : 0
Expand All @@ -210,6 +248,7 @@ export function generateReport(benchmark: Benchmark, checkpoint: RunCheckpoint):
evaluate: calculateLatencyStats(evaluateDurations),
total: calculateLatencyStats(totalDurations),
},
tokens: tokenMetrics,
retrieval: overallRetrieval,
byQuestionType,
questionTypeRegistry: benchmark.getQuestionTypes(),
Expand Down
3 changes: 3 additions & 0 deletions src/types/checkpoint.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ export interface SearchPhaseCheckpoint {
export interface AnswerPhaseCheckpoint {
status: PhaseStatus
hypothesis?: string
promptTokens?: number
basePromptTokens?: number
contextTokens?: number
startedAt?: string
completedAt?: string
durationMs?: number
Expand Down
10 changes: 10 additions & 0 deletions src/types/unified.ts
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@ export interface QuestionTypeStats {
retrieval?: RetrievalAggregates
}

export interface TokenMetrics {
totalTokens: number
basePromptTokens: number
contextTokens: number
avgTokensPerQuestion: number
avgBasePromptTokens: number
avgContextTokens: number
}

export interface BenchmarkResult {
provider: string
benchmark: string
Expand All @@ -112,6 +121,7 @@ export interface BenchmarkResult {
evaluate: LatencyStats
total: LatencyStats
}
tokens?: TokenMetrics
retrieval?: RetrievalAggregates
byQuestionType: Record<string, QuestionTypeStats>
questionTypeRegistry?: QuestionTypeRegistry
Expand Down
54 changes: 54 additions & 0 deletions src/utils/tokens.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import { Tiktoken } from "js-tiktoken"
import cl100k_base from "js-tiktoken/ranks/cl100k_base"
import o200k_base from "js-tiktoken/ranks/o200k_base"
import { countTokens as countAnthropicTokens } from "@anthropic-ai/tokenizer"
import type { ModelConfig } from "./models"

/**
* Count tokens in a text string based on the model being used
*/
export function countTokens(text: string, modelConfig: ModelConfig): number {
const provider = modelConfig.provider

if (provider === "openai") {
return countOpenAITokens(text, modelConfig.id)
} else if (provider === "anthropic") {
return countAnthropicTokens(text)
} else if (provider === "google") {
// Google doesn't have a standard tokenizer for JS
// Use approximation: ~4 characters per token
return Math.ceil(text.length / 4)
}

// Fallback approximation
return Math.ceil(text.length / 4)
}

/**
* Count tokens for OpenAI models using tiktoken
*/
function countOpenAITokens(text: string, modelId: string): number {
// Determine which encoding to use based on model
// o200k_base is used for GPT-4o and newer models
// cl100k_base is used for GPT-4, GPT-3.5-turbo
try {
let encoding: Tiktoken

if (
modelId.includes("gpt-4o") ||
modelId.includes("gpt-4.1") ||
modelId.includes("gpt-5")
) {
encoding = new Tiktoken(o200k_base)
} else {
// Default to cl100k_base for other GPT-4 models
encoding = new Tiktoken(cl100k_base)
}

const tokens = encoding.encode(text)
return tokens.length
} catch (error) {
// Fallback to approximation if encoding fails
return Math.ceil(text.length / 4)
}
}
Loading