Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions packages/types/src/codebase-index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@ export const CODEBASE_INDEX_DEFAULTS = {
MAX_SEARCH_SCORE: 1,
DEFAULT_SEARCH_MIN_SCORE: 0.4,
SEARCH_SCORE_STEP: 0.05,
// Embedding batch size settings
MIN_EMBEDDING_BATCH_SIZE: 10,
MAX_EMBEDDING_BATCH_SIZE: 200,
DEFAULT_EMBEDDING_BATCH_SIZE: 60,
EMBEDDING_BATCH_SIZE_STEP: 10,
// Max chunk size settings (characters per code chunk)
MIN_MAX_CHUNK_SIZE: 200,
MAX_MAX_CHUNK_SIZE: 5000,
DEFAULT_MAX_CHUNK_SIZE: 1000,
MAX_CHUNK_SIZE_STEP: 100,
// Parsing concurrency settings (concurrent file parsing)
MIN_PARSING_CONCURRENCY: 1,
MAX_PARSING_CONCURRENCY: 50,
DEFAULT_PARSING_CONCURRENCY: 10,
PARSING_CONCURRENCY_STEP: 1,
} as const

/**
Expand Down Expand Up @@ -42,6 +57,22 @@ export const codebaseIndexConfigSchema = z.object({
.min(CODEBASE_INDEX_DEFAULTS.MIN_SEARCH_RESULTS)
.max(CODEBASE_INDEX_DEFAULTS.MAX_SEARCH_RESULTS)
.optional(),
// Advanced indexing parameters
codebaseIndexEmbeddingBatchSize: z
.number()
.min(CODEBASE_INDEX_DEFAULTS.MIN_EMBEDDING_BATCH_SIZE)
.max(CODEBASE_INDEX_DEFAULTS.MAX_EMBEDDING_BATCH_SIZE)
.optional(),
codebaseIndexMaxChunkSize: z
.number()
.min(CODEBASE_INDEX_DEFAULTS.MIN_MAX_CHUNK_SIZE)
.max(CODEBASE_INDEX_DEFAULTS.MAX_MAX_CHUNK_SIZE)
.optional(),
codebaseIndexParsingConcurrency: z
.number()
.min(CODEBASE_INDEX_DEFAULTS.MIN_PARSING_CONCURRENCY)
.max(CODEBASE_INDEX_DEFAULTS.MAX_PARSING_CONCURRENCY)
.optional(),
// OpenAI Compatible specific fields
codebaseIndexOpenAiCompatibleBaseUrl: z.string().optional(),
codebaseIndexOpenAiCompatibleModelDimension: z.number().optional(),
Expand Down
45 changes: 44 additions & 1 deletion src/services/code-index/config-manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@ import { ApiHandlerOptions } from "../../shared/api"
import { ContextProxy } from "../../core/config/ContextProxy"
import { EmbedderProvider } from "./interfaces/manager"
import { CodeIndexConfig, PreviousConfigSnapshot } from "./interfaces/config"
import { DEFAULT_SEARCH_MIN_SCORE, DEFAULT_MAX_SEARCH_RESULTS } from "./constants"
import {
DEFAULT_SEARCH_MIN_SCORE,
DEFAULT_MAX_SEARCH_RESULTS,
BATCH_SEGMENT_THRESHOLD,
MAX_BLOCK_CHARS,
PARSING_CONCURRENCY,
} from "./constants"
import { getDefaultModelId, getModelDimension, getModelScoreThreshold } from "../../shared/embeddingModels"

/**
Expand All @@ -26,6 +32,10 @@ export class CodeIndexConfigManager {
private qdrantApiKey?: string
private searchMinScore?: number
private searchMaxResults?: number
// Advanced indexing parameters
private embeddingBatchSize?: number
private maxChunkSize?: number
private parsingConcurrency?: number

constructor(private readonly contextProxy: ContextProxy) {
// Initialize with current configuration to avoid false restart triggers
Expand Down Expand Up @@ -87,6 +97,11 @@ export class CodeIndexConfigManager {
this.searchMinScore = codebaseIndexSearchMinScore
this.searchMaxResults = codebaseIndexSearchMaxResults

// Load advanced indexing parameters
this.embeddingBatchSize = codebaseIndexConfig.codebaseIndexEmbeddingBatchSize
this.maxChunkSize = codebaseIndexConfig.codebaseIndexMaxChunkSize
this.parsingConcurrency = codebaseIndexConfig.codebaseIndexParsingConcurrency

// Validate and set model dimension
const rawDimension = codebaseIndexConfig.codebaseIndexEmbedderModelDimension
if (rawDimension !== undefined && rawDimension !== null) {
Expand Down Expand Up @@ -460,6 +475,10 @@ export class CodeIndexConfigManager {
qdrantApiKey: this.qdrantApiKey,
searchMinScore: this.currentSearchMinScore,
searchMaxResults: this.currentSearchMaxResults,
// Advanced indexing parameters
embeddingBatchSize: this.currentEmbeddingBatchSize,
maxChunkSize: this.currentMaxChunkSize,
parsingConcurrency: this.currentParsingConcurrency,
}
}

Expand Down Expand Up @@ -541,4 +560,28 @@ export class CodeIndexConfigManager {
public get currentSearchMaxResults(): number {
return this.searchMaxResults ?? DEFAULT_MAX_SEARCH_RESULTS
}

/**
* Gets the configured embedding batch size.
* Returns user setting if configured, otherwise returns default.
*/
public get currentEmbeddingBatchSize(): number {
return this.embeddingBatchSize ?? BATCH_SEGMENT_THRESHOLD
}

/**
* Gets the configured max chunk size (characters per code chunk).
* Returns user setting if configured, otherwise returns default.
*/
public get currentMaxChunkSize(): number {
return this.maxChunkSize ?? MAX_BLOCK_CHARS
}

/**
* Gets the configured parsing concurrency (concurrent file parsing).
* Returns user setting if configured, otherwise returns default.
*/
public get currentParsingConcurrency(): number {
return this.parsingConcurrency ?? PARSING_CONCURRENCY
}
}
6 changes: 3 additions & 3 deletions src/services/code-index/constants/index.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { CODEBASE_INDEX_DEFAULTS } from "@roo-code/types"

/**Parser */
export const MAX_BLOCK_CHARS = 1000
export const MAX_BLOCK_CHARS = CODEBASE_INDEX_DEFAULTS.DEFAULT_MAX_CHUNK_SIZE
export const MIN_BLOCK_CHARS = 50
export const MIN_CHUNK_REMAINDER_CHARS = 200 // Minimum characters for the *next* chunk after a split
export const MAX_CHARS_TOLERANCE_FACTOR = 1.15 // 15% tolerance for max chars
Expand All @@ -16,10 +16,10 @@ export const MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 // 1MB

/**Directory Scanner */
export const MAX_LIST_FILES_LIMIT_CODE_INDEX = 50_000
export const BATCH_SEGMENT_THRESHOLD = 60 // Number of code segments to batch for embeddings/upserts
export const BATCH_SEGMENT_THRESHOLD = CODEBASE_INDEX_DEFAULTS.DEFAULT_EMBEDDING_BATCH_SIZE // Number of code segments to batch for embeddings/upserts
export const MAX_BATCH_RETRIES = 3
export const INITIAL_RETRY_DELAY_MS = 500
export const PARSING_CONCURRENCY = 10
export const PARSING_CONCURRENCY = CODEBASE_INDEX_DEFAULTS.DEFAULT_PARSING_CONCURRENCY
export const MAX_PENDING_BATCHES = 20 // Maximum number of batches to accumulate before waiting

/**OpenAI Embedder */
Expand Down
4 changes: 4 additions & 0 deletions src/services/code-index/interfaces/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ export interface CodeIndexConfig {
qdrantApiKey?: string
searchMinScore?: number
searchMaxResults?: number
// Advanced indexing parameters
embeddingBatchSize?: number
maxChunkSize?: number
parsingConcurrency?: number
}

/**
Expand Down
18 changes: 15 additions & 3 deletions src/services/code-index/processors/file-watcher.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ import {
IVectorStore,
PointStruct,
BatchProcessingSummary,
ICodeParser,
} from "../interfaces"
import { codeParser } from "./parser"
import { CacheManager } from "../cache-manager"
import { generateNormalizedAbsolutePath, generateRelativeFilePath } from "../shared/get-relative-path"
import { isPathInIgnoredDirectory } from "../../glob/ignore-utils"
Expand Down Expand Up @@ -68,9 +68,13 @@ export class FileWatcher implements IFileWatcher {
* Creates a new file watcher
* @param workspacePath Path to the workspace
* @param context VS Code extension context
* @param cacheManager Cache manager
* @param embedder Optional embedder
* @param vectorStore Optional vector store
* @param cacheManager Cache manager
* @param ignoreInstance Optional ignore instance
* @param ignoreController Optional ignore controller
* @param batchSegmentThreshold Optional batch segment threshold
* @param codeParser Code parser for parsing files
*/
constructor(
private workspacePath: string,
Expand All @@ -81,6 +85,7 @@ export class FileWatcher implements IFileWatcher {
ignoreInstance?: Ignore,
ignoreController?: RooIgnoreController,
batchSegmentThreshold?: number,
private readonly codeParser?: ICodeParser,
) {
this.ignoreController = ignoreController || new RooIgnoreController(workspacePath)
if (ignoreInstance) {
Expand Down Expand Up @@ -557,7 +562,14 @@ export class FileWatcher implements IFileWatcher {
}

// Parse file
const blocks = await codeParser.parseFile(filePath, { content, fileHash: newHash })
if (!this.codeParser) {
return {
path: filePath,
status: "local_error" as const,
error: new Error("No code parser configured"),
}
}
const blocks = await this.codeParser.parseFile(filePath, { content, fileHash: newHash })

// Prepare points for batch processing
let pointsToUpsert: PointStruct[] = []
Expand Down
23 changes: 16 additions & 7 deletions src/services/code-index/processors/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,18 @@ import { sanitizeErrorMessage } from "../shared/validation-helpers"
export class CodeParser implements ICodeParser {
private loadedParsers: LanguageParser = {}
private pendingLoads: Map<string, Promise<LanguageParser>> = new Map()
private readonly maxBlockChars: number
// Markdown files are now supported using the custom markdown parser
// which extracts headers and sections for semantic indexing

/**
* Creates a new CodeParser instance
* @param maxBlockChars Maximum characters per code chunk (default: MAX_BLOCK_CHARS from constants)
*/
constructor(maxBlockChars?: number) {
this.maxBlockChars = maxBlockChars ?? MAX_BLOCK_CHARS
}

/**
* Parses a code file into code blocks
* @param filePath Path to the file to parse
Expand Down Expand Up @@ -179,7 +188,7 @@ export class CodeParser implements ICodeParser {
// Check if the node meets the minimum character requirement
if (currentNode.text.length >= MIN_BLOCK_CHARS) {
// If it also exceeds the maximum character limit, try to break it down
if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
if (currentNode.text.length > this.maxBlockChars * MAX_CHARS_TOLERANCE_FACTOR) {
if (currentNode.children.filter((child) => child !== null).length > 0) {
// If it has children, process them instead
queue.push(...currentNode.children.filter((child) => child !== null))
Expand Down Expand Up @@ -244,7 +253,7 @@ export class CodeParser implements ICodeParser {
let currentChunkLines: string[] = []
let currentChunkLength = 0
let chunkStartLineIndex = 0 // 0-based index within the `lines` array
const effectiveMaxChars = MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR
const effectiveMaxChars = this.maxBlockChars * MAX_CHARS_TOLERANCE_FACTOR

const finalizeChunk = (endLineIndex: number) => {
if (currentChunkLength >= MIN_BLOCK_CHARS && currentChunkLines.length > 0) {
Expand Down Expand Up @@ -314,10 +323,10 @@ export class CodeParser implements ICodeParser {
let remainingLineContent = line
let currentSegmentStartChar = 0
while (remainingLineContent.length > 0) {
const segment = remainingLineContent.substring(0, MAX_BLOCK_CHARS)
remainingLineContent = remainingLineContent.substring(MAX_BLOCK_CHARS)
const segment = remainingLineContent.substring(0, this.maxBlockChars)
remainingLineContent = remainingLineContent.substring(this.maxBlockChars)
createSegmentBlock(segment, originalLineNumber, currentSegmentStartChar)
currentSegmentStartChar += MAX_BLOCK_CHARS
currentSegmentStartChar += this.maxBlockChars
}
// Update chunkStartLineIndex to continue processing from the next line
chunkStartLineIndex = i + 1
Expand Down Expand Up @@ -425,8 +434,8 @@ export class CodeParser implements ICodeParser {

// Check if content needs chunking (either total size or individual line size)
const needsChunking =
content.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ||
lines.some((line) => line.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR)
content.length > this.maxBlockChars * MAX_CHARS_TOLERANCE_FACTOR ||
lines.some((line) => line.length > this.maxBlockChars * MAX_CHARS_TOLERANCE_FACTOR)

if (needsChunking) {
// Apply chunking for large content or oversized lines
Expand Down
6 changes: 5 additions & 1 deletion src/services/code-index/processors/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import { Package } from "../../../shared/package"

export class DirectoryScanner implements IDirectoryScanner {
private readonly batchSegmentThreshold: number
private readonly parsingConcurrencyLimit: number

constructor(
private readonly embedder: IEmbedder,
Expand All @@ -41,6 +42,7 @@ export class DirectoryScanner implements IDirectoryScanner {
private readonly cacheManager: CacheManager,
private readonly ignoreInstance: Ignore,
batchSegmentThreshold?: number,
parsingConcurrencyLimit?: number,
) {
// Get the configurable batch size from VSCode settings, fallback to default
// If not provided in constructor, try to get from VSCode settings
Expand All @@ -56,6 +58,8 @@ export class DirectoryScanner implements IDirectoryScanner {
this.batchSegmentThreshold = BATCH_SEGMENT_THRESHOLD
}
}
// Set parsing concurrency (default from constants if not provided)
this.parsingConcurrencyLimit = parsingConcurrencyLimit ?? PARSING_CONCURRENCY
}

/**
Expand Down Expand Up @@ -109,7 +113,7 @@ export class DirectoryScanner implements IDirectoryScanner {
let skippedCount = 0

// Initialize parallel processing tools
const parseLimiter = pLimit(PARSING_CONCURRENCY) // Concurrency for file parsing
const parseLimiter = pLimit(this.parsingConcurrencyLimit) // Concurrency for file parsing
const batchLimiter = pLimit(BATCH_PROCESSING_CONCURRENCY) // Concurrency for batch processing
const mutex = new Mutex()

Expand Down
Loading
Loading