volcengine · ByteDanceLiuYang · May 20, 2026 · May 20, 2026 · May 21, 2026 · May 21, 2026
diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py b/benchmark/retrieval/grep/vikingdb_bm25/step1_generate.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""Generate benchmark data for grep bm25 vs fs comparison.
+
+Produces ~80,000 markdown files (~50KB each, ~4GB total) in a 4-level
+directory tree:
+  level0: 10 dirs
+  level1: 10 dirs per level0  (100 total)
+  level2: 10 dirs per level1  (1,000 total)
+  level3: 8 dirs per level2   (8,000 total)
+  files:  10 per level3 dir   (80,000 total)
+
+Target keywords appear in ~1% of files each, simulating a realistic
+large-scale codebase where bm25 recall dramatically reduces search scope.
+"""
+
+import os
+import random
+
+BASE_DIR = os.path.expanduser("~/.openviking/data/benchmark")
+
+# Directory tree — each level has independent dir count
+LEVEL0_DIRS = 10
+LEVEL1_DIRS = 10  # per level0 dir
+LEVEL2_DIRS = 10  # per level1 dir
+LEVEL3_DIRS = 8  # per level2 dir (flexible)
+FILES_PER_DIR = 10  # per level3 dir
+
+# Total: 10 * 10 * 10 * 8 * 10 = 80,000 files
+# Size:  80,000 * 50KB ≈ 4GB
+# Each top-level dir: 10*10*8*10*50KB = 400MB
+
+TARGET_FILE_SIZE = 50000  # ~50KB
+
+TARGET_KEYWORDS = ["VikingDB", "FullText", "bm25", "search_by_keywords"]
+
+FILLER_WORDS = [
+    "configuration",
+    "deployment",
+    "architecture",
+    "implementation",
+    "performance",
+    "optimization",
+    "integration",
+    "middleware",
+    "authentication",
+    "authorization",
+    "encryption",
+    "validation",
+    "monitoring",
+    "logging",
+    "caching",
+    "serialization",
+    "concurrency",
+    "scalability",
+    "reliability",
+    "observability",
+    "throughput",
+    "latency",
+    "availability",
+    "consistency",
+    "partitioning",
+    "replication",
+    "failover",
+    "loadbalancing",
+    "containerization",
+    "orchestration",
+    "provisioning",
+    "lifecycle",
+]
+
+random.seed(42)
+
+total_files = LEVEL0_DIRS * LEVEL1_DIRS * LEVEL2_DIRS * LEVEL3_DIRS * FILES_PER_DIR
+keyword_hit_count = max(1, total_files // 100)  # 1% = 800 files per keyword
+file_indices = list(range(total_files))
+keyword_files = {}
+for kw in TARGET_KEYWORDS:
+    chosen = random.sample(file_indices, keyword_hit_count)
+    keyword_files[kw] = set(chosen)
+
+
+def generate_section(title_level):
+    """Generate a markdown section with realistic filler content."""
+    prefix = "#" * title_level
+    title_words = random.sample(FILLER_WORDS, 3)
+    title = f"{prefix} {' '.join(title_words).title()}\n\n"
+
+    paragraphs = []
+    for _ in range(random.randint(2, 5)):
+        sentences = []
+        for _ in range(random.randint(3, 8)):
+            words = random.choices(FILLER_WORDS, k=random.randint(8, 15))
+            sentences.append(" ".join(words).capitalize() + ".")
+        paragraphs.append(" ".join(sentences))
+
+    return title + "\n\n".join(paragraphs) + "\n\n"
+
+
+def generate_file(file_idx):
+    """Generate a ~50KB markdown file with 3-5 h1 sections, each with 5-10 h2 sections."""
+    parts = []
+    num_h1 = random.randint(3, 5)
+    for _ in range(num_h1):
+        parts.append(generate_section(1))
+        num_h2 = random.randint(5, 10)
+        for _ in range(num_h2):
+            parts.append(generate_section(2))
+
+    # Inject target keyword if this file is selected
+    for kw, indices in keyword_files.items():
+        if file_idx in indices:
+            injection = (
+                f"\nThis module provides {kw} integration for advanced search capabilities. "
+                f"The {kw} feature enables efficient keyword-based retrieval across large datasets.\n\n"
+            )
+            parts[2] = parts[2] + injection  # after first h1 + first h2
+
+    content = "".join(parts)
+    # Pad to target size if needed
+    if len(content) < TARGET_FILE_SIZE:
+        padding_parts = []
+        while len("".join(padding_parts)) < TARGET_FILE_SIZE - len(content):
+            words = random.choices(FILLER_WORDS, k=20)
+            padding_parts.append(" ".join(words).capitalize() + ".\n")
+        content += "\n\n## Appendix\n\n" + "".join(padding_parts)
+
+    return content[:TARGET_FILE_SIZE]
+
+
+print(f"Generating {total_files} markdown files under {BASE_DIR}...")
+print(
+    f"  Tree: level0={LEVEL0_DIRS} x level1={LEVEL1_DIRS} x level2={LEVEL2_DIRS} x level3={LEVEL3_DIRS}"
+)
+print(f"  Files per leaf dir: {FILES_PER_DIR}")
+print(f"  Target keywords: {TARGET_KEYWORDS}")
+print(
+    f"  Each keyword appears in ~{keyword_hit_count} files out of {total_files} "
+    f"(~{keyword_hit_count / total_files * 100:.1f}%)"
+)
+print(f"  Estimated total size: ~{total_files * TARGET_FILE_SIZE / 1e9:.1f} GB")
+
+file_idx = 0
+os.makedirs(BASE_DIR, exist_ok=True)
+
+for i0 in range(LEVEL0_DIRS):
+    d0 = os.path.join(BASE_DIR, f"level0_{i0:02d}")
+    os.makedirs(d0, exist_ok=True)
+    for i1 in range(LEVEL1_DIRS):
+        d1 = os.path.join(d0, f"level1_{i1:02d}")
+        os.makedirs(d1, exist_ok=True)
+        for i2 in range(LEVEL2_DIRS):
+            d2 = os.path.join(d1, f"level2_{i2:02d}")
+            os.makedirs(d2, exist_ok=True)
+            for i3 in range(LEVEL3_DIRS):
+                d3 = os.path.join(d2, f"level3_{i3:02d}")
+                os.makedirs(d3, exist_ok=True)
+                for f in range(FILES_PER_DIR):
+                    filepath = os.path.join(d3, f"doc_{f:04d}.md")
+                    content = generate_file(file_idx)
+                    with open(filepath, "w") as fh:
+                        fh.write(content)
+                    file_idx += 1
+                    if file_idx % 10000 == 0:
+                        print(f"  ... {file_idx} files written")
+
+print(f"Done! {file_idx} files generated under {BASE_DIR}")
diff --git a/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py b/benchmark/retrieval/grep/vikingdb_bm25/step2_quick_add_resource.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""Step 2: Quick upload — import benchmark files skipping VLM+embedding.
+
+Walks the benchmark directory and uploads each file via the OpenViking Python SDK
+with build_index=False, which skips VLM summarization and embedding. This makes
+the upload phase fast and avoids circuit-breaker issues from VLM failures.
+
+After all files are uploaded, run step3_build_index.py to trigger VLM+embedding
+in a controlled batch, then step4_benchmark.py to measure grep performance.
+
+Supports resume: a progress file (.add_resource_progress) tracks completed files.
+If interrupted, re-run to automatically skip already-imported files.
+
+Usage:
+  python3 step2_quick_add_resource.py [--no-resume] [--max-failures N]
+"""
+
+import argparse
+import os
+import sys
+
+BASE_DIR = os.path.expanduser("~/.openviking/data/benchmark")
+DATA_DIR = os.path.expanduser("~/.openviking/data")
+PROGRESS_FILE = os.path.join(BASE_DIR, ".add_resource_progress")
+
+
+def load_progress() -> set:
+    """Load set of already-imported relative paths from progress file."""
+    done = set()
+    if os.path.exists(PROGRESS_FILE):
+        with open(PROGRESS_FILE) as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    done.add(line)
+    return done
+
+
+def save_progress(rel_path: str) -> None:
+    """Append a completed relative path to the progress file and flush immediately."""
+    with open(PROGRESS_FILE, "a") as f:
+        f.write(rel_path + "\n")
+        f.flush()
+        os.fsync(f.fileno())
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Step 2: Quick upload benchmark files (skip VLM+embedding)"
+    )
+    parser.add_argument(
+        "--no-resume", action="store_true", help="Disable auto-resume, start from scratch"
+    )
+    parser.add_argument(
+        "--max-failures", type=int, default=10, help="Abort after N failures (default: 10)"
+    )
+    args = parser.parse_args()
+
+    from openviking.sync_client import SyncOpenViking
+
+    client = SyncOpenViking()
+    client.initialize()
+
+    # Collect all files first (deterministic order)
+    all_files = []
+    for root, dirs, files in os.walk(BASE_DIR):
+        dirs.sort()
+        for fname in sorted(files):
+            if fname.endswith(".md"):
+                all_files.append(os.path.join(root, fname))
+
+    # Load resume state
+    done_set = set()
+    if not args.no_resume:
+        done_set = load_progress()
+        if done_set:
+            print(f"Resuming: {len(done_set)} files already imported (from {PROGRESS_FILE})")
+
+    count = 0
+    skipped = 0
+    failed = 0
+
+    for filepath in all_files:
+        rel = os.path.relpath(filepath, DATA_DIR)
+        rel_dir = os.path.dirname(rel)
+        parent_uri = f"viking://resources/{rel_dir}"
+
+        # Skip already-imported files
+        if rel in done_set:
+            skipped += 1
+            continue
+
+        idx = count + skipped + 1
+        print(f"[{idx}/{len(all_files)}] Uploading {rel} ...", end=" ", flush=True)
+
+        try:
+            client.add_resource(
+                path=filepath,
+                parent=parent_uri,
+                build_index=False,
+                wait=False,
+                create_parent=True,
+            )
+            print("OK")
+            save_progress(rel)
+        except Exception as e:
+            print(f"FAILED: {e}")
+            failed += 1
+            if failed >= args.max_failures:
+                print(f"\nToo many failures ({failed}), aborting. Re-run to resume.")
+                sys.exit(1)
+
+        count += 1
+        if count % 100 == 0:
+            print(f"  ... {count} files uploaded this run ({failed} failed, {skipped} skipped)")
+
+    print(f"\nDone! {count} uploaded, {skipped} skipped, {failed} failed")
+    if failed == 0:
+        print("Next step: run step3_build_index.py to trigger VLM+embedding")
+
+
+if __name__ == "__main__":
+    main()