AdityaVG13 · AdityaVG13 · Apr 7, 2026 · Apr 6, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/benchmark/baseline-v041-benchmark.json b/benchmark/baseline-v041-benchmark.json
diff --git a/benchmark/baseline-v041-metrics.json b/benchmark/baseline-v041-metrics.json
@@ -0,0 +1,212 @@
+{
+  "aggregate": {
+    "avg_recall": 0.101,
+    "avg_precision": 0.597,
+    "avg_f1": 0.153,
+    "macro_recall": 0.083,
+    "total_relevant_in_db": 1842,
+    "total_found": 152
+  },
+  "results": [
+    {
+      "query": "token optimization settings",
+      "db_relevant": 41,
+      "found": 13,
+      "returned": 10,
+      "returned_relevant": 4,
+      "recall": 0.317,
+      "precision": 0.4,
+      "f1": 0.354
+    },
+    {
+      "query": "cache expiry guard hook",
+      "db_relevant": 20,
+      "found": 1,
+      "returned": 1,
+      "returned_relevant": 1,
+      "recall": 0.05,
+      "precision": 1.0,
+      "f1": 0.095
+    },
+    {
+      "query": "RTK path fix bashrc",
+      "db_relevant": 84,
+      "found": 11,
+      "returned": 6,
+      "returned_relevant": 1,
+      "recall": 0.131,
+      "precision": 0.167,
+      "f1": 0.147
+    },
+    {
+      "query": "CCMeter analytics dashboard",
+      "db_relevant": 132,
+      "found": 15,
+      "returned": 9,
+      "returned_relevant": 6,
+      "recall": 0.114,
+      "precision": 0.667,
+      "f1": 0.194
+    },
+    {
+      "query": "browser cleanup playwright",
+      "db_relevant": 33,
+      "found": 6,
+      "returned": 6,
+      "returned_relevant": 6,
+      "recall": 0.182,
+      "precision": 1.0,
+      "f1": 0.308
+    },
+    {
+      "query": "uv python package management",
+      "db_relevant": 65,
+      "found": 15,
+      "returned": 7,
+      "returned_relevant": 4,
+      "recall": 0.231,
+      "precision": 0.571,
+      "f1": 0.329
+    },
+    {
+      "query": "never use em-dashes",
+      "db_relevant": 187,
+      "found": 6,
+      "returned": 7,
+      "returned_relevant": 1,
+      "recall": 0.032,
+      "precision": 0.143,
+      "f1": 0.052
+    },
+    {
+      "query": "cortex recall before investigation",
+      "db_relevant": 107,
+      "found": 12,
+      "returned": 4,
+      "returned_relevant": 2,
+      "recall": 0.112,
+      "precision": 0.5,
+      "f1": 0.183
+    },
+    {
+      "query": "codex agent contributions",
+      "db_relevant": 181,
+      "found": 10,
+      "returned": 5,
+      "returned_relevant": 2,
+      "recall": 0.055,
+      "precision": 0.4,
+      "f1": 0.097
+    },
+    {
+      "query": "gemini agent decisions",
+      "db_relevant": 173,
+      "found": 5,
+      "returned": 2,
+      "returned_relevant": 2,
+      "recall": 0.029,
+      "precision": 1.0,
+      "f1": 0.056
+    },
+    {
+      "query": "factory droid builds",
+      "db_relevant": 129,
+      "found": 15,
+      "returned": 6,
+      "returned_relevant": 3,
+      "recall": 0.116,
+      "precision": 0.5,
+      "f1": 0.189
+    },
+    {
+      "query": "multi-agent shared state",
+      "db_relevant": 159,
+      "found": 3,
+      "returned": 2,
+      "returned_relevant": 0,
+      "recall": 0.019,
+      "precision": 0.0,
+      "f1": 0.0
+    },
+    {
+      "query": "boot compiler capsule system",
+      "db_relevant": 45,
+      "found": 5,
+      "returned": 2,
+      "returned_relevant": 1,
+      "recall": 0.111,
+      "precision": 0.5,
+      "f1": 0.182
+    },
+    {
+      "query": "conflict detection jaccard cosine",
+      "db_relevant": 29,
+      "found": 3,
+      "returned": 4,
+      "returned_relevant": 1,
+      "recall": 0.103,
+      "precision": 0.25,
+      "f1": 0.146
+    },
+    {
+      "query": "embedding engine MiniLM",
+      "db_relevant": 38,
+      "found": 1,
+      "returned": 2,
+      "returned_relevant": 1,
+      "recall": 0.026,
+      "precision": 0.5,
+      "f1": 0.05
+    },
+    {
+      "query": "crystal cluster formation",
+      "db_relevant": 111,
+      "found": 10,
+      "returned": 3,
+      "returned_relevant": 1,
+      "recall": 0.09,
+      "precision": 0.333,
+      "f1": 0.142
+    },
+    {
+      "query": "user writing voice style",
+      "db_relevant": 40,
+      "found": 2,
+      "returned": 2,
+      "returned_relevant": 2,
+      "recall": 0.05,
+      "precision": 1.0,
+      "f1": 0.095
+    },
+    {
+      "query": "self improvement engine goals",
+      "db_relevant": 86,
+      "found": 4,
+      "returned": 3,
+      "returned_relevant": 3,
+      "recall": 0.047,
+      "precision": 1.0,
+      "f1": 0.089
+    },
+    {
+      "query": "tauri dashboard control center",
+      "db_relevant": 61,
+      "found": 10,
+      "returned": 6,
+      "returned_relevant": 6,
+      "recall": 0.164,
+      "precision": 1.0,
+      "f1": 0.282
+    },
+    {
+      "query": "job applicator skill",
+      "db_relevant": 121,
+      "found": 5,
+      "returned": 3,
+      "returned_relevant": 3,
+      "recall": 0.041,
+      "precision": 1.0,
+      "f1": 0.079
+    }
+  ]
+}
diff --git a/benchmark/baseline-v041.md b/benchmark/baseline-v041.md
@@ -0,0 +1,51 @@
+# Cortex v0.4.1 Recall Baseline
+**Date:** 2026-04-06T18:39:53
+**Nodes:** 544 (271 memories, 273 decisions)
+**Embeddings:** MiniLM-L6 384-dim, has_embeddings=false (health embeddings=562, status=available)
+
+## Aggregate Metrics
+| Metric | Value | Source |
+|--------|-------|--------|
+| Ground Truth Precision | 0.552 | benchmark-v2 |
+| Keyword Precision | 0.335 | benchmark-v2 |
+| MRR | 0.692 | benchmark-v2 |
+| Hit Rate | 0.900 | benchmark-v2 |
+| Avg Latency (ms) | 97.5 | benchmark-v2 |
+| Avg Recall | 0.101 | metric |
+| Avg Precision | 0.597 | metric |
+| Avg F1 | 0.153 | metric |
+| Macro Recall | 0.083 | metric |
+| Total Relevant in DB | 1842 | metric |
+
+## By Category (benchmark-v2)
+| Category | Queries | GT Precision | MRR | Avg ms | Avg Tokens |
+|----------|---------|-------------|-----|--------|------------|
+| project_decisions | 4 | 0.400 | 0.583 | 104.3 | 324.2 |
+| feedback_rules | 4 | 0.595 | 0.750 | 87.2 | 249.8 |
+| cross_agent | 4 | 0.475 | 0.750 | 98.5 | 136.0 |
+| architecture | 4 | 0.291 | 0.375 | 99.9 | 134.5 |
+| user_context | 4 | 1.000 | 1.000 | 97.8 | 177.8 |
+
+## Worst Queries (GT precision < 0.40)
+| Query | GT Precision | MRR | Category | Failure Mode |
+|-------|-------------|-----|----------|-------------|
+| cache expiry guard hook | 0.333 | 1.000 | project_decisions | GIGO |
+| RTK path fix bashrc | 0.200 | 0.500 | project_decisions | RANKING |
+| never use em-dashes | 0.143 | 0.500 | feedback_rules | RANKING |
+| multi-agent shared state | 0.000 | 0.000 | cross_agent | SPARSE |
+| conflict detection jaccard cosine | 0.333 | 0.500 | architecture | RANKING |
+| embedding engine MiniLM | 0.000 | 0.000 | architecture | SPARSE |
+| crystal cluster formation | 0.333 | 0.500 | architecture | RANKING |
+
+## Delta from 2026-04-05 Run
+| Metric | 2026-04-05 | Today | Delta |
+|--------|-----------|-------|-------|
+| GT Precision | 0.587 | 0.552 | -0.035 |
+| MRR | 0.742 | 0.692 | -0.050 |
+| Avg Latency | 105.5ms | 97.5ms | -8.0ms |
+
+## Notes
+- Health reported embedding_status=available and 562 embeddings, but benchmark-v2 recorded has_embeddings=false because the /embed probe did not return a vector. Recall still ran through the current /recall endpoint.
+- GT precision dropped by 0.035 (-6.0% relative) and MRR dropped by 0.050 (-6.7% relative) from the 2026-04-05 run, while average latency improved by 8.0ms (-7.6%).
+- The node count increased from the prompt context to 544 active memory/decision nodes at measurement time, which likely contributes to changed precision and recall totals.
+- `cache expiry guard hook` is labeled GIGO even though the top two results were relevant; the precision loss comes from irrelevant tail results after the relevant hits.