future-agi · SuhaniNagpal7 · May 8, 2026
diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts
@@ -824,7 +824,7 @@ export const tabNavigation: NavTab[] = [
             items: [
               { title: 'End-to-End with Falcon AI: Trace → Debug → Evaluate → Dataset → Fix in One Workflow', href: '/docs/cookbook/falcon-ai/end-to-end' },
               { title: 'Context-Aware Trace Debugging with Falcon AI', href: '/docs/cookbook/falcon-ai/context-aware-debugging' },
-              { title: 'Building Golden Datasets from Production Traces with Falcon AI', href: '/docs/cookbook/falcon-ai/eval-datasets-from-traces' },
+              { title: 'Building Golden Datasets from Production Traces with Falcon AI', href: '/docs/cookbook/falcon-ai/golden-datasets-from-traces' },
             ]
           },
           {

diff --git a/src/pages/docs/cookbook/falcon-ai/context-aware-debugging.mdx b/src/pages/docs/cookbook/falcon-ai/context-aware-debugging.mdx
@@ -157,7 +157,7 @@ You went from a failing trace to a verified prompt fix in three Falcon AI turns.
   <Card title="End-to-End with Falcon AI" icon="sparkles" href="/docs/cookbook/falcon-ai/end-to-end">
     The full lifecycle: trace, debug, evaluate, dataset, fix in one workflow
   </Card>
-  <Card title="Building Golden Datasets from Production Traces" icon="database" href="/docs/cookbook/falcon-ai/eval-datasets-from-traces">
+  <Card title="Building Golden Datasets from Production Traces" icon="database" href="/docs/cookbook/falcon-ai/golden-datasets-from-traces">
     Once you've fixed one trace, lock the failure pattern in as a regression dataset
   </Card>
   <Card title="Error Feed" icon="bug" href="/docs/error-feed">

diff --git a/src/pages/docs/cookbook/falcon-ai/end-to-end.mdx b/src/pages/docs/cookbook/falcon-ai/end-to-end.mdx
@@ -188,7 +188,7 @@ You went from a noisy traced project to a fixed agent and a reusable regression
   <Card title="Context-Aware Trace Debugging" icon="zap" href="/docs/cookbook/falcon-ai/context-aware-debugging">
     From a single bad trace to a paste-ready prompt fix in minutes
   </Card>
-  <Card title="Building Golden Datasets from Production Traces" icon="database" href="/docs/cookbook/falcon-ai/eval-datasets-from-traces">
+  <Card title="Building Golden Datasets from Production Traces" icon="database" href="/docs/cookbook/falcon-ai/golden-datasets-from-traces">
     Curate balanced golden datasets from real traces with `/build-dataset`
   </Card>
   <Card title="Falcon AI Skills" icon="bolt" href="/docs/falcon-ai/features/skills">

diff --git a/...k/falcon-ai/eval-datasets-from-traces.mdx → ...falcon-ai/golden-datasets-from-traces.mdx b/...k/falcon-ai/eval-datasets-from-traces.mdx → ...falcon-ai/golden-datasets-from-traces.mdx
@@ -4,8 +4,8 @@ description: "Turn production traces into a curated, ground-truthed golden datas
 ---
 
 <div style={{display: "flex", gap: "8px", flexWrap: "wrap", margin: "0.5rem 0 1rem"}}>
-<a href="https://colab.research.google.com/github/future-agi/cookbooks/blob/cookbook/falcon-ai-page/falcon-ai/eval-datasets-from-traces.ipynb" target="_blank" style={{display: "inline-flex"}}><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" style={{height: "28px"}} /></a>
-<a href="https://github.com/future-agi/cookbooks/blob/cookbook/falcon-ai-page/falcon-ai/eval-datasets-from-traces.ipynb" target="_blank" style={{display: "inline-flex"}}><img src="https://img.shields.io/badge/View_on_GitHub-181717?logo=github&logoColor=white" alt="GitHub" style={{height: "28px"}} /></a>
+<a href="https://colab.research.google.com/github/future-agi/cookbooks/blob/cookbook/falcon-ai-page/falcon-ai/golden-datasets-from-traces.ipynb" target="_blank" style={{display: "inline-flex"}}><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" style={{height: "28px"}} /></a>
+<a href="https://github.com/future-agi/cookbooks/blob/cookbook/falcon-ai-page/falcon-ai/golden-datasets-from-traces.ipynb" target="_blank" style={{display: "inline-flex"}}><img src="https://img.shields.io/badge/View_on_GitHub-181717?logo=github&logoColor=white" alt="GitHub" style={{height: "28px"}} /></a>
 </div>
 
 | Time | Difficulty | Package |
@@ -109,7 +109,7 @@ Open the Falcon AI sidebar on the project. The context chip should show the proj
 
 Falcon AI returns a category histogram and flags traces where the category looks off given the email content (your wording and counts will vary).
 
-<img src="https://fi-cookbook-assets.s3.ap-south-1.amazonaws.com/falcon-ai/eval-datasets-from-traces/step-4-explore-failures.png" alt="Falcon AI sidebar showing the per-category distribution and flagged misclassifications for the email-triage-prod project" style={{width: "100%", borderRadius: "0.75rem", border: "1px solid var(--color-border-default)"}} />
+<img src="https://fi-cookbook-assets.s3.ap-south-1.amazonaws.com/falcon-ai/golden-datasets-from-traces/step-4-explore-failures.png" alt="Falcon AI sidebar showing the per-category distribution and flagged misclassifications for the email-triage-prod project" style={{width: "100%", borderRadius: "0.75rem", border: "1px solid var(--color-border-default)"}} />
 
 These flagged misclassifications are a strong starting point, not ground truth. You'll confirm them in a later step.
 
@@ -127,7 +127,7 @@ These flagged misclassifications are a strong starting point, not ground truth.
 
 Falcon AI orchestrates the underlying dataset tools (such as `create_dataset`, `add_columns`, `add_dataset_rows`) against the traces in context and returns a completion card with a link to the new dataset.
 
-<img src="https://fi-cookbook-assets.s3.ap-south-1.amazonaws.com/falcon-ai/eval-datasets-from-traces/step-5-build-dataset.png" alt="Falcon AI completion card for the email-triage-eval-v1 dataset showing per-category coverage and the flagged misclassifications that were included" style={{width: "100%", borderRadius: "0.75rem", border: "1px solid var(--color-border-default)"}} />
+<img src="https://fi-cookbook-assets.s3.ap-south-1.amazonaws.com/falcon-ai/golden-datasets-from-traces/step-5-build-dataset.png" alt="Falcon AI completion card for the email-triage-eval-v1 dataset showing per-category coverage and the flagged misclassifications that were included" style={{width: "100%", borderRadius: "0.75rem", border: "1px solid var(--color-border-default)"}} />
 
 A dataset that is 90% successes won't catch regressions; one that is 90% failures won't catch false positives. The "at least 2 from each category plus the misclassifications" rule gives both classes meaningful coverage.
 
@@ -140,7 +140,7 @@ A dataset that is 90% successes won't catch regressions; one that is 90% failure
 
 Falcon AI populates both columns per row. Expect a split between confident `expected_category` values and a few rows tagged `NEEDS_REVIEW`.
 
-<img src="https://fi-cookbook-assets.s3.ap-south-1.amazonaws.com/falcon-ai/eval-datasets-from-traces/step-6-ground-truth-column.png" alt="Falcon AI per-row preview of the new expected_category and review_note columns with NEEDS_REVIEW flags on the genuinely ambiguous rows" style={{width: "100%", borderRadius: "0.75rem", border: "1px solid var(--color-border-default)"}} />
+<img src="https://fi-cookbook-assets.s3.ap-south-1.amazonaws.com/falcon-ai/golden-datasets-from-traces/step-6-ground-truth-column.png" alt="Falcon AI per-row preview of the new expected_category and review_note columns with NEEDS_REVIEW flags on the genuinely ambiguous rows" style={{width: "100%", borderRadius: "0.75rem", border: "1px solid var(--color-border-default)"}} />
 
 Open the dataset in **Datasets → email-triage-eval-v1**, click each `NEEDS_REVIEW` row, and decide based on your team's routing rules. Edit the rows in the UI or ask Falcon AI to update them.
 
@@ -151,7 +151,7 @@ Open the dataset in **Datasets → email-triage-eval-v1**, click each `NEEDS_REV
 
 > Run an evaluation on `email-triage-eval-v1` that checks whether `predicted_category` exactly matches `expected_category` for each row. Use the eval template from this workspace that best fits a string-equality check between two columns.
 
-<img src="https://fi-cookbook-assets.s3.ap-south-1.amazonaws.com/falcon-ai/eval-datasets-from-traces/step-7-run-evaluations.png" alt="Falcon AI eval run output showing the per-row predicted vs expected category and pass/fail/skip verdict for email-triage-eval-v1" style={{width: "100%", borderRadius: "0.75rem", border: "1px solid var(--color-border-default)"}} />
+<img src="https://fi-cookbook-assets.s3.ap-south-1.amazonaws.com/falcon-ai/golden-datasets-from-traces/step-7-run-evaluations.png" alt="Falcon AI eval run output showing the per-row predicted vs expected category and pass/fail/skip verdict for email-triage-eval-v1" style={{width: "100%", borderRadius: "0.75rem", border: "1px solid var(--color-border-default)"}} />
 
 Both the pass pattern and the fail pattern are what you want. A regression test where every row passes is not testing anything; one where every row fails is just noisy. The dataset now has compounding value: any future prompt change can be re-scored against it in one chat message.