future-agi · KarthikAvinashFI · May 8, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/public/images/docs/evaluation/custom/agents-tab.png b/public/images/docs/evaluation/custom/agents-tab.png
diff --git a/public/images/docs/evaluation/custom/llm-judge-tab.png b/public/images/docs/evaluation/custom/llm-judge-tab.png
diff --git a/public/images/docs/evaluation/custom/output-type-scoring.png b/public/images/docs/evaluation/custom/output-type-scoring.png
diff --git a/public/images/docs/evaluation/error-localization/toggle-on.png b/public/images/docs/evaluation/error-localization/toggle-on.png
diff --git a/public/images/docs/evaluation/evaluate/add-evaluation.png b/public/images/docs/evaluation/evaluate/add-evaluation.png
diff --git a/public/images/docs/evaluation/evaluate/configured-evals-panel.png b/public/images/docs/evaluation/evaluate/configured-evals-panel.png
diff --git a/public/images/docs/evaluation/evaluate/dataset-page.png b/public/images/docs/evaluation/evaluate/dataset-page.png
diff --git a/public/images/docs/evaluation/evaluate/dataset-with-results-running.png b/public/images/docs/evaluation/evaluate/dataset-with-results-running.png
diff --git a/public/images/docs/evaluation/ground-truth/map-variables.png b/public/images/docs/evaluation/ground-truth/map-variables.png
diff --git a/public/images/docs/evaluation/ground-truth/role-mapping-embedding-generation.png b/public/images/docs/evaluation/ground-truth/role-mapping-embedding-generation.png
diff --git a/public/images/docs/evaluation/ground-truth/tab-selected.png b/public/images/docs/evaluation/ground-truth/tab-selected.png
diff --git a/public/images/docs/evaluation/test-playground/custom-tab.png b/public/images/docs/evaluation/test-playground/custom-tab.png
diff --git a/...ic/images/docs/evaluation/test-playground/toxicity-result-audio-inp-dataset.png b/...ic/images/docs/evaluation/test-playground/toxicity-result-audio-inp-dataset.png
diff --git a/public/images/docs/observe/evals/editing-eval-in-tasks-page.png b/public/images/docs/observe/evals/editing-eval-in-tasks-page.png
diff --git a/public/images/docs/observe/evals/evals-results-observe-page.png b/public/images/docs/observe/evals/evals-results-observe-page.png
diff --git a/public/images/docs/observe/evals/task-config-page.png b/public/images/docs/observe/evals/task-config-page.png
diff --git a/public/images/docs/simulation/add-evaluation-button.png b/public/images/docs/simulation/add-evaluation-button.png
diff --git a/public/images/docs/simulation/eval-results-page.png b/public/images/docs/simulation/eval-results-page.png
diff --git a/public/images/docs/simulation/map-variables-for-evals.png b/public/images/docs/simulation/map-variables-for-evals.png
diff --git a/public/images/docs/simulation/search-evals.png b/public/images/docs/simulation/search-evals.png
diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts
@@ -295,6 +295,10 @@ export const tabNavigation: NavTab[] = [
               { title: 'Understanding Evaluation', href: '/docs/evaluation/concepts/understanding-evaluation' },
               { title: 'Eval Types', href: '/docs/evaluation/concepts/eval-types' },
               { title: 'Eval Templates', href: '/docs/evaluation/concepts/eval-templates' },
+              { title: 'Output Types', href: '/docs/evaluation/concepts/output-types' },
+              { title: 'Data Injection', href: '/docs/evaluation/concepts/data-injection' },
+              { title: 'Composite Evals', href: '/docs/evaluation/concepts/composite-evals' },
+              { title: 'Versioning', href: '/docs/evaluation/concepts/versioning' },
               { title: 'Judge Models', href: '/docs/evaluation/concepts/judge-models' },
               { title: 'Eval Results', href: '/docs/evaluation/concepts/eval-results' },
             ]
@@ -305,6 +309,9 @@ export const tabNavigation: NavTab[] = [
               { title: 'Built-in Evals', href: '/docs/evaluation/builtin' },
               { title: 'Evaluate via Platform & SDK', href: '/docs/evaluation/features/evaluate' },
               { title: 'Create Custom Evals', href: '/docs/evaluation/features/custom' },
+              { title: 'Test Playground', href: '/docs/evaluation/features/test-playground' },
+              { title: 'Ground Truth', href: '/docs/evaluation/features/ground-truth' },
+              { title: 'Error Localization', href: '/docs/evaluation/features/error-localization' },
               { title: 'Use Custom Models', href: '/docs/evaluation/features/custom-models' },
               { title: 'Future AGI Models', href: '/docs/evaluation/features/futureagi-models' },
               { title: 'Evaluate CI/CD Pipeline', href: '/docs/evaluation/features/cicd' },

diff --git a/src/pages/docs/evaluation/builtin/accuracy.mdx b/src/pages/docs/evaluation/builtin/accuracy.mdx
@@ -0,0 +1,53 @@
+---
+title: "Accuracy: Built-in Evaluation"
+description: "Computes classification accuracy by comparing predicted labels against expected labels. Accepts single values or JSON arrays of labels. Case-insensitive comp..."
+---
+
+Computes classification accuracy by comparing predicted labels against expected labels. Accepts single values or JSON arrays of labels. Case-insensitive comparison.
+
+<CodeGroup>
+
+```python Python
+result = evaluator.evaluate(
+    eval_templates="accuracy",
+    inputs={
+        "output": "The capital of France is Paris.",
+        "expected": "Paris"
+    },
+)
+
+print(result.eval_results[0].output)
+print(result.eval_results[0].reason)
+```
+
+```typescript JS/TS
+import { Evaluator } from "@future-agi/ai-evaluation";
+
+const evaluator = new Evaluator();
+
+const result = await evaluator.evaluate(
+  "accuracy",
+  {
+    output: "The capital of France is Paris.",
+    expected: "Paris"
+  }
+);
+
+console.log(result);
+```
+
+</CodeGroup>
+
+| **Input** |  |  |  |
+| ------ | --------- | ---- | ----------- |
+| | **Required Input** | **Type** | **Description** |
+| | `output` | `string` | The output. |
+| | `expected` | `string` | The expected. |
+
+| **Output** |  |  |
+| ------ | ----- | ----------- |
+| | **Field** | **Description** |
+| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. |
+| | **Reason** | A plain-language explanation of the verdict. |
+
+**Tags:** `NLP Metrics`, `Output Validation`
diff --git a/src/pages/docs/evaluation/builtin/answer-similarity.mdx b/src/pages/docs/evaluation/builtin/answer-similarity.mdx
@@ -0,0 +1,53 @@
+---
+title: "Answer Similarity: Built-in Evaluation"
+description: "Evaluates the similarity between the expected and actual responses"
+---
+
+Evaluates the similarity between the expected and actual responses.
+
+<CodeGroup>
+
+```python Python
+result = evaluator.evaluate(
+    eval_templates="answer_similarity",
+    inputs={
+        "expected_response": "...",
+        "response": "..."
+    },
+)
+
+print(result.eval_results[0].output)
+print(result.eval_results[0].reason)
+```
+
+```typescript JS/TS
+import { Evaluator } from "@future-agi/ai-evaluation";
+
+const evaluator = new Evaluator();
+
+const result = await evaluator.evaluate(
+  "answer_similarity",
+  {
+    expected_response: "...",
+    response: "..."
+  }
+);
+
+console.log(result);
+```
+
+</CodeGroup>
+
+| **Input** |  |  |  |
+| ------ | --------- | ---- | ----------- |
+| | **Required Input** | **Type** | **Description** |
+| | `expected_response` | `string` | The expected response. |
+| | `response` | `string` | The response. |
+
+| **Output** |  |  |
+| ------ | ----- | ----------- |
+| | **Field** | **Description** |
+| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. |
+| | **Reason** | A plain-language explanation of the verdict. |
+
+**Tags:** `NLP Metrics`, `Output Validation`
diff --git a/src/pages/docs/evaluation/builtin/api-call.mdx b/src/pages/docs/evaluation/builtin/api-call.mdx
@@ -0,0 +1,50 @@
+---
+title: "Api Call: Built-in Evaluation"
+description: "Makes an API call and evaluates the response"
+---
+
+Makes an API call and evaluates the response.
+
+<CodeGroup>
+
+```python Python
+result = evaluator.evaluate(
+    eval_templates="api_call",
+    inputs={
+        "response": "..."
+    },
+)
+
+print(result.eval_results[0].output)
+print(result.eval_results[0].reason)
+```
+
+```typescript JS/TS
+import { Evaluator } from "@future-agi/ai-evaluation";
+
+const evaluator = new Evaluator();
+
+const result = await evaluator.evaluate(
+  "api_call",
+  {
+    response: "..."
+  }
+);
+
+console.log(result);
+```
+
+</CodeGroup>
+
+| **Input** |  |  |  |
+| ------ | --------- | ---- | ----------- |
+| | **Required Input** | **Type** | **Description** |
+| | `response` | `string` | The response. |
+
+| **Output** |  |  |
+| ------ | ----- | ----------- |
+| | **Field** | **Description** |
+| | **Result** | Returns `Passed` or `Failed` per row, with a reason explaining the verdict. |
+| | **Reason** | A plain-language explanation of the verdict. |
+
+**Tags:** `Code`, `Output Validation`
diff --git a/src/pages/docs/evaluation/builtin/balanced-accuracy.mdx b/src/pages/docs/evaluation/builtin/balanced-accuracy.mdx
@@ -0,0 +1,53 @@
+---
+title: "Balanced Accuracy: Built-in Evaluation"
+description: "Computes balanced accuracy (average recall per class). Handles imbalanced datasets better than standard accuracy"
+---
+
+Computes balanced accuracy (average recall per class). Handles imbalanced datasets better than standard accuracy.
+
+<CodeGroup>
+
+```python Python
+result = evaluator.evaluate(
+    eval_templates="balanced_accuracy",
+    inputs={
+        "output": "The capital of France is Paris.",
+        "expected": "Paris"
+    },
+)
+
+print(result.eval_results[0].output)
+print(result.eval_results[0].reason)
+```
+
+```typescript JS/TS
+import { Evaluator } from "@future-agi/ai-evaluation";
+
+const evaluator = new Evaluator();
+
+const result = await evaluator.evaluate(
+  "balanced_accuracy",
+  {
+    output: "The capital of France is Paris.",
+    expected: "Paris"
+  }
+);
+
+console.log(result);
+```
+
+</CodeGroup>
+
+| **Input** |  |  |  |
+| ------ | --------- | ---- | ----------- |
+| | **Required Input** | **Type** | **Description** |
+| | `output` | `string` | The output. |
+| | `expected` | `string` | The expected. |
+
+| **Output** |  |  |
+| ------ | ----- | ----------- |
+| | **Field** | **Description** |
+| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. |
+| | **Reason** | A plain-language explanation of the verdict. |
+
+**Tags:** `NLP Metrics`, `Output Validation`
diff --git a/src/pages/docs/evaluation/builtin/bleu.mdx b/src/pages/docs/evaluation/builtin/bleu.mdx
@@ -42,8 +42,8 @@ console.log(result);
 | **Input** |  |  |  |
 | ------ | --------- | ---- | ----------- |
 | | **Required Input** | **Type** | **Description** |
-| | `reference` | `string` | Model-generated output to be evaluated. |
-| | `hypothesis` | `string` or `List[string]` | One or more reference texts. |
+| | `reference` | `string` | The reference / ground-truth text the output is being compared against. |
+| | `hypothesis` | `string` | The model-generated output being evaluated. |
 
 | **Output** |  |  |
 | ------ | ----- | ----------- |

diff --git a/src/pages/docs/evaluation/builtin/character-error-rate.mdx b/src/pages/docs/evaluation/builtin/character-error-rate.mdx
@@ -0,0 +1,53 @@
+---
+title: "Character Error Rate: Built-in Evaluation"
+description: "Computes Character Error Rate (CER) for ASR/OCR evaluation. CER measures character-level edit distance between reference and hypothesis. Returns 1-CER as sco..."
+---
+
+Computes Character Error Rate (CER) for ASR/OCR evaluation. CER measures character-level edit distance between reference and hypothesis. Returns 1-CER as score (higher=better).
+
+<CodeGroup>
+
+```python Python
+result = evaluator.evaluate(
+    eval_templates="character_error_rate",
+    inputs={
+        "reference": "The capital of France is Paris.",
+        "hypothesis": "Paris is the capital of France."
+    },
+)
+
+print(result.eval_results[0].output)
+print(result.eval_results[0].reason)
+```
+
+```typescript JS/TS
+import { Evaluator } from "@future-agi/ai-evaluation";
+
+const evaluator = new Evaluator();
+
+const result = await evaluator.evaluate(
+  "character_error_rate",
+  {
+    reference: "The capital of France is Paris.",
+    hypothesis: "Paris is the capital of France."
+  }
+);
+
+console.log(result);
+```
+
+</CodeGroup>
+
+| **Input** |  |  |  |
+| ------ | --------- | ---- | ----------- |
+| | **Required Input** | **Type** | **Description** |
+| | `reference` | `string` | The reference. |
+| | `hypothesis` | `string` | The hypothesis. |
+
+| **Output** |  |  |
+| ------ | ----- | ----------- |
+| | **Field** | **Description** |
+| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. |
+| | **Reason** | A plain-language explanation of the verdict. |
+
+**Tags:** `NLP Metrics`, `Audio`
diff --git a/src/pages/docs/evaluation/builtin/chrf-score.mdx b/src/pages/docs/evaluation/builtin/chrf-score.mdx
@@ -0,0 +1,53 @@
+---
+title: "Chrf Score: Built-in Evaluation"
+description: "Computes ChrF score (character n-gram F-score). More robust than BLEU for morphologically rich languages and short texts. Uses character-level n-grams up to ..."
+---
+
+Computes ChrF score (character n-gram F-score). More robust than BLEU for morphologically rich languages and short texts. Uses character-level n-grams up to order 6 with recall-weighted F-score.
+
+<CodeGroup>
+
+```python Python
+result = evaluator.evaluate(
+    eval_templates="chrf_score",
+    inputs={
+        "reference": "The capital of France is Paris.",
+        "hypothesis": "Paris is the capital of France."
+    },
+)
+
+print(result.eval_results[0].output)
+print(result.eval_results[0].reason)
+```
+
+```typescript JS/TS
+import { Evaluator } from "@future-agi/ai-evaluation";
+
+const evaluator = new Evaluator();
+
+const result = await evaluator.evaluate(
+  "chrf_score",
+  {
+    reference: "The capital of France is Paris.",
+    hypothesis: "Paris is the capital of France."
+  }
+);
+
+console.log(result);
+```
+
+</CodeGroup>
+
+| **Input** |  |  |  |
+| ------ | --------- | ---- | ----------- |
+| | **Required Input** | **Type** | **Description** |
+| | `reference` | `string` | The reference. |
+| | `hypothesis` | `string` | The hypothesis. |
+
+| **Output** |  |  |
+| ------ | ----- | ----------- |
+| | **Field** | **Description** |
+| | **Result** | Returns a numeric score between 0 and 1, plus a reason explaining the verdict. |
+| | **Reason** | A plain-language explanation of the verdict. |
+
+**Tags:** `NLP Metrics`, `Text`