Corbell-AI · himmi-01 · May 2, 2026 · May 1, 2026
diff --git a/evalmonkey/evals/asset_generator.py b/evalmonkey/evals/asset_generator.py
@@ -23,6 +23,7 @@
 
 import json
 import os
+import re
 import textwrap
 from dataclasses import dataclass, field
 from datetime import datetime
@@ -139,6 +140,12 @@ def generate_improvement_evals(self, n: int = 5) -> List[dict]:
                 response_format={"type": "json_object"},
             )
             content = response.choices[0].message.content
+            # Strip markdown code fences — some providers (Anthropic)
+            # wrap JSON in ```json ... ``` even with response_format
+            if content and "```" in content:
+                match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", content, re.DOTALL)
+                if match:
+                    content = match.group(1).strip()
             # LLM sometimes wraps the array in {"evals": [...]}
             parsed = json.loads(content)
             if isinstance(parsed, list):

diff --git a/evalmonkey/evals/runner.py b/evalmonkey/evals/runner.py
@@ -1,7 +1,22 @@
 import os
 import json
+import re
 from evalmonkey.utils.llm import call_llm
 
+def _strip_code_fences(text: str) -> str:
+    """Strip markdown code fences from LLM output.
+
+    Some providers (notably Anthropic via litellm) wrap JSON responses in
+    ```json ... ``` code blocks even when response_format=json_object is
+    requested.  This causes json.loads() to fail with a parse error.
+    """
+    if text and "```" in text:
+        match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", text, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+    return text
+
+
 class LLMJudgeProvider:
     """
     LLMJudgeProvider uses litellm to abstract all common backend API LLM providers.
@@ -32,6 +47,7 @@ def score_run(self, rubric: str, agent_output: str) -> dict:
                 response_format={"type": "json_object"}
             )
             content = response.choices[0].message.content
+            content = _strip_code_fences(content)
             return json.loads(content)
         except Exception as e:
             # Fallback if there's a JSON parse error or API issue