Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions evalmonkey/evals/asset_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import json
import os
import re
import textwrap
from dataclasses import dataclass, field
from datetime import datetime
Expand Down Expand Up @@ -139,6 +140,12 @@ def generate_improvement_evals(self, n: int = 5) -> List[dict]:
response_format={"type": "json_object"},
)
content = response.choices[0].message.content
# Strip markdown code fences — some providers (Anthropic)
# wrap JSON in ```json ... ``` even with response_format
if content and "```" in content:
match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", content, re.DOTALL)
if match:
content = match.group(1).strip()
# LLM sometimes wraps the array in {"evals": [...]}
parsed = json.loads(content)
if isinstance(parsed, list):
Expand Down
16 changes: 16 additions & 0 deletions evalmonkey/evals/runner.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,22 @@
import os
import json
import re
from evalmonkey.utils.llm import call_llm

def _strip_code_fences(text: str) -> str:
"""Strip markdown code fences from LLM output.

Some providers (notably Anthropic via litellm) wrap JSON responses in
```json ... ``` code blocks even when response_format=json_object is
requested. This causes json.loads() to fail with a parse error.
"""
if text and "```" in text:
match = re.search(r"```(?:json)?\s*\n?(.*?)\n?\s*```", text, re.DOTALL)
if match:
return match.group(1).strip()
return text


class LLMJudgeProvider:
"""
LLMJudgeProvider uses litellm to abstract all common backend API LLM providers.
Expand Down Expand Up @@ -32,6 +47,7 @@ def score_run(self, rubric: str, agent_output: str) -> dict:
response_format={"type": "json_object"}
)
content = response.choices[0].message.content
content = _strip_code_fences(content)
return json.loads(content)
except Exception as e:
# Fallback if there's a JSON parse error or API issue
Expand Down
Loading