Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -210,4 +210,7 @@ GPT4o_MINI/
.DS_Store
tools/*
tmp_*/*
vlmeval/dataset/SciCode/eval/data/*.h5
vlmeval/dataset/SciCode/eval/data/*.h5

# EarthLink file
variables_embedding.jsonl

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The .gitignore entry variables_embedding.jsonl is added without a preceding newline, which can sometimes lead to issues with the last line not being properly ignored by some git clients or tools. It's a good practice to ensure a newline at the end of the file.

variables_embedding.jsonl

35 changes: 31 additions & 4 deletions agent_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

from tqdm import tqdm

from scieval.agents import create_agent, get_available_agents
from scieval.agents.records import EvalRecord, TrajectoryStore
from scieval.agents.smolagents import SmolAgentsAgent

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The import from scieval.agents.smolagents import SmolAgentsAgent is removed, but SmolAgentsAgent is still referenced as a default class name in _build_agent_from_config. While the new create_agent factory handles this, it might be cleaner to explicitly import SmolAgentsAgent if it's intended to be a default, or ensure the default is a string that create_agent can resolve without needing a direct import here.

from scieval.dataset import build_dataset
from scieval.smp import dump, get_logger, load, timestr, githash, ls

Expand All @@ -32,11 +32,38 @@ def _build_dataset_from_config(cfg: Dict[str, Any], dataset_name: str):


def _build_agent_from_config(cfg: Dict[str, Any], agent_name: str):
"""
Build an agent from configuration.

Args:
cfg: Configuration dictionary
agent_name: Name of the agent in the config

Returns:
Agent instance

Raises:
ValueError: If agent class is not supported
ImportError: If agent dependencies are not installed
"""
config = copy.deepcopy(cfg[agent_name])
cls_name = config.pop("class", "SmolAgentsAgent")
if cls_name not in ["SmolAgentsAgent", "smolagents"]:
raise ValueError(f"Unsupported agent class: {cls_name}")
return SmolAgentsAgent(**config)

# Handle legacy name mapping
if cls_name == "smolagents":
cls_name = "SmolAgentsAgent"

try:
return create_agent(cls_name, **config)
except ImportError as e:
available = get_available_agents()
available_list = [name for name, avail in available.items() if avail]
raise ImportError(
f"Failed to create agent '{cls_name}'. "
f"Required dependencies may not be installed.\n"
f"Available agents: {', '.join(available_list) if available_list else 'None'}\n"
f"Error: {e}"
Comment on lines +61 to +65

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The error message for ImportError is quite long and includes a list of available agents. While informative, it might be better to keep the initial error message concise and suggest checking documentation or a specific command for dependency installation, rather than listing all available agents directly in the error message. This can make the error message less overwhelming.

) from e


def _run_one_sample(
Expand Down
81 changes: 81 additions & 0 deletions scieval/agents/EarthLink/agent/data_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import json
from .. import config as CFG
from ..tools.data import ALL_DATA_TOOLS
from ..utils.common import extract_code_blocks
from ..utils.agent import Agent


def create_data_check_agent(logger=None):

prompt = (
"You are a geoscience experiment agent. "
"You understand the user's needs and call related functions to confirm current available CMIP data and observational data information. "
"Then judge whether the current available data can meet the user's needs. \n"
"## Note: \n"
"1. Do not guess the available data information, you should call the function to obtain the available data information.\n"
"2. Only judge based on data information obtained from the function, we can not use any external web data.\n"
"3. You can use the corresponding function tools to find the variable name abbreviations in CMIP, CMIP model names, names of variable that can be derived, avaiable reference datasets, etc.\n"
"(note that the variable names in the observation datasets have been processed to be the same as those in CMIP). \n"
"4. If it can or partially can (for example, there are multiple solutions in the plan, and some solutions can be met), the check passes and the reason can be concise.\n"
"If it cannot (for example, the necessary observation data is missing or all solutions in all plans cannot be met), the check fails and you need to give as detailed a reason as possible.\n"
"## Ouput format:\n"
"```json\n{\"pass\": true or false, \"reason\": \"...\"}\n```"
)

agent = Agent(
name="Data Check Agent",
model_settings=CFG.DATA_CHECK_MODEL_SETTING,
system_prompt=prompt,
tools=ALL_DATA_TOOLS,
max_agent_iterations=CFG.DATA_CHECK_MAX_AGENT_ITERS,
logger=logger,
)
return agent


async def chat_data_check_agent(run_info: dict, save_round: int = 0) -> dict:

user_request = run_info["user_request"]
experiment_plan = run_info["experiment_plan"]
logger = run_info['logger']

data_check_agent = create_data_check_agent(logger=logger)

data_check_input = (
f"<user_request>\n\n{user_request}\n\n</user_request>\n\n"
f"<experiment_plan>\n\n{experiment_plan}\n\n</experiment_plan>\n\n"
"Now, please judge whether the available data can meet the user's needs. "
)

max_try = 20
cur_try = 0
while True:
if cur_try >= max_try:
raise RuntimeError(
f"Failed to get valid JSON output from the data check agent after {max_try} tries."
)

cur_try += 1

result = await data_check_agent.chat(data_check_input)

try:
code = extract_code_blocks(result.content, language='json')
if code is not None:
output = json.loads(code)
else:
output = json.loads(result.content)
if ("pass" not in output):
raise ValueError("Output JSON must contain 'pass' and 'reason' keys.")
Comment on lines +68 to +69

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The ValueError message Output JSON must contain 'pass' and 'reason' keys. is slightly misleading. The reason key is only required if pass is false. The current check if ("pass" not in output) covers the first part, but the second if (not output['pass']) and ("reason" not in output) is more specific. Consider rephrasing the first error message to reflect that pass is always required, and reason is conditional.

Suggested change
if ("pass" not in output):
raise ValueError("Output JSON must contain 'pass' and 'reason' keys.")
if "pass" not in output:
raise ValueError("Output JSON must contain 'pass' key.")

if (not output['pass']) and ("reason" not in output):
raise ValueError("If 'pass' is false, output JSON must contain 'reason' key.")
break
except Exception as e:
data_check_input = (
f"Output is not a valid JSON or does not contain the required keys. Error: \n{e}\n"
"Please output in the following format:\n"
"```json\n{\"pass\": true or false, \"reason\": \"...\"}\n```"
)
data_check_agent.save_messages(f"{run_info['root']}/agent_logs/data_check_agent_round_{save_round}.json")

return output
177 changes: 177 additions & 0 deletions scieval/agents/EarthLink/agent/plan.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import os
import asyncio

from .. import config as CFG
from ..tools.web_search import web_search
from ..tools.data import ALL_DATA_TOOLS
from ..utils.agent import Agent

from .data_check import chat_data_check_agent


# For both plan and plan check agents
PLAN_PROMPT_NOTE = (
"## Note: \n"
"1. Experiments can only use CMIP (Coupled Model Intercomparison Project Phase) datasets and some observation datasets.\n"
"2. The datasets used should be based on the user's request.\n"
"3. It is not mandatory to use CMIP data. If there is no special instruction and the observation data can meet the requirements, the observation data should be used.\n"
"4. If the required observation datasets are not available, you can use CMIP datasets instead. \n"
"5. You can use the web search tool to search for relevant scientific definitions and calculation steps.\n"
"6. You can use corresponding function tools to find the available cmip data, available observation data, CMIP model names, variable name abbreviations in CMIP, names of variable that can be derived, etc. "
"(note that the variable names in the observation datasets have been processed to be the same as those in CMIP). \n"
"7. If the variables required to complete the user task do not exist in CMIP, the corresponding variables should be calculated using existing variables.\n"
"8. If not specified, monthly data is preferred.\n"
"9. The plan should be as detailed and specific as possible, such as the time period, variable name, unit, etc. of the data used. "
"However, it should not include path names, specific parameter configurations (such as color map or line thickness for plotting, etc.), specific execution operations or codes (such as what packages are used for data processing and plotting), etc.\n"
"10. The plan should not include data download, sensitivity experiments, reproducibility, documentation, or future considerations.\n"
"11. The plan cannot contain any preconceived conclusions. \n"
)
PLAN_NOTE_CNT = 11


PLAN_AGENT_TOOLS = [
web_search
] + ALL_DATA_TOOLS


def create_plan_agent(logger=None):

prompt = (
"You are a geoscience experiment agent who is good at experiment planning. "
"You understand the user's needs and output the corresponding experimental plan. "
"Your plan should include what data to use, what preprocessing needs to be done on the data, what calculations to perform, what kind of figures to draw, etc. \n"
f"{PLAN_PROMPT_NOTE}"
f"{PLAN_NOTE_CNT+1}. Output the plan directly, don't output anything else."
)

agent = Agent(
name="Plan Agent",
model_settings=CFG.PLAN_MODEL_SETTING,
system_prompt=prompt,
tools=PLAN_AGENT_TOOLS,
max_agent_iterations=CFG.PLAN_MAX_AGENT_ITERS,
logger=logger,
verbose=False
)

return agent


def create_plan_aggregation_agent(logger=None):

prompt = (
"You are a geoscience experiment agent who is good at checking and making experimental plans. "
"You understand the user's needs and check the rationality and feasibility of the user's experimental plans and provide an improved plan. \n"
f"{PLAN_PROMPT_NOTE}"
f"{PLAN_NOTE_CNT+1}. Directly output your improved complete experimental plan, don't output anything else."
)

agent = Agent(
name="Plan Aggregation Agent",
model_settings=CFG.PLAN_AGGREGATION_MODEL_SETTING,
system_prompt=prompt,
tools=PLAN_AGENT_TOOLS,
max_agent_iterations=CFG.PLAN_MAX_AGENT_ITERS,
logger=logger
)

return agent


async def _single_plan(run_info, user_request, idx, plan_templates):

plan_input = (
f"<user_request>\n{user_request}\n</user_request>\n\n"
)
if plan_templates is not None:
plan_input = (
f"<some_reference_plans>\n"
f"{plan_templates}\n\n"
f"</some_reference_plans>\n\n"
) + plan_input + (
f"The above content starts with some reference experimental plans for other possible similar tasks, "
"followed by the current user's request. \n"
f"Please provide an experimental plan according to the user's request.\n"
)
else:
plan_input += (
f"The above is the user's request. \n"
f"Please provide an experimental plan according to the user's request.\n"
)


plan_agent = create_plan_agent()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The create_plan_agent() call here does not pass the logger from run_info. This means the _single_plan function's agent will not log to the main run logger, potentially making debugging harder for individual plan generation steps.

Suggested change
plan_agent = create_plan_agent()
plan_agent = create_plan_agent(logger=run_info['logger'])


result = await plan_agent.chat(plan_input)
cur_plan = result.content

with open(f"{run_info['root']}/experiment_plans/plan_{idx}.md", "w", encoding='utf-8') as f:
f.write(cur_plan)

plan_agent.save_messages(f"{run_info['root']}/agent_logs/plan_agent_{idx}.json")

return cur_plan


async def chat_plan_agent(run_info: dict):

user_request = run_info["user_request"]
logger = run_info['logger']
root = run_info['root']

os.makedirs(f"{root}/experiment_plans", exist_ok=True)

plan_templates = None
tasks = [
_single_plan(run_info, user_request, i, plan_templates)
for i in range(CFG.MAX_PLANS)
]

try:
plan_list = await asyncio.gather(*tasks, return_exceptions=False)
except Exception as e:
raise RuntimeError(f"Error in plan generation: {e}")

assert len(plan_list) == CFG.MAX_PLANS

experiment_plan_strings = ""
for i in range(len(plan_list)):
experiment_plan_strings += f"\n\n{'-'*5} Begin of plan {i} {'-'*5}\n\n"
experiment_plan_strings += f"{plan_list[i]}"
experiment_plan_strings += f"\n\n{'-'*5} End of plan {i} {'-'*5}\n\n"

plan_aggregation_input = (
f"<user_request>\n\n{user_request}\n\n</user_request>\n\n"
f"<experimental_plans>\n\n"f"{experiment_plan_strings}\n\n</experimental_plans>\n\n"
"The above are the user's request and some experimental plans.\n"
"Now, please provide an improved experimental plan according to the user's request."
)

plan_aggregation_agent = create_plan_aggregation_agent(logger=logger)

debug_round = 0

while True:
debug_round += 1
if debug_round > CFG.MAX_PLAN_DEBUG_ROUND:
raise RuntimeError(f"Data availablility check failed after maximum plan debug rounds ({CFG.MAX_PLAN_DEBUG_ROUND}).")

result = await plan_aggregation_agent.chat(plan_aggregation_input)

plan_aggregation_agent.save_messages(f"{root}/agent_logs/plan_aggregation_agent_round_{debug_round-1}.json")

run_info["experiment_plan"] = result.content
check_result = await chat_data_check_agent(run_info, save_round=debug_round-1)
if check_result['pass']:
break

plan_aggregation_input = (
f"{check_result['reason']}\n\n"
"The data availability check failed due to the above reasons. "
"You can use corresponding function tools to confirm the data information. "
"Please provide a modified experimental plan."
)
with open(f"{root}/experiment_plans/final_plan.md", "w", encoding='utf-8') as f:
f.write(result.content)

return run_info
24 changes: 24 additions & 0 deletions scieval/agents/EarthLink/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
API_KEY = None
BASE_URL = None

EMBEDDING_MODEL = "Qwen/Qwen3-Embedding-8B"
EMBEDDING_API_KEY = None
EMBEDDING_BASE_URL = None

TAVILY_API_KEY = None

DEFAULT_MODEL = "gpt-5"
DEFAULT_MODEL_PROVIDER = "openai"


MAX_PLANS = 3
PLAN_MODEL = None
PLAN_MODEL_SETTING = {}
PLAN_MAX_AGENT_ITERS = 60

PLAN_AGGREGATION_MODEL_SETTING = {}

DATA_CHECK_MODEL_SETTING = {"reasoning_effort": "low"}
DATA_CHECK_MAX_AGENT_ITERS = 60

MAX_PLAN_DEBUG_ROUND = 2
Loading