Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@
"for benchmark in benchmarks[:2]:\n",
" print(\"--------------------------------\")\n",
" print(f\"Bot: {benchmark.name}\")\n",
" print(f\"Score: {benchmark.average_inverse_expected_log_score}\") # Lower is better\n",
" print(f\"Score: {benchmark.average_expected_baseline_score}\") # Lower is better\n",
" print(f\"Num reports in benchmark: {len(benchmark.forecast_reports)}\")\n",
" print(f\"Time: {benchmark.time_taken_in_minutes}min\")\n",
" print(f\"Cost: ${benchmark.total_cost}\")"
Expand Down
2 changes: 1 addition & 1 deletion code_tests/low_cost_or_live_api_tests/test_benchmarker.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,5 +110,5 @@ def assert_all_benchmark_object_fields_are_not_none(
len(benchmark.forecast_reports) == num_questions
), "Forecast reports is not set"
assert (
benchmark.average_inverse_expected_log_score > 0
benchmark.average_expected_baseline_score > 0
), "Average inverse expected log score is not set"
14 changes: 7 additions & 7 deletions code_tests/unit_tests/test_forecasting/test_forecast_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from forecasting_tools.data_models.forecast_report import ReasonedPrediction
from forecasting_tools.data_models.questions import BinaryQuestion
from forecasting_tools.forecast_bots.bot_lists import (
get_all_official_bot_classes,
get_all_important_bot_classes,
)
from forecasting_tools.forecast_bots.forecast_bot import (
ForecastBot,
Expand Down Expand Up @@ -41,10 +41,10 @@ async def mock_research(*args, **kwargs):
)
assert len(results) == 2
assert isinstance(results[0], ForecastReport)
assert isinstance(results[1], RuntimeError)
assert isinstance(results[1], Exception)
assert "Test error" in str(results[1])

with pytest.raises(RuntimeError, match="Test error"):
with pytest.raises(Exception):
await bot.forecast_questions(test_questions, return_exceptions=False)


Expand All @@ -58,10 +58,10 @@ async def mock_research(*args, **kwargs):
bot.run_research = mock_research

result = await bot.forecast_question(test_question, return_exceptions=True)
assert isinstance(result, RuntimeError)
assert isinstance(result, Exception)
assert "Test error" in str(result)

with pytest.raises(RuntimeError, match="Test error"):
with pytest.raises(Exception):
await bot.forecast_question(test_question, return_exceptions=False)


Expand Down Expand Up @@ -114,7 +114,7 @@ async def mock_forecast(*args, **kwargs):

bot._run_forecast_on_binary = mock_forecast

with pytest.raises(RuntimeError):
with pytest.raises(Exception):
await bot.forecast_question(test_question)


Expand Down Expand Up @@ -219,7 +219,7 @@ async def count_research(*args, **kwargs):
await bot.forecast_question(forecasted_question)


@pytest.mark.parametrize("bot", get_all_official_bot_classes())
@pytest.mark.parametrize("bot", get_all_important_bot_classes())
def test_bot_has_config(bot: type[ForecastBot]):
probable_minimum_number_of_bot_params = 3
bot_config = bot().get_config()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,16 @@ async def test_aggregate_predictions() -> None:
await BinaryReport.aggregate_predictions([1.1, 0.5], question)


def test_inversed_expected_log_score() -> None:
def test_expected_baseline_score() -> None:
# Test with valid community prediction
report = ForecastingTestManager.get_fake_forecast_report(
prediction=0.6, community_prediction=0.7
)
score = report.inversed_expected_log_score
score = report.expected_baseline_score
assert score is not None
expected_score = -1 * (0.7 * np.log2(0.6) + 0.3 * np.log2(0.4))
expected_score = 100.0 * (
0.7 * (np.log(0.6) + 1.0) + (1.0 - 0.7) * (np.log(1.0 - 0.6) + 1.0)
)
assert score == pytest.approx(expected_score)
assert score > 0

Expand All @@ -68,17 +70,17 @@ def test_inversed_expected_log_score() -> None:
worse_report = ForecastingTestManager.get_fake_forecast_report(
prediction=0.4, community_prediction=0.7
)
better_score = better_report.inversed_expected_log_score
worse_score = worse_report.inversed_expected_log_score
better_score = better_report.expected_baseline_score
worse_score = worse_report.expected_baseline_score
assert better_score is not None
assert worse_score is not None
assert better_score < worse_score
assert better_score > worse_score

# Test with None community prediction
report = ForecastingTestManager.get_fake_forecast_report(
prediction=0.6, community_prediction=None
)
assert report.inversed_expected_log_score is None
assert report.expected_baseline_score is None


def test_deviation_points() -> None:
Expand Down Expand Up @@ -110,7 +112,7 @@ def test_calculate_average_expected_log_score() -> None:
),
]

average_score = BinaryReport.calculate_average_inverse_expected_log_score(
average_score = BinaryReport.calculate_average_expected_baseline_score(
reports
)
assert isinstance(average_score, float)
Expand All @@ -122,7 +124,7 @@ def test_calculate_average_expected_log_score() -> None:
)
]
with pytest.raises(AssertionError):
BinaryReport.calculate_average_inverse_expected_log_score(
BinaryReport.calculate_average_expected_baseline_score(
reports_with_none
)

Expand Down
9 changes: 9 additions & 0 deletions forecasting_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,15 @@
ForecastBot as ForecastBot,
)
from forecasting_tools.forecast_bots.main_bot import MainBot as MainBot
from forecasting_tools.forecast_bots.official_bots.q1_template_bot import (
Q1TemplateBot2025 as Q1TemplateBot2025,
)
from forecasting_tools.forecast_bots.official_bots.q3_template_bot import (
Q3TemplateBot2024 as Q3TemplateBot2024,
)
from forecasting_tools.forecast_bots.official_bots.q4_template_bot import (
Q4TemplateBot2024 as Q4TemplateBot2024,
)
from forecasting_tools.forecast_bots.template_bot import (
TemplateBot as TemplateBot,
)
Expand Down
42 changes: 40 additions & 2 deletions forecasting_tools/ai_models/general_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,32 @@ class GeneralLlm(
"""

_model_trackers: dict[str, ModelTracker] = {}
_defaults: dict[str, Any] = {
"gpt-4o": {
"timeout": 40,
},
"gpt-4o-mini": {
"timeout": 40,
},
"o1-preview": {
"timeout": 80,
},
"o1": {
"timeout": 80,
},
"o3-mini": {
"timeout": 80,
},
"perplexity/": {
"timeout": 120,
},
"claude": {
"timeout": 40,
},
"deepseek/": {
"timeout": 80,
},
}

def __init__(
self,
Expand Down Expand Up @@ -114,10 +140,12 @@ def __init__(
if self._use_metaculus_proxy
else model
)
default_timeout = self._get_default_timeout(self._litellm_model)

self.litellm_kwargs = kwargs
self.litellm_kwargs["model"] = self._litellm_model
self.litellm_kwargs["temperature"] = temperature
self.litellm_kwargs["timeout"] = timeout
self.litellm_kwargs["timeout"] = timeout or default_timeout

if self._use_metaculus_proxy:
assert (
Expand Down Expand Up @@ -153,6 +181,14 @@ def __init__(

self._give_cost_tracking_warning_if_needed()

@classmethod
def _get_default_timeout(cls, model: str) -> int:
all_keys = cls._defaults.keys()
matching_keys = [key for key in all_keys if key in model]
if not matching_keys:
return 60
return cls._defaults[matching_keys[0]]["timeout"]

def _give_cost_tracking_warning_if_needed(self) -> None:
model = self._litellm_model
model_tracker = self._model_trackers.get(model)
Expand Down Expand Up @@ -215,7 +251,9 @@ async def _mockable_direct_call_to_model(
choices = response.choices
choices = typeguard.check_type(choices, list[Choices])
answer = choices[0].message.content
assert isinstance(answer, str)
assert isinstance(
answer, str
), f"Answer is not a string and is of type: {type(answer)}. Answer: {answer}"
usage = response.usage # type: ignore
assert isinstance(usage, Usage)
prompt_tokens = usage.prompt_tokens
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,19 @@ def __init__(self, resources_used: int, time: datetime) -> None:
class RefreshingBucketRateLimiter:
"""
The refreshing bucket rate limiter is a way of limiting resource use over time.
- It sets up a capacity and a refresh rate.
- Refresh rate is per second.
- It will wait to fill up when the bucket is emptied.


For example:
- requests per minute
- tokens per second
- etc.

It sets up a capacity and a refresh rate. Refresh rate is per second.
The capacity acts as a burst limit. You can spend X resources in a short interval before being forced to slow down.
The capacity acts as a burst limit. You can spend X resources before
being forced to slow down as the bottom of the bucket is hit and you
can't use more resources.
If you use only X capacity, it will take the (refresh rate * X) to get back to full capacity.
If you reach the bottom of the bucket, the bucket will fill all the way up before you can use resources again.
This is to make sure something like a "requests per minute" limit is not exceeded even after a burst
Expand Down
4 changes: 2 additions & 2 deletions forecasting_tools/data_models/benchmark_for_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ class BenchmarkForBot(BaseModel, Jsonable):
forecast_reports: list[BinaryReport | NumericReport | MultipleChoiceReport]

@property
def average_inverse_expected_log_score(self) -> float:
def average_expected_baseline_score(self) -> float:
reports = typeguard.check_type(
self.forecast_reports,
list[ForecastReport],
)
return ForecastReport.calculate_average_inverse_expected_log_score(
return ForecastReport.calculate_average_expected_baseline_score(
reports
)
14 changes: 11 additions & 3 deletions forecasting_tools/data_models/binary_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,23 @@ def make_readable_prediction(cls, prediction: float) -> str:
def community_prediction(self) -> float | None:
return self.question.community_prediction_at_access_time

@property
def expected_baseline_score(self) -> float | None:
c = self.community_prediction
p = self.prediction
if c is None:
return None
return 100.0 * (
c * (np.log(p) + 1.0) + (1.0 - c) * (np.log(1.0 - p) + 1.0)
)

@property
def inversed_expected_log_score(self) -> float | None:
c = self.community_prediction
p = self.prediction
if c is None:
return None
# TODO: Run a simulation using ln rather than log2
# since Metaculus uses ln for log scores https://www.metaculus.com/help/scores-faq/#log-score
expected_log_score = c * np.log2(p) + (1 - c) * np.log2(1 - p)
expected_log_score = c * np.log(p) + (1 - c) * np.log(1 - p)
inversed_expected_log_score = -1 * expected_log_score
return inversed_expected_log_score

Expand Down
32 changes: 13 additions & 19 deletions forecasting_tools/data_models/forecast_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,36 +61,30 @@ def forecast_rationales(self) -> str:
return self._get_section_content(index=2, expected_word="forecast")

@property
def inversed_expected_log_score(self) -> float | None:
def expected_baseline_score(self) -> float | None:
"""
Expected log score is evaluated to correlate closest to the baseline score
when assuming the community prediction is the true probability.
(see scripts/simulate_a_tournament.ipynb).
We invert the expected log score so it behaves like a brier score
(where it is positive and lower is better).

The score for a perfect predictor for a set of binary questions with
community predictions distributed
- Uniformly between 0 and 1 is 0.723
- Closer to 0 or 1 is 0.553
- Closer to 0.5 is 0.932
- Uniformly between 0.1 and 0.9 is 0.834
- Uniformly between (0 and 0.1) union (0.9 and 1) is 0.270

Someone who predicts 0.5 always gets a value of 1
Uses the community prediction to calculate the expected value of the baseline score
by assuming the community prediction is the true probability. Can be used as
a proxy score for comparing forecasters on the same set of questions, enabling
faster feedback loops.

Higher is better.

See https://www.metaculus.com/help/scores-faq/#baseline-score
and scripts/simulate_a_tournament.ipynb for more details.
"""
raise NotImplementedError("Not implemented")
raise NotImplementedError("Not yet implemented")

@property
def community_prediction(self) -> Any | None:
raise NotImplementedError("Not implemented")

@staticmethod
def calculate_average_inverse_expected_log_score(
def calculate_average_expected_baseline_score(
reports: list[ForecastReport],
) -> float:
deviation_scores: list[float | None] = [
report.inversed_expected_log_score for report in reports
report.expected_baseline_score for report in reports
]
validated_deviation_scores: list[float] = []
for score in deviation_scores:
Expand Down
2 changes: 1 addition & 1 deletion forecasting_tools/data_models/multiple_choice_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class MultipleChoiceReport(ForecastReport):
prediction: PredictedOptionList

@property
def inversed_expected_log_score(self) -> float | None:
def expected_baseline_score(self) -> float | None:
raise NotImplementedError("Not implemented")

@property
Expand Down
14 changes: 7 additions & 7 deletions forecasting_tools/forecast_bots/bot_lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@
from forecasting_tools.forecast_bots.forecast_bot import ForecastBot
from forecasting_tools.forecast_bots.main_bot import MainBot
from forecasting_tools.forecast_bots.official_bots.q1_template_bot import (
Q1TemplateBot,
Q1TemplateBot2025,
)
from forecasting_tools.forecast_bots.official_bots.q3_template_bot import (
Q3TemplateBot,
Q3TemplateBot2024,
)
from forecasting_tools.forecast_bots.official_bots.q4_template_bot import (
Q4TemplateBot,
Q4TemplateBot2024,
)
from forecasting_tools.forecast_bots.template_bot import TemplateBot


def get_all_official_bot_classes() -> list[type[ForecastBot]]:
def get_all_important_bot_classes() -> list[type[ForecastBot]]:
return [
MainBot,
TemplateBot,
Q1TemplateBot,
Q3TemplateBot,
Q4TemplateBot,
Q1TemplateBot2025,
Q3TemplateBot2024,
Q4TemplateBot2024,
Q4VeritasBot,
Q1VeritasBot,
]
Expand Down
4 changes: 2 additions & 2 deletions forecasting_tools/forecast_bots/community/q4_veritas_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,14 @@
MetaculusQuestion,
)
from forecasting_tools.forecast_bots.official_bots.q3_template_bot import (
Q3TemplateBot,
Q3TemplateBot2024,
)
from forecasting_tools.research_agents.research_coordinator import (
ResearchCoordinator,
)


class Q4VeritasBot(Q3TemplateBot):
class Q4VeritasBot(Q3TemplateBot2024):
FINAL_DECISION_LLM = Gpt4o(temperature=0.7)

def __init__(
Expand Down
Loading