Metaculus · CodexVeritas · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/README.ipynb b/README.ipynb
@@ -375,7 +375,7 @@
     "for benchmark in benchmarks[:2]:\n",
     "    print(\"--------------------------------\")\n",
     "    print(f\"Bot: {benchmark.name}\")\n",
-    "    print(f\"Score: {benchmark.average_inverse_expected_log_score}\") # Lower is better\n",
+    "    print(f\"Score: {benchmark.average_expected_baseline_score}\") # Lower is better\n",
     "    print(f\"Num reports in benchmark: {len(benchmark.forecast_reports)}\")\n",
     "    print(f\"Time: {benchmark.time_taken_in_minutes}min\")\n",
     "    print(f\"Cost: ${benchmark.total_cost}\")"

diff --git a/code_tests/low_cost_or_live_api_tests/test_benchmarker.py b/code_tests/low_cost_or_live_api_tests/test_benchmarker.py
@@ -110,5 +110,5 @@ def assert_all_benchmark_object_fields_are_not_none(
         len(benchmark.forecast_reports) == num_questions
     ), "Forecast reports is not set"
     assert (
-        benchmark.average_inverse_expected_log_score > 0
+        benchmark.average_expected_baseline_score > 0
     ), "Average inverse expected log score is not set"
diff --git a/code_tests/unit_tests/test_forecasting/test_forecast_bot.py b/code_tests/unit_tests/test_forecasting/test_forecast_bot.py
@@ -9,7 +9,7 @@
 from forecasting_tools.data_models.forecast_report import ReasonedPrediction
 from forecasting_tools.data_models.questions import BinaryQuestion
 from forecasting_tools.forecast_bots.bot_lists import (
-    get_all_official_bot_classes,
+    get_all_important_bot_classes,
 )
 from forecasting_tools.forecast_bots.forecast_bot import (
     ForecastBot,
@@ -41,10 +41,10 @@ async def mock_research(*args, **kwargs):
     )
     assert len(results) == 2
     assert isinstance(results[0], ForecastReport)
-    assert isinstance(results[1], RuntimeError)
+    assert isinstance(results[1], Exception)
     assert "Test error" in str(results[1])
 
-    with pytest.raises(RuntimeError, match="Test error"):
+    with pytest.raises(Exception):
         await bot.forecast_questions(test_questions, return_exceptions=False)
 
 
@@ -58,10 +58,10 @@ async def mock_research(*args, **kwargs):
     bot.run_research = mock_research
 
     result = await bot.forecast_question(test_question, return_exceptions=True)
-    assert isinstance(result, RuntimeError)
+    assert isinstance(result, Exception)
     assert "Test error" in str(result)
 
-    with pytest.raises(RuntimeError, match="Test error"):
+    with pytest.raises(Exception):
         await bot.forecast_question(test_question, return_exceptions=False)
 
 
@@ -114,7 +114,7 @@ async def mock_forecast(*args, **kwargs):
 
     bot._run_forecast_on_binary = mock_forecast
 
-    with pytest.raises(RuntimeError):
+    with pytest.raises(Exception):
         await bot.forecast_question(test_question)
 
 
@@ -219,7 +219,7 @@ async def count_research(*args, **kwargs):
         await bot.forecast_question(forecasted_question)
 
 
-@pytest.mark.parametrize("bot", get_all_official_bot_classes())
+@pytest.mark.parametrize("bot", get_all_important_bot_classes())
 def test_bot_has_config(bot: type[ForecastBot]):
     probable_minimum_number_of_bot_params = 3
     bot_config = bot().get_config()

diff --git a/code_tests/unit_tests/test_forecasting/test_questions_and_reports/test_binary_report.py b/code_tests/unit_tests/test_forecasting/test_questions_and_reports/test_binary_report.py
@@ -50,14 +50,16 @@ async def test_aggregate_predictions() -> None:
         await BinaryReport.aggregate_predictions([1.1, 0.5], question)
 
 
-def test_inversed_expected_log_score() -> None:
+def test_expected_baseline_score() -> None:
     # Test with valid community prediction
     report = ForecastingTestManager.get_fake_forecast_report(
         prediction=0.6, community_prediction=0.7
     )
-    score = report.inversed_expected_log_score
+    score = report.expected_baseline_score
     assert score is not None
-    expected_score = -1 * (0.7 * np.log2(0.6) + 0.3 * np.log2(0.4))
+    expected_score = 100.0 * (
+        0.7 * (np.log(0.6) + 1.0) + (1.0 - 0.7) * (np.log(1.0 - 0.6) + 1.0)
+    )
     assert score == pytest.approx(expected_score)
     assert score > 0
 
@@ -68,17 +70,17 @@ def test_inversed_expected_log_score() -> None:
     worse_report = ForecastingTestManager.get_fake_forecast_report(
         prediction=0.4, community_prediction=0.7
     )
-    better_score = better_report.inversed_expected_log_score
-    worse_score = worse_report.inversed_expected_log_score
+    better_score = better_report.expected_baseline_score
+    worse_score = worse_report.expected_baseline_score
     assert better_score is not None
     assert worse_score is not None
-    assert better_score < worse_score
+    assert better_score > worse_score
 
     # Test with None community prediction
     report = ForecastingTestManager.get_fake_forecast_report(
         prediction=0.6, community_prediction=None
     )
-    assert report.inversed_expected_log_score is None
+    assert report.expected_baseline_score is None
 
 
 def test_deviation_points() -> None:
@@ -110,7 +112,7 @@ def test_calculate_average_expected_log_score() -> None:
         ),
     ]
 
-    average_score = BinaryReport.calculate_average_inverse_expected_log_score(
+    average_score = BinaryReport.calculate_average_expected_baseline_score(
         reports
     )
     assert isinstance(average_score, float)
@@ -122,7 +124,7 @@ def test_calculate_average_expected_log_score() -> None:
         )
     ]
     with pytest.raises(AssertionError):
-        BinaryReport.calculate_average_inverse_expected_log_score(
+        BinaryReport.calculate_average_expected_baseline_score(
             reports_with_none
         )
 

diff --git a/forecasting_tools/__init__.py b/forecasting_tools/__init__.py
@@ -73,6 +73,15 @@
     ForecastBot as ForecastBot,
 )
 from forecasting_tools.forecast_bots.main_bot import MainBot as MainBot
+from forecasting_tools.forecast_bots.official_bots.q1_template_bot import (
+    Q1TemplateBot2025 as Q1TemplateBot2025,
+)
+from forecasting_tools.forecast_bots.official_bots.q3_template_bot import (
+    Q3TemplateBot2024 as Q3TemplateBot2024,
+)
+from forecasting_tools.forecast_bots.official_bots.q4_template_bot import (
+    Q4TemplateBot2024 as Q4TemplateBot2024,
+)
 from forecasting_tools.forecast_bots.template_bot import (
     TemplateBot as TemplateBot,
 )

diff --git a/forecasting_tools/ai_models/general_llm.py b/forecasting_tools/ai_models/general_llm.py
@@ -55,6 +55,32 @@ class GeneralLlm(
     """
 
     _model_trackers: dict[str, ModelTracker] = {}
+    _defaults: dict[str, Any] = {
+        "gpt-4o": {
+            "timeout": 40,
+        },
+        "gpt-4o-mini": {
+            "timeout": 40,
+        },
+        "o1-preview": {
+            "timeout": 80,
+        },
+        "o1": {
+            "timeout": 80,
+        },
+        "o3-mini": {
+            "timeout": 80,
+        },
+        "perplexity/": {
+            "timeout": 120,
+        },
+        "claude": {
+            "timeout": 40,
+        },
+        "deepseek/": {
+            "timeout": 80,
+        },
+    }
 
     def __init__(
         self,
@@ -114,10 +140,12 @@ def __init__(
             if self._use_metaculus_proxy
             else model
         )
+        default_timeout = self._get_default_timeout(self._litellm_model)
+
         self.litellm_kwargs = kwargs
         self.litellm_kwargs["model"] = self._litellm_model
         self.litellm_kwargs["temperature"] = temperature
-        self.litellm_kwargs["timeout"] = timeout
+        self.litellm_kwargs["timeout"] = timeout or default_timeout
 
         if self._use_metaculus_proxy:
             assert (
@@ -153,6 +181,14 @@ def __init__(
 
         self._give_cost_tracking_warning_if_needed()
 
+    @classmethod
+    def _get_default_timeout(cls, model: str) -> int:
+        all_keys = cls._defaults.keys()
+        matching_keys = [key for key in all_keys if key in model]
+        if not matching_keys:
+            return 60
+        return cls._defaults[matching_keys[0]]["timeout"]
+
     def _give_cost_tracking_warning_if_needed(self) -> None:
         model = self._litellm_model
         model_tracker = self._model_trackers.get(model)
@@ -215,7 +251,9 @@ async def _mockable_direct_call_to_model(
         choices = response.choices
         choices = typeguard.check_type(choices, list[Choices])
         answer = choices[0].message.content
-        assert isinstance(answer, str)
+        assert isinstance(
+            answer, str
+        ), f"Answer is not a string and is of type: {type(answer)}. Answer: {answer}"
         usage = response.usage  # type: ignore
         assert isinstance(usage, Usage)
         prompt_tokens = usage.prompt_tokens

diff --git a/forecasting_tools/ai_models/resource_managers/refreshing_bucket_rate_limiter.py b/forecasting_tools/ai_models/resource_managers/refreshing_bucket_rate_limiter.py
@@ -25,14 +25,19 @@ def __init__(self, resources_used: int, time: datetime) -> None:
 class RefreshingBucketRateLimiter:
     """
     The refreshing bucket rate limiter is a way of limiting resource use over time.
+    - It sets up a capacity and a refresh rate.
+    - Refresh rate is per second.
+    - It will wait to fill up when the bucket is emptied.
+
 
     For example:
     - requests per minute
     - tokens per second
     - etc.
 
-    It sets up a capacity and a refresh rate. Refresh rate is per second.
-    The capacity acts as a burst limit. You can spend X resources in a short interval before being forced to slow down.
+    The capacity acts as a burst limit. You can spend X resources before
+    being forced to slow down as the bottom of the bucket is hit and you
+    can't use more resources.
     If you use only X capacity, it will take the (refresh rate * X) to get back to full capacity.
     If you reach the bottom of the bucket, the bucket will fill all the way up before you can use resources again.
     This is to make sure something like a "requests per minute" limit is not exceeded even after a burst

diff --git a/forecasting_tools/data_models/benchmark_for_bot.py b/forecasting_tools/data_models/benchmark_for_bot.py
@@ -24,11 +24,11 @@ class BenchmarkForBot(BaseModel, Jsonable):
     forecast_reports: list[BinaryReport | NumericReport | MultipleChoiceReport]
 
     @property
-    def average_inverse_expected_log_score(self) -> float:
+    def average_expected_baseline_score(self) -> float:
         reports = typeguard.check_type(
             self.forecast_reports,
             list[ForecastReport],
         )
-        return ForecastReport.calculate_average_inverse_expected_log_score(
+        return ForecastReport.calculate_average_expected_baseline_score(
             reports
         )
diff --git a/forecasting_tools/data_models/binary_report.py b/forecasting_tools/data_models/binary_report.py
@@ -53,15 +53,23 @@ def make_readable_prediction(cls, prediction: float) -> str:
     def community_prediction(self) -> float | None:
         return self.question.community_prediction_at_access_time
 
+    @property
+    def expected_baseline_score(self) -> float | None:
+        c = self.community_prediction
+        p = self.prediction
+        if c is None:
+            return None
+        return 100.0 * (
+            c * (np.log(p) + 1.0) + (1.0 - c) * (np.log(1.0 - p) + 1.0)
+        )
+
     @property
     def inversed_expected_log_score(self) -> float | None:
         c = self.community_prediction
         p = self.prediction
         if c is None:
             return None
-        # TODO: Run a simulation using ln rather than log2
-        # since Metaculus uses ln for log scores https://www.metaculus.com/help/scores-faq/#log-score
-        expected_log_score = c * np.log2(p) + (1 - c) * np.log2(1 - p)
+        expected_log_score = c * np.log(p) + (1 - c) * np.log(1 - p)
         inversed_expected_log_score = -1 * expected_log_score
         return inversed_expected_log_score
 

diff --git a/forecasting_tools/data_models/forecast_report.py b/forecasting_tools/data_models/forecast_report.py
@@ -61,36 +61,30 @@ def forecast_rationales(self) -> str:
         return self._get_section_content(index=2, expected_word="forecast")
 
     @property
-    def inversed_expected_log_score(self) -> float | None:
+    def expected_baseline_score(self) -> float | None:
         """
-        Expected log score is evaluated to correlate closest to the baseline score
-        when assuming the community prediction is the true probability.
-        (see scripts/simulate_a_tournament.ipynb).
-        We invert the expected log score so it behaves like a brier score
-        (where it is positive and lower is better).
-
-        The score for a perfect predictor for a set of binary questions with
-        community predictions distributed
-        - Uniformly between 0 and 1 is 0.723
-        - Closer to 0 or 1 is 0.553
-        - Closer to 0.5 is 0.932
-        - Uniformly between 0.1 and 0.9 is 0.834
-        - Uniformly between (0 and 0.1) union (0.9 and 1) is 0.270
-
-        Someone who predicts 0.5 always gets a value of 1
+        Uses the community prediction to calculate the expected value of the baseline score
+        by assuming the community prediction is the true probability. Can be used as
+        a proxy score for comparing forecasters on the same set of questions, enabling
+        faster feedback loops.
+
+        Higher is better.
+
+        See https://www.metaculus.com/help/scores-faq/#baseline-score
+        and scripts/simulate_a_tournament.ipynb for more details.
         """
-        raise NotImplementedError("Not implemented")
+        raise NotImplementedError("Not yet implemented")
 
     @property
     def community_prediction(self) -> Any | None:
         raise NotImplementedError("Not implemented")
 
     @staticmethod
-    def calculate_average_inverse_expected_log_score(
+    def calculate_average_expected_baseline_score(
         reports: list[ForecastReport],
     ) -> float:
         deviation_scores: list[float | None] = [
-            report.inversed_expected_log_score for report in reports
+            report.expected_baseline_score for report in reports
         ]
         validated_deviation_scores: list[float] = []
         for score in deviation_scores:

diff --git a/forecasting_tools/data_models/multiple_choice_report.py b/forecasting_tools/data_models/multiple_choice_report.py
@@ -19,7 +19,7 @@ class MultipleChoiceReport(ForecastReport):
     prediction: PredictedOptionList
 
     @property
-    def inversed_expected_log_score(self) -> float | None:
+    def expected_baseline_score(self) -> float | None:
         raise NotImplementedError("Not implemented")
 
     @property

diff --git a/forecasting_tools/forecast_bots/bot_lists.py b/forecasting_tools/forecast_bots/bot_lists.py
@@ -9,24 +9,24 @@
 from forecasting_tools.forecast_bots.forecast_bot import ForecastBot
 from forecasting_tools.forecast_bots.main_bot import MainBot
 from forecasting_tools.forecast_bots.official_bots.q1_template_bot import (
-    Q1TemplateBot,
+    Q1TemplateBot2025,
 )
 from forecasting_tools.forecast_bots.official_bots.q3_template_bot import (
-    Q3TemplateBot,
+    Q3TemplateBot2024,
 )
 from forecasting_tools.forecast_bots.official_bots.q4_template_bot import (
-    Q4TemplateBot,
+    Q4TemplateBot2024,
 )
 from forecasting_tools.forecast_bots.template_bot import TemplateBot
 
 
-def get_all_official_bot_classes() -> list[type[ForecastBot]]:
+def get_all_important_bot_classes() -> list[type[ForecastBot]]:
     return [
         MainBot,
         TemplateBot,
-        Q1TemplateBot,
-        Q3TemplateBot,
-        Q4TemplateBot,
+        Q1TemplateBot2025,
+        Q3TemplateBot2024,
+        Q4TemplateBot2024,
         Q4VeritasBot,
         Q1VeritasBot,
     ]

diff --git a/forecasting_tools/forecast_bots/community/q4_veritas_bot.py b/forecasting_tools/forecast_bots/community/q4_veritas_bot.py
@@ -8,14 +8,14 @@
     MetaculusQuestion,
 )
 from forecasting_tools.forecast_bots.official_bots.q3_template_bot import (
-    Q3TemplateBot,
+    Q3TemplateBot2024,
 )
 from forecasting_tools.research_agents.research_coordinator import (
     ResearchCoordinator,
 )
 
 
-class Q4VeritasBot(Q3TemplateBot):
+class Q4VeritasBot(Q3TemplateBot2024):
     FINAL_DECISION_LLM = Gpt4o(temperature=0.7)
 
     def __init__(