Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions code_tests/integration_tests/test_forecast_bots_live.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from forecasting_tools.data_models.data_organizer import DataOrganizer
from forecasting_tools.data_models.questions import (
ConditionalQuestion,
DateQuestion,
MetaculusQuestion,
)
from forecasting_tools.data_models.timestamped_predictions import (
Expand Down Expand Up @@ -89,7 +88,7 @@ async def test_predicts_ai_2027_tournament(bot: ForecastBot) -> None:
reports = await bot.forecast_on_tournament("ai-2027")
bot.log_report_summary(reports)

assert len(reports) == 15, "Expected 19 reports"
assert len(reports) == 19, "Expected 19 reports"

except Exception as e:
pytest.fail(f"Forecasting on ai-2027 tournament failed: {e}")
Expand Down Expand Up @@ -186,13 +185,8 @@ async def test_collects_reports_on_open_questions(mocker: Mock) -> None:
questions_that_should_be_being_forecast_on = (
MetaculusApi.get_all_open_questions_from_tournament(tournament_id)
)
date_questions = [
question
for question in questions_that_should_be_being_forecast_on
if isinstance(question, DateQuestion)
]
assert len(reports) == len(questions_that_should_be_being_forecast_on) - len(
date_questions
assert len(reports) == len(
questions_that_should_be_being_forecast_on
), "Not all questions were forecasted on"


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
def test_metaculus_report_is_jsonable() -> None:
temp_writing_path = "temp/temp_metaculus_report.json"
read_report_path = "code_tests/unit_tests/test_data_models/forecasting_test_data/metaculus_forecast_report_examples.json"
# TODO: Add examples for conditional and date reports (and discrete reports?)

reports = DataOrganizer.load_reports_from_file_path(read_report_path)
assert any(isinstance(report, NumericReport) for report in reports)
assert any(isinstance(report, BinaryReport) for report in reports)
Expand Down
6 changes: 6 additions & 0 deletions forecasting_tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@
from forecasting_tools.data_models.multiple_choice_report import (
PredictedOptionList as PredictedOptionList,
)
from forecasting_tools.data_models.numeric_report import (
DatePercentile as DatePercentile,
)
from forecasting_tools.data_models.numeric_report import DateReport as DateReport
from forecasting_tools.data_models.numeric_report import (
DiscreteReport as DiscreteReport,
)
Expand All @@ -108,6 +112,7 @@
)
from forecasting_tools.data_models.numeric_report import NumericReport as NumericReport
from forecasting_tools.data_models.questions import BinaryQuestion as BinaryQuestion
from forecasting_tools.data_models.questions import DateQuestion as DateQuestion
from forecasting_tools.data_models.questions import DiscreteQuestion as DiscreteQuestion
from forecasting_tools.data_models.questions import (
MetaculusQuestion as MetaculusQuestion,
Expand All @@ -124,6 +129,7 @@
ForecastReport.model_rebuild()
NumericReport.model_rebuild()
DiscreteReport.model_rebuild()
DateReport.model_rebuild()
from forecasting_tools.data_models.questions import QuestionState as QuestionState
from forecasting_tools.forecast_bots.forecast_bot import ForecastBot as ForecastBot
from forecasting_tools.forecast_bots.forecast_bot import Notepad as Notepad
Expand Down
7 changes: 5 additions & 2 deletions forecasting_tools/data_models/data_organizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
PredictedOptionList,
)
from forecasting_tools.data_models.numeric_report import (
DateReport,
DiscreteReport,
NumericDistribution,
NumericReport,
Expand Down Expand Up @@ -46,7 +47,9 @@ class TypeMapping(BaseModel):
| DiscreteQuestion
| ConditionalQuestion
)
ReportTypes = NumericReport | MultipleChoiceReport | BinaryReport | DiscreteReport
ReportTypes = (
NumericReport | MultipleChoiceReport | BinaryReport | DiscreteReport | DateReport
)


class DataOrganizer:
Expand All @@ -64,7 +67,7 @@ class DataOrganizer:
TypeMapping(
question_type=DateQuestion,
test_post_id=4110, # https://www.metaculus.com/questions/4110/birthdate-of-oldest-living-human-in-2200/
report_type=None, # Not Implemented Yet
report_type=DateReport,
),
TypeMapping(
question_type=MultipleChoiceQuestion,
Expand Down
62 changes: 55 additions & 7 deletions forecasting_tools/data_models/numeric_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import logging
from collections import Counter
from datetime import datetime, timezone
from typing import TYPE_CHECKING

import numpy as np
Expand All @@ -12,6 +13,7 @@

if TYPE_CHECKING:
from forecasting_tools.data_models.questions import (
DateQuestion,
DiscreteQuestion,
NumericQuestion,
)
Expand Down Expand Up @@ -61,6 +63,25 @@ def validate_percentile(self: Percentile) -> Percentile:
return self


class DatePercentile(BaseModel):
percentile: float = Field(
description="A number between 0 and 1 (e.g. '90% likelihood of AGI by 2040-01-01' translates to '0.9')",
)
value: datetime = Field(
description="The date matching the percentile (e.g. '90% likelihood of AGI by 2040-01-01' translates to '2040-01-01')",
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. You can make the typehint "datetime" and it will automatically make sure its a datetime, and structure_output should format things correctly as well
  2. Make sure the description matches the datetime format (the example given is as if it were regular numeric)
  3. If you do the above you should probably rename it to DatePercentile.
  4. Potentially add validation for timezone.
  5. Potentially move this to FallTemplateBot since this is used only for prompting and nowhere else? If not though, make sure to export this from the package so that I can use it in the template bot example for participants.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think making it a datetime directly would work, because it's not an instance of BaseClass.


@model_validator(mode="after")
def validate_percentile(self: DatePercentile) -> DatePercentile:
if self.percentile < 0 or self.percentile > 1:
raise ValueError(
f"Percentile must be between 0 and 1, but was {self.percentile}"
)
if np.isnan(self.percentile):
raise ValueError(f"Percentile must be a number, but was {self.percentile}")
return self


class NumericDistribution(BaseModel):
declared_percentiles: list[Percentile]
open_upper_bound: bool
Expand All @@ -73,6 +94,7 @@ class NumericDistribution(BaseModel):
)
standardize_cdf: bool = True
strict_validation: bool = True
is_date: bool = False

@model_validator(mode="after")
def validate_percentiles(self: NumericDistribution) -> NumericDistribution:
Expand Down Expand Up @@ -231,29 +253,42 @@ def _check_distribution_too_tall(self, cdf: list[Percentile]) -> None:
def from_question(
cls,
percentiles: list[Percentile],
question: NumericQuestion,
question: NumericQuestion | DateQuestion,
standardize_cdf: bool | None = None,
) -> NumericDistribution:
from forecasting_tools.data_models.questions import DateQuestion

is_date = isinstance(question, DateQuestion)

if is_date:
upper_bound_float: float = question.upper_bound.timestamp()
lower_bound_float: float = question.lower_bound.timestamp()
else:
upper_bound_float = question.upper_bound
lower_bound_float = question.lower_bound

if standardize_cdf is None:
return NumericDistribution(
declared_percentiles=percentiles,
open_upper_bound=question.open_upper_bound,
open_lower_bound=question.open_lower_bound,
upper_bound=question.upper_bound,
lower_bound=question.lower_bound,
upper_bound=upper_bound_float,
lower_bound=lower_bound_float,
zero_point=question.zero_point,
cdf_size=question.cdf_size,
is_date=is_date,
)
else:
return NumericDistribution(
declared_percentiles=percentiles,
open_upper_bound=question.open_upper_bound,
open_lower_bound=question.open_lower_bound,
upper_bound=question.upper_bound,
lower_bound=question.lower_bound,
upper_bound=upper_bound_float,
lower_bound=lower_bound_float,
zero_point=question.zero_point,
cdf_size=question.cdf_size,
standardize_cdf=standardize_cdf,
is_date=is_date,
)

def get_representative_percentiles(
Expand Down Expand Up @@ -562,7 +597,9 @@ class NumericReport(ForecastReport):

@classmethod
async def aggregate_predictions(
cls, predictions: list[NumericDistribution], question: NumericQuestion
cls,
predictions: list[NumericDistribution],
question: NumericQuestion | DateQuestion,
) -> NumericDistribution:
assert predictions, "No predictions to aggregate"
cdfs = [prediction.get_cdf() for prediction in predictions]
Expand Down Expand Up @@ -604,7 +641,13 @@ def make_readable_prediction(cls, prediction: NumericDistribution) -> str:
)
readable = "Probability distribution:\n"
for percentile in representative_percentiles:
readable += f"- {percentile.percentile:.2%} chance of value below {round(percentile.value,6)}\n"
if prediction.is_date:
formatted_value = datetime.fromtimestamp(
percentile.value, tz=timezone.utc
).strftime("%Y-%m-%d %H:%M:%S UTC")
else:
formatted_value = str(round(percentile.value, 6))
readable += f"- {percentile.percentile:.2%} chance of value below {formatted_value}\n"
return readable

async def publish_report_to_metaculus(self) -> None:
Expand Down Expand Up @@ -639,3 +682,8 @@ async def publish_report_to_metaculus(self) -> None:
class DiscreteReport(NumericReport):
question: DiscreteQuestion
prediction: NumericDistribution


class DateReport(NumericReport):
question: DateQuestion
prediction: NumericDistribution
2 changes: 2 additions & 0 deletions forecasting_tools/data_models/questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@ class DateQuestion(MetaculusQuestion, BoundedQuestionMixin):
open_upper_bound: bool
open_lower_bound: bool
zero_point: float | None = None
cdf_size: int = 201

@model_validator(mode="before")
@classmethod
Expand Down Expand Up @@ -484,6 +485,7 @@ def from_metaculus_api_json(cls, api_json: dict) -> DateQuestion:
open_upper_bound=open_upper_bound,
open_lower_bound=open_lower_bound,
zero_point=zero_point,
cdf_size=cls._get_cdf_size_from_json(api_json),
**normal_metaculus_question.model_dump(),
)

Expand Down
24 changes: 7 additions & 17 deletions forecasting_tools/forecast_bots/forecast_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,22 +163,6 @@ async def forecast_on_tournament(
return_exceptions: bool = False,
) -> list[ForecastReport] | list[ForecastReport | BaseException]:
questions = MetaculusApi.get_all_open_questions_from_tournament(tournament_id)
supported_question_types = [
NumericQuestion,
MultipleChoiceQuestion,
BinaryQuestion,
ConditionalQuestion,
]
supported_questions = [
question
for question in questions
if isinstance(question, tuple(supported_question_types))
]
if len(supported_questions) != len(questions):
logger.warning(
f"Skipping {len(questions) - len(supported_questions)} questions that are not supported (probably date questions)"
)
questions = supported_questions
return await self.forecast_questions(questions, return_exceptions)

@overload
Expand Down Expand Up @@ -523,7 +507,7 @@ async def _make_prediction(
elif isinstance(question, ConditionalQuestion):
forecast_function = lambda q, r: self._run_forecast_on_conditional(q, r)
elif isinstance(question, DateQuestion):
raise NotImplementedError("Date questions not supported yet")
forecast_function = lambda q, r: self._run_forecast_on_date(q, r)
else:
raise ValueError(f"Unknown question type: {type(question)}")

Expand All @@ -542,6 +526,12 @@ async def _run_forecast_on_multiple_choice(
) -> ReasonedPrediction[PredictedOptionList]:
raise NotImplementedError("Subclass must implement this method")

async def _run_forecast_on_date(
self, question: DateQuestion, research: str
) -> ReasonedPrediction[NumericDistribution]:
# Return a numeric distribution of timestamps
raise NotImplementedError("Subclass must implement this method")

async def _run_forecast_on_conditional(
self, question: ConditionalQuestion, research: str
) -> ReasonedPrediction[ConditionalPrediction]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from forecasting_tools.data_models.numeric_report import NumericDistribution
from forecasting_tools.data_models.questions import (
BinaryQuestion,
DateQuestion,
MetaculusQuestion,
MultipleChoiceQuestion,
NumericQuestion,
Expand Down Expand Up @@ -172,3 +173,55 @@ async def _run_forecast_on_numeric(
)
async with self._concurrency_limiter:
return await self._numeric_prompt_to_forecast(question, prompt)

async def _run_forecast_on_date(
self, question: DateQuestion, research: str
) -> ReasonedPrediction[NumericDistribution]:
upper_bound_message, lower_bound_message = (
self._create_upper_and_lower_bound_messages(question)
)
prompt = clean_indents(
f"""
You are a professional forecaster interviewing for a job.

Your interview question is:
{question.question_text}

Background:
{question.background_info}

{question.resolution_criteria}

{question.fine_print}

Your research assistant says:
{research}

Today is {datetime.now().strftime("%Y-%m-%d")}.

{lower_bound_message}
{upper_bound_message}

Formatting Instructions:
- This is a date question, and as such, the answer must be expressed in terms of dates.
- The dates must be written in the format of YYYY-MM-DD. If hours matter, please append the date with the hour in UTC and military time: YYYY-MM-DDTHH:MM:SSZ. No other formatting is allowed.
- Always start with a lower date chronologically and then increase from there.
- Do NOT forget this. The dates must be written in chronological order starting at the earliest time at percentile 10 and increasing from there.

{self._instructions}

You remind yourself that good forecasters are humble and set wide 90/10 confidence intervals to account for unknown unknowns.

The last thing you write is your final answer as:
"
Percentile 10: YYYY-MM-DD (oldest date)
Percentile 20: YYYY-MM-DD
Percentile 40: YYYY-MM-DD
Percentile 60: YYYY-MM-DD
Percentile 80: YYYY-MM-DD
Percentile 90: YYYY-MM-DD (newest date)
"
"""
)
async with self._concurrency_limiter:
return await self._date_prompt_to_forecast(question, prompt)
Loading