Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,13 @@
"# Note: Make sure to set the 'FILE_WRITING_ALLOWED' environment variable to true if you want to save the benchmarks to a file\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once you have benchmark files in your directory you can run `streamlit run scripts/benchmark_displayer.py` to get a UI with the benchmarks. This will allow you to see metrics, code of the bots, the actual bot responses, etc."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
)
from code_tests.unit_tests.test_ai_models.models_to_test import ModelsToTest
from code_tests.utilities_for_tests import coroutine_testing
from forecasting_tools.ai_models.basic_model_interfaces.ai_model import AiModel
from forecasting_tools.ai_models.gpto1preview import GptO1Preview
from forecasting_tools.ai_models.metaculus4o import Gpt4oMetaculusProxy
from forecasting_tools.ai_models.model_interfaces.ai_model import AiModel

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import asyncio

import pytest

from code_tests.unit_tests.test_ai_models.models_to_test import (
GeneralLlmInstancesToTest,
ModelTest,
)


@pytest.mark.parametrize(
"test_name, test", GeneralLlmInstancesToTest().all_tests_with_names()
)
def test_general_llm_instances_run(
test_name: str,
test: ModelTest,
) -> None:
model = test.llm
model_input = test.model_input
response = asyncio.run(model.invoke(model_input))
assert response is not None, "Response is None"
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
from forecasting_tools.ai_models.ai_utils.response_types import (
TextTokenCostResponse,
)
from forecasting_tools.ai_models.basic_model_interfaces.ai_model import AiModel
from forecasting_tools.ai_models.basic_model_interfaces.incurs_cost import (
IncursCost,
from forecasting_tools.ai_models.model_interfaces.ai_model import AiModel
from forecasting_tools.ai_models.model_interfaces.combined_llm_archetype import (
CombinedLlmArchetype,
)
from forecasting_tools.ai_models.basic_model_interfaces.retryable_model import (
from forecasting_tools.ai_models.model_interfaces.incurs_cost import IncursCost
from forecasting_tools.ai_models.model_interfaces.retryable_model import (
RetryableModel,
)
from forecasting_tools.ai_models.resource_managers.hard_limit_manager import (
Expand Down Expand Up @@ -124,6 +125,27 @@ def test_cost_manager_notices_cost_without_mocks(
assert cost > 0, "No cost was incurred"


@pytest.mark.parametrize("subclass", ModelsToTest.INCURS_COST_LIST)
async def test_cost_calculated_matches_actual_cost(
subclass: type[AiModel],
) -> None:
if not issubclass(subclass, CombinedLlmArchetype):
pytest.skip("Model does not have calculate_cost_from_tokens method")
model = subclass()
direct_response = await model._mockable_direct_call_to_model(
model._get_cheap_input_for_invoke()
)
assert isinstance(direct_response, TextTokenCostResponse)
actual_cost = direct_response.cost
calculated_cost = model.calculate_cost_from_tokens(
direct_response.prompt_tokens_used,
direct_response.completion_tokens_used,
)
assert calculated_cost == pytest.approx(
actual_cost
), "Cost calculated does not match actual cost"


@pytest.mark.parametrize("subclass", ModelsToTest.INCURS_COST_LIST)
def test_cost_manager_notices_cost_with_mocks(
mocker: Mock, subclass: type[AiModel]
Expand Down

This file was deleted.

10 changes: 5 additions & 5 deletions code_tests/unit_tests/test_ai_models/ai_mock_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
import logging
from unittest.mock import Mock

from forecasting_tools.ai_models.basic_model_interfaces.ai_model import AiModel
from forecasting_tools.ai_models.basic_model_interfaces.request_limited_model import (
from forecasting_tools.ai_models.model_interfaces.ai_model import AiModel
from forecasting_tools.ai_models.model_interfaces.request_limited_model import (
RequestLimitedModel,
)
from forecasting_tools.ai_models.basic_model_interfaces.time_limited_model import (
from forecasting_tools.ai_models.model_interfaces.time_limited_model import (
TimeLimitedModel,
)
from forecasting_tools.ai_models.basic_model_interfaces.token_limited_model import (
from forecasting_tools.ai_models.model_interfaces.token_limited_model import (
TokenLimitedModel,
)
from forecasting_tools.ai_models.basic_model_interfaces.tokens_are_calculatable import (
from forecasting_tools.ai_models.model_interfaces.tokens_are_calculatable import (
TokensAreCalculatable,
)

Expand Down
108 changes: 89 additions & 19 deletions code_tests/unit_tests/test_ai_models/models_to_test.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,34 @@
from litellm import model_cost

from forecasting_tools.ai_models.basic_model_interfaces.ai_model import AiModel
from forecasting_tools.ai_models.basic_model_interfaces.incurs_cost import (
IncursCost,
)
from forecasting_tools.ai_models.basic_model_interfaces.outputs_text import (
from forecasting_tools.ai_models.ai_utils.openai_utils import VisionMessageData
from forecasting_tools.ai_models.claude35sonnet import Claude35Sonnet
from forecasting_tools.ai_models.deepseek_r1 import DeepSeekR1
from forecasting_tools.ai_models.exa_searcher import ExaSearcher
from forecasting_tools.ai_models.general_llm import GeneralLlm, ModelInputType
from forecasting_tools.ai_models.gpt4o import Gpt4o
from forecasting_tools.ai_models.gpt4ovision import Gpt4oVision
from forecasting_tools.ai_models.gpto1 import GptO1
from forecasting_tools.ai_models.metaculus4o import Gpt4oMetaculusProxy
from forecasting_tools.ai_models.model_interfaces.ai_model import AiModel
from forecasting_tools.ai_models.model_interfaces.incurs_cost import IncursCost
from forecasting_tools.ai_models.model_interfaces.outputs_text import (
OutputsText,
)
from forecasting_tools.ai_models.basic_model_interfaces.request_limited_model import (
from forecasting_tools.ai_models.model_interfaces.request_limited_model import (
RequestLimitedModel,
)
from forecasting_tools.ai_models.basic_model_interfaces.retryable_model import (
from forecasting_tools.ai_models.model_interfaces.retryable_model import (
RetryableModel,
)
from forecasting_tools.ai_models.basic_model_interfaces.time_limited_model import (
from forecasting_tools.ai_models.model_interfaces.time_limited_model import (
TimeLimitedModel,
)
from forecasting_tools.ai_models.basic_model_interfaces.token_limited_model import (
from forecasting_tools.ai_models.model_interfaces.token_limited_model import (
TokenLimitedModel,
)
from forecasting_tools.ai_models.basic_model_interfaces.tokens_incur_cost import (
from forecasting_tools.ai_models.model_interfaces.tokens_incur_cost import (
TokensIncurCost,
)
from forecasting_tools.ai_models.claude35sonnet import Claude35Sonnet
from forecasting_tools.ai_models.deepseek_r1 import DeepSeekR1
from forecasting_tools.ai_models.exa_searcher import ExaSearcher
from forecasting_tools.ai_models.gpt4o import Gpt4o
from forecasting_tools.ai_models.gpt4ovision import Gpt4oVision
from forecasting_tools.ai_models.gpto1preview import GptO1Preview
from forecasting_tools.ai_models.metaculus4o import Gpt4oMetaculusProxy
from forecasting_tools.ai_models.perplexity import Perplexity


Expand All @@ -43,8 +43,8 @@ def litellm_has_model_cost(self, model: str) -> bool:
Gpt4o,
Gpt4oMetaculusProxy,
Gpt4oVision,
GptO1Preview,
# GptO1, # TODO: dependencies do not yet support this
# GptO1Preview,
GptO1,
Claude35Sonnet,
Perplexity,
ExaSearcher,
Expand Down Expand Up @@ -74,3 +74,73 @@ def litellm_has_model_cost(self, model: str) -> bool:
TOKENS_INCUR_COST_LIST: list[type[TokensIncurCost]] = [
model for model in ALL_MODELS if issubclass(model, TokensIncurCost)
]


class ModelTest:
def __init__(self, llm: GeneralLlm, model_input: ModelInputType) -> None:
self.llm = llm
self.model_input = model_input


class GeneralLlmInstancesToTest:
SMALL_BASE_64_IMAGE = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII="
CHEAP_VISION_MESSAGE_DATA = VisionMessageData(
prompt="Hi", b64_image=SMALL_BASE_64_IMAGE, image_resolution="low"
)

def _get_cheap_user_message(self) -> str:
return "Hi"

def _get_cheap_vision_message_data(self) -> VisionMessageData:
return self.CHEAP_VISION_MESSAGE_DATA

def _all_tests(self) -> list[ModelTest]:
return [
ModelTest(
GeneralLlm(model="gpt-4o"), self._get_cheap_user_message()
),
ModelTest(
GeneralLlm(model="gpt-4o"),
self._get_cheap_vision_message_data(),
),
ModelTest(
GeneralLlm(model="gpt-4o"),
[{"role": "user", "content": self._get_cheap_user_message()}],
),
ModelTest(
GeneralLlm(model="o3-mini", reasoning_effort="low"),
self._get_cheap_user_message(),
),
ModelTest(
GeneralLlm(model="metaculus/gpt-4o"),
self._get_cheap_user_message(),
),
ModelTest(
GeneralLlm(model="metaculus/claude-3-5-sonnet-20241022"),
self._get_cheap_user_message(),
),
ModelTest(
GeneralLlm(model="claude-3-5-sonnet-20241022"),
self._get_cheap_user_message(),
),
ModelTest(
GeneralLlm(model="claude-3-5-sonnet-20241022"),
self._get_cheap_vision_message_data(),
),
ModelTest(
GeneralLlm(model="perplexity/sonar-pro"),
self._get_cheap_user_message(),
),
ModelTest(
GeneralLlm(model="deepseek/deepseek-reasoner"),
self._get_cheap_user_message(),
),
]

def all_tests_with_names(self) -> list[tuple[str, ModelTest]]:
tests = self._all_tests()
pairs = []
for test in tests:
input_type = type(test.model_input)
pairs.append((f"{test.llm.model}-{input_type.__name__}", test))
return pairs
Loading