Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
199 changes: 148 additions & 51 deletions src/config.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,181 @@
import functools
"""Configuration logic and schema definitions.

The `get_config` function provides access to the most recently loaded configuration.
A default configuration is loaded if none is explicitly set.

Use `set_config` to use a different configuration.
To parse a configuration from a file and environment variables, use `parse_configuration`.
Example of loading a configuration with a custom TOML and .env file:

```
config = parse_config(
dotenv_file=Path("path/to/.env"),
configuration_file=Path("path/to/config.toml")
)
set_config(config)
```
and then consequent calls to `get_config` will return that configuration.
"""

import os
import tomllib
import typing
from pathlib import Path
from typing import Literal, cast

from dotenv import load_dotenv
from loguru import logger
from pydantic import AnyUrl, BaseModel, Field

TomlTable = dict[str, typing.Any]

CONFIG_DIRECTORY_ENV = "OPENML_REST_API_CONFIG_DIRECTORY"
CONFIG_FILE_ENV = "OPENML_REST_API_CONFIG_FILE"
DOTENV_FILE_ENV = "OPENML_REST_API_DOTENV_FILE"

OPENML_DB_USERNAME_ENV = "OPENML_DATABASES_OPENML_USERNAME"
OPENML_DB_PASSWORD_ENV = "OPENML_DATABASES_OPENML_PASSWORD" # noqa: S105 # not a password
EXPDB_DB_USERNAME_ENV = "OPENML_DATABASES_EXPDB_USERNAME"
EXPDB_DB_PASSWORD_ENV = "OPENML_DATABASES_EXPDB_PASSWORD" # noqa: S105 # not a password

_config_directory = Path(os.getenv(CONFIG_DIRECTORY_ENV, Path(__file__).parent))
_config_directory = _config_directory.expanduser().absolute()
_config_file = Path(os.getenv(CONFIG_FILE_ENV, _config_directory / "config.toml"))
_config_file = _config_file.expanduser().absolute()
_dotenv_file = Path(os.getenv(DOTENV_FILE_ENV, _config_directory / ".env"))
_dotenv_file = _dotenv_file.expanduser().absolute()
_config: Configuration | None = None
Comment thread
sourcery-ai[bot] marked this conversation as resolved.


logger.info(
"Determined configuration sources.",
configuration_directory=_config_directory,
configuration_file=_config_file,
dotenv_file=_dotenv_file,
)
# The reason we use a module variable instead of functools.cache
# is that this method allows a custom configuration to be set
# through `set_config` and subsequently loaded through `get_config`.
def get_config() -> Configuration:
if _config is None:
config = parse_config()
set_config(config)
return cast("Configuration", _config)
Comment thread
PGijsbers marked this conversation as resolved.

load_dotenv(dotenv_path=_dotenv_file)

def set_config(configuration: Configuration) -> None:
global _config # noqa: PLW0603
_config = configuration

def _apply_defaults_to_siblings(configuration: TomlTable) -> TomlTable:
defaults = configuration["defaults"]
return {
subtable: (defaults | overrides) if isinstance(overrides, dict) else overrides
for subtable, overrides in configuration.items()
if subtable != "defaults"
}

class Configuration(BaseModel, frozen=True):
openml_database: DatabaseConfiguration
expdb_database: DatabaseConfiguration
development: DevelopmentConfiguration
routing: RoutingConfiguration
logging: list[LoggingConfiguration]

@functools.cache
def _load_configuration(file: Path) -> TomlTable:
return tomllib.loads(file.read_text())

class DatabaseConfiguration(BaseModel, frozen=True):
"""Settings for one database connection."""

def load_routing_configuration(file: Path = _config_file) -> TomlTable:
return typing.cast("TomlTable", _load_configuration(file)["routing"])
host: str = Field(default="database", description="Database server host name")
port: int = Field(default=3306, gt=0)
database: str = Field(description="Database name")
username: str = Field(default="root")
password: str = Field(default="ok")
echo: bool = Field(
default=False,
description="https://docs.sqlalchemy.org/en/20/core/engines.html#sqlalchemy.create_engine.params.echo",
)
drivername: str = Field(
default="mysql+aiomysql",
description="SQLAlchemy `dialect` and `driver`: https://docs.sqlalchemy.org/en/20/dialects/index.html",
)


class DevelopmentConfiguration(BaseModel, frozen=True):
"""Settings for development or test specific features."""

allow_test_api_keys: bool = Field(default=False)

@functools.cache
def load_database_configuration(file: Path = _config_file) -> TomlTable:
configuration = _load_configuration(file)
database_configuration = _apply_defaults_to_siblings(
configuration["databases"],

class RoutingConfiguration(BaseModel, frozen=True):
root_path: str = Field(default="", description="Path prefix under which the service is hosted.")
minio_url: AnyUrl = Field(description="URL to the MinIO server or service")
server_url: AnyUrl = Field(
description="URL to this server (excluding the path prefix of `fastapi.root_path`).",
)
database_configuration["openml"]["username"] = os.environ.get(
OPENML_DB_USERNAME_ENV,
"root",


class LoggingConfiguration(BaseModel, frozen=True):
"""Configuration for a single log sink.

You can add any arguments that `loguru.logger.add` allows,
the `sink` will be used as first positional argument.
See also: https://loguru.readthedocs.io/en/stable/api/logger.html
"""

sink: str
level: Literal["TRACE", "DEBUG", "INFO", "SUCCESS", "WARNING", "ERROR"]
rotation: str | None = Field(
default=None,
description="Set rotation policy by date or file size.",
)
database_configuration["openml"]["password"] = os.environ.get(
OPENML_DB_PASSWORD_ENV,
"ok",
retention: str | None = Field(
default=None,
description="Timespan after which automatic cleanup occurs.",
)
database_configuration["expdb"]["username"] = os.environ.get(
EXPDB_DB_USERNAME_ENV,
"root",
compression: str | None = Field(default="gz")
# Logs provided variables as JSON
serialize: bool = Field(default=True)
# Decouples log calls from I/O and makes it multiprocessing safe.
enqueue: bool = Field(default=True)


def _db_env_credentials(alias: str) -> dict[str, str]:
return {
"username": os.environ.get(
f"OPENML_DATABASES_{alias.upper()}_USERNAME",
"root",
),
"password": os.environ.get(
f"OPENML_DATABASES_{alias.upper()}_PASSWORD",
"ok",
),
}


def parse_config(
dotenv_file: Path | None = None,
configuration_file: Path | None = None,
) -> Configuration:
"""Load configuration from file and environment variables.

The parsed configuration is returned but not used by default for other calls in this module.
"""
_config_directory = Path(os.getenv(CONFIG_DIRECTORY_ENV, Path(__file__).parent))
_config_directory = _config_directory.expanduser().absolute()
logger.info(
"Determined configuration directory to be {configuration_directory}.",
configuration_directory=_config_directory,
)
database_configuration["expdb"]["password"] = os.environ.get(
EXPDB_DB_PASSWORD_ENV,
"ok",

if not dotenv_file:
dotenv_filepath = os.getenv(DOTENV_FILE_ENV, _config_directory / ".env")
dotenv_file = Path(dotenv_filepath).expanduser().absolute()

logger.info(
"Determined dotenv file path to be {dotenv_file}.",
dotenv_file=dotenv_file,
)
return database_configuration
load_dotenv(dotenv_file)

if not configuration_file:
config_filepath = os.getenv(CONFIG_FILE_ENV, _config_directory / "config.toml")
configuration_file = Path(config_filepath).expanduser().absolute()

def load_configuration(file: Path | None = None) -> TomlTable:
file = file or _config_file
return tomllib.loads(file.read_text())
logger.info(
"Determined config file path to be {config_file}.",
config_file=configuration_file,
)

config = tomllib.loads(configuration_file.read_text())
db_section = config["databases"]
openml_db = DatabaseConfiguration(**db_section["openml"], **_db_env_credentials("openml"))
expdb_db = DatabaseConfiguration(**db_section["expdb"], **_db_env_credentials("expdb"))

return Configuration(
routing=RoutingConfiguration(**config["routing"]),
logging=[
LoggingConfiguration(**sink_configuration)
for sink_configuration in config["logging"].values()
],
openml_database=openml_db,
expdb_database=expdb_db,
development=DevelopmentConfiguration(**config["development"]),
)
17 changes: 1 addition & 16 deletions src/config.toml
Original file line number Diff line number Diff line change
@@ -1,39 +1,24 @@
arff_base_url="https://test.openml.org"
minio_base_url="https://openml1.win.tue.nl"

[development]
allow_test_api_keys=true

# Any number of logging.NAME configurations can be added.
# NAME is for reference only, it has no meaning otherwise.
# You can add any arguments to `loguru.logger.add`,
# the `sink` variable will be used as first positional argument.
# https://loguru.readthedocs.io/en/stable/api/logger.html
[logging.develop]
sink="develop.log"
# One of loguru levels: TRACE, DEBUG, INFO, SUCCESS, WARNING, ERROR
level="DEBUG"
# Automatically create a new file by date or file size
rotation="50 MB"
# Retention specifies the timespan after which automatic cleanup occurs.
retention="1 day"
compression="gz"

[fastapi]
root_path=""

[databases.defaults]
host="database"
port="3306"
# SQLAlchemy `dialect` and `driver`: https://docs.sqlalchemy.org/en/20/dialects/index.html
drivername="mysql+aiomysql"

[databases.expdb]
database="openml_expdb"

[databases.openml]
database="openml"

[routing]
root_path=""
minio_url="http://minio:9000/"
server_url="http://php-api:80/"
6 changes: 3 additions & 3 deletions src/core/formatting.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import html
from typing import TYPE_CHECKING

from config import load_routing_configuration
from config import get_config
from schemas.datasets.openml import DatasetFileFormat

if TYPE_CHECKING:
Expand All @@ -21,14 +21,14 @@ def _format_parquet_url(dataset: Row) -> str | None:
if dataset.format.lower() != DatasetFileFormat.ARFF:
return None

minio_base_url = load_routing_configuration()["minio_url"]
minio_base_url = get_config().routing.minio_url
ten_thousands_prefix = f"{dataset.did // 10_000:04d}"
padded_id = f"{dataset.did:04d}"
return f"{minio_base_url}datasets/{ten_thousands_prefix}/{padded_id}/dataset_{dataset.did}.pq"


def _format_dataset_url(dataset: Row) -> str:
base_url = load_routing_configuration()["server_url"]
base_url = get_config().routing.server_url
filename = f"{html.escape(dataset.name)}.{dataset.format.lower()}"
return f"{base_url}data/v1/download/{dataset.file_id}/{filename}"

Expand Down
24 changes: 12 additions & 12 deletions src/core/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,31 @@
import time
import uuid
from collections.abc import Awaitable, Callable
from pathlib import Path
from typing import TYPE_CHECKING

from loguru import logger

from config import load_configuration
from config import LoggingConfiguration

if TYPE_CHECKING:
from starlette.requests import Request
from starlette.responses import Response


def setup_log_sinks(configuration_file: Path | None = None) -> None:
def setup_log_sinks(*configurations: LoggingConfiguration) -> None:
"""Configure loguru based on app configuration."""
configuration = load_configuration(configuration_file)
for nickname, sink_configuration in configuration.get("logging", {}).items():
logger.info("Configuring sink", nickname=nickname, **sink_configuration)
sink = sink_configuration.pop("sink")
for sink_configuration in configurations:
conf = sink_configuration.model_dump()
logger.info("Configuring sink", **conf)
sink = conf.pop("sink")
if sink == "sys.stderr":
sink = sys.stderr
# Logs the additionally provided data as JSON.
sink_configuration.setdefault("serialize", True)
# Decouples log calls from I/O and makes it multiprocessing safe.
sink_configuration.setdefault("enqueue", True)
logger.add(sink, **sink_configuration)
# defaults may be provided for rotation and retention,
# but they are not valid options for stderr logging.
conf.pop("rotation", None)
conf.pop("retention", None)
conf.pop("compression", None)
logger.add(sink, **conf)


async def add_request_context_to_log(
Expand Down
23 changes: 13 additions & 10 deletions src/database/setup.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,39 @@
from sqlalchemy.engine import URL
from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine

from config import load_database_configuration
from config import DatabaseConfiguration, get_config

_user_engine = None
_expdb_engine = None


def _create_engine(database_name: str) -> AsyncEngine:
database_configuration = load_database_configuration()
db_config = dict(database_configuration[database_name])
echo = db_config.pop("echo", False)

db_url = URL.create(**db_config)
def _create_engine(db_config: DatabaseConfiguration) -> AsyncEngine:
db_url = URL.create(
Comment thread
sourcery-ai[bot] marked this conversation as resolved.
drivername=db_config.drivername,
username=db_config.username,
password=db_config.password,
host=db_config.host,
port=db_config.port,
database=db_config.database,
)
return create_async_engine(
db_url,
echo=echo,
echo=db_config.echo,
pool_recycle=3600,
)
Comment thread
PGijsbers marked this conversation as resolved.


def user_database() -> AsyncEngine:
global _user_engine # noqa: PLW0603
if _user_engine is None:
_user_engine = _create_engine("openml")
_user_engine = _create_engine(get_config().openml_database)
return _user_engine


def expdb_database() -> AsyncEngine:
global _expdb_engine # noqa: PLW0603
if _expdb_engine is None:
_expdb_engine = _create_engine("expdb")
_expdb_engine = _create_engine(get_config().expdb_database)
return _expdb_engine


Expand Down
Loading
Loading