Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion benchmarks/automl/run_automl_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
sanitize_feature_names,
save_feature_cache,
)
from featcopilot.utils.models import DEFAULT_MODEL

warnings.filterwarnings("ignore")

Expand Down Expand Up @@ -362,7 +363,7 @@ def get_featcopilot_engines(task: str, with_llm: bool) -> tuple[list[str], dict[
engines.append("text")
if with_llm:
engines.append("llm")
return engines, {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"}
return engines, {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"}
return engines, None


Expand Down
3 changes: 2 additions & 1 deletion benchmarks/compare_tools/run_fe_tools_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
)
from benchmarks.splits import split_benchmark_data
from featcopilot.utils.logger import get_logger # noqa: E402
from featcopilot.utils.models import DEFAULT_MODEL # noqa: E402

logger = get_logger(__name__)

Expand Down Expand Up @@ -1465,7 +1466,7 @@ def load_cache(output_path: Path) -> pd.DataFrame | None:
tools_to_run = None # None means run all available tools

# Run benchmark
llm_config = {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"} if args.with_llm else None
llm_config = {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"} if args.with_llm else None
results = run_comparison_benchmark(
dataset_names=dataset_names,
tools=tools_to_run,
Expand Down
121 changes: 112 additions & 9 deletions benchmarks/simple_models/run_simple_models_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@

import argparse
import json
import logging
import time
import warnings
from datetime import datetime
Expand Down Expand Up @@ -79,12 +80,40 @@
sanitize_feature_frames,
sanitize_feature_names,
)
from featcopilot.utils.models import DEFAULT_MODEL

# Module logger for surfacing exceptions that were previously swallowed.
# We deliberately use the stdlib ``logging`` module here (rather than
# ``featcopilot.utils.logger.get_logger``) because the latter sets
# ``propagate=False`` on the ``featcopilot.*`` logger tree, which prevents
# benchmark output from reaching root-logger handlers configured by
# downstream consumers (CI runners, log aggregators, ``pytest --log-cli``).
# A vanilla ``logging.getLogger(__name__)`` here keeps the benchmark output
# routable through the consumer's normal logging configuration.
# ``logging.basicConfig`` is the consumer's responsibility.
logger = logging.getLogger(__name__)

# Default configuration
DEFAULT_MAX_FEATURES = 100
QUICK_DATASETS = ["titanic", "house_prices", "credit_risk", "bike_sharing", "customer_churn", "insurance_claims"]


# Exception types we expect to encounter during per-fold feature
# engineering. These come from sklearn / pandas / featcopilot validation
# code paths and represent recoverable user-input issues (bad columns,
# wrong dtypes, etc.). Anything else we surface via ``logger.exception``
# so genuine bugs (e.g. ``AttributeError`` from a refactor regression)
# don't get masked behind a benign-looking baseline-fallback.
_EXPECTED_FE_FAILURES: tuple[type[BaseException], ...] = (
ValueError,
KeyError,
TypeError,
RuntimeError,
MemoryError,
np.linalg.LinAlgError,
)


# Markers that indicate a loader returned synthetic data despite the
# registry tagging the dataset as real-world (e.g., a Kaggle/OpenML/HF
# loader that fell back to a synthesized dataset because the upstream
Expand Down Expand Up @@ -148,7 +177,11 @@ def _resolve_source(result: dict) -> str:
return "synthetic"
try:
return "real_world" if is_real_world(dataset) else "synthetic"
except Exception:
except (KeyError, ValueError, TypeError):
# ``is_real_world`` raises only ``KeyError`` for unknown datasets
# and ``ValueError``/``TypeError`` for invalid input; anything else
# is a genuine bug we want to surface rather than silently bucket
# as "synthetic".
return "synthetic"


Expand Down Expand Up @@ -384,7 +417,7 @@ def get_featcopilot_engines(task: str, with_llm: bool) -> tuple[list[str], dict[
engines.append("text")
if with_llm:
engines.append("llm")
return engines, {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"}
return engines, {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"}
return engines, None


Expand Down Expand Up @@ -597,6 +630,11 @@ def run_single_benchmark(
fe_times = []
n_features_generated = []
engines_used: list[str] = []
# Track per-fold FeatCopilot failures so the silent baseline
# fallback is visible to consumers (previously the same broad
# ``except Exception`` would mask the failure rate behind an
# otherwise-healthy-looking results row).
fe_failed_folds: list[dict[str, Any]] = []

seeds = [42 + i * 7 for i in range(n_seeds)]
if not seeds:
Expand Down Expand Up @@ -667,13 +705,56 @@ def run_single_benchmark(
# would just produce ``["str", "str", ...]``.
if not engines_used:
engines_used = list(fold_engines)
except Exception as e:
print(f" FeatCopilot error on fold {fold_idx}: {e}")
except _EXPECTED_FE_FAILURES as e:
# Recoverable per-fold failure (bad columns, wrong dtypes,
# etc.). Fall back to baseline score and record the failure
# so it shows up in the results dict.
logger.warning(
"FeatCopilot recoverable error on dataset=%s seed=%s fold=%s: %s: %s",
dataset_name,
seed,
fold_idx,
type(e).__name__,
e,
)
fe_failed_folds.append(
{
"seed": seed,
"fold": fold_idx,
"error_type": type(e).__name__,
"error_message": str(e),
"expected": True,
}
)
tabular_fold_scores.append(best_baseline[primary_metric])
fe_times.append(0.0)
# Fall back to the (per-fold) baseline feature width since
# FeatCopilot didn't produce engineered features this fold.
n_features_generated.append(X_train.shape[1])
except Exception as e:
# Unexpected error — surface the full traceback so genuine
# bugs (e.g. a refactor regression raising ``AttributeError``)
# don't get masked behind a silent baseline-fallback. We
# still continue to the next fold so a single bad fold
# doesn't poison the entire dataset run.
logger.exception(
"FeatCopilot UNEXPECTED error on dataset=%s seed=%s fold=%s",
dataset_name,
seed,
fold_idx,
)
fe_failed_folds.append(
{
"seed": seed,
"fold": fold_idx,
"error_type": type(e).__name__,
"error_message": str(e),
"expected": False,
}
)
tabular_fold_scores.append(best_baseline[primary_metric])
fe_times.append(0.0)
n_features_generated.append(X_train.shape[1])

baseline_scores = np.array(baseline_fold_scores)
tabular_scores = np.array(tabular_fold_scores)
Expand All @@ -689,7 +770,19 @@ def run_single_benchmark(
if len(baseline_scores) >= 5 and not np.allclose(baseline_scores, tabular_scores):
try:
_, p_value = stats.wilcoxon(tabular_scores, baseline_scores, alternative="two-sided")
except ValueError:
except ValueError as e:
# ``scipy.stats.wilcoxon`` raises ``ValueError`` when the input
# contains all-zero differences or insufficient non-zero pairs.
# Falling back to ``p_value = 1.0`` (no significance) is the
# right behaviour, but log so it doesn't look like a real
# null result. Anything other than ``ValueError`` is a bug
# we want to surface.
logger.warning(
"Wilcoxon test failed for %s (n=%d), reporting p_value=1.0: %s",
dataset_name,
len(baseline_scores),
e,
)
p_value = 1.0

# Cast to native Python ``bool`` so the in-memory results dict is
Expand Down Expand Up @@ -726,15 +819,25 @@ def run_single_benchmark(
"engines_used": engines_used,
"baseline_fold_scores": baseline_scores.tolist(),
"tabular_fold_scores": tabular_scores.tolist(),
# Per-fold FeatCopilot failure log. Empty list means every fold
# ran the engineered pipeline cleanly. Non-empty entries record
# the seed/fold, exception class, message, and whether the
# exception was an *expected* validation error (``expected=True``)
# or an *unexpected* bug (``expected=False``) so reviewers /
# report consumers can see at a glance whether the
# ``tabular_best_score`` is a fair comparison.
"fe_failed_folds": fe_failed_folds,
"n_fe_failed_folds": len(fe_failed_folds),
}

return results

except Exception as e:
print(f"Error: {e}")
import traceback

traceback.print_exc()
# Top-level safety net: keep the benchmark loop alive when a single
# dataset fails so the rest of the suite still produces a report.
# Surface the full traceback (``logger.exception``) so unexpected
# failures don't look like a benign skip.
logger.exception("Dataset run failed for %s: %s", dataset_name, e)
return None


Expand Down
8 changes: 5 additions & 3 deletions featcopilot/llm/code_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from featcopilot.core.feature import Feature, FeatureOrigin, FeatureType
from featcopilot.utils.logger import get_logger
from featcopilot.utils.models import DEFAULT_MODEL

logger = get_logger(__name__)

Expand All @@ -23,8 +24,9 @@ class FeatureCodeGenerator:

Parameters
----------
model : str, default='gpt-5.2'
LLM model to use
model : str, optional
LLM model to use. Defaults to
:data:`featcopilot.utils.models.DEFAULT_MODEL`.
validate : bool, default=True
Whether to validate generated code
backend : str, default='copilot'
Expand All @@ -45,7 +47,7 @@ class FeatureCodeGenerator:

def __init__(
self,
model: str = "gpt-5.2",
model: str = DEFAULT_MODEL,
validate: bool = True,
verbose: bool = False,
backend: Literal["copilot", "litellm", "openai"] = "copilot",
Expand Down
12 changes: 7 additions & 5 deletions featcopilot/llm/copilot_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
from pydantic import BaseModel, Field

from featcopilot.utils.logger import get_logger
from featcopilot.utils.models import DEFAULT_MODEL

logger = get_logger(__name__)


class CopilotConfig(BaseModel):
"""Configuration for Copilot client."""

model: str = Field(default="gpt-5.2", description="Model to use")
model: str = Field(default=DEFAULT_MODEL, description="Model to use")
temperature: float = Field(default=0.3, ge=0, le=1, description="Temperature for generation")
max_tokens: int = Field(default=4096, description="Maximum tokens in response")
timeout: float = Field(default=60.0, description="Timeout in seconds")
Expand All @@ -40,12 +41,13 @@ class CopilotFeatureClient:
----------
config : CopilotConfig, optional
Configuration for the client
model : str, default='gpt-5.2'
Model to use for generation
model : str, optional
Model to use for generation. Defaults to
:data:`featcopilot.utils.models.DEFAULT_MODEL`.

Examples
--------
>>> client = CopilotFeatureClient(model='gpt-5.2')
>>> client = CopilotFeatureClient() # uses DEFAULT_MODEL
>>> await client.start()
>>> suggestions = await client.suggest_features(
... column_info={'age': 'int', 'income': 'float'},
Expand All @@ -54,7 +56,7 @@ class CopilotFeatureClient:
>>> await client.stop()
"""

def __init__(self, config: CopilotConfig | None = None, model: str = "gpt-5.2", **kwargs):
def __init__(self, config: CopilotConfig | None = None, model: str = DEFAULT_MODEL, **kwargs):
self.config = config or CopilotConfig(model=model, **kwargs)
self._client = None
self._session = None
Expand Down
8 changes: 5 additions & 3 deletions featcopilot/llm/explainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from featcopilot.core.feature import Feature, FeatureSet
from featcopilot.utils.logger import get_logger
from featcopilot.utils.models import DEFAULT_MODEL

logger = get_logger(__name__)

Expand All @@ -22,8 +23,9 @@ class FeatureExplainer:

Parameters
----------
model : str, default='gpt-5.2'
LLM model to use
model : str, optional
LLM model to use. Defaults to
:data:`featcopilot.utils.models.DEFAULT_MODEL`.
backend : str, default='copilot'
LLM backend to use: 'copilot', 'openai', or 'litellm'
api_key : str, optional
Expand All @@ -39,7 +41,7 @@ class FeatureExplainer:

def __init__(
self,
model: str = "gpt-5.2",
model: str = DEFAULT_MODEL,
verbose: bool = False,
backend: Literal["copilot", "litellm", "openai"] = "copilot",
api_key: str | None = None,
Expand Down
10 changes: 6 additions & 4 deletions featcopilot/llm/semantic_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from featcopilot.core.base import BaseEngine, EngineConfig
from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType
from featcopilot.utils.logger import get_logger
from featcopilot.utils.models import DEFAULT_MODEL

logger = get_logger(__name__)

Expand All @@ -20,7 +21,7 @@ class SemanticEngineConfig(EngineConfig):
"""Configuration for semantic feature engine."""

name: str = "SemanticEngine"
model: str = Field(default="gpt-5.2", description="LLM model to use")
model: str = Field(default=DEFAULT_MODEL, description="LLM model to use")
max_suggestions: int = Field(default=20, description="Max features to suggest")
validate_features: bool = Field(default=True, description="Validate generated code")
domain: str | None = Field(default=None, description="Domain context")
Expand Down Expand Up @@ -53,8 +54,9 @@ class SemanticEngine(BaseEngine):

Parameters
----------
model : str, default='gpt-5.2'
LLM model to use
model : str, optional
LLM model to use. Defaults to
:data:`featcopilot.utils.models.DEFAULT_MODEL`.
max_suggestions : int, default=20
Maximum number of features to suggest
validate_features : bool, default=True
Expand Down Expand Up @@ -102,7 +104,7 @@ class SemanticEngine(BaseEngine):

def __init__(
self,
model: str = "gpt-5.2",
model: str = DEFAULT_MODEL,
max_suggestions: int = 20,
validate_features: bool = True,
domain: str | None = None,
Expand Down
8 changes: 5 additions & 3 deletions featcopilot/llm/transform_rule_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from featcopilot.core.transform_rule import TransformRule
from featcopilot.stores.rule_store import TransformRuleStore
from featcopilot.utils.logger import get_logger
from featcopilot.utils.models import DEFAULT_MODEL

logger = get_logger(__name__)

Expand All @@ -26,8 +27,9 @@ class TransformRuleGenerator:

Parameters
----------
model : str, default='gpt-5.2'
LLM model to use
model : str, optional
LLM model to use. Defaults to
:data:`featcopilot.utils.models.DEFAULT_MODEL`.
store : TransformRuleStore, optional
Rule store for saving and retrieving rules
validate : bool, default=True
Expand All @@ -51,7 +53,7 @@ class TransformRuleGenerator:

def __init__(
self,
model: str = "gpt-5.2",
model: str = DEFAULT_MODEL,
store: TransformRuleStore | None = None,
validate: bool = True,
verbose: bool = False,
Expand Down
Loading
Loading