thinkall · jianglibigdata · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/benchmarks/automl/run_automl_benchmark.py b/benchmarks/automl/run_automl_benchmark.py
@@ -79,6 +79,7 @@
     sanitize_feature_names,
     save_feature_cache,
 )
+from featcopilot.utils.models import DEFAULT_MODEL
 
 warnings.filterwarnings("ignore")
 
@@ -362,7 +363,7 @@ def get_featcopilot_engines(task: str, with_llm: bool) -> tuple[list[str], dict[
         engines.append("text")
     if with_llm:
         engines.append("llm")
-        return engines, {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"}
+        return engines, {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"}
     return engines, None
 
 

diff --git a/benchmarks/compare_tools/run_fe_tools_comparison.py b/benchmarks/compare_tools/run_fe_tools_comparison.py
@@ -62,6 +62,7 @@
 )
 from benchmarks.splits import split_benchmark_data
 from featcopilot.utils.logger import get_logger  # noqa: E402
+from featcopilot.utils.models import DEFAULT_MODEL  # noqa: E402
 
 logger = get_logger(__name__)
 
@@ -1465,7 +1466,7 @@ def load_cache(output_path: Path) -> pd.DataFrame | None:
         tools_to_run = None  # None means run all available tools
 
     # Run benchmark
-    llm_config = {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"} if args.with_llm else None
+    llm_config = {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"} if args.with_llm else None
     results = run_comparison_benchmark(
         dataset_names=dataset_names,
         tools=tools_to_run,

diff --git a/benchmarks/simple_models/run_simple_models_benchmark.py b/benchmarks/simple_models/run_simple_models_benchmark.py
@@ -43,6 +43,7 @@
 
 import argparse
 import json
+import logging
 import time
 import warnings
 from datetime import datetime
@@ -79,12 +80,40 @@
     sanitize_feature_frames,
     sanitize_feature_names,
 )
+from featcopilot.utils.models import DEFAULT_MODEL
+
+# Module logger for surfacing exceptions that were previously swallowed.
+# We deliberately use the stdlib ``logging`` module here (rather than
+# ``featcopilot.utils.logger.get_logger``) because the latter sets
+# ``propagate=False`` on the ``featcopilot.*`` logger tree, which prevents
+# benchmark output from reaching root-logger handlers configured by
+# downstream consumers (CI runners, log aggregators, ``pytest --log-cli``).
+# A vanilla ``logging.getLogger(__name__)`` here keeps the benchmark output
+# routable through the consumer's normal logging configuration.
+# ``logging.basicConfig`` is the consumer's responsibility.
+logger = logging.getLogger(__name__)
 
 # Default configuration
 DEFAULT_MAX_FEATURES = 100
 QUICK_DATASETS = ["titanic", "house_prices", "credit_risk", "bike_sharing", "customer_churn", "insurance_claims"]
 
 
+# Exception types we expect to encounter during per-fold feature
+# engineering. These come from sklearn / pandas / featcopilot validation
+# code paths and represent recoverable user-input issues (bad columns,
+# wrong dtypes, etc.). Anything else we surface via ``logger.exception``
+# so genuine bugs (e.g. ``AttributeError`` from a refactor regression)
+# don't get masked behind a benign-looking baseline-fallback.
+_EXPECTED_FE_FAILURES: tuple[type[BaseException], ...] = (
+    ValueError,
+    KeyError,
+    TypeError,
+    RuntimeError,
+    MemoryError,
+    np.linalg.LinAlgError,
+)
+
+
 # Markers that indicate a loader returned synthetic data despite the
 # registry tagging the dataset as real-world (e.g., a Kaggle/OpenML/HF
 # loader that fell back to a synthesized dataset because the upstream
@@ -148,7 +177,11 @@ def _resolve_source(result: dict) -> str:
         return "synthetic"
     try:
         return "real_world" if is_real_world(dataset) else "synthetic"
-    except Exception:
+    except (KeyError, ValueError, TypeError):
+        # ``is_real_world`` raises only ``KeyError`` for unknown datasets
+        # and ``ValueError``/``TypeError`` for invalid input; anything else
+        # is a genuine bug we want to surface rather than silently bucket
+        # as "synthetic".
         return "synthetic"
 
 
@@ -384,7 +417,7 @@ def get_featcopilot_engines(task: str, with_llm: bool) -> tuple[list[str], dict[
         engines.append("text")
     if with_llm:
         engines.append("llm")
-        return engines, {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"}
+        return engines, {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"}
     return engines, None
 
 
@@ -597,6 +630,11 @@ def run_single_benchmark(
         fe_times = []
         n_features_generated = []
         engines_used: list[str] = []
+        # Track per-fold FeatCopilot failures so the silent baseline
+        # fallback is visible to consumers (previously the same broad
+        # ``except Exception`` would mask the failure rate behind an
+        # otherwise-healthy-looking results row).
+        fe_failed_folds: list[dict[str, Any]] = []
 
         seeds = [42 + i * 7 for i in range(n_seeds)]
         if not seeds:
@@ -667,13 +705,56 @@ def run_single_benchmark(
                     # would just produce ``["str", "str", ...]``.
                     if not engines_used:
                         engines_used = list(fold_engines)
-                except Exception as e:
-                    print(f"   FeatCopilot error on fold {fold_idx}: {e}")
+                except _EXPECTED_FE_FAILURES as e:
+                    # Recoverable per-fold failure (bad columns, wrong dtypes,
+                    # etc.). Fall back to baseline score and record the failure
+                    # so it shows up in the results dict.
+                    logger.warning(
+                        "FeatCopilot recoverable error on dataset=%s seed=%s fold=%s: %s: %s",
+                        dataset_name,
+                        seed,
+                        fold_idx,
+                        type(e).__name__,
+                        e,
+                    )
+                    fe_failed_folds.append(
+                        {
+                            "seed": seed,
+                            "fold": fold_idx,
+                            "error_type": type(e).__name__,
+                            "error_message": str(e),
+                            "expected": True,
+                        }
+                    )
                     tabular_fold_scores.append(best_baseline[primary_metric])
                     fe_times.append(0.0)
                     # Fall back to the (per-fold) baseline feature width since
                     # FeatCopilot didn't produce engineered features this fold.
                     n_features_generated.append(X_train.shape[1])
+                except Exception as e:
+                    # Unexpected error — surface the full traceback so genuine
+                    # bugs (e.g. a refactor regression raising ``AttributeError``)
+                    # don't get masked behind a silent baseline-fallback. We
+                    # still continue to the next fold so a single bad fold
+                    # doesn't poison the entire dataset run.
+                    logger.exception(
+                        "FeatCopilot UNEXPECTED error on dataset=%s seed=%s fold=%s",
+                        dataset_name,
+                        seed,
+                        fold_idx,
+                    )
+                    fe_failed_folds.append(
+                        {
+                            "seed": seed,
+                            "fold": fold_idx,
+                            "error_type": type(e).__name__,
+                            "error_message": str(e),
+                            "expected": False,
+                        }
+                    )
+                    tabular_fold_scores.append(best_baseline[primary_metric])
+                    fe_times.append(0.0)
+                    n_features_generated.append(X_train.shape[1])
 
         baseline_scores = np.array(baseline_fold_scores)
         tabular_scores = np.array(tabular_fold_scores)
@@ -689,7 +770,19 @@ def run_single_benchmark(
         if len(baseline_scores) >= 5 and not np.allclose(baseline_scores, tabular_scores):
             try:
                 _, p_value = stats.wilcoxon(tabular_scores, baseline_scores, alternative="two-sided")
-            except ValueError:
+            except ValueError as e:
+                # ``scipy.stats.wilcoxon`` raises ``ValueError`` when the input
+                # contains all-zero differences or insufficient non-zero pairs.
+                # Falling back to ``p_value = 1.0`` (no significance) is the
+                # right behaviour, but log so it doesn't look like a real
+                # null result. Anything other than ``ValueError`` is a bug
+                # we want to surface.
+                logger.warning(
+                    "Wilcoxon test failed for %s (n=%d), reporting p_value=1.0: %s",
+                    dataset_name,
+                    len(baseline_scores),
+                    e,
+                )
                 p_value = 1.0
 
         # Cast to native Python ``bool`` so the in-memory results dict is
@@ -726,15 +819,25 @@ def run_single_benchmark(
             "engines_used": engines_used,
             "baseline_fold_scores": baseline_scores.tolist(),
             "tabular_fold_scores": tabular_scores.tolist(),
+            # Per-fold FeatCopilot failure log. Empty list means every fold
+            # ran the engineered pipeline cleanly. Non-empty entries record
+            # the seed/fold, exception class, message, and whether the
+            # exception was an *expected* validation error (``expected=True``)
+            # or an *unexpected* bug (``expected=False``) so reviewers /
+            # report consumers can see at a glance whether the
+            # ``tabular_best_score`` is a fair comparison.
+            "fe_failed_folds": fe_failed_folds,
+            "n_fe_failed_folds": len(fe_failed_folds),
         }
 
         return results
 
     except Exception as e:
-        print(f"Error: {e}")
-        import traceback
-
-        traceback.print_exc()
+        # Top-level safety net: keep the benchmark loop alive when a single
+        # dataset fails so the rest of the suite still produces a report.
+        # Surface the full traceback (``logger.exception``) so unexpected
+        # failures don't look like a benign skip.
+        logger.exception("Dataset run failed for %s: %s", dataset_name, e)
         return None
 
 

diff --git a/featcopilot/llm/code_generator.py b/featcopilot/llm/code_generator.py
@@ -10,6 +10,7 @@
 
 from featcopilot.core.feature import Feature, FeatureOrigin, FeatureType
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 
 logger = get_logger(__name__)
 
@@ -23,8 +24,9 @@ class FeatureCodeGenerator:
 
     Parameters
     ----------
-    model : str, default='gpt-5.2'
-        LLM model to use
+    model : str, optional
+        LLM model to use. Defaults to
+        :data:`featcopilot.utils.models.DEFAULT_MODEL`.
     validate : bool, default=True
         Whether to validate generated code
     backend : str, default='copilot'
@@ -45,7 +47,7 @@ class FeatureCodeGenerator:
 
     def __init__(
         self,
-        model: str = "gpt-5.2",
+        model: str = DEFAULT_MODEL,
         validate: bool = True,
         verbose: bool = False,
         backend: Literal["copilot", "litellm", "openai"] = "copilot",

diff --git a/featcopilot/llm/copilot_client.py b/featcopilot/llm/copilot_client.py
@@ -12,14 +12,15 @@
 from pydantic import BaseModel, Field
 
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 
 logger = get_logger(__name__)
 
 
 class CopilotConfig(BaseModel):
     """Configuration for Copilot client."""
 
-    model: str = Field(default="gpt-5.2", description="Model to use")
+    model: str = Field(default=DEFAULT_MODEL, description="Model to use")
     temperature: float = Field(default=0.3, ge=0, le=1, description="Temperature for generation")
     max_tokens: int = Field(default=4096, description="Maximum tokens in response")
     timeout: float = Field(default=60.0, description="Timeout in seconds")
@@ -40,12 +41,13 @@ class CopilotFeatureClient:
     ----------
     config : CopilotConfig, optional
         Configuration for the client
-    model : str, default='gpt-5.2'
-        Model to use for generation
+    model : str, optional
+        Model to use for generation. Defaults to
+        :data:`featcopilot.utils.models.DEFAULT_MODEL`.
 
     Examples
     --------
-    >>> client = CopilotFeatureClient(model='gpt-5.2')
+    >>> client = CopilotFeatureClient()  # uses DEFAULT_MODEL
     >>> await client.start()
     >>> suggestions = await client.suggest_features(
     ...     column_info={'age': 'int', 'income': 'float'},
@@ -54,7 +56,7 @@ class CopilotFeatureClient:
     >>> await client.stop()
     """
 
-    def __init__(self, config: CopilotConfig | None = None, model: str = "gpt-5.2", **kwargs):
+    def __init__(self, config: CopilotConfig | None = None, model: str = DEFAULT_MODEL, **kwargs):
         self.config = config or CopilotConfig(model=model, **kwargs)
         self._client = None
         self._session = None

diff --git a/featcopilot/llm/explainer.py b/featcopilot/llm/explainer.py
@@ -9,6 +9,7 @@
 
 from featcopilot.core.feature import Feature, FeatureSet
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 
 logger = get_logger(__name__)
 
@@ -22,8 +23,9 @@ class FeatureExplainer:
 
     Parameters
     ----------
-    model : str, default='gpt-5.2'
-        LLM model to use
+    model : str, optional
+        LLM model to use. Defaults to
+        :data:`featcopilot.utils.models.DEFAULT_MODEL`.
     backend : str, default='copilot'
         LLM backend to use: 'copilot', 'openai', or 'litellm'
     api_key : str, optional
@@ -39,7 +41,7 @@ class FeatureExplainer:
 
     def __init__(
         self,
-        model: str = "gpt-5.2",
+        model: str = DEFAULT_MODEL,
         verbose: bool = False,
         backend: Literal["copilot", "litellm", "openai"] = "copilot",
         api_key: str | None = None,

diff --git a/featcopilot/llm/semantic_engine.py b/featcopilot/llm/semantic_engine.py
@@ -12,6 +12,7 @@
 from featcopilot.core.base import BaseEngine, EngineConfig
 from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 
 logger = get_logger(__name__)
 
@@ -20,7 +21,7 @@ class SemanticEngineConfig(EngineConfig):
     """Configuration for semantic feature engine."""
 
     name: str = "SemanticEngine"
-    model: str = Field(default="gpt-5.2", description="LLM model to use")
+    model: str = Field(default=DEFAULT_MODEL, description="LLM model to use")
     max_suggestions: int = Field(default=20, description="Max features to suggest")
     validate_features: bool = Field(default=True, description="Validate generated code")
     domain: str | None = Field(default=None, description="Domain context")
@@ -53,8 +54,9 @@ class SemanticEngine(BaseEngine):
 
     Parameters
     ----------
-    model : str, default='gpt-5.2'
-        LLM model to use
+    model : str, optional
+        LLM model to use. Defaults to
+        :data:`featcopilot.utils.models.DEFAULT_MODEL`.
     max_suggestions : int, default=20
         Maximum number of features to suggest
     validate_features : bool, default=True
@@ -102,7 +104,7 @@ class SemanticEngine(BaseEngine):
 
     def __init__(
         self,
-        model: str = "gpt-5.2",
+        model: str = DEFAULT_MODEL,
         max_suggestions: int = 20,
         validate_features: bool = True,
         domain: str | None = None,

diff --git a/featcopilot/llm/transform_rule_generator.py b/featcopilot/llm/transform_rule_generator.py
@@ -13,6 +13,7 @@
 from featcopilot.core.transform_rule import TransformRule
 from featcopilot.stores.rule_store import TransformRuleStore
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 
 logger = get_logger(__name__)
 
@@ -26,8 +27,9 @@ class TransformRuleGenerator:
 
     Parameters
     ----------
-    model : str, default='gpt-5.2'
-        LLM model to use
+    model : str, optional
+        LLM model to use. Defaults to
+        :data:`featcopilot.utils.models.DEFAULT_MODEL`.
     store : TransformRuleStore, optional
         Rule store for saving and retrieving rules
     validate : bool, default=True
@@ -51,7 +53,7 @@ class TransformRuleGenerator:
 
     def __init__(
         self,
-        model: str = "gpt-5.2",
+        model: str = DEFAULT_MODEL,
         store: TransformRuleStore | None = None,
         validate: bool = True,
         verbose: bool = False,