From 35ae05f8a971591e9698ac646bb860a883cb4486 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Wed, 6 May 2026 18:12:37 +0800
Subject: [PATCH 1/2] chore: centralize default LLM model + harden benchmark
 exception handling

Two follow-ups intentionally deferred from PR #6:

1. **Centralize `DEFAULT_MODEL`** as the single source of truth across
   the Copilot-backed LLM modules. Previously every module duplicated the
   literal `"gpt-5.2"` default, which made silent drift possible (one
   module could lag behind a future model bump).

   - `featcopilot/utils/__init__.py`: re-export `DEFAULT_MODEL`.
   - `featcopilot/llm/copilot_client.py`: `CopilotConfig.model` &
     `CopilotFeatureClient.__init__` now default to `DEFAULT_MODEL`.
   - `featcopilot/llm/code_generator.py`,
     `featcopilot/llm/explainer.py`,
     `featcopilot/llm/transform_rule_generator.py`,
     `featcopilot/llm/semantic_engine.py` (config + `__init__`):
     same treatment.
   - `featcopilot/transformers/sklearn_compat.py`:
     `AutoFeatureEngineer._create_engine("llm")` falls back to
     `DEFAULT_MODEL` instead of a hardcoded literal.
   - Benchmark runners (`simple_models`, `automl`, `compare_tools`)
     pull the same constant in their `llm_config` builders.

   Behaviour unchanged: `DEFAULT_MODEL == "gpt-5.2"` still.
   OpenAI / LiteLLM clients intentionally keep `"gpt-4o"` (they use a
   different backend with its own public model identifier).

2. **Harden benchmark exception handling** in
   `benchmarks/simple_models/run_simple_models_benchmark.py`:

   - Replace the bare `except Exception` around per-fold FeatCopilot
     fit/transform with a narrowed `_EXPECTED_FE_FAILURES` list
     (`ValueError, KeyError, TypeError, RuntimeError, MemoryError,
     LinAlgError`) plus a separate `except Exception` branch that
     logs the full traceback via `logger.exception`. Genuine bugs
     (`AttributeError`, `ImportError`, ...) no longer hide behind a
     silent baseline-fallback.
   - Record per-fold failures in a new `fe_failed_folds` /
     `n_fe_failed_folds` field on the results dict so consumers can
     see when the `tabular_best_score` is a fair comparison vs a
     fallback-padded one.
   - Replace `print(...)` with `logger.warning` / `logger.exception`
     so failures show up in log aggregators / CI summaries.
   - Wilcoxon `except ValueError` now logs (was silently setting
     `p_value = 1.0`).
   - Top-level dataset-loop `except Exception` switched to
     `logger.exception` (was `print + traceback.print_exc`).
   - `_resolve_source` narrowed from `except Exception` to
     `(KeyError, ValueError, TypeError)`.

Tests:
- `tests/test_utils.py`: 3 new tests pinning `DEFAULT_MODEL` is
  re-exported from `featcopilot.utils`, every Copilot-backed LLM
  module / config defaults to `DEFAULT_MODEL`, and
  `AutoFeatureEngineer.llm_config` falls back to `DEFAULT_MODEL`.
- `tests/test_benchmark_encoding.py`: 4 new tests pinning the
  narrowed exception list, propagation of unexpected errors out of
  `_resolve_source`, the centralized model in `llm_config`, and the
  module-owned `logger` attribute.

Full pytest: 924 passed, 2 skipped.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 benchmarks/automl/run_automl_benchmark.py     |   3 +-
 .../compare_tools/run_fe_tools_comparison.py  |   3 +-
 .../run_simple_models_benchmark.py            | 118 ++++++++++++++++--
 featcopilot/llm/code_generator.py             |   8 +-
 featcopilot/llm/copilot_client.py             |  12 +-
 featcopilot/llm/explainer.py                  |   8 +-
 featcopilot/llm/semantic_engine.py            |  10 +-
 featcopilot/llm/transform_rule_generator.py   |   8 +-
 featcopilot/transformers/sklearn_compat.py    |   3 +-
 featcopilot/utils/__init__.py                 |   2 +
 tests/test_benchmark_encoding.py              |  86 +++++++++++++
 tests/test_utils.py                           |  54 ++++++++
 12 files changed, 285 insertions(+), 30 deletions(-)

diff --git a/benchmarks/automl/run_automl_benchmark.py b/benchmarks/automl/run_automl_benchmark.py
index d1993ca..3f28c72 100644
--- a/benchmarks/automl/run_automl_benchmark.py
+++ b/benchmarks/automl/run_automl_benchmark.py
@@ -79,6 +79,7 @@
     sanitize_feature_names,
     save_feature_cache,
 )
+from featcopilot.utils.models import DEFAULT_MODEL
 
 warnings.filterwarnings("ignore")
 
@@ -362,7 +363,7 @@ def get_featcopilot_engines(task: str, with_llm: bool) -> tuple[list[str], dict[
         engines.append("text")
     if with_llm:
         engines.append("llm")
-        return engines, {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"}
+        return engines, {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"}
     return engines, None
 
 
diff --git a/benchmarks/compare_tools/run_fe_tools_comparison.py b/benchmarks/compare_tools/run_fe_tools_comparison.py
index 7817644..118f584 100644
--- a/benchmarks/compare_tools/run_fe_tools_comparison.py
+++ b/benchmarks/compare_tools/run_fe_tools_comparison.py
@@ -62,6 +62,7 @@
 )
 from benchmarks.splits import split_benchmark_data
 from featcopilot.utils.logger import get_logger  # noqa: E402
+from featcopilot.utils.models import DEFAULT_MODEL  # noqa: E402
 
 logger = get_logger(__name__)
 
@@ -1465,7 +1466,7 @@ def load_cache(output_path: Path) -> pd.DataFrame | None:
         tools_to_run = None  # None means run all available tools
 
     # Run benchmark
-    llm_config = {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"} if args.with_llm else None
+    llm_config = {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"} if args.with_llm else None
     results = run_comparison_benchmark(
         dataset_names=dataset_names,
         tools=tools_to_run,
diff --git a/benchmarks/simple_models/run_simple_models_benchmark.py b/benchmarks/simple_models/run_simple_models_benchmark.py
index 1fd8f5e..317fcea 100644
--- a/benchmarks/simple_models/run_simple_models_benchmark.py
+++ b/benchmarks/simple_models/run_simple_models_benchmark.py
@@ -43,6 +43,7 @@
 
 import argparse
 import json
+import logging
 import time
 import warnings
 from datetime import datetime
@@ -79,12 +80,37 @@
     sanitize_feature_frames,
     sanitize_feature_names,
 )
+from featcopilot.utils.models import DEFAULT_MODEL
+
+# Module logger for surfacing exceptions that were previously swallowed.
+# We use the stdlib ``logging`` module here (rather than
+# ``featcopilot.utils.logger.get_logger``) so the benchmark stays usable
+# even before featcopilot is installed (e.g. the benchmark cache loader
+# imports this module from CI smoke tests). ``logging.basicConfig`` is
+# the consumer's responsibility.
+logger = logging.getLogger(__name__)
 
 # Default configuration
 DEFAULT_MAX_FEATURES = 100
 QUICK_DATASETS = ["titanic", "house_prices", "credit_risk", "bike_sharing", "customer_churn", "insurance_claims"]
 
 
+# Exception types we expect to encounter during per-fold feature
+# engineering. These come from sklearn / pandas / featcopilot validation
+# code paths and represent recoverable user-input issues (bad columns,
+# wrong dtypes, etc.). Anything else we surface via ``logger.exception``
+# so genuine bugs (e.g. ``AttributeError`` from a refactor regression)
+# don't get masked behind a benign-looking baseline-fallback.
+_EXPECTED_FE_FAILURES: tuple[type[BaseException], ...] = (
+    ValueError,
+    KeyError,
+    TypeError,
+    RuntimeError,
+    MemoryError,
+    np.linalg.LinAlgError,
+)
+
+
 # Markers that indicate a loader returned synthetic data despite the
 # registry tagging the dataset as real-world (e.g., a Kaggle/OpenML/HF
 # loader that fell back to a synthesized dataset because the upstream
@@ -148,7 +174,11 @@ def _resolve_source(result: dict) -> str:
         return "synthetic"
     try:
         return "real_world" if is_real_world(dataset) else "synthetic"
-    except Exception:
+    except (KeyError, ValueError, TypeError):
+        # ``is_real_world`` raises only ``KeyError`` for unknown datasets
+        # and ``ValueError``/``TypeError`` for invalid input; anything else
+        # is a genuine bug we want to surface rather than silently bucket
+        # as "synthetic".
         return "synthetic"
 
 
@@ -384,7 +414,7 @@ def get_featcopilot_engines(task: str, with_llm: bool) -> tuple[list[str], dict[
         engines.append("text")
     if with_llm:
         engines.append("llm")
-        return engines, {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"}
+        return engines, {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"}
     return engines, None
 
 
@@ -597,6 +627,11 @@ def run_single_benchmark(
         fe_times = []
         n_features_generated = []
         engines_used: list[str] = []
+        # Track per-fold FeatCopilot failures so the silent baseline
+        # fallback is visible to consumers (previously the same broad
+        # ``except Exception`` would mask the failure rate behind an
+        # otherwise-healthy-looking results row).
+        fe_failed_folds: list[dict[str, Any]] = []
 
         seeds = [42 + i * 7 for i in range(n_seeds)]
         if not seeds:
@@ -667,13 +702,56 @@ def run_single_benchmark(
                     # would just produce ``["str", "str", ...]``.
                     if not engines_used:
                         engines_used = list(fold_engines)
-                except Exception as e:
-                    print(f"   FeatCopilot error on fold {fold_idx}: {e}")
+                except _EXPECTED_FE_FAILURES as e:
+                    # Recoverable per-fold failure (bad columns, wrong dtypes,
+                    # etc.). Fall back to baseline score and record the failure
+                    # so it shows up in the results dict.
+                    logger.warning(
+                        "FeatCopilot recoverable error on dataset=%s seed=%s fold=%s: %s: %s",
+                        dataset_name,
+                        seed,
+                        fold_idx,
+                        type(e).__name__,
+                        e,
+                    )
+                    fe_failed_folds.append(
+                        {
+                            "seed": seed,
+                            "fold": fold_idx,
+                            "error_type": type(e).__name__,
+                            "error_message": str(e),
+                            "expected": True,
+                        }
+                    )
                     tabular_fold_scores.append(best_baseline[primary_metric])
                     fe_times.append(0.0)
                     # Fall back to the (per-fold) baseline feature width since
                     # FeatCopilot didn't produce engineered features this fold.
                     n_features_generated.append(X_train.shape[1])
+                except Exception as e:
+                    # Unexpected error — surface the full traceback so genuine
+                    # bugs (e.g. a refactor regression raising ``AttributeError``)
+                    # don't get masked behind a silent baseline-fallback. We
+                    # still continue to the next fold so a single bad fold
+                    # doesn't poison the entire dataset run.
+                    logger.exception(
+                        "FeatCopilot UNEXPECTED error on dataset=%s seed=%s fold=%s",
+                        dataset_name,
+                        seed,
+                        fold_idx,
+                    )
+                    fe_failed_folds.append(
+                        {
+                            "seed": seed,
+                            "fold": fold_idx,
+                            "error_type": type(e).__name__,
+                            "error_message": str(e),
+                            "expected": False,
+                        }
+                    )
+                    tabular_fold_scores.append(best_baseline[primary_metric])
+                    fe_times.append(0.0)
+                    n_features_generated.append(X_train.shape[1])
 
         baseline_scores = np.array(baseline_fold_scores)
         tabular_scores = np.array(tabular_fold_scores)
@@ -689,7 +767,19 @@ def run_single_benchmark(
         if len(baseline_scores) >= 5 and not np.allclose(baseline_scores, tabular_scores):
             try:
                 _, p_value = stats.wilcoxon(tabular_scores, baseline_scores, alternative="two-sided")
-            except ValueError:
+            except ValueError as e:
+                # ``scipy.stats.wilcoxon`` raises ``ValueError`` when the input
+                # contains all-zero differences or insufficient non-zero pairs.
+                # Falling back to ``p_value = 1.0`` (no significance) is the
+                # right behaviour, but log so it doesn't look like a real
+                # null result. Anything other than ``ValueError`` is a bug
+                # we want to surface.
+                logger.warning(
+                    "Wilcoxon test failed for %s (n=%d), reporting p_value=1.0: %s",
+                    dataset_name,
+                    len(baseline_scores),
+                    e,
+                )
                 p_value = 1.0
 
         # Cast to native Python ``bool`` so the in-memory results dict is
@@ -726,15 +816,25 @@ def run_single_benchmark(
             "engines_used": engines_used,
             "baseline_fold_scores": baseline_scores.tolist(),
             "tabular_fold_scores": tabular_scores.tolist(),
+            # Per-fold FeatCopilot failure log. Empty list means every fold
+            # ran the engineered pipeline cleanly. Non-empty entries record
+            # the seed/fold, exception class, message, and whether the
+            # exception was an *expected* validation error (``expected=True``)
+            # or an *unexpected* bug (``expected=False``) so reviewers /
+            # report consumers can see at a glance whether the
+            # ``tabular_best_score`` is a fair comparison.
+            "fe_failed_folds": fe_failed_folds,
+            "n_fe_failed_folds": len(fe_failed_folds),
         }
 
         return results
 
     except Exception as e:
-        print(f"Error: {e}")
-        import traceback
-
-        traceback.print_exc()
+        # Top-level safety net: keep the benchmark loop alive when a single
+        # dataset fails so the rest of the suite still produces a report.
+        # Surface the full traceback (``logger.exception``) so unexpected
+        # failures don't look like a benign skip.
+        logger.exception("Dataset run failed for %s: %s", dataset_name, e)
         return None
 
 
diff --git a/featcopilot/llm/code_generator.py b/featcopilot/llm/code_generator.py
index 6e8358a..7c1139c 100644
--- a/featcopilot/llm/code_generator.py
+++ b/featcopilot/llm/code_generator.py
@@ -10,6 +10,7 @@
 
 from featcopilot.core.feature import Feature, FeatureOrigin, FeatureType
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 
 logger = get_logger(__name__)
 
@@ -23,8 +24,9 @@ class FeatureCodeGenerator:
 
     Parameters
     ----------
-    model : str, default='gpt-5.2'
-        LLM model to use
+    model : str, optional
+        LLM model to use. Defaults to
+        :data:`featcopilot.utils.models.DEFAULT_MODEL`.
     validate : bool, default=True
         Whether to validate generated code
     backend : str, default='copilot'
@@ -45,7 +47,7 @@ class FeatureCodeGenerator:
 
     def __init__(
         self,
-        model: str = "gpt-5.2",
+        model: str = DEFAULT_MODEL,
         validate: bool = True,
         verbose: bool = False,
         backend: Literal["copilot", "litellm", "openai"] = "copilot",
diff --git a/featcopilot/llm/copilot_client.py b/featcopilot/llm/copilot_client.py
index 6353b3e..e82e638 100644
--- a/featcopilot/llm/copilot_client.py
+++ b/featcopilot/llm/copilot_client.py
@@ -12,6 +12,7 @@
 from pydantic import BaseModel, Field
 
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 
 logger = get_logger(__name__)
 
@@ -19,7 +20,7 @@
 class CopilotConfig(BaseModel):
     """Configuration for Copilot client."""
 
-    model: str = Field(default="gpt-5.2", description="Model to use")
+    model: str = Field(default=DEFAULT_MODEL, description="Model to use")
     temperature: float = Field(default=0.3, ge=0, le=1, description="Temperature for generation")
     max_tokens: int = Field(default=4096, description="Maximum tokens in response")
     timeout: float = Field(default=60.0, description="Timeout in seconds")
@@ -40,12 +41,13 @@ class CopilotFeatureClient:
     ----------
     config : CopilotConfig, optional
         Configuration for the client
-    model : str, default='gpt-5.2'
-        Model to use for generation
+    model : str, optional
+        Model to use for generation. Defaults to
+        :data:`featcopilot.utils.models.DEFAULT_MODEL`.
 
     Examples
     --------
-    >>> client = CopilotFeatureClient(model='gpt-5.2')
+    >>> client = CopilotFeatureClient()  # uses DEFAULT_MODEL
     >>> await client.start()
     >>> suggestions = await client.suggest_features(
     ...     column_info={'age': 'int', 'income': 'float'},
@@ -54,7 +56,7 @@ class CopilotFeatureClient:
     >>> await client.stop()
     """
 
-    def __init__(self, config: CopilotConfig | None = None, model: str = "gpt-5.2", **kwargs):
+    def __init__(self, config: CopilotConfig | None = None, model: str = DEFAULT_MODEL, **kwargs):
         self.config = config or CopilotConfig(model=model, **kwargs)
         self._client = None
         self._session = None
diff --git a/featcopilot/llm/explainer.py b/featcopilot/llm/explainer.py
index e5ec3ca..d6e638e 100644
--- a/featcopilot/llm/explainer.py
+++ b/featcopilot/llm/explainer.py
@@ -9,6 +9,7 @@
 
 from featcopilot.core.feature import Feature, FeatureSet
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 
 logger = get_logger(__name__)
 
@@ -22,8 +23,9 @@ class FeatureExplainer:
 
     Parameters
     ----------
-    model : str, default='gpt-5.2'
-        LLM model to use
+    model : str, optional
+        LLM model to use. Defaults to
+        :data:`featcopilot.utils.models.DEFAULT_MODEL`.
     backend : str, default='copilot'
         LLM backend to use: 'copilot', 'openai', or 'litellm'
     api_key : str, optional
@@ -39,7 +41,7 @@ class FeatureExplainer:
 
     def __init__(
         self,
-        model: str = "gpt-5.2",
+        model: str = DEFAULT_MODEL,
         verbose: bool = False,
         backend: Literal["copilot", "litellm", "openai"] = "copilot",
         api_key: str | None = None,
diff --git a/featcopilot/llm/semantic_engine.py b/featcopilot/llm/semantic_engine.py
index a935a74..527812b 100644
--- a/featcopilot/llm/semantic_engine.py
+++ b/featcopilot/llm/semantic_engine.py
@@ -12,6 +12,7 @@
 from featcopilot.core.base import BaseEngine, EngineConfig
 from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 
 logger = get_logger(__name__)
 
@@ -20,7 +21,7 @@ class SemanticEngineConfig(EngineConfig):
     """Configuration for semantic feature engine."""
 
     name: str = "SemanticEngine"
-    model: str = Field(default="gpt-5.2", description="LLM model to use")
+    model: str = Field(default=DEFAULT_MODEL, description="LLM model to use")
     max_suggestions: int = Field(default=20, description="Max features to suggest")
     validate_features: bool = Field(default=True, description="Validate generated code")
     domain: str | None = Field(default=None, description="Domain context")
@@ -53,8 +54,9 @@ class SemanticEngine(BaseEngine):
 
     Parameters
     ----------
-    model : str, default='gpt-5.2'
-        LLM model to use
+    model : str, optional
+        LLM model to use. Defaults to
+        :data:`featcopilot.utils.models.DEFAULT_MODEL`.
     max_suggestions : int, default=20
         Maximum number of features to suggest
     validate_features : bool, default=True
@@ -102,7 +104,7 @@ class SemanticEngine(BaseEngine):
 
     def __init__(
         self,
-        model: str = "gpt-5.2",
+        model: str = DEFAULT_MODEL,
         max_suggestions: int = 20,
         validate_features: bool = True,
         domain: str | None = None,
diff --git a/featcopilot/llm/transform_rule_generator.py b/featcopilot/llm/transform_rule_generator.py
index 395a245..413083b 100644
--- a/featcopilot/llm/transform_rule_generator.py
+++ b/featcopilot/llm/transform_rule_generator.py
@@ -13,6 +13,7 @@
 from featcopilot.core.transform_rule import TransformRule
 from featcopilot.stores.rule_store import TransformRuleStore
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 
 logger = get_logger(__name__)
 
@@ -26,8 +27,9 @@ class TransformRuleGenerator:
 
     Parameters
     ----------
-    model : str, default='gpt-5.2'
-        LLM model to use
+    model : str, optional
+        LLM model to use. Defaults to
+        :data:`featcopilot.utils.models.DEFAULT_MODEL`.
     store : TransformRuleStore, optional
         Rule store for saving and retrieving rules
     validate : bool, default=True
@@ -51,7 +53,7 @@ class TransformRuleGenerator:
 
     def __init__(
         self,
-        model: str = "gpt-5.2",
+        model: str = DEFAULT_MODEL,
         store: TransformRuleStore | None = None,
         validate: bool = True,
         verbose: bool = False,
diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py
index d23a46b..728f9ab 100644
--- a/featcopilot/transformers/sklearn_compat.py
+++ b/featcopilot/transformers/sklearn_compat.py
@@ -17,6 +17,7 @@
 from featcopilot.engines.timeseries import TimeSeriesEngine
 from featcopilot.selection.unified import FeatureSelector
 from featcopilot.utils.logger import get_logger
+from featcopilot.utils.models import DEFAULT_MODEL
 from featcopilot.utils.validation import find_potential_leakage_columns
 
 logger = get_logger(__name__)
@@ -364,7 +365,7 @@ def _create_engine(self, engine_name: str):
             from featcopilot.llm.semantic_engine import SemanticEngine
 
             return SemanticEngine(
-                model=self.llm_config.get("model", "gpt-5.2"),
+                model=self.llm_config.get("model", DEFAULT_MODEL),
                 max_suggestions=self.llm_config.get("max_suggestions", 20),
                 domain=self.llm_config.get("domain"),
                 verbose=self.verbose,
diff --git a/featcopilot/utils/__init__.py b/featcopilot/utils/__init__.py
index 8920ef4..76d12ac 100644
--- a/featcopilot/utils/__init__.py
+++ b/featcopilot/utils/__init__.py
@@ -2,6 +2,7 @@
 
 from featcopilot.utils.cache import FeatureCache
 from featcopilot.utils.models import (
+    DEFAULT_MODEL,
     fetch_models,
     get_default_model,
     get_model_info,
@@ -15,6 +16,7 @@
 __all__ = [
     "parallel_apply",
     "FeatureCache",
+    "DEFAULT_MODEL",
     "fetch_models",
     "list_models",
     "get_model_info",
diff --git a/tests/test_benchmark_encoding.py b/tests/test_benchmark_encoding.py
index 89fa52d..de5b9d6 100644
--- a/tests/test_benchmark_encoding.py
+++ b/tests/test_benchmark_encoding.py
@@ -909,3 +909,89 @@ def test_significant_field_is_native_python_bool():
     assert type(sig) is bool, f"significant must be native ``bool``, got {type(sig).__name__}"
     # And must be JSON-serializable directly without any custom encoder.
     json_mod.dumps({"significant": sig})
+
+
+# ---------------------------------------------------------------------------
+# Exception-handling hardening regression tests
+# ---------------------------------------------------------------------------
+
+
+def test_resolve_source_only_swallows_expected_exceptions():
+    """``_resolve_source`` must let unexpected exceptions propagate.
+
+    Pre-fix this used a bare ``except Exception`` which masked genuine bugs
+    (e.g. an ``AttributeError`` raised by a refactor regression in
+    ``is_real_world``). We now narrow to ``(KeyError, ValueError, TypeError)``
+    and let everything else propagate.
+    """
+    from unittest.mock import patch
+
+    from benchmarks.simple_models import run_simple_models_benchmark as bench
+
+    # Expected: KeyError on unknown dataset name → bucket as synthetic.
+    with patch.object(bench, "is_real_world", side_effect=KeyError("unknown")):
+        assert bench._resolve_source({"dataset": "totally_unknown"}) == "synthetic"
+
+    # Unexpected: an ``AttributeError`` (a real bug) MUST propagate, not be
+    # silently bucketed as "synthetic".
+    with patch.object(bench, "is_real_world", side_effect=AttributeError("bug!")):
+        try:
+            bench._resolve_source({"dataset": "titanic"})
+        except AttributeError:
+            pass
+        else:
+            raise AssertionError("AttributeError must propagate, not be swallowed")
+
+
+def test_expected_fe_failures_includes_recoverable_types():
+    """``_EXPECTED_FE_FAILURES`` covers the recoverable per-fold error types.
+
+    The benchmark's per-fold FeatCopilot try/except must distinguish
+    *expected* validation errors (which justify a silent baseline fallback)
+    from *unexpected* bugs (which deserve a full traceback). This test pins
+    the contract for the expected-error list.
+    """
+    from benchmarks.simple_models.run_simple_models_benchmark import (
+        _EXPECTED_FE_FAILURES,
+    )
+
+    # Recoverable user-input / validation failures.
+    assert ValueError in _EXPECTED_FE_FAILURES
+    assert KeyError in _EXPECTED_FE_FAILURES
+    assert TypeError in _EXPECTED_FE_FAILURES
+    assert RuntimeError in _EXPECTED_FE_FAILURES
+    # Genuine bugs MUST NOT be in the expected list — otherwise they'd be
+    # silently swallowed instead of surfaced via ``logger.exception``.
+    assert AttributeError not in _EXPECTED_FE_FAILURES
+    assert NameError not in _EXPECTED_FE_FAILURES
+    assert ImportError not in _EXPECTED_FE_FAILURES
+
+
+def test_benchmark_module_uses_centralized_default_model():
+    """Benchmarks pull the default Copilot model from ``DEFAULT_MODEL``.
+
+    Pre-fix every benchmark runner duplicated the literal ``"gpt-5.2"``
+    string in its ``llm_config`` builder. We now import the centralized
+    constant so a future model change is a single-line edit.
+    """
+    from benchmarks.simple_models.run_simple_models_benchmark import (
+        get_featcopilot_engines,
+    )
+    from featcopilot.utils.models import DEFAULT_MODEL
+
+    _, llm_config = get_featcopilot_engines("classification", with_llm=True)
+    assert llm_config is not None
+    assert llm_config["model"] == DEFAULT_MODEL
+
+
+def test_benchmark_logger_is_configured():
+    """The benchmark module exposes a stdlib logger for surfacing failures.
+
+    Previously the per-fold FeatCopilot try/except used ``print(...)``,
+    which made failures invisible to log aggregators / CI summaries.
+    This test pins that the module owns a ``logger`` attribute.
+    """
+    from benchmarks.simple_models import run_simple_models_benchmark as bench
+
+    assert hasattr(bench, "logger")
+    assert bench.logger.name == "benchmarks.simple_models.run_simple_models_benchmark"
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9996895..b0093ea 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -431,6 +431,60 @@ def test_default_model_constant(self):
         """Test DEFAULT_MODEL constant value."""
         assert DEFAULT_MODEL == "gpt-5.2"
 
+    def test_default_model_exposed_from_utils_package(self):
+        """``DEFAULT_MODEL`` is exported from the ``featcopilot.utils`` namespace.
+
+        Pins the public re-export so downstream code can do
+        ``from featcopilot.utils import DEFAULT_MODEL`` without dipping
+        into the private ``featcopilot.utils.models`` module.
+        """
+        from featcopilot.utils import DEFAULT_MODEL as DEFAULT_MODEL_PUBLIC
+
+        assert DEFAULT_MODEL_PUBLIC == DEFAULT_MODEL
+
+    def test_llm_modules_share_default_model(self):
+        """All Copilot-backed LLM modules use ``DEFAULT_MODEL`` as their default.
+
+        Regression guard: previously every module duplicated the literal
+        ``"gpt-5.2"`` string, which made it possible to drift the default
+        across modules silently. We now expect every Copilot-backed
+        constructor / Pydantic config to default to ``DEFAULT_MODEL``.
+
+        OpenAI / LiteLLM clients are intentionally excluded — they
+        default to ``"gpt-4o"`` because that's a real public model
+        identifier for those backends.
+        """
+        from featcopilot.llm.code_generator import FeatureCodeGenerator
+        from featcopilot.llm.copilot_client import CopilotConfig, CopilotFeatureClient
+        from featcopilot.llm.explainer import FeatureExplainer
+        from featcopilot.llm.semantic_engine import SemanticEngine, SemanticEngineConfig
+        from featcopilot.llm.transform_rule_generator import TransformRuleGenerator
+
+        assert CopilotConfig().model == DEFAULT_MODEL
+        assert CopilotFeatureClient().config.model == DEFAULT_MODEL
+        assert FeatureCodeGenerator().model == DEFAULT_MODEL
+        assert FeatureExplainer().model == DEFAULT_MODEL
+        assert SemanticEngineConfig().model == DEFAULT_MODEL
+        assert TransformRuleGenerator().model == DEFAULT_MODEL
+        # SemanticEngine accepts the model in __init__ and stores it on
+        # its config; verify the default propagates end-to-end.
+        engine = SemanticEngine(validate_features=False)
+        assert engine.config.model == DEFAULT_MODEL
+
+    def test_auto_feature_engineer_llm_default_model(self):
+        """AutoFeatureEngineer.llm_config falls back to ``DEFAULT_MODEL``.
+
+        The internal ``_create_engine("llm")`` path used to hardcode
+        ``"gpt-5.2"`` as a fallback when ``llm_config`` was missing the
+        ``model`` key. We now expect the centralized default to be used
+        so a future model change in one place propagates everywhere.
+        """
+        from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer
+
+        afe = AutoFeatureEngineer(llm_config={})  # missing "model" key
+        engine = afe._create_engine("llm")
+        assert engine.config.model == DEFAULT_MODEL
+
 
 class TestFetchModels:
     """Tests for fetch_models function."""

From e14a0785a339adf9452d9ea57c367b7b08f14fb0 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Wed, 6 May 2026 19:14:14 +0800
Subject: [PATCH 2/2] chore: clarify benchmark logger comment (Copilot review)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous comment claimed the stdlib `logging` choice was about
'staying usable before featcopilot is installed' — but this PR added a
direct `from featcopilot.utils.models import DEFAULT_MODEL` right
above the comment, which contradicts that claim.

The actual motivation is that `featcopilot.utils.logger` sets
`propagate=False` on the `featcopilot.*` logger tree (see
`featcopilot/utils/logger.py:20`), which would block benchmark
output from reaching root-logger handlers configured by downstream
consumers (CI runners, log aggregators, `pytest --log-cli`).

Updated the comment to reflect that real motivation; no behavior change.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../simple_models/run_simple_models_benchmark.py    | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/benchmarks/simple_models/run_simple_models_benchmark.py b/benchmarks/simple_models/run_simple_models_benchmark.py
index 317fcea..146a9bf 100644
--- a/benchmarks/simple_models/run_simple_models_benchmark.py
+++ b/benchmarks/simple_models/run_simple_models_benchmark.py
@@ -83,11 +83,14 @@
 from featcopilot.utils.models import DEFAULT_MODEL
 
 # Module logger for surfacing exceptions that were previously swallowed.
-# We use the stdlib ``logging`` module here (rather than
-# ``featcopilot.utils.logger.get_logger``) so the benchmark stays usable
-# even before featcopilot is installed (e.g. the benchmark cache loader
-# imports this module from CI smoke tests). ``logging.basicConfig`` is
-# the consumer's responsibility.
+# We deliberately use the stdlib ``logging`` module here (rather than
+# ``featcopilot.utils.logger.get_logger``) because the latter sets
+# ``propagate=False`` on the ``featcopilot.*`` logger tree, which prevents
+# benchmark output from reaching root-logger handlers configured by
+# downstream consumers (CI runners, log aggregators, ``pytest --log-cli``).
+# A vanilla ``logging.getLogger(__name__)`` here keeps the benchmark output
+# routable through the consumer's normal logging configuration.
+# ``logging.basicConfig`` is the consumer's responsibility.
 logger = logging.getLogger(__name__)
 
 # Default configuration