From 35ae05f8a971591e9698ac646bb860a883cb4486 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Wed, 6 May 2026 18:12:37 +0800 Subject: [PATCH 1/2] chore: centralize default LLM model + harden benchmark exception handling Two follow-ups intentionally deferred from PR #6: 1. **Centralize `DEFAULT_MODEL`** as the single source of truth across the Copilot-backed LLM modules. Previously every module duplicated the literal `"gpt-5.2"` default, which made silent drift possible (one module could lag behind a future model bump). - `featcopilot/utils/__init__.py`: re-export `DEFAULT_MODEL`. - `featcopilot/llm/copilot_client.py`: `CopilotConfig.model` & `CopilotFeatureClient.__init__` now default to `DEFAULT_MODEL`. - `featcopilot/llm/code_generator.py`, `featcopilot/llm/explainer.py`, `featcopilot/llm/transform_rule_generator.py`, `featcopilot/llm/semantic_engine.py` (config + `__init__`): same treatment. - `featcopilot/transformers/sklearn_compat.py`: `AutoFeatureEngineer._create_engine("llm")` falls back to `DEFAULT_MODEL` instead of a hardcoded literal. - Benchmark runners (`simple_models`, `automl`, `compare_tools`) pull the same constant in their `llm_config` builders. Behaviour unchanged: `DEFAULT_MODEL == "gpt-5.2"` still. OpenAI / LiteLLM clients intentionally keep `"gpt-4o"` (they use a different backend with its own public model identifier). 2. **Harden benchmark exception handling** in `benchmarks/simple_models/run_simple_models_benchmark.py`: - Replace the bare `except Exception` around per-fold FeatCopilot fit/transform with a narrowed `_EXPECTED_FE_FAILURES` list (`ValueError, KeyError, TypeError, RuntimeError, MemoryError, LinAlgError`) plus a separate `except Exception` branch that logs the full traceback via `logger.exception`. Genuine bugs (`AttributeError`, `ImportError`, ...) no longer hide behind a silent baseline-fallback. - Record per-fold failures in a new `fe_failed_folds` / `n_fe_failed_folds` field on the results dict so consumers can see when the `tabular_best_score` is a fair comparison vs a fallback-padded one. - Replace `print(...)` with `logger.warning` / `logger.exception` so failures show up in log aggregators / CI summaries. - Wilcoxon `except ValueError` now logs (was silently setting `p_value = 1.0`). - Top-level dataset-loop `except Exception` switched to `logger.exception` (was `print + traceback.print_exc`). - `_resolve_source` narrowed from `except Exception` to `(KeyError, ValueError, TypeError)`. Tests: - `tests/test_utils.py`: 3 new tests pinning `DEFAULT_MODEL` is re-exported from `featcopilot.utils`, every Copilot-backed LLM module / config defaults to `DEFAULT_MODEL`, and `AutoFeatureEngineer.llm_config` falls back to `DEFAULT_MODEL`. - `tests/test_benchmark_encoding.py`: 4 new tests pinning the narrowed exception list, propagation of unexpected errors out of `_resolve_source`, the centralized model in `llm_config`, and the module-owned `logger` attribute. Full pytest: 924 passed, 2 skipped. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- benchmarks/automl/run_automl_benchmark.py | 3 +- .../compare_tools/run_fe_tools_comparison.py | 3 +- .../run_simple_models_benchmark.py | 118 ++++++++++++++++-- featcopilot/llm/code_generator.py | 8 +- featcopilot/llm/copilot_client.py | 12 +- featcopilot/llm/explainer.py | 8 +- featcopilot/llm/semantic_engine.py | 10 +- featcopilot/llm/transform_rule_generator.py | 8 +- featcopilot/transformers/sklearn_compat.py | 3 +- featcopilot/utils/__init__.py | 2 + tests/test_benchmark_encoding.py | 86 +++++++++++++ tests/test_utils.py | 54 ++++++++ 12 files changed, 285 insertions(+), 30 deletions(-) diff --git a/benchmarks/automl/run_automl_benchmark.py b/benchmarks/automl/run_automl_benchmark.py index d1993ca..3f28c72 100644 --- a/benchmarks/automl/run_automl_benchmark.py +++ b/benchmarks/automl/run_automl_benchmark.py @@ -79,6 +79,7 @@ sanitize_feature_names, save_feature_cache, ) +from featcopilot.utils.models import DEFAULT_MODEL warnings.filterwarnings("ignore") @@ -362,7 +363,7 @@ def get_featcopilot_engines(task: str, with_llm: bool) -> tuple[list[str], dict[ engines.append("text") if with_llm: engines.append("llm") - return engines, {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"} + return engines, {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"} return engines, None diff --git a/benchmarks/compare_tools/run_fe_tools_comparison.py b/benchmarks/compare_tools/run_fe_tools_comparison.py index 7817644..118f584 100644 --- a/benchmarks/compare_tools/run_fe_tools_comparison.py +++ b/benchmarks/compare_tools/run_fe_tools_comparison.py @@ -62,6 +62,7 @@ ) from benchmarks.splits import split_benchmark_data from featcopilot.utils.logger import get_logger # noqa: E402 +from featcopilot.utils.models import DEFAULT_MODEL # noqa: E402 logger = get_logger(__name__) @@ -1465,7 +1466,7 @@ def load_cache(output_path: Path) -> pd.DataFrame | None: tools_to_run = None # None means run all available tools # Run benchmark - llm_config = {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"} if args.with_llm else None + llm_config = {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"} if args.with_llm else None results = run_comparison_benchmark( dataset_names=dataset_names, tools=tools_to_run, diff --git a/benchmarks/simple_models/run_simple_models_benchmark.py b/benchmarks/simple_models/run_simple_models_benchmark.py index 1fd8f5e..317fcea 100644 --- a/benchmarks/simple_models/run_simple_models_benchmark.py +++ b/benchmarks/simple_models/run_simple_models_benchmark.py @@ -43,6 +43,7 @@ import argparse import json +import logging import time import warnings from datetime import datetime @@ -79,12 +80,37 @@ sanitize_feature_frames, sanitize_feature_names, ) +from featcopilot.utils.models import DEFAULT_MODEL + +# Module logger for surfacing exceptions that were previously swallowed. +# We use the stdlib ``logging`` module here (rather than +# ``featcopilot.utils.logger.get_logger``) so the benchmark stays usable +# even before featcopilot is installed (e.g. the benchmark cache loader +# imports this module from CI smoke tests). ``logging.basicConfig`` is +# the consumer's responsibility. +logger = logging.getLogger(__name__) # Default configuration DEFAULT_MAX_FEATURES = 100 QUICK_DATASETS = ["titanic", "house_prices", "credit_risk", "bike_sharing", "customer_churn", "insurance_claims"] +# Exception types we expect to encounter during per-fold feature +# engineering. These come from sklearn / pandas / featcopilot validation +# code paths and represent recoverable user-input issues (bad columns, +# wrong dtypes, etc.). Anything else we surface via ``logger.exception`` +# so genuine bugs (e.g. ``AttributeError`` from a refactor regression) +# don't get masked behind a benign-looking baseline-fallback. +_EXPECTED_FE_FAILURES: tuple[type[BaseException], ...] = ( + ValueError, + KeyError, + TypeError, + RuntimeError, + MemoryError, + np.linalg.LinAlgError, +) + + # Markers that indicate a loader returned synthetic data despite the # registry tagging the dataset as real-world (e.g., a Kaggle/OpenML/HF # loader that fell back to a synthesized dataset because the upstream @@ -148,7 +174,11 @@ def _resolve_source(result: dict) -> str: return "synthetic" try: return "real_world" if is_real_world(dataset) else "synthetic" - except Exception: + except (KeyError, ValueError, TypeError): + # ``is_real_world`` raises only ``KeyError`` for unknown datasets + # and ``ValueError``/``TypeError`` for invalid input; anything else + # is a genuine bug we want to surface rather than silently bucket + # as "synthetic". return "synthetic" @@ -384,7 +414,7 @@ def get_featcopilot_engines(task: str, with_llm: bool) -> tuple[list[str], dict[ engines.append("text") if with_llm: engines.append("llm") - return engines, {"model": "gpt-5.2", "max_suggestions": 20, "backend": "copilot"} + return engines, {"model": DEFAULT_MODEL, "max_suggestions": 20, "backend": "copilot"} return engines, None @@ -597,6 +627,11 @@ def run_single_benchmark( fe_times = [] n_features_generated = [] engines_used: list[str] = [] + # Track per-fold FeatCopilot failures so the silent baseline + # fallback is visible to consumers (previously the same broad + # ``except Exception`` would mask the failure rate behind an + # otherwise-healthy-looking results row). + fe_failed_folds: list[dict[str, Any]] = [] seeds = [42 + i * 7 for i in range(n_seeds)] if not seeds: @@ -667,13 +702,56 @@ def run_single_benchmark( # would just produce ``["str", "str", ...]``. if not engines_used: engines_used = list(fold_engines) - except Exception as e: - print(f" FeatCopilot error on fold {fold_idx}: {e}") + except _EXPECTED_FE_FAILURES as e: + # Recoverable per-fold failure (bad columns, wrong dtypes, + # etc.). Fall back to baseline score and record the failure + # so it shows up in the results dict. + logger.warning( + "FeatCopilot recoverable error on dataset=%s seed=%s fold=%s: %s: %s", + dataset_name, + seed, + fold_idx, + type(e).__name__, + e, + ) + fe_failed_folds.append( + { + "seed": seed, + "fold": fold_idx, + "error_type": type(e).__name__, + "error_message": str(e), + "expected": True, + } + ) tabular_fold_scores.append(best_baseline[primary_metric]) fe_times.append(0.0) # Fall back to the (per-fold) baseline feature width since # FeatCopilot didn't produce engineered features this fold. n_features_generated.append(X_train.shape[1]) + except Exception as e: + # Unexpected error — surface the full traceback so genuine + # bugs (e.g. a refactor regression raising ``AttributeError``) + # don't get masked behind a silent baseline-fallback. We + # still continue to the next fold so a single bad fold + # doesn't poison the entire dataset run. + logger.exception( + "FeatCopilot UNEXPECTED error on dataset=%s seed=%s fold=%s", + dataset_name, + seed, + fold_idx, + ) + fe_failed_folds.append( + { + "seed": seed, + "fold": fold_idx, + "error_type": type(e).__name__, + "error_message": str(e), + "expected": False, + } + ) + tabular_fold_scores.append(best_baseline[primary_metric]) + fe_times.append(0.0) + n_features_generated.append(X_train.shape[1]) baseline_scores = np.array(baseline_fold_scores) tabular_scores = np.array(tabular_fold_scores) @@ -689,7 +767,19 @@ def run_single_benchmark( if len(baseline_scores) >= 5 and not np.allclose(baseline_scores, tabular_scores): try: _, p_value = stats.wilcoxon(tabular_scores, baseline_scores, alternative="two-sided") - except ValueError: + except ValueError as e: + # ``scipy.stats.wilcoxon`` raises ``ValueError`` when the input + # contains all-zero differences or insufficient non-zero pairs. + # Falling back to ``p_value = 1.0`` (no significance) is the + # right behaviour, but log so it doesn't look like a real + # null result. Anything other than ``ValueError`` is a bug + # we want to surface. + logger.warning( + "Wilcoxon test failed for %s (n=%d), reporting p_value=1.0: %s", + dataset_name, + len(baseline_scores), + e, + ) p_value = 1.0 # Cast to native Python ``bool`` so the in-memory results dict is @@ -726,15 +816,25 @@ def run_single_benchmark( "engines_used": engines_used, "baseline_fold_scores": baseline_scores.tolist(), "tabular_fold_scores": tabular_scores.tolist(), + # Per-fold FeatCopilot failure log. Empty list means every fold + # ran the engineered pipeline cleanly. Non-empty entries record + # the seed/fold, exception class, message, and whether the + # exception was an *expected* validation error (``expected=True``) + # or an *unexpected* bug (``expected=False``) so reviewers / + # report consumers can see at a glance whether the + # ``tabular_best_score`` is a fair comparison. + "fe_failed_folds": fe_failed_folds, + "n_fe_failed_folds": len(fe_failed_folds), } return results except Exception as e: - print(f"Error: {e}") - import traceback - - traceback.print_exc() + # Top-level safety net: keep the benchmark loop alive when a single + # dataset fails so the rest of the suite still produces a report. + # Surface the full traceback (``logger.exception``) so unexpected + # failures don't look like a benign skip. + logger.exception("Dataset run failed for %s: %s", dataset_name, e) return None diff --git a/featcopilot/llm/code_generator.py b/featcopilot/llm/code_generator.py index 6e8358a..7c1139c 100644 --- a/featcopilot/llm/code_generator.py +++ b/featcopilot/llm/code_generator.py @@ -10,6 +10,7 @@ from featcopilot.core.feature import Feature, FeatureOrigin, FeatureType from featcopilot.utils.logger import get_logger +from featcopilot.utils.models import DEFAULT_MODEL logger = get_logger(__name__) @@ -23,8 +24,9 @@ class FeatureCodeGenerator: Parameters ---------- - model : str, default='gpt-5.2' - LLM model to use + model : str, optional + LLM model to use. Defaults to + :data:`featcopilot.utils.models.DEFAULT_MODEL`. validate : bool, default=True Whether to validate generated code backend : str, default='copilot' @@ -45,7 +47,7 @@ class FeatureCodeGenerator: def __init__( self, - model: str = "gpt-5.2", + model: str = DEFAULT_MODEL, validate: bool = True, verbose: bool = False, backend: Literal["copilot", "litellm", "openai"] = "copilot", diff --git a/featcopilot/llm/copilot_client.py b/featcopilot/llm/copilot_client.py index 6353b3e..e82e638 100644 --- a/featcopilot/llm/copilot_client.py +++ b/featcopilot/llm/copilot_client.py @@ -12,6 +12,7 @@ from pydantic import BaseModel, Field from featcopilot.utils.logger import get_logger +from featcopilot.utils.models import DEFAULT_MODEL logger = get_logger(__name__) @@ -19,7 +20,7 @@ class CopilotConfig(BaseModel): """Configuration for Copilot client.""" - model: str = Field(default="gpt-5.2", description="Model to use") + model: str = Field(default=DEFAULT_MODEL, description="Model to use") temperature: float = Field(default=0.3, ge=0, le=1, description="Temperature for generation") max_tokens: int = Field(default=4096, description="Maximum tokens in response") timeout: float = Field(default=60.0, description="Timeout in seconds") @@ -40,12 +41,13 @@ class CopilotFeatureClient: ---------- config : CopilotConfig, optional Configuration for the client - model : str, default='gpt-5.2' - Model to use for generation + model : str, optional + Model to use for generation. Defaults to + :data:`featcopilot.utils.models.DEFAULT_MODEL`. Examples -------- - >>> client = CopilotFeatureClient(model='gpt-5.2') + >>> client = CopilotFeatureClient() # uses DEFAULT_MODEL >>> await client.start() >>> suggestions = await client.suggest_features( ... column_info={'age': 'int', 'income': 'float'}, @@ -54,7 +56,7 @@ class CopilotFeatureClient: >>> await client.stop() """ - def __init__(self, config: CopilotConfig | None = None, model: str = "gpt-5.2", **kwargs): + def __init__(self, config: CopilotConfig | None = None, model: str = DEFAULT_MODEL, **kwargs): self.config = config or CopilotConfig(model=model, **kwargs) self._client = None self._session = None diff --git a/featcopilot/llm/explainer.py b/featcopilot/llm/explainer.py index e5ec3ca..d6e638e 100644 --- a/featcopilot/llm/explainer.py +++ b/featcopilot/llm/explainer.py @@ -9,6 +9,7 @@ from featcopilot.core.feature import Feature, FeatureSet from featcopilot.utils.logger import get_logger +from featcopilot.utils.models import DEFAULT_MODEL logger = get_logger(__name__) @@ -22,8 +23,9 @@ class FeatureExplainer: Parameters ---------- - model : str, default='gpt-5.2' - LLM model to use + model : str, optional + LLM model to use. Defaults to + :data:`featcopilot.utils.models.DEFAULT_MODEL`. backend : str, default='copilot' LLM backend to use: 'copilot', 'openai', or 'litellm' api_key : str, optional @@ -39,7 +41,7 @@ class FeatureExplainer: def __init__( self, - model: str = "gpt-5.2", + model: str = DEFAULT_MODEL, verbose: bool = False, backend: Literal["copilot", "litellm", "openai"] = "copilot", api_key: str | None = None, diff --git a/featcopilot/llm/semantic_engine.py b/featcopilot/llm/semantic_engine.py index a935a74..527812b 100644 --- a/featcopilot/llm/semantic_engine.py +++ b/featcopilot/llm/semantic_engine.py @@ -12,6 +12,7 @@ from featcopilot.core.base import BaseEngine, EngineConfig from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType from featcopilot.utils.logger import get_logger +from featcopilot.utils.models import DEFAULT_MODEL logger = get_logger(__name__) @@ -20,7 +21,7 @@ class SemanticEngineConfig(EngineConfig): """Configuration for semantic feature engine.""" name: str = "SemanticEngine" - model: str = Field(default="gpt-5.2", description="LLM model to use") + model: str = Field(default=DEFAULT_MODEL, description="LLM model to use") max_suggestions: int = Field(default=20, description="Max features to suggest") validate_features: bool = Field(default=True, description="Validate generated code") domain: str | None = Field(default=None, description="Domain context") @@ -53,8 +54,9 @@ class SemanticEngine(BaseEngine): Parameters ---------- - model : str, default='gpt-5.2' - LLM model to use + model : str, optional + LLM model to use. Defaults to + :data:`featcopilot.utils.models.DEFAULT_MODEL`. max_suggestions : int, default=20 Maximum number of features to suggest validate_features : bool, default=True @@ -102,7 +104,7 @@ class SemanticEngine(BaseEngine): def __init__( self, - model: str = "gpt-5.2", + model: str = DEFAULT_MODEL, max_suggestions: int = 20, validate_features: bool = True, domain: str | None = None, diff --git a/featcopilot/llm/transform_rule_generator.py b/featcopilot/llm/transform_rule_generator.py index 395a245..413083b 100644 --- a/featcopilot/llm/transform_rule_generator.py +++ b/featcopilot/llm/transform_rule_generator.py @@ -13,6 +13,7 @@ from featcopilot.core.transform_rule import TransformRule from featcopilot.stores.rule_store import TransformRuleStore from featcopilot.utils.logger import get_logger +from featcopilot.utils.models import DEFAULT_MODEL logger = get_logger(__name__) @@ -26,8 +27,9 @@ class TransformRuleGenerator: Parameters ---------- - model : str, default='gpt-5.2' - LLM model to use + model : str, optional + LLM model to use. Defaults to + :data:`featcopilot.utils.models.DEFAULT_MODEL`. store : TransformRuleStore, optional Rule store for saving and retrieving rules validate : bool, default=True @@ -51,7 +53,7 @@ class TransformRuleGenerator: def __init__( self, - model: str = "gpt-5.2", + model: str = DEFAULT_MODEL, store: TransformRuleStore | None = None, validate: bool = True, verbose: bool = False, diff --git a/featcopilot/transformers/sklearn_compat.py b/featcopilot/transformers/sklearn_compat.py index d23a46b..728f9ab 100644 --- a/featcopilot/transformers/sklearn_compat.py +++ b/featcopilot/transformers/sklearn_compat.py @@ -17,6 +17,7 @@ from featcopilot.engines.timeseries import TimeSeriesEngine from featcopilot.selection.unified import FeatureSelector from featcopilot.utils.logger import get_logger +from featcopilot.utils.models import DEFAULT_MODEL from featcopilot.utils.validation import find_potential_leakage_columns logger = get_logger(__name__) @@ -364,7 +365,7 @@ def _create_engine(self, engine_name: str): from featcopilot.llm.semantic_engine import SemanticEngine return SemanticEngine( - model=self.llm_config.get("model", "gpt-5.2"), + model=self.llm_config.get("model", DEFAULT_MODEL), max_suggestions=self.llm_config.get("max_suggestions", 20), domain=self.llm_config.get("domain"), verbose=self.verbose, diff --git a/featcopilot/utils/__init__.py b/featcopilot/utils/__init__.py index 8920ef4..76d12ac 100644 --- a/featcopilot/utils/__init__.py +++ b/featcopilot/utils/__init__.py @@ -2,6 +2,7 @@ from featcopilot.utils.cache import FeatureCache from featcopilot.utils.models import ( + DEFAULT_MODEL, fetch_models, get_default_model, get_model_info, @@ -15,6 +16,7 @@ __all__ = [ "parallel_apply", "FeatureCache", + "DEFAULT_MODEL", "fetch_models", "list_models", "get_model_info", diff --git a/tests/test_benchmark_encoding.py b/tests/test_benchmark_encoding.py index 89fa52d..de5b9d6 100644 --- a/tests/test_benchmark_encoding.py +++ b/tests/test_benchmark_encoding.py @@ -909,3 +909,89 @@ def test_significant_field_is_native_python_bool(): assert type(sig) is bool, f"significant must be native ``bool``, got {type(sig).__name__}" # And must be JSON-serializable directly without any custom encoder. json_mod.dumps({"significant": sig}) + + +# --------------------------------------------------------------------------- +# Exception-handling hardening regression tests +# --------------------------------------------------------------------------- + + +def test_resolve_source_only_swallows_expected_exceptions(): + """``_resolve_source`` must let unexpected exceptions propagate. + + Pre-fix this used a bare ``except Exception`` which masked genuine bugs + (e.g. an ``AttributeError`` raised by a refactor regression in + ``is_real_world``). We now narrow to ``(KeyError, ValueError, TypeError)`` + and let everything else propagate. + """ + from unittest.mock import patch + + from benchmarks.simple_models import run_simple_models_benchmark as bench + + # Expected: KeyError on unknown dataset name → bucket as synthetic. + with patch.object(bench, "is_real_world", side_effect=KeyError("unknown")): + assert bench._resolve_source({"dataset": "totally_unknown"}) == "synthetic" + + # Unexpected: an ``AttributeError`` (a real bug) MUST propagate, not be + # silently bucketed as "synthetic". + with patch.object(bench, "is_real_world", side_effect=AttributeError("bug!")): + try: + bench._resolve_source({"dataset": "titanic"}) + except AttributeError: + pass + else: + raise AssertionError("AttributeError must propagate, not be swallowed") + + +def test_expected_fe_failures_includes_recoverable_types(): + """``_EXPECTED_FE_FAILURES`` covers the recoverable per-fold error types. + + The benchmark's per-fold FeatCopilot try/except must distinguish + *expected* validation errors (which justify a silent baseline fallback) + from *unexpected* bugs (which deserve a full traceback). This test pins + the contract for the expected-error list. + """ + from benchmarks.simple_models.run_simple_models_benchmark import ( + _EXPECTED_FE_FAILURES, + ) + + # Recoverable user-input / validation failures. + assert ValueError in _EXPECTED_FE_FAILURES + assert KeyError in _EXPECTED_FE_FAILURES + assert TypeError in _EXPECTED_FE_FAILURES + assert RuntimeError in _EXPECTED_FE_FAILURES + # Genuine bugs MUST NOT be in the expected list — otherwise they'd be + # silently swallowed instead of surfaced via ``logger.exception``. + assert AttributeError not in _EXPECTED_FE_FAILURES + assert NameError not in _EXPECTED_FE_FAILURES + assert ImportError not in _EXPECTED_FE_FAILURES + + +def test_benchmark_module_uses_centralized_default_model(): + """Benchmarks pull the default Copilot model from ``DEFAULT_MODEL``. + + Pre-fix every benchmark runner duplicated the literal ``"gpt-5.2"`` + string in its ``llm_config`` builder. We now import the centralized + constant so a future model change is a single-line edit. + """ + from benchmarks.simple_models.run_simple_models_benchmark import ( + get_featcopilot_engines, + ) + from featcopilot.utils.models import DEFAULT_MODEL + + _, llm_config = get_featcopilot_engines("classification", with_llm=True) + assert llm_config is not None + assert llm_config["model"] == DEFAULT_MODEL + + +def test_benchmark_logger_is_configured(): + """The benchmark module exposes a stdlib logger for surfacing failures. + + Previously the per-fold FeatCopilot try/except used ``print(...)``, + which made failures invisible to log aggregators / CI summaries. + This test pins that the module owns a ``logger`` attribute. + """ + from benchmarks.simple_models import run_simple_models_benchmark as bench + + assert hasattr(bench, "logger") + assert bench.logger.name == "benchmarks.simple_models.run_simple_models_benchmark" diff --git a/tests/test_utils.py b/tests/test_utils.py index 9996895..b0093ea 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -431,6 +431,60 @@ def test_default_model_constant(self): """Test DEFAULT_MODEL constant value.""" assert DEFAULT_MODEL == "gpt-5.2" + def test_default_model_exposed_from_utils_package(self): + """``DEFAULT_MODEL`` is exported from the ``featcopilot.utils`` namespace. + + Pins the public re-export so downstream code can do + ``from featcopilot.utils import DEFAULT_MODEL`` without dipping + into the private ``featcopilot.utils.models`` module. + """ + from featcopilot.utils import DEFAULT_MODEL as DEFAULT_MODEL_PUBLIC + + assert DEFAULT_MODEL_PUBLIC == DEFAULT_MODEL + + def test_llm_modules_share_default_model(self): + """All Copilot-backed LLM modules use ``DEFAULT_MODEL`` as their default. + + Regression guard: previously every module duplicated the literal + ``"gpt-5.2"`` string, which made it possible to drift the default + across modules silently. We now expect every Copilot-backed + constructor / Pydantic config to default to ``DEFAULT_MODEL``. + + OpenAI / LiteLLM clients are intentionally excluded — they + default to ``"gpt-4o"`` because that's a real public model + identifier for those backends. + """ + from featcopilot.llm.code_generator import FeatureCodeGenerator + from featcopilot.llm.copilot_client import CopilotConfig, CopilotFeatureClient + from featcopilot.llm.explainer import FeatureExplainer + from featcopilot.llm.semantic_engine import SemanticEngine, SemanticEngineConfig + from featcopilot.llm.transform_rule_generator import TransformRuleGenerator + + assert CopilotConfig().model == DEFAULT_MODEL + assert CopilotFeatureClient().config.model == DEFAULT_MODEL + assert FeatureCodeGenerator().model == DEFAULT_MODEL + assert FeatureExplainer().model == DEFAULT_MODEL + assert SemanticEngineConfig().model == DEFAULT_MODEL + assert TransformRuleGenerator().model == DEFAULT_MODEL + # SemanticEngine accepts the model in __init__ and stores it on + # its config; verify the default propagates end-to-end. + engine = SemanticEngine(validate_features=False) + assert engine.config.model == DEFAULT_MODEL + + def test_auto_feature_engineer_llm_default_model(self): + """AutoFeatureEngineer.llm_config falls back to ``DEFAULT_MODEL``. + + The internal ``_create_engine("llm")`` path used to hardcode + ``"gpt-5.2"`` as a fallback when ``llm_config`` was missing the + ``model`` key. We now expect the centralized default to be used + so a future model change in one place propagates everywhere. + """ + from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer + + afe = AutoFeatureEngineer(llm_config={}) # missing "model" key + engine = afe._create_engine("llm") + assert engine.config.model == DEFAULT_MODEL + class TestFetchModels: """Tests for fetch_models function.""" From e14a0785a339adf9452d9ea57c367b7b08f14fb0 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Wed, 6 May 2026 19:14:14 +0800 Subject: [PATCH 2/2] chore: clarify benchmark logger comment (Copilot review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous comment claimed the stdlib `logging` choice was about 'staying usable before featcopilot is installed' — but this PR added a direct `from featcopilot.utils.models import DEFAULT_MODEL` right above the comment, which contradicts that claim. The actual motivation is that `featcopilot.utils.logger` sets `propagate=False` on the `featcopilot.*` logger tree (see `featcopilot/utils/logger.py:20`), which would block benchmark output from reaching root-logger handlers configured by downstream consumers (CI runners, log aggregators, `pytest --log-cli`). Updated the comment to reflect that real motivation; no behavior change. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../simple_models/run_simple_models_benchmark.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/benchmarks/simple_models/run_simple_models_benchmark.py b/benchmarks/simple_models/run_simple_models_benchmark.py index 317fcea..146a9bf 100644 --- a/benchmarks/simple_models/run_simple_models_benchmark.py +++ b/benchmarks/simple_models/run_simple_models_benchmark.py @@ -83,11 +83,14 @@ from featcopilot.utils.models import DEFAULT_MODEL # Module logger for surfacing exceptions that were previously swallowed. -# We use the stdlib ``logging`` module here (rather than -# ``featcopilot.utils.logger.get_logger``) so the benchmark stays usable -# even before featcopilot is installed (e.g. the benchmark cache loader -# imports this module from CI smoke tests). ``logging.basicConfig`` is -# the consumer's responsibility. +# We deliberately use the stdlib ``logging`` module here (rather than +# ``featcopilot.utils.logger.get_logger``) because the latter sets +# ``propagate=False`` on the ``featcopilot.*`` logger tree, which prevents +# benchmark output from reaching root-logger handlers configured by +# downstream consumers (CI runners, log aggregators, ``pytest --log-cli``). +# A vanilla ``logging.getLogger(__name__)`` here keeps the benchmark output +# routable through the consumer's normal logging configuration. +# ``logging.basicConfig`` is the consumer's responsibility. logger = logging.getLogger(__name__) # Default configuration