diff --git a/README.md b/README.md index e4f3d4f..280db2d 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,39 @@ for feature, explanation in engineer.explain_features().items(): print(f"{feature}: {explanation}") ``` +## Command-Line Interface + +FeatCopilot ships a `featcopilot` CLI for shell, scripting, and agentic +(LLM tool-use) workflows — no Python glue required. All subcommands accept +`--json` for machine-readable stdout; errors are written to stderr with a +non-zero exit code so agents can parse failures deterministically. + +```bash +# Discover capabilities (engines, selection methods, I/O formats) +featcopilot info --json + +# Run feature engineering on a CSV / JSON file +featcopilot transform \ + --input data.csv --target label --output features.csv \ + --engines tabular --max-features 50 --json + +# Inspect generated features (name, explanation, code) as JSON for an LLM +featcopilot explain --input data.csv --target label + +# Equivalent module form +python -m featcopilot info --json +``` + +Pass `--config config.json` to provide nested keys such as `llm_config`; +explicit CLI flags override values from the config file. + +> **Parquet I/O.** FeatCopilot's base install does not pin a parquet engine. +> To use `--input file.parquet` / `--output file.parquet` (or the +> `parquet` value in `--input-format` / `--output-format`), install one of +> `pyarrow` or `fastparquet`. `featcopilot info --json` reports +> `"parquet_available": true` only when an engine is importable in the +> current environment. + ## Engines ### Tabular Engine diff --git a/featcopilot/__main__.py b/featcopilot/__main__.py new file mode 100644 index 0000000..0cce4e0 --- /dev/null +++ b/featcopilot/__main__.py @@ -0,0 +1,6 @@ +"""Enable ``python -m featcopilot`` to dispatch to the CLI.""" + +from featcopilot.cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/featcopilot/cli.py b/featcopilot/cli.py new file mode 100644 index 0000000..6f019f5 --- /dev/null +++ b/featcopilot/cli.py @@ -0,0 +1,1473 @@ +""" +FeatCopilot command-line interface. + +Provides a stable, agent-friendly CLI for invoking FeatCopilot from shells, +notebooks, agentic workflows (e.g. Copilot/LLM tool-use), and CI pipelines +without writing Python glue code. + +Subcommands +----------- +info + Print version and supported engines/methods. Always machine-readable + when ``--json`` is passed. +transform + Run :class:`featcopilot.AutoFeatureEngineer` on a tabular input file + (CSV / Parquet / JSON) and write engineered features to an output file. + Emits a JSON status line on stdout when ``--json`` is passed so that + agents can parse the result deterministically. +explain + Fit the engineer and print a JSON document describing each generated + feature (name, explanation, code) for downstream LLM consumption. + +Examples +-------- +Agentic usage (machine-readable result on stdout, errors on stderr):: + + featcopilot info --json + featcopilot transform \\ + --input data.csv --target label --output features.csv \\ + --engines tabular --max-features 50 --json + featcopilot explain --input data.csv --target label --json + +Equivalent module invocation:: + + python -m featcopilot info --json + +Parquet I/O is supported only when ``pyarrow`` or ``fastparquet`` is +installed (FeatCopilot's base distribution does not pin either); ``info`` +reports the runtime availability via ``parquet_available``. +""" + +from __future__ import annotations + +import argparse +import contextlib +import json +import logging +import sys +import threading +import warnings +from pathlib import Path +from typing import Any + +from featcopilot import __version__ +from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer +from featcopilot.utils.logger import get_logger + +logger = get_logger(__name__) + +SUPPORTED_INPUT_FORMATS = ("csv", "parquet", "json") +SUPPORTED_OUTPUT_FORMATS = ("csv", "parquet", "json") + + +def _parquet_engine_available() -> bool: + """Return ``True`` if a parquet engine (pyarrow or fastparquet) can be imported. + + FeatCopilot's base install pins neither ``pyarrow`` nor ``fastparquet``; + parquet I/O is therefore opportunistic. ``info`` uses this probe so the + machine-readable capability output reflects what will actually work in + the current environment, rather than always advertising parquet. + + Uses ``__import__`` (not ``importlib.util.find_spec``) so the probe is + *correct* even on environments with a broken native install: + ``find_spec`` only confirms a distribution is on ``sys.path``; it does + not prove the C extensions can actually load. A real import is the + only way to verify the engine is usable. + """ + for name in ("pyarrow", "fastparquet"): + try: + __import__(name) + return True + except Exception: # noqa: BLE001 - any import-time failure means unusable + continue + return False + + +def _detect_format(path: Path, override: str | None) -> str: + """Return one of ``SUPPORTED_INPUT_FORMATS`` for ``path``. + + Parameters + ---------- + path : pathlib.Path + File path whose suffix is inspected when ``override`` is ``None``. + override : str or None + Explicit format override (``csv`` / ``parquet`` / ``json``). + + Raises + ------ + ValueError + If the format cannot be determined or is not supported. + """ + if override is not None: + fmt = override.lower() + if fmt not in SUPPORTED_INPUT_FORMATS: + raise ValueError(f"Unsupported format {override!r}; expected one of {SUPPORTED_INPUT_FORMATS}") + return fmt + + suffix = path.suffix.lower().lstrip(".") + aliases = {"pq": "parquet", "parq": "parquet"} + fmt = aliases.get(suffix, suffix) + if fmt not in SUPPORTED_INPUT_FORMATS: + raise ValueError( + f"Cannot infer format from extension {path.suffix!r}; " + f"pass --input-format / --output-format (one of {SUPPORTED_INPUT_FORMATS})." + ) + return fmt + + +def _read_table(path: Path, fmt: str, *, nrows: int | None = None, suppress_truncation_warning: bool = False): + """Read a tabular file into a pandas DataFrame. + + All user-facing failure modes (missing parquet engine, ``--input`` + pointing at a directory, permission denied, malformed JSON/CSV, + decoding errors) are normalized into :class:`ValueError` so the CLI's + top-level handler routes them to the deterministic ``exit 2`` + user-error path. The generic ``exit 1`` backstop is reserved for + truly unexpected (i.e. CLI-internal) errors. + + Parameters + ---------- + path : pathlib.Path + File to read. + fmt : str + One of ``csv`` / ``parquet`` / ``json``. + nrows : int or None, optional + Cap the number of rows returned. For ``csv``, this is propagated + directly to :func:`pandas.read_csv` so the underlying read is + memory-bounded. For ``parquet`` and ``json``, pandas does not + expose a native row limit, so the file is fully read and then + truncated; a :class:`UserWarning` is issued in that case (unless + ``suppress_truncation_warning`` is true) so the caller knows the + bound is post-read (not memory-bounded). The ``nrows`` cap is + applied with a deterministic head slice so re-runs on the same + input produce the same metadata. + suppress_truncation_warning : bool, optional + When True, the post-read truncation notice (parquet / JSON only) + is *not* emitted from this helper. Used by callers that emit + their own consolidated, user-facing warning so users don't see + a confusing pair of messages — see ``_cmd_explain``'s + ``--explain-sample-size`` handling. + """ + import pandas as pd + + if path.is_dir(): + raise ValueError(f"--input expects a file, but {str(path)!r} is a directory.") + + if fmt == "csv": + try: + # ``nrows`` is the only memory-bound knob native to read_csv; + # passing it here is what lets ``--explain-sample-size`` actually + # cap memory on huge CSV inputs (rather than loading the entire + # file and then trimming). + df = pd.read_csv(path, nrows=nrows) + except ( + OSError, + pd.errors.ParserError, + pd.errors.EmptyDataError, + UnicodeDecodeError, + ) as exc: + # ``EmptyDataError`` fires for headerless / zero-byte CSVs; + # without it, those inputs would fall into the generic exit-1 + # "unexpected error" path instead of the documented exit-2 + # user-input error. + raise ValueError(f"Failed to read CSV from {str(path)!r}: {exc}") from exc + elif fmt == "parquet": + try: + df = pd.read_parquet(path) + except ImportError as exc: + raise ValueError( + f"Reading parquet requires a parquet engine (pyarrow or fastparquet); " + f"install one of them, or convert the input to CSV/JSON. Original error: {exc}" + ) from exc + except Exception as exc: + # Catch *any* backend failure (``OSError`` for I/O, + # ``pyarrow.lib.ArrowInvalid`` for corrupt files, + # ``ValueError`` from ``fastparquet`` for malformed metadata, + # etc.) and surface it via the deterministic exit-2 path. + # Catching ``Exception`` is appropriate here because the entire + # operation is delegated to a third-party backend; any error + # raised is by definition an I/O or data issue, not a CLI bug. + raise ValueError(f"Failed to read parquet from {str(path)!r}: {exc}") from exc + if nrows is not None and len(df) > nrows: + if not suppress_truncation_warning: + warnings.warn( + f"--explain-sample-size cap is applied post-read for parquet " + f"(loaded {len(df)} rows, truncating to {nrows}). pandas " + "does not expose a native parquet row-limit, so the full " + "file is materialized in memory before the cap. For hard " + "memory bounds on huge inputs, convert to CSV first.", + UserWarning, + stacklevel=2, + ) + df = df.iloc[:nrows] + elif fmt == "json": + # ``orient='records'`` is the agent-friendly default; fall back to + # pandas' auto-detection when the file isn't a records list. + try: + df = pd.read_json(path, orient="records") + except ValueError: + try: + df = pd.read_json(path) + except ValueError as exc: + raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc + except OSError as exc: + raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc + if nrows is not None and len(df) > nrows: + if not suppress_truncation_warning: + warnings.warn( + f"--explain-sample-size cap is applied post-read for JSON " + f"(loaded {len(df)} rows, truncating to {nrows}). pandas " + "does not expose a native JSON row-limit, so the full " + "file is materialized in memory before the cap. For hard " + "memory bounds on huge inputs, convert to CSV first.", + UserWarning, + stacklevel=2, + ) + df = df.iloc[:nrows] + else: + raise ValueError(f"Unsupported input format: {fmt}") + + # Reject "header-only" / empty inputs across every supported format. + # ``DataFrame.empty`` returns ``True`` for both zero-row AND + # zero-column frames, but those are very different user errors that + # warrant different remediation paths, so check the two cases + # explicitly. ``pd.read_csv`` returns an empty DataFrame (no + # exception) when the CSV has headers but zero data rows; the same + # goes for an empty parquet file or ``[]`` JSON body. Without this + # check, the CLI would pass an empty frame into ``TabularEngine``, + # which divides by ``len(X)`` while fitting categorical encoding and + # exits via the generic ``unexpected error`` path. Surface both + # cases as clean exit-2 user-input errors. + if len(df.columns) == 0: + raise ValueError( + f"Input file {str(path)!r} has no columns. " + "Feature engineering requires at least one input feature column " + "(e.g. a JSON array of ``{}`` objects, or a table that only " + "preserved an index, would hit this error)." + ) + if len(df) == 0: + raise ValueError( + f"Input file {str(path)!r} is empty (zero data rows). " + "Feature engineering requires at least one row of data." + ) + return df + + +def _write_table(df, path: Path, fmt: str) -> None: + """Write a pandas DataFrame to ``path`` in ``fmt``. + + All user-facing failure modes (missing parquet engine, ``--output`` + pointing at a directory, permission denied, parent-directory creation + failures) are normalized into :class:`ValueError` so the CLI surfaces a + clean stderr message via the standard ``exit 2`` path instead of the + generic ``exit 1`` "unexpected error" backstop. + """ + if path.exists() and path.is_dir(): + raise ValueError(f"--output expects a file, but {str(path)!r} is an existing directory.") + + try: + path.parent.mkdir(parents=True, exist_ok=True) + except OSError as exc: + raise ValueError(f"Cannot create parent directory for {str(path)!r}: {exc}") from exc + + if fmt == "csv": + try: + df.to_csv(path, index=False) + except OSError as exc: + raise ValueError(f"Failed to write CSV to {str(path)!r}: {exc}") from exc + elif fmt == "parquet": + try: + df.to_parquet(path, index=False) + except ImportError as exc: + raise ValueError( + f"Writing parquet requires a parquet engine (pyarrow or fastparquet); " + f"install one of them, or pick CSV/JSON via --output-format. Original error: {exc}" + ) from exc + except Exception as exc: + # Same broad-catch rationale as ``_read_table``: parquet write + # is fully delegated to a backend (``pyarrow``/``fastparquet``) + # whose errors include ``OSError`` (I/O), engine-specific type + # / conversion exceptions for unsupported column values, etc. + # All of these are user-facing data issues, not CLI bugs, so + # they should produce a clean exit-2 failure. + raise ValueError(f"Failed to write parquet to {str(path)!r}: {exc}") from exc + elif fmt == "json": + try: + df.to_json(path, orient="records", indent=2) + except OSError as exc: + raise ValueError(f"Failed to write JSON to {str(path)!r}: {exc}") from exc + else: + raise ValueError(f"Unsupported output format: {fmt}") + + +# Top-level keys recognized in a ``--config`` JSON file. The CLI rejects +# any other top-level key with a precise exit-2 error so typos like +# ``max_feature`` (no s) fail fast in automation rather than silently +# running with defaults. +_KNOWN_CONFIG_KEYS = frozenset( + { + "engines", + "selection_methods", + "max_features", + "correlation_threshold", + "leakage_guard", + "gate_n_jobs", + "llm_config", + "verbose", + "explain_sample_size", + } +) + + +def _load_config(config_path: str | None) -> dict[str, Any]: + """Load a JSON config file (or return an empty dict). + + Normalizes user-input mistakes (missing path, directory passed instead + of a file, invalid JSON, non-object root, unknown top-level keys) into + :class:`ValueError` / :class:`FileNotFoundError` so the CLI's top-level + error handler can route them all to the deterministic ``exit 2`` + user-error path (rather than e.g. ``IsADirectoryError`` falling into + the generic ``exit 1`` "unexpected error" backstop, or a typo silently + being ignored). + """ + if config_path is None: + return {} + path = Path(config_path) + if not path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + if path.is_dir(): + raise ValueError(f"--config expects a JSON file, but {config_path!r} is a directory.") + try: + with path.open("r", encoding="utf-8") as fh: + data = json.load(fh) + except json.JSONDecodeError as exc: + raise ValueError(f"Config file {config_path!r} is not valid JSON: {exc}") from exc + except OSError as exc: + # Catch-all for unreadable files (permission denied, broken symlink, + # etc.). Surface as a user-facing error rather than the generic + # exit-1 backstop. + raise ValueError(f"Config file {config_path!r} could not be read: {exc}") from exc + if not isinstance(data, dict): + raise ValueError(f"Config file {config_path!r} must contain a JSON object at the top level") + unknown = sorted(set(data.keys()) - _KNOWN_CONFIG_KEYS) + if unknown: + raise ValueError( + f"Config file {config_path!r} has unknown top-level key(s): {unknown}. " + f"Recognized keys: {sorted(_KNOWN_CONFIG_KEYS)}." + ) + return data + + +def _emit(payload: dict[str, Any], *, as_json: bool, stream=None) -> None: + """Emit a payload to stdout, JSON-encoded when ``as_json`` is true.""" + stream = stream if stream is not None else sys.stdout + if as_json: + stream.write(json.dumps(payload, default=str, sort_keys=True)) + stream.write("\n") + else: + for key, value in payload.items(): + stream.write(f"{key}: {value}\n") + stream.flush() + + +def _check_scalar_type( + name: str, + value: Any, + expected: tuple[type, ...], + *, + allow_none: bool = False, + allow_bool: bool = True, +) -> None: + """Validate a scalar value's type for ``--config``-supplied keys. + + Raises :class:`ValueError` (caught by ``main()`` -> exit 2) when the + value's type does not match. ``bool`` is a subclass of ``int`` in + Python; pass ``allow_bool=False`` to reject ``True``/``False`` for + numeric-only fields like ``max_features`` / ``correlation_threshold``. + """ + if value is None: + if allow_none: + return + raise ValueError(f"`{name}` must not be null in --config") + if not allow_bool and isinstance(value, bool): + raise ValueError( + f"`{name}` in --config must be a {' or '.join(t.__name__ for t in expected)}; " f"got bool={value!r}." + ) + if not isinstance(value, expected): + raise ValueError( + f"`{name}` in --config must be a {' or '.join(t.__name__ for t in expected)}; " + f"got {type(value).__name__}={value!r}." + ) + + +def _build_engineer(args: argparse.Namespace, *, include_selection_config: bool = True) -> AutoFeatureEngineer: + """Construct an :class:`AutoFeatureEngineer` from parsed CLI args. + + Precedence: explicit CLI flags override values from ``--config``; + explicit config values (including empty lists) override the defaults. + Empty / non-list values are propagated unchanged so that + :meth:`AutoFeatureEngineer._validate_configuration` produces its + canonical (and deterministic) error path — the CLI's wrapper must not + silently rewrite a misconfigured config into something that looks + different from what the user wrote. + + ``include_selection_config=False`` (used by the ``explain`` subcommand) + skips reading selection-only config keys (``selection_methods``, + ``correlation_threshold``) so a shared config file with selection + settings does not cause ``explain`` to fail config-validation for keys + that are inert at runtime (selection is disabled in ``explain``). + """ + config = _load_config(args.config) + + def pick(flag_value, config_key, default): + # Explicit CLI flag wins. Otherwise honor an explicit config entry + # — even a falsy one such as ``[]`` — so AutoFeatureEngineer can + # raise its own clear "must contain at least one" error rather than + # the CLI silently swapping in defaults. Only fall back to the + # default when the key is *absent* from the config. + if flag_value is not None: + return flag_value + if config_key in config: + return config[config_key] + return default + + engines = pick(args.engines, "engines", ["tabular"]) + # ``explain`` exposes ``--engines`` and ``--max-features`` (engine-level + # caps) but not the selection-only flags ``--selection-methods`` and + # ``--correlation-threshold``. When ``include_selection_config`` is + # False (i.e. we're called from ``explain``) we also skip reading the + # selection-only keys from the config file, so a shared transform/explain + # config with selection settings won't trip ``explain`` over keys that + # have no effect on its runtime behavior. + if include_selection_config: + selection_methods = pick( + getattr(args, "selection_methods", None), + "selection_methods", + ["mutual_info", "importance"], + ) + correlation_threshold = pick(getattr(args, "correlation_threshold", None), "correlation_threshold", 0.85) + else: + selection_methods = ["mutual_info", "importance"] + correlation_threshold = 0.85 + max_features = pick(args.max_features, "max_features", None) + leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn") + gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1) + + # Type-check scalar config fields here so the CLI surfaces a clean + # exit-2 error instead of a downstream ``TypeError`` (e.g. from + # ``self.max_features <= 0`` when the JSON config supplied a string). + # ``argparse`` already enforces types for the flag side; this only + # guards against malformed ``--config`` JSON. + _check_scalar_type("max_features", max_features, (int,), allow_none=True, allow_bool=False) + _check_scalar_type("correlation_threshold", correlation_threshold, (int, float), allow_bool=False) + _check_scalar_type("gate_n_jobs", gate_n_jobs, (int,), allow_bool=False) + _check_scalar_type("leakage_guard", leakage_guard, (str,)) + + # Range-check ``correlation_threshold``: it's only meaningful in + # ``[0.0, 1.0]``. Values above 1 silently disable redundancy + # elimination (``FeatureSelector.fit`` only runs it when threshold + # < 1.0); values below 0 effectively treat every numeric pair as + # redundant. Reject out-of-range up front so the CLI doesn't quietly + # change selector behavior. + if not (0.0 <= float(correlation_threshold) <= 1.0): + raise ValueError(f"`correlation_threshold` must be in the range [0.0, 1.0]; got {correlation_threshold!r}.") + # ``max_features`` must be positive when set (matches + # AutoFeatureEngineer's own validation). Surface that here too so + # the message says ``max_features`` rather than the more cryptic + # transformer error. + if max_features is not None and max_features <= 0: + raise ValueError(f"`max_features` must be a positive integer when set; got {max_features!r}.") + + # Validate ``llm_config`` is a JSON object (i.e. a Python dict) before + # forwarding it. Without this check, a misconfigured non-dict value + # would only fail at engine-construction time inside + # ``AutoFeatureEngineer._create_engine`` via ``self.llm_config.get(...)``, + # raising an ``AttributeError`` that bypasses the structured exit-2 + # user-error path (the CLI would surface it as exit 1 "unexpected + # error", which is a poor agent contract for a documented config key). + llm_config_raw = config.get("llm_config") + if llm_config_raw is None: + llm_config: dict[str, Any] = {} + elif isinstance(llm_config_raw, dict): + llm_config = llm_config_raw + else: + raise ValueError( + "`llm_config` in the --config file must be a JSON object (mapping); " + f"got {type(llm_config_raw).__name__}={llm_config_raw!r}." + ) + + # ``verbose`` is type-checked before being forwarded so a malformed + # config like ``{"verbose": "false"}`` (truthy string) does NOT silently + # turn verbose mode on — instead it raises a clean exit-2 error + # consistent with the other scalar fields. ``args.verbose`` is already + # a bool / None thanks to ``BooleanOptionalAction``; only the config + # path can introduce a non-bool. + verbose_raw = pick(args.verbose, "verbose", False) + _check_scalar_type("verbose", verbose_raw, (bool,)) + verbose = bool(verbose_raw) + + # Pass ``engines`` / ``selection_methods`` through *unchanged* (no + # ``list(...)`` wrapping). Coercion would convert a misconfigured + # JSON string like ``"tabular"`` into ``['t','a','b','u','l','a','r']``, + # turning a clear type error into a confusing "Unknown engines" path. + # AutoFeatureEngineer.__init__ rejects non-list/tuple inputs with a + # precise message — let it. + return AutoFeatureEngineer( + engines=engines, + max_features=max_features, + selection_methods=selection_methods, + correlation_threshold=correlation_threshold, + llm_config=llm_config, + verbose=verbose, + leakage_guard=leakage_guard, + gate_n_jobs=gate_n_jobs, + ) + + +def _split_xy(df, target: str | None): + """Split a DataFrame into ``(X, y)``; ``y`` is ``None`` when no target.""" + if target is None: + return df, None + if target not in df.columns: + raise ValueError( + f"Target column {target!r} not found in input. " + f"Available columns: {list(df.columns)[:20]}{'...' if len(df.columns) > 20 else ''}" + ) + y = df[target] + X = df.drop(columns=[target]) + return X, y + + +def _cmd_info(args: argparse.Namespace) -> int: + """Print version + supported engines/methods. + + Parquet appears in ``supported_input_formats`` / ``supported_output_formats`` + only when an actual parquet engine (``pyarrow`` or ``fastparquet``) can + be imported in the current environment — otherwise the ``info`` output + would advertise a format that immediately fails on use, which is + misleading for the agentic capability-discovery the CLI is designed to + support. + """ + parquet_ok = _parquet_engine_available() + input_formats = [f for f in SUPPORTED_INPUT_FORMATS if f != "parquet" or parquet_ok] + output_formats = [f for f in SUPPORTED_OUTPUT_FORMATS if f != "parquet" or parquet_ok] + payload = { + "version": __version__, + "supported_engines": sorted(AutoFeatureEngineer.SUPPORTED_ENGINES), + "supported_selection_methods": sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS), + "supported_leakage_guards": sorted(AutoFeatureEngineer.SUPPORTED_LEAKAGE_GUARDS), + "supported_input_formats": input_formats, + "supported_output_formats": output_formats, + "parquet_available": parquet_ok, + } + _emit(payload, as_json=args.json) + return 0 + + +def _fit_transform_capturing_warnings(engineer, X, y, **kwargs): + """Thin convenience wrapper: run ``engineer.fit_transform(X, y, **kwargs)`` + inside a single ``_capture_featcopilot_messages()`` block. + + This helper is *not* used by the CLI subcommands themselves anymore + — they wrap a wider region (read + fit_transform + write) in one + capture so warnings emitted during pandas / pyarrow read or write + phases are also surfaced via the JSON ``warnings`` field instead of + leaking onto stderr. It is preserved as a public-ish helper for + external test code that just wants to capture messages around a + plain ``fit_transform`` call. + + Returns + ------- + (messages, result) + ``messages`` is a list of ``str`` (warnings then logs, in + emission order). ``result`` is whatever ``fit_transform`` returned. + """ + with _capture_featcopilot_messages() as captured: + result = engineer.fit_transform(X, y, **kwargs) + return captured, result + + +class _ThreadCaptureState: + """Holds per-thread capture *stacks* with strict per-thread isolation + and a *narrow* cross-thread fallback for LLM-client log records. + + Each thread maps to a stack of capture lists. Nested + :func:`_capture_featcopilot_messages` calls on the same thread push + onto the stack; the innermost active capture is always at the top + and receives records / warnings until its block exits, at which + point the outer capture (if any) becomes active again. + + **Strict thread isolation by default.** :meth:`get` returns a target + list ONLY for the calling thread itself. This avoids the + misattribution that an unconditional single-active-capture fallback + would cause: any featcopilot log on any thread would otherwise be + silently swallowed into the active CLI run's payload, including + output from unrelated background work happening in the same process. + + **Narrow LLM-client fallback.** :meth:`get_for_llm_record` is the + one exception: when a record originates from one of the *exact* + sync LLM client modules whitelisted in + :attr:`_LLM_FALLBACK_LOGGER_NAMES` (the modules whose synchronous + entry points fall back to ``ThreadPoolExecutor`` in event-loop + environments), and exactly one capture is active in the process, + the record is routed to that capture even when emitted on a worker + thread. This addresses the common case where an LLM client's + mock-mode startup warning fires on a worker that ``submit()`` + spawned and would otherwise bleed onto stderr. The whitelist is an + explicit set of module names — *not* a prefix — so unrelated + ``featcopilot.llm.*`` loggers (e.g. ``semantic_engine``, + ``code_generator``, ``transform_rule_generator``, ``explainer``) + that never run on worker threads do not trigger the fallback. + + **Atomic per-record decision.** :meth:`resolve_for_record` caches + the lookup on the record itself the first time it is called, so + :class:`_ThreadRoutingHandler` (which routes records) and + :class:`_SuppressCapturingFilter` (which suppresses them from + stderr) cannot disagree under concurrent push/pop activity from + other threads. Without that caching, ``filter`` could see one + state and the later ``emit`` could see another, breaking the + "captured XOR on stderr" invariant. + + Shared by :class:`_ThreadRoutingHandler` (writes records), + :class:`_SuppressCapturingFilter` (suppresses stderr), and the + routing ``warnings.showwarning`` override. + """ + + # Logger names whose records are eligible for the narrow cross-thread + # fallback. Only the synchronous LLM client modules whose ``run`` / + # batch entry points fall back to ``ThreadPoolExecutor`` workers in + # event-loop environments are listed; their startup / mock-mode + # warnings legitimately fire from those worker threads. Other + # ``featcopilot.llm.*`` loggers are intentionally NOT included so + # cross-thread records from unrelated background work cannot be + # silently swallowed into an active CLI capture. + _LLM_FALLBACK_LOGGER_NAMES = frozenset( + { + "featcopilot.llm.copilot_client", + "featcopilot.llm.litellm_client", + "featcopilot.llm.openai_client", + } + ) + + # Sentinel for "no cached decision yet" on a log record. Distinct + # from ``None``, which is itself a valid resolved decision meaning + # "no capture target — let the record flow to stderr". + _UNCACHED = object() + + def __init__(self): + self._per_thread: dict[int, list[list[str]]] = {} + self._lock = threading.Lock() + + def push(self, tid: int, target: list[str]) -> None: + with self._lock: + self._per_thread.setdefault(tid, []).append(target) + + def pop(self, tid: int) -> None: + with self._lock: + stack = self._per_thread.get(tid) + if stack: + stack.pop() + if not stack: + del self._per_thread[tid] + + def get(self, tid: int) -> list[str] | None: + # Strict per-thread lookup. No cross-thread fallback: an + # unconditional fallback was too broad and could silently + # swallow unrelated background log output into a CLI run's + # payload. The narrow LLM-client fallback lives in + # :meth:`get_for_llm_record` instead, opted into by name. + with self._lock: + stack = self._per_thread.get(tid) + if stack: + return stack[-1] + return None + + def get_for_llm_record(self, tid: int, logger_name: str) -> list[str] | None: + """Per-thread lookup with a narrow cross-thread fallback for LLM + client records. + + When the calling thread has its own active capture, that's used. + Otherwise — and only when the record originates from a + whitelisted sync LLM client module + (:attr:`_LLM_FALLBACK_LOGGER_NAMES`) AND exactly one capture is + active in the process — the record is routed to that single + capture so it doesn't bleed onto stderr. The whitelist is an + explicit set of module names so unrelated ``featcopilot.llm.*`` + loggers (e.g. ``semantic_engine``, ``code_generator``) that + never hop onto worker threads do not trigger the fallback. + + This method takes the lock once and returns a snapshot + decision; callers should generally use :meth:`resolve_for_record` + to additionally cache the result on the record so that paired + filter / emit calls can never disagree under concurrent + push/pop on other threads. + """ + with self._lock: + stack = self._per_thread.get(tid) + if stack: + return stack[-1] + if logger_name not in self._LLM_FALLBACK_LOGGER_NAMES: + return None + if len(self._per_thread) != 1: + # Either no captures are active (nothing to route to) or + # multiple are active (ambiguous — keep strict isolation + # so concurrent CLI calls don't cross-contaminate). + return None + only_stack = next(iter(self._per_thread.values())) + if only_stack: + return only_stack[-1] + return None + + def resolve_for_record(self, record: logging.LogRecord) -> list[str] | None: + """Resolve the capture target for ``record`` exactly once. + + The first call computes the decision via + :meth:`get_for_llm_record` and caches it on the record itself + as ``record._featcopilot_capture_target``; subsequent calls + return the cached value. This is what makes the routing + handler's ``emit`` and the suppression filter's ``filter`` + atomic with respect to each other: they always see the same + decision for a given record even if another thread pops or + pushes a capture between the two calls. + + Logging records are produced and dispatched to handlers in the + same thread, so caching directly on the record is safe — there + is no concurrent reader of the same record's attributes. + """ + cached = getattr(record, "_featcopilot_capture_target", self._UNCACHED) + if cached is not self._UNCACHED: + return cached + target = self.get_for_llm_record(threading.get_ident(), record.name) + record._featcopilot_capture_target = target + return target + + +class _ThreadRoutingHandler(logging.Handler): + """Logging handler that routes records to the calling thread's capture list. + + Attached once to the ``featcopilot`` root logger. Records propagated + from any ``featcopilot.*`` child logger reach this handler in the same + way they reach the existing stderr handler. If the calling thread has + a registered capture list, the record is appended to it. + Otherwise, for records originating from a ``featcopilot.llm.*`` + logger AND when exactly one capture is active in the process, the + record is routed to that capture (the narrow LLM-client cross-thread + fallback — see :class:`_ThreadCaptureState`). Records from any + other thread / logger combination flow through to the existing + stderr handler. + """ + + def __init__(self, state: _ThreadCaptureState): + super().__init__(logging.DEBUG) + self._state = state + self.setFormatter(logging.Formatter("%(levelname)s: %(message)s")) + + def emit(self, record: logging.LogRecord) -> None: + # Use the cached resolver so this handler and the paired + # ``_SuppressCapturingFilter`` always see the same decision + # for a given record, even if another thread pushes or pops + # a capture between the filter and emit phases. + target = self._state.resolve_for_record(record) + if target is None: + return + try: + target.append(self.format(record)) + except Exception: # pragma: no cover - never let logging crash the CLI + target.append(record.getMessage()) + + +class _SuppressCapturingFilter(logging.Filter): + """Filter for the *existing* handlers: drops records being captured. + + Without this filter, every record routed by + :class:`_ThreadRoutingHandler` to a capture list would still hit the + featcopilot root logger's stderr ``StreamHandler`` and bleed onto + stderr — breaking the CLI's "stderr reserved for failures" contract. + The filter mirrors the routing handler's policy so the two stay in + lockstep: anything captured (current-thread record OR cross-thread + LLM-client record under the narrow fallback) is also suppressed + from the original handlers; anything else (records from + non-capturing threads / unrelated background work) flows through to + stderr unchanged. + """ + + def __init__(self, state: _ThreadCaptureState): + super().__init__() + self._state = state + + def filter(self, record: logging.LogRecord) -> bool: + # Use the cached resolver so this filter and the paired + # ``_ThreadRoutingHandler`` always see the same decision for a + # given record, even if another thread pushes or pops a capture + # between this call and the routing handler's emit. Without the + # cache, the two could disagree and the same record could end + # up both captured and on stderr (or suppressed without being + # captured) — breaking the "stderr reserved for failures" + # contract under concurrent CLI calls. + return self._state.resolve_for_record(record) is None + + +# Module-level singletons. Installed exactly once on the featcopilot root +# logger / its existing handlers; subsequent ``_capture_featcopilot_messages`` +# calls just push/pop thread state. No global lock is held during the slow +# ``fit_transform`` body — concurrent threads each capture their own records +# independently. +_capture_state = _ThreadCaptureState() +_routing_handler = _ThreadRoutingHandler(_capture_state) +_suppress_filter = _SuppressCapturingFilter(_capture_state) +_install_lock = threading.Lock() +_install_done = False +# Captures the original ``warnings.showwarning`` at first install so the +# routing override can chain to it for non-capturing threads (and so we +# never mutate it again on subsequent capture calls — the previous +# per-call save/restore raced under concurrent overlapping captures). +_original_showwarning = None + + +def _routing_showwarning(message, category, filename, lineno, file=None, line=None): + """Permanent ``warnings.showwarning`` override (installed once). + + Routes warnings to the *innermost* capturing list for the current + thread (via :class:`_ThreadCaptureState` stack lookup). If the + current thread is not capturing, chains to the original + ``warnings.showwarning`` so non-capturing threads keep their normal + behavior. + + Installed once globally — *not* swapped per-call — so concurrent + overlapping captures on different threads cannot race on the + process-global ``warnings.showwarning`` slot. + """ + target = _capture_state.get(threading.get_ident()) + if target is not None: + target.append(str(message)) + return + if _original_showwarning is not None: + _original_showwarning(message, category, filename, lineno, file, line) + + +def _install_capture_hooks_once() -> None: + """Install the routing handler + suppress filter + showwarning override. + + The logger handler and filter are installed exactly once (idempotent). + The ``warnings.showwarning`` override is re-installed every call if + something else has replaced it — this is necessary because external + code (most commonly ``warnings.catch_warnings()`` blocks) can reset + the global ``warnings.showwarning`` and undo a previous install. The + fresh re-install captures the current (caller's) ``showwarning`` as + the new "original" to chain to, so non-capturing threads still see + whatever warning behavior the caller had set up. + + All hooks themselves dispatch on :class:`_ThreadCaptureState` which + uses a per-thread stack, so they are no-ops for threads that aren't + currently capturing. + """ + global _install_done, _original_showwarning + with _install_lock: + # Logger handler/filter install (truly once — these can't be + # silently undone by external code in the way ``warnings.showwarning`` + # can). + if not _install_done: + fc_root = logging.getLogger("featcopilot") + if _routing_handler not in fc_root.handlers: + fc_root.addHandler(_routing_handler) + for handler in list(fc_root.handlers): + if handler is _routing_handler: + continue + if _suppress_filter not in handler.filters: + handler.addFilter(_suppress_filter) + _install_done = True + + # ``warnings.showwarning`` install — re-check every entry. A + # caller's ``warnings.catch_warnings()`` block restores the + # previous ``showwarning`` on exit, undoing our install. Re- + # installing on next entry is what makes overlapping captures + # robust against caller-side warning context manipulation. + if warnings.showwarning is not _routing_showwarning: + _original_showwarning = warnings.showwarning + warnings.showwarning = _routing_showwarning + + +@contextlib.contextmanager +def _capture_featcopilot_messages(): + """Capture FeatCopilot log records and ``warnings.warn`` calls emitted + on the *current thread*. + + Yields a list that the caller can read after the with-block exits. + The list contains formatted log records (in emission order) and any + Python warning messages emitted during the with-block on this thread. + + Concurrency model + ----------------- + * **Logger records** are routed *per-thread* via + :class:`_ThreadRoutingHandler` (added once to the ``featcopilot`` + root logger) and a :class:`_SuppressCapturingFilter` on the existing + handlers. Two threads can capture concurrently without blocking + each other; each sees only its own records, and other threads' + records still flow normally to stderr. + * **``warnings.warn`` records** are intercepted via a permanent + :func:`_routing_showwarning` override installed once. The override + routes by ``threading.get_ident()`` and chains to the original + ``warnings.showwarning`` for non-capturing threads. The override is + *not* swapped per-call, so concurrent overlapping captures on + different threads cannot race on the process-global + ``warnings.showwarning`` slot. + * **Nested captures** on the same thread are supported via a + per-thread stack in :class:`_ThreadCaptureState`. Records and + warnings always go to the innermost active capture; when the inner + block exits, the outer capture is automatically reactivated. + + The contextmanager does NOT hold any lock for the duration of the + with-block — only briefly during install/push/pop — so long-running + ``fit_transform`` calls in one thread do not block other threads + from running concurrently. + """ + _install_capture_hooks_once() + + captured: list[str] = [] + tid = threading.get_ident() + _capture_state.push(tid, captured) + try: + yield captured + finally: + _capture_state.pop(tid) + + +def _cmd_transform(args: argparse.Namespace) -> int: + """Read input, fit/transform, write output. + + The "successful runs keep stderr empty" CLI contract requires that + *every* phase that can legitimately emit a Python warning be wrapped + in the message capture, not just ``fit_transform``. Concretely: + + * **Read** — ``pd.read_csv`` can emit ``DtypeWarning`` on mixed-type + columns and parquet/JSON readers can emit pyarrow / pandas + future-warnings on a successful read. + * **Fit / transform** — engineers themselves emit warnings (e.g. + ``AutoFeatureEngineer.fit`` for leakage-prone column names under + ``leakage_guard='warn'``). + * **Write** — ``DataFrame.to_csv`` / ``to_parquet`` / ``to_json`` + can emit pandas / pyarrow ``FutureWarning`` / ``UserWarning`` + during a successful write. + + All three phases now live inside one capture block so any warnings + they emit are surfaced via the JSON ``warnings`` field rather than + leaking onto stderr. + """ + input_path = Path(args.input) + if not input_path.exists(): + raise FileNotFoundError(f"Input file not found: {args.input}") + output_path = Path(args.output) + + in_fmt = _detect_format(input_path, args.input_format) + out_fmt = _detect_format(output_path, args.output_format) + + # Single capture context spans read + fit_transform + write so any + # legitimate-but-noisy warnings from pandas/pyarrow during read or + # write end up in the JSON payload's ``warnings`` field instead of + # bleeding to stderr. ``_capture_featcopilot_messages`` is a no-op + # for non-warning code paths so wrapping the broader region has no + # side effects on the happy path. + with _capture_featcopilot_messages() as captured_warnings: + df = _read_table(input_path, in_fmt) + X, y = _split_xy(df, args.target) + + # Build the engineer first: ``_build_engineer`` runs all scalar / list / + # dict type validation on the merged CLI-flag + config view, so any + # malformed value (e.g. ``"max_features": "5"``, ``"verbose": "false"``) + # surfaces a precise exit-2 error here rather than down the wrong + # ``--target is required`` rabbit hole. + engineer = _build_engineer(args) + + # Selection requires a target column to fit against. ``AutoFeatureEngineer`` + # only actually fits a selector when ``y is not None`` AND ``max_features`` + # is set; without ``max_features`` the call is a raw feature-generation + # run and does not need a target. The CLI mirrors that contract: only + # require ``--target`` when both selection is enabled (the default) AND + # ``max_features`` is configured (CLI flag or config), so commands like + # ``featcopilot transform --input in.csv --output out.csv`` (no target, + # no cap) still work. Using ``engineer.max_features`` here means the + # value has already been type-validated, so we never report + # ``--target is required`` when the real problem is a malformed + # ``max_features`` config value. + if not args.no_selection and args.target is None and engineer.max_features is not None: + raise ValueError( + "--target is required when feature selection is applied " + "(i.e. when --max-features / config max_features is set). " + "Pass --target , or pass --no-selection / drop --max-features to skip selection." + ) + + transformed = engineer.fit_transform( + X, + y, + task_description=args.task_description or "prediction task", + target_name=args.target, + apply_selection=not args.no_selection, + ) + + if args.include_target and y is not None: + # Re-attach the target column so downstream training scripts can + # consume the engineered file as a single artifact. Detect column + # collisions: if an engineered feature happens to share the + # target's column name (e.g. a target named ``foo_pow2`` matching + # a tabular-engine derived feature), blindly assigning ``transformed[ + # target_name] = y.values`` would silently overwrite the engineered + # column. Surface that as a clean exit-2 error instead. Callers + # who knowingly want to overwrite can rename their target before + # invoking ``transform`` (or skip ``--include-target``). + target_name = args.target if args.target in df.columns else "target" + if target_name in transformed.columns: + raise ValueError( + f"--include-target would overwrite engineered feature {target_name!r} " + "with the target values. Rename the target column in the input file, " + "or drop --include-target." + ) + transformed = transformed.copy() + transformed[target_name] = y.values + + _write_table(transformed, output_path, out_fmt) + + payload = { + "status": "ok", + "input": str(input_path), + "output": str(output_path), + "input_format": in_fmt, + "output_format": out_fmt, + "n_rows": int(transformed.shape[0]), + "n_features": int(transformed.shape[1]), + "n_input_columns": int(X.shape[1]), + "n_generated_features": len(engineer.get_feature_names()), + "engines": list(engineer.engines), + "selection_methods": list(engineer.selection_methods), + "max_features": engineer.max_features, + "target": args.target, + "selection_applied": engineer._selector is not None, + "warnings": captured_warnings, + } + _emit(payload, as_json=args.json) + return 0 + + +# Default ``explain`` behavior is to use the full input so the metadata +# is a faithful description of what a corresponding ``transform`` run +# would do — engines like ``TabularEngine._fit_categorical_encoding`` +# use ``n_rows`` and per-category counts to decide e.g. one-hot vs. +# target-encoding, so subsampling can silently change which features +# appear. Callers who knowingly accept that trade-off can opt in via +# ``--explain-sample-size`` (set to ``None``/absent to disable, any +# positive integer to cap). +def _cmd_explain(args: argparse.Namespace) -> int: + """Fit + transform engines and print feature explanations + code as JSON. + + The built-in engines populate their internal feature-name registry during + :meth:`transform`, not :meth:`fit` (planning happens in ``fit`` but feature + objects are materialized in ``transform``). We therefore call + :meth:`AutoFeatureEngineer.fit_transform` so ``get_feature_names()``, + :meth:`explain_features` and :meth:`get_feature_code` all return the + actual generated features. Selection is intentionally skipped here so the + payload describes every candidate feature the engines produced, not just + the post-selection survivors. + + Performance vs. faithfulness + --------------------------- + By default ``explain`` runs on the *full* input so the reported + metadata is a faithful description of what a corresponding + ``transform`` would generate. Some engines (notably + :class:`TabularEngine`) consult row counts and per-category + statistics when deciding which features to plan, so blind + subsampling can silently change the result. + + For very large inputs where the metadata-only nature of ``explain`` + really should not pay full memory / compute cost, callers can pass + ``--explain-sample-size N`` (or set ``"explain_sample_size": N`` in + ``--config``) to cap the rows fed to the engineer. The cap is a + deterministic *head slice* (the first N rows): for CSV the cap is + threaded through ``pd.read_csv(nrows=N)`` so memory is bounded + natively; for parquet/JSON pandas has no native row-limit so the + file is fully read and then truncated, with a UserWarning explaining + the limitation. The cap is NOT a random sample — callers who need + randomness should sample externally before invoking ``explain``. + A "metadata may differ" UserWarning is emitted (captured into the + JSON payload's ``warnings`` field) only when the cap actually + truncated the input. The ``n_rows_used`` field reports the effective + sample size. + """ + input_path = Path(args.input) + if not input_path.exists(): + raise FileNotFoundError(f"Input file not found: {args.input}") + + in_fmt = _detect_format(input_path, args.input_format) + + # Apply opt-in sample cap from CLI flag or config (CLI flag wins). + # Resolve and validate it BEFORE reading the input so the cap can be + # threaded into ``_read_table(... nrows=sample_size)`` to bound memory + # on huge inputs (CSV uses ``pd.read_csv(nrows=...)`` natively; + # parquet/JSON fall back to post-read truncation with a UserWarning + # since pandas doesn't expose a native row-limit for those formats). + sample_size = getattr(args, "explain_sample_size", None) + if sample_size is None and args.config is not None: + sample_size = _load_config(args.config).get("explain_sample_size") + if sample_size is not None: + _check_scalar_type("explain_sample_size", sample_size, (int,), allow_bool=False) + if sample_size <= 0: + raise ValueError(f"`explain_sample_size` must be a positive integer when set; got {sample_size!r}.") + + engineer = _build_engineer(args, include_selection_config=False) + + # Run the sample-warning AND ``fit_transform`` inside a single + # capture context so the sampling notice ends up in the JSON + # payload's ``warnings`` field instead of bleeding onto stderr. + with _capture_featcopilot_messages() as captured_warnings: + # Choose a read strategy that gives us: + # 1. a memory bound where pandas allows it (CSV ``nrows``), and + # 2. enough information to detect truncation without ``_read_table`` + # emitting its own (slightly off-by-one) warning that would + # then double up with ours. + # + # For CSV we ask for ``sample_size + 1`` rows: ``pd.read_csv`` + # reads at most that many, AND ``len(df) > sample_size`` becomes + # a strict proof of truncation. We pass ``suppress_truncation_warning`` + # so ``_read_table`` doesn't emit its own message — ``_cmd_explain`` + # is the single source of truth for the sampling notice and uses + # the user-facing ``sample_size`` value. + # + # For parquet/JSON pandas exposes no native row-limit, so we + # always read fully (``nrows=None``) and let ``_cmd_explain`` + # both detect truncation and emit a single notice that includes + # the "memory bound is post-read" caveat. Asking ``_read_table`` + # for a limit there would only have caused it to truncate at + # ``sample_size + 1`` and emit a confusing duplicate warning. + if sample_size is not None and in_fmt == "csv": + read_nrows: int | None = sample_size + 1 + else: + read_nrows = None + df = _read_table(input_path, in_fmt, nrows=read_nrows, suppress_truncation_warning=True) + X, y = _split_xy(df, args.target) + n_sampled = len(X) + if sample_size is not None and n_sampled > sample_size: + # Strict proof of truncation: file had at least one more row + # than the requested cap. Trim to the exact cap and emit the + # "metadata may differ" notice. + X = X.iloc[:sample_size] + if y is not None: + y = y.iloc[:sample_size] + original_len = n_sampled + n_sampled = sample_size + msg = f"explain: capping input to {sample_size} rows (head slice). " + if in_fmt != "csv": + # For parquet/JSON we read the whole file before truncation + # (no native row-limit). Surface that fact so callers know + # memory wasn't bounded. + msg += ( + f"For {in_fmt}, pandas does not expose a native row-limit, " + f"so the full file ({original_len}+ rows) was loaded into " + "memory before truncation. For hard memory bounds on huge " + "inputs, convert to CSV first. " + ) + msg += ( + "Some engines (e.g. TabularEngine categorical encoding) decide which " + "features to plan based on row counts and per-category statistics, " + "so the reported metadata may differ from a full-input transform run." + ) + warnings.warn(msg, UserWarning, stacklevel=2) + + engineer.fit_transform( + X, + y, + task_description=args.task_description or "prediction task", + target_name=args.target, + apply_selection=False, + ) + + # ``explain_features`` / ``get_feature_code`` consult engine + # internals that may legitimately emit pandas / pyarrow warnings + # (e.g. when stringifying expression trees touching deprecated + # APIs). Keep them inside the capture so any such warning ends + # up in the JSON payload's ``warnings`` field instead of bleeding + # to stderr — same "stderr reserved for failures" contract as + # the read / fit_transform phases above. + explanations = engineer.explain_features() + code = engineer.get_feature_code() + feature_names = engineer.get_feature_names() + + payload = { + "status": "ok", + "input": str(input_path), + "n_features": len(feature_names), + "n_rows_used": n_sampled, + "engines": list(engineer.engines), + "features": [ + { + "name": name, + "explanation": explanations.get(name, ""), + "code": code.get(name, ""), + } + for name in feature_names + ], + "warnings": captured_warnings, + } + + # explain always emits JSON to stdout (it's the only sensible format), + # but we still respect ``--json`` for symmetry with other subcommands. + _emit(payload, as_json=True) + return 0 + + +class _StructuredArgumentParser(argparse.ArgumentParser): + """``argparse.ArgumentParser`` that emits the CLI's structured single-line + error format on usage failures. + + The default ``argparse`` ``error()`` method writes the multi-line + usage banner ("usage: featcopilot ...\\n featcopilot: error: ...") to + stderr before raising :class:`SystemExit`. That breaks the CLI + contract that stderr carries exactly one ``featcopilot: error: ...`` + line per failure — agents parsing stderr deterministically would see + the banner and the actual error mixed together, with no easy way to + tell which is which. + + This subclass overrides :meth:`error` to write a single line and + skip the banner, so usage failures (missing required argument, + unknown flag, missing subcommand, etc.) follow the same single-line + contract as the rest of the CLI's exit-2 paths. + """ + + def error(self, message: str) -> None: # type: ignore[override] + sys.stderr.write(f"featcopilot: error: {message}\n") + # ``ArgumentParser.error`` is documented to terminate; ``SystemExit(2)`` + # is what the parent class would do after writing the banner. + # ``main()``'s ``except SystemExit`` handler converts this to an int + # return value so callers still see the documented exit-2 contract. + raise SystemExit(2) + + +def _build_parser() -> argparse.ArgumentParser: + parser = _StructuredArgumentParser( + prog="featcopilot", + description=( + "FeatCopilot CLI — automated feature engineering from the command line. " + "Designed for scripting and agentic usage; pass --json to any subcommand " + "for machine-readable stdout." + ), + ) + parser.add_argument( + "-V", + "--version", + action="version", + version=f"featcopilot {__version__}", + ) + # Use the structured parser class for subparsers too so any + # subcommand-specific usage error (unknown flag, missing required + # arg) follows the same single-line stderr contract. + subparsers = parser.add_subparsers( + dest="command", required=True, metavar="COMMAND", parser_class=_StructuredArgumentParser + ) + + # ----- info --------------------------------------------------------- + p_info = subparsers.add_parser( + "info", + help="Print version and supported engines/methods.", + description="Print the installed FeatCopilot version and the supported engines, " + "selection methods, leakage guards, and I/O formats.", + ) + p_info.add_argument("--json", action="store_true", help="Emit JSON to stdout.") + p_info.set_defaults(func=_cmd_info) + + # ----- transform ---------------------------------------------------- + p_transform = subparsers.add_parser( + "transform", + help="Run feature engineering on a tabular file.", + description="Read INPUT, run AutoFeatureEngineer, and write engineered features to OUTPUT.", + ) + _add_io_args(p_transform) + _add_engineer_args(p_transform) + p_transform.add_argument( + "--no-selection", + action="store_true", + help="Disable feature selection (skip do-no-harm gate).", + ) + p_transform.add_argument( + "--include-target", + action="store_true", + help="Include the target column in the output file.", + ) + p_transform.add_argument("--json", action="store_true", help="Emit a JSON status line on stdout.") + p_transform.set_defaults(func=_cmd_transform) + + # ----- explain ------------------------------------------------------ + p_explain = subparsers.add_parser( + "explain", + help="Print JSON feature explanations and code for agent consumption.", + description="Fit AutoFeatureEngineer on INPUT and emit a JSON document " + "describing each generated feature (name, explanation, code). Selection is " + "intentionally disabled, so all candidate features are reported.", + ) + p_explain.add_argument("--input", "-i", required=True, help="Path to input file (CSV / Parquet / JSON).") + p_explain.add_argument("--input-format", choices=SUPPORTED_INPUT_FORMATS, help="Override input format detection.") + p_explain.add_argument( + "--target", + "-t", + help="Target column name. Used by leakage-guard checks and as task context " + "for the LLM engine. (Selection is disabled in `explain`, so this flag " + "does not gate selector behavior.)", + ) + p_explain.add_argument( + "--task-description", + help="Natural-language ML task description (used by the LLM engine).", + ) + p_explain.add_argument( + "--explain-sample-size", + type=int, + default=None, + help="Cap the input fed to the engineer at this many rows. The cap is " + "applied as a deterministic head slice (the first N rows of the input — " + "for CSV via `pd.read_csv(nrows=N)` so memory is bounded; for parquet/JSON " + "the file is fully read and then truncated, with a warning). OFF by default: " + "the full input is used so the metadata is a faithful description of what a " + "corresponding `transform` would generate. Pass a positive integer ONLY when " + "you knowingly accept that (a) the analyzed rows are the first N (not a " + "random sample), and (b) some engines (e.g. TabularEngine categorical " + "encoding) decide which features to plan based on row counts and per-category " + "statistics, so the reported metadata may differ from a full-input run.", + ) + _add_engineer_args(p_explain, include_selection_args=False) + p_explain.add_argument("--json", action="store_true", help="(Always JSON — flag accepted for symmetry.)") + p_explain.set_defaults(func=_cmd_explain) + + return parser + + +def _add_io_args(p: argparse.ArgumentParser) -> None: + p.add_argument("--input", "-i", required=True, help="Path to input file (CSV / Parquet / JSON).") + p.add_argument("--output", "-o", required=True, help="Path to output file (CSV / Parquet / JSON).") + p.add_argument("--input-format", choices=SUPPORTED_INPUT_FORMATS, help="Override input format detection.") + p.add_argument("--output-format", choices=SUPPORTED_OUTPUT_FORMATS, help="Override output format detection.") + p.add_argument( + "--target", + "-t", + help="Target column name. Required when feature selection is applied " + "(i.e. when --max-features / config max_features is set so the " + "selector actually fits). With no max_features, raw feature " + "generation runs without a target.", + ) + p.add_argument( + "--task-description", + help="Natural-language ML task description (used by the LLM engine).", + ) + + +def _add_engineer_args(p: argparse.ArgumentParser, *, include_selection_args: bool = True) -> None: + """Add ``AutoFeatureEngineer``-related flags to a subparser. + + ``include_selection_args=False`` omits selection-only flags + (``--selection-methods`` and ``--correlation-threshold``) — these are + silently ignored by the ``explain`` subcommand, which always runs with + selection disabled. ``--max-features`` is *not* selection-only: + ``AutoFeatureEngineer`` forwards it into engine construction (e.g. the + tabular engine uses it to cap the number of generated features), so it + is exposed even when ``include_selection_args=False`` to give callers + a CLI-level handle on the engine output size. + """ + p.add_argument( + "--engines", + nargs="+", + choices=sorted(AutoFeatureEngineer.SUPPORTED_ENGINES), + help="Engines to use (default: tabular).", + ) + # ``--max-features`` is exposed on every engineer-using subcommand + # because it caps engine output, not just selection — see the + # ``AutoFeatureEngineer`` constructor and ``TabularEngine``. + p.add_argument( + "--max-features", + type=int, + help="Maximum number of features to generate / keep (forwarded to engines and selector).", + ) + if include_selection_args: + p.add_argument( + "--selection-methods", + nargs="+", + choices=sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS), + help="Selection methods (default: mutual_info importance).", + ) + p.add_argument( + "--correlation-threshold", + type=float, + help="Maximum pairwise correlation in redundancy elimination (default: 0.85).", + ) + p.add_argument( + "--leakage-guard", + choices=sorted(AutoFeatureEngineer.SUPPORTED_LEAKAGE_GUARDS), + help="How to handle suspicious column names (default: warn).", + ) + p.add_argument( + "--gate-n-jobs", + type=int, + help="Parallelism for the do-no-harm gate's RF (default: 1; -1 = all cores).", + ) + p.add_argument( + "--config", + help="Path to a JSON config file. CLI flags take precedence over config keys. " + "Use this to pass nested keys such as ``llm_config``.", + ) + # ``BooleanOptionalAction`` (Python 3.9+) provides both ``--verbose`` + # and ``--no-verbose`` so a config-supplied ``"verbose": true`` can be + # explicitly turned off from the command line. ``default=None`` so the + # absence of either flag means "fall through to config / default". + p.add_argument( + "--verbose", + action=argparse.BooleanOptionalAction, + default=None, + help="Enable verbose logging (or --no-verbose to override config).", + ) + + +def main(argv: list[str] | None = None) -> int: + """CLI entry point. + + Returns the process exit code; suitable for both the ``console_scripts`` + entry point (``featcopilot``) and ``python -m featcopilot``. Argparse + usage errors (missing subcommand, unknown flag) and the cooperative + ``--help`` / ``--version`` actions all normally raise :class:`SystemExit`; + we trap those here and return their exit code so that programmatic + callers (and agent harnesses) get a consistent integer-returning API. + """ + parser = _build_parser() + + try: + args = parser.parse_args(argv) + except SystemExit as exc: + # argparse uses SystemExit(0) for ``--help`` / ``--version`` and + # SystemExit(2) for usage errors (also writing to stderr). We let the + # output through but convert the exit into a return value so + # ``main(argv) -> int`` is honored even on parse-time failures. + code = exc.code + if code is None: + return 0 + if isinstance(code, int): + return code + # Non-int code (e.g. error string): print to stderr, return 2. + sys.stderr.write(f"{code}\n") + return 2 + + try: + return args.func(args) + except (FileNotFoundError, ValueError) as exc: + # User-facing input/config errors: print a clean message to stderr + # without a traceback so agents can parse the failure. + sys.stderr.write(f"featcopilot: error: {exc}\n") + return 2 + except KeyboardInterrupt: + sys.stderr.write("featcopilot: interrupted\n") + return 130 + except Exception as exc: # pragma: no cover - defensive backstop + # Single deterministic stderr line so agents can parse the failure. + # We deliberately do NOT call ``logger.exception(...)`` here: + # FeatCopilot loggers write to stderr, which would append a second + # timestamped traceback after our structured line and break the + # CLI's "stderr is exactly one error message" contract. Internal + # failure introspection is the caller's job (e.g. set + # ``PYTHONFAULTHANDLER=1`` or attach a debugger). + sys.stderr.write(f"featcopilot: unexpected error: {type(exc).__name__}: {exc}\n") + return 1 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/pyproject.toml b/pyproject.toml index f904b9c..583f2eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,9 @@ Homepage = "https://github.com/thinkall/featcopilot" Documentation = "https://github.com/thinkall/featcopilot#readme" Repository = "https://github.com/thinkall/featcopilot" +[project.scripts] +featcopilot = "featcopilot.cli:main" + [tool.setuptools.packages.find] where = ["."] include = ["featcopilot*"] diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..2f8c1f3 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,3427 @@ +"""Tests for the featcopilot CLI.""" + +from __future__ import annotations + +import argparse +import io +import json +import logging +import sys +import threading +import warnings +from contextlib import redirect_stderr, redirect_stdout +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from featcopilot import __version__ +from featcopilot import cli as fc_cli + + +def _run(argv: list[str]) -> tuple[int, str, str]: + """Invoke ``cli.main(argv)`` and capture exit code, stdout, stderr. + + The featcopilot logger installs a ``StreamHandler(sys.stderr)`` at + import time, which holds a reference to the *original* ``sys.stderr`` + object. ``redirect_stderr`` only swaps the ``sys.stderr`` module + attribute, so without also redirecting the handler's ``stream`` any + log output the suppression filter doesn't catch would still go to + the real terminal — leaving every ``err == ""`` assertion in this + file vacuously satisfied even in the presence of a leak. This helper + therefore both redirects ``sys.stderr`` AND temporarily re-points + every ``StreamHandler`` on the ``featcopilot`` root logger at the + same ``err`` buffer for the duration of the call, so the captured + ``err`` value reflects what would actually have been written to the + user's terminal. + """ + out, err = io.StringIO(), io.StringIO() + fc_logger = logging.getLogger("featcopilot") + saved_streams: list[tuple[logging.StreamHandler, object]] = [] + for handler in list(fc_logger.handlers): + if isinstance(handler, logging.StreamHandler): + saved_streams.append((handler, handler.stream)) + handler.stream = err + try: + with redirect_stdout(out), redirect_stderr(err): + rc = fc_cli.main(argv) + finally: + for handler, original_stream in saved_streams: + handler.stream = original_stream + return rc, out.getvalue(), err.getvalue() + + +@pytest.fixture +def tabular_csv(tmp_path: Path) -> Path: + """A small classification dataset written to CSV.""" + rng = np.random.default_rng(42) + n = 200 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "x3": rng.integers(0, 5, size=n), + "y": rng.integers(0, 2, size=n), + } + ) + path = tmp_path / "in.csv" + df.to_csv(path, index=False) + return path + + +# --------------------------------------------------------------------- info + + +def test_info_json_emits_supported_options(): + rc, out, err = _run(["info", "--json"]) + assert rc == 0, err + payload = json.loads(out) + assert payload["version"] == __version__ + assert "tabular" in payload["supported_engines"] + assert "mutual_info" in payload["supported_selection_methods"] + assert "warn" in payload["supported_leakage_guards"] + # CSV/JSON are always supported; parquet is gated on engine availability. + assert {"csv", "json"} <= set(payload["supported_input_formats"]) + assert {"csv", "json"} <= set(payload["supported_output_formats"]) + assert isinstance(payload["parquet_available"], bool) + if payload["parquet_available"]: + assert "parquet" in payload["supported_input_formats"] + assert "parquet" in payload["supported_output_formats"] + else: + assert "parquet" not in payload["supported_input_formats"] + assert "parquet" not in payload["supported_output_formats"] + + +def test_info_excludes_parquet_when_engine_missing(monkeypatch): + """When no parquet engine can be imported, ``info`` must not advertise it.""" + monkeypatch.setattr(fc_cli, "_parquet_engine_available", lambda: False) + rc, out, _ = _run(["info", "--json"]) + assert rc == 0 + payload = json.loads(out) + assert payload["parquet_available"] is False + assert "parquet" not in payload["supported_input_formats"] + assert "parquet" not in payload["supported_output_formats"] + + +def test_info_includes_parquet_when_engine_present(monkeypatch): + monkeypatch.setattr(fc_cli, "_parquet_engine_available", lambda: True) + rc, out, _ = _run(["info", "--json"]) + assert rc == 0 + payload = json.loads(out) + assert payload["parquet_available"] is True + assert "parquet" in payload["supported_input_formats"] + assert "parquet" in payload["supported_output_formats"] + + +def test_info_text_mode_is_human_readable(): + rc, out, _ = _run(["info"]) + assert rc == 0 + # Not JSON: parsing should fail. + with pytest.raises(json.JSONDecodeError): + json.loads(out) + assert "version" in out + assert __version__ in out + + +def test_top_level_version_flag(capsys): + # ``--version`` (argparse action) prints to stdout; main() now traps the + # SystemExit and returns the code so the API contract is consistent. + rc = fc_cli.main(["--version"]) + assert rc == 0 + assert __version__ in capsys.readouterr().out + + +# ----------------------------------------------------------------- transform + + +def test_transform_csv_to_csv(tmp_path: Path, tabular_csv: Path): + out_path = tmp_path / "out.csv" + rc, out, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "10", + "--json", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + assert payload["target"] == "y" + assert payload["engines"] == ["tabular"] + assert payload["selection_applied"] is True + assert payload["n_input_columns"] == 3 # x1, x2, x3 (y is the target) + + # The output file exists and is readable as CSV. + assert out_path.exists() + written = pd.read_csv(out_path) + assert written.shape[0] == 200 + assert "y" not in written.columns # target excluded by default + + +def test_transform_include_target_round_trip(tmp_path: Path, tabular_csv: Path): + out_path = tmp_path / "out.csv" + rc, _, err = _run( + [ + "transform", + "-i", + str(tabular_csv), + "-o", + str(out_path), + "-t", + "y", + "--max-features", + "10", + "--include-target", + ] + ) + assert rc == 0, err + written = pd.read_csv(out_path) + assert "y" in written.columns + + +def test_transform_parquet_round_trip(tmp_path: Path): + pytest.importorskip("pyarrow") + rng = np.random.default_rng(0) + df = pd.DataFrame({"a": rng.normal(size=120), "b": rng.normal(size=120), "y": rng.integers(0, 2, size=120)}) + in_path = tmp_path / "in.parquet" + out_path = tmp_path / "out.parquet" + df.to_parquet(in_path, index=False) + + rc, out, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "8", + "--json", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["input_format"] == "parquet" + assert payload["output_format"] == "parquet" + pd.read_parquet(out_path) # readable + + +def test_transform_json_round_trip(tmp_path: Path): + rng = np.random.default_rng(0) + df = pd.DataFrame({"a": rng.normal(size=80), "b": rng.normal(size=80), "y": rng.integers(0, 2, size=80)}) + in_path = tmp_path / "in.json" + out_path = tmp_path / "out.json" + df.to_json(in_path, orient="records") + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(out_path), + "--target", + "y", + ] + ) + assert rc == 0, err + written = pd.read_json(out_path, orient="records") + assert written.shape[0] == 80 + + +def test_transform_no_selection_skips_selector(tmp_path: Path, tabular_csv: Path): + out_path = tmp_path / "out.csv" + rc, out, err = _run( + [ + "transform", + "-i", + str(tabular_csv), + "-o", + str(out_path), + "-t", + "y", + "--no-selection", + "--max-features", + "5", + "--json", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["selection_applied"] is False + + +def test_transform_config_file_supplies_engineer_kwargs(tmp_path: Path, tabular_csv: Path): + config_path = tmp_path / "cfg.json" + config_path.write_text( + json.dumps( + { + "engines": ["tabular"], + "selection_methods": ["mutual_info"], + "max_features": 7, + "correlation_threshold": 0.9, + "leakage_guard": "off", + } + ) + ) + out_path = tmp_path / "out.csv" + rc, out, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--config", + str(config_path), + "--json", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["selection_methods"] == ["mutual_info"] + assert payload["max_features"] == 7 + + +def test_transform_cli_flags_override_config(tmp_path: Path, tabular_csv: Path): + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"max_features": 5, "engines": ["tabular"]})) + out_path = tmp_path / "out.csv" + rc, out, _ = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--config", + str(config_path), + "--max-features", + "12", + "--json", + ] + ) + assert rc == 0 + assert json.loads(out)["max_features"] == 12 + + +# ----------------------- _build_engineer config validation + + +def test_string_engines_in_config_returns_clean_exit_2(tmp_path: Path, tabular_csv: Path): + """A misconfigured ``"engines": "tabular"`` (string instead of list) must + surface ``AutoFeatureEngineer``'s precise type-validation error via the + standard exit-2 path — *not* be silently coerced into a per-character list. + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"engines": "tabular"})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + ] + ) + assert rc == 2 + assert "engines must be a list or tuple" in err + + +def test_empty_engines_list_in_config_returns_clean_exit_2(tmp_path: Path, tabular_csv: Path): + """An explicit empty ``engines`` list in the config must propagate to the + transformer's validation so the user sees the documented error, instead + of being silently rewritten into the defaults. + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"engines": []})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + ] + ) + assert rc == 2 + assert "at least one engine" in err.lower() or "empty sequence" in err.lower() + + +def test_empty_selection_methods_list_in_config_returns_clean_exit_2(tmp_path: Path, tabular_csv: Path): + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"selection_methods": []})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + ] + ) + assert rc == 2 + assert "at least one method" in err.lower() or "empty sequence" in err.lower() + + +# ----------------------- scalar config-type validation + + +@pytest.mark.parametrize( + "key,value,fragment", + [ + ("max_features", "10", "max_features"), + ("max_features", True, "max_features"), # bool rejected for numeric field + ("correlation_threshold", "0.9", "correlation_threshold"), + ("correlation_threshold", True, "correlation_threshold"), + ("gate_n_jobs", "2", "gate_n_jobs"), + ("leakage_guard", 42, "leakage_guard"), + ], +) +def test_scalar_type_mismatch_in_config_returns_exit_2(tmp_path: Path, tabular_csv: Path, key, value, fragment): + """A malformed JSON config (string in a numeric field, etc.) must hit the + deterministic exit-2 user-error path with a precise message — not bubble + up as a downstream ``TypeError`` (exit 1). + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({key: value})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + ] + ) + assert rc == 2 + assert fragment in err + + +@pytest.mark.parametrize("threshold", [-0.1, 1.1, 5.0, -1.0]) +def test_correlation_threshold_out_of_range_returns_exit_2(tmp_path: Path, tabular_csv: Path, threshold): + """``correlation_threshold`` is only meaningful in [0.0, 1.0]. Out-of-range + values silently change selector behavior (>1 disables redundancy elim, + <0 treats every numeric pair as redundant), so the CLI rejects them up + front with a precise exit-2 error. + """ + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--correlation-threshold", + str(threshold), + "--max-features", + "5", + ] + ) + assert rc == 2 + assert "correlation_threshold" in err + assert "[0.0, 1.0]" in err or "0.0" in err + + +def test_correlation_threshold_in_config_out_of_range_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """The same range check applies when ``correlation_threshold`` arrives + from ``--config`` rather than the CLI flag. + """ + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"correlation_threshold": 2.5})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--max-features", + "5", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "correlation_threshold" in err + + +def test_correlation_threshold_boundary_values_accepted(tmp_path: Path, tabular_csv: Path): + """The boundaries (0.0 and 1.0) must be accepted — they're the inclusive + valid range. Default 0.85 is also exercised throughout the suite. + """ + out_path = tmp_path / "out.csv" + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--correlation-threshold", + "0.0", + "--max-features", + "5", + ] + ) + assert rc == 0, err + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--correlation-threshold", + "1.0", + "--max-features", + "5", + ] + ) + assert rc == 0, err + + +# ----------------------- --verbose / --no-verbose + + +def test_no_verbose_overrides_config_verbose_true(tmp_path: Path, tabular_csv: Path): + """``--no-verbose`` (BooleanOptionalAction) must override a config-level + ``"verbose": true`` to false — the documented precedence rule. + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"verbose": True})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + "--no-verbose", + "--max-features", + "5", + "--json", + ] + ) + assert rc == 0, err + + +def test_verbose_overrides_config_verbose_false(tmp_path: Path, tabular_csv: Path): + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"verbose": False})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + "--verbose", + "--max-features", + "5", + ] + ) + assert rc == 0, err + + +@pytest.mark.parametrize( + "value", + ["true", "false", 1, 0], +) +def test_non_bool_verbose_in_config_returns_exit_2(tmp_path: Path, tabular_csv: Path, value): + """A malformed ``"verbose": `` config must hit exit 2 with a + precise message, not silently turn verbose mode on/off via Python's + truthiness rules. + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"verbose": value})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + "--max-features", + "5", + ] + ) + assert rc == 2 + assert "verbose" in err + + +def test_transform_missing_target_with_selection_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """Without ``--target``, selection silently degrades to a no-op. The CLI + must surface that as a clean exit-2 user error so automation can react. + """ + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--max-features", + "5", + ] + ) + assert rc == 2 + assert "--target" in err + assert "selection" in err.lower() + + +def test_transform_missing_target_with_no_selection_succeeds(tmp_path: Path, tabular_csv: Path): + """Once selection is opted out, the missing target is no longer an error + (selection requires a target; raw transform doesn't). + """ + # Drop the target column so we can run without --target. + in_path = tmp_path / "in_notarget.csv" + pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False) + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--no-selection", + ] + ) + assert rc == 0, err + + +def test_transform_missing_target_no_max_features_succeeds(tmp_path: Path, tabular_csv: Path): + """Without ``--max-features`` (and the corresponding config key), + ``AutoFeatureEngineer`` doesn't actually fit a selector even with + ``apply_selection=True``, so requiring ``--target`` would be a false + positive. Raw feature generation without target / without cap must + therefore succeed. + """ + in_path = tmp_path / "in_notarget.csv" + pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False) + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + ] + ) + assert rc == 0, err + + +def test_transform_missing_target_max_features_in_config_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """The ``--target`` requirement also fires when ``max_features`` comes + from ``--config`` (not just the CLI flag), since the selector will + actually run in that case. + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"max_features": 5})) + in_path = tmp_path / "in_notarget.csv" + pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False) + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--config", + str(config_path), + ] + ) + assert rc == 2 + assert "--target" in err + + +def test_explain_ignores_selection_only_config_keys(tmp_path: Path, tabular_csv: Path): + """A shared transform/explain config with selection-only keys + (``selection_methods`` / ``correlation_threshold``) must not break + ``explain``: those keys are inert at runtime (selection is disabled + in ``explain``) and ``_build_engineer(include_selection_config=False)`` + skips reading them so config-validation does not fire. + """ + config_path = tmp_path / "cfg.json" + # Use *valid* selection_methods values; the point is they''re ignored. + config_path.write_text( + json.dumps( + { + "engines": ["tabular"], + "selection_methods": ["mutual_info"], + "correlation_threshold": 0.5, + "max_features": 5, + } + ) + ) + rc, out, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--config", + str(config_path), + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + + +# ----------------------- explain subparser doesn't expose selection-only flags + + +def test_explain_rejects_selection_methods_flag(tmp_path: Path, tabular_csv: Path): + """``explain`` always disables selection, so accepting ``--selection-methods`` + on the CLI would silently mis-configure the user. The subparser must not + advertise it. + """ + rc, _, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--selection-methods", + "mutual_info", + ] + ) + assert rc == 2 + assert "unrecognized" in err.lower() or "--selection-methods" in err.lower() + + +def test_explain_accepts_max_features_flag(tmp_path: Path, tabular_csv: Path): + """``--max-features`` is *not* selection-only — ``AutoFeatureEngineer`` + forwards it into engine construction (e.g. the tabular engine uses it + to cap how many features it generates). ``explain`` must therefore + expose it so callers can bound the size of the explanation payload. + """ + rc, out, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--max-features", + "5", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + + +def test_explain_rejects_correlation_threshold_flag(tmp_path: Path, tabular_csv: Path): + rc, _, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--correlation-threshold", + "0.9", + ] + ) + assert rc == 2 + + +def test_explain_target_help_no_longer_says_required_for_selection(): + """The ``--target`` help on ``explain`` must not claim it gates selection + (selection is intentionally disabled in ``explain``). + """ + parser = fc_cli._build_parser() + # argparse stores subparsers under a special action attribute + explain_parser = next( + action.choices["explain"] for action in parser._actions if isinstance(action, argparse._SubParsersAction) + ) + target_help = next(a.help for a in explain_parser._actions if "--target" in a.option_strings) + assert "required for selection" not in target_help + assert "leakage" in target_help.lower() or "task context" in target_help.lower() + + +# ----------------------- I/O OSError normalization + + +def test_input_directory_returns_exit_2(tmp_path: Path): + """Pointing ``--input`` at a directory must surface as exit 2.""" + in_dir = tmp_path / "i_am_a_dir.csv" + in_dir.mkdir() + rc, _, err = _run( + [ + "transform", + "--input", + str(in_dir), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "directory" in err.lower() + + +def test_output_directory_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """Pointing ``--output`` at an existing directory must surface as exit 2.""" + out_dir = tmp_path / "i_am_a_dir.csv" + out_dir.mkdir() + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_dir), + "--target", + "y", + ] + ) + assert rc == 2 + assert "directory" in err.lower() + + +def test_unwritable_output_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """An ``OSError`` on write (e.g. permission denied) must surface as exit 2.""" + import pandas as pd + + def _raise_oserror(self, *args, **kwargs): + raise PermissionError("simulated write failure") + + monkeypatch.setattr(pd.DataFrame, "to_csv", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to write" in err.lower() + + +def test_unreadable_input_csv_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """An ``OSError`` while reading the input must surface as exit 2.""" + import pandas as pd + + def _raise_oserror(*args, **kwargs): + raise PermissionError("simulated read failure") + + monkeypatch.setattr(pd, "read_csv", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read" in err.lower() + + +def test_empty_csv_input_returns_exit_2(tmp_path: Path): + """A zero-byte / headerless CSV triggers ``pandas.errors.EmptyDataError``, + which must be normalized to the documented exit-2 user-input error path + rather than falling through to the generic exit-1 backstop. + """ + in_path = tmp_path / "empty.csv" + in_path.write_text("") # zero bytes -> EmptyDataError on read + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read csv" in err.lower() + + +def test_headerless_csv_input_returns_exit_2(tmp_path: Path): + """A CSV with no header and no rows is also empty-data territory and + must surface as exit 2. + """ + in_path = tmp_path / "headerless.csv" + in_path.write_text("\n\n\n") # only newlines, no header + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read csv" in err.lower() + + +def test_header_only_csv_input_returns_exit_2(tmp_path: Path): + """A CSV that has a header line but ZERO data rows is read by pandas + as an *empty* DataFrame (no exception). Without the explicit empty + check, the CLI would feed it into ``TabularEngine`` which divides by + ``len(X)`` and exits via the generic exit-1 backstop. The CLI must + surface this as a clean exit-2 user-input error. + """ + in_path = tmp_path / "header_only.csv" + in_path.write_text("x1,x2,y\n") # header but no data + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "empty" in err.lower() + assert "zero data rows" in err.lower() + + +def test_empty_json_input_returns_exit_2(tmp_path: Path): + """An empty JSON array is parsed as an empty DataFrame and must be + rejected up front like header-only CSV. + """ + in_path = tmp_path / "empty.json" + in_path.write_text("[]") + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "empty" in err.lower() + + +def test_empty_parquet_input_returns_exit_2(tmp_path: Path): + """A parquet file with schema but zero rows is rejected up front.""" + pytest.importorskip("pyarrow") + in_path = tmp_path / "empty.parquet" + pd.DataFrame({"x1": [], "x2": [], "y": []}).to_parquet(in_path, index=False) + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "empty" in err.lower() + + +def test_explain_header_only_csv_returns_exit_2(tmp_path: Path): + """The empty-input check is applied to ``explain`` too.""" + in_path = tmp_path / "header_only.csv" + in_path.write_text("x1,x2,y\n") + + rc, _, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + ] + ) + assert rc == 2 + assert "empty" in err.lower() + + +def test_transform_include_target_collision_returns_exit_2(tmp_path: Path): + """``--include-target`` would silently overwrite an engineered feature + if it happens to share the target column's name. The CLI must detect + that collision and fail with exit 2 instead of losing the engineered + feature. + + A target named ``x1_pow2`` (which the tabular engine generates as a + derived feature from a numeric column ``x1``) provokes the collision. + """ + rng = np.random.default_rng(0) + n = 200 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + # Target column has a name that the tabular engine would also + # generate (``x1_pow2`` etc. is in the tabular engine's + # derived feature catalog). + "x1_pow2": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "collision.csv" + df.to_csv(in_path, index=False) + out_path = tmp_path / "out.csv" + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(out_path), + "--target", + "x1_pow2", + "--include-target", + "--max-features", + "5", + ] + ) + # Either the engineered set actually contains the colliding name (in + # which case we MUST exit 2), or selection happened to drop it. Skip + # if the engine didn't materialize the colliding feature this run — + # the test is about the contract, not whether ``x1_pow2`` is always + # generated. + if rc == 2: + assert "include-target would overwrite" in err.lower() + assert "x1_pow2" in err + else: + # No collision actually occurred; the test is a no-op for this + # input. Future engine changes that always emit ``x1_pow2`` will + # expose the collision branch. + assert rc == 0, err + + +def test_transform_include_target_collision_deterministic(tmp_path: Path, tabular_csv: Path, monkeypatch): + """Deterministic version of the collision test: monkey-patch the + engineer so its transformed frame contains a column with the target's + name. This guarantees we exercise the exit-2 collision branch + regardless of which features the real engineer picks. + """ + from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer + + real_fit_transform = AutoFeatureEngineer.fit_transform + + def _patched_fit_transform(self, X, y=None, **kwargs): + result = real_fit_transform(self, X, y, **kwargs) + # Inject a column named ``y`` into the result so it collides with + # the target column the test will pass. + result = result.copy() + result["y"] = result.iloc[:, 0] # arbitrary engineered values + return result + + monkeypatch.setattr(AutoFeatureEngineer, "fit_transform", _patched_fit_transform) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--include-target", + "--max-features", + "5", + ] + ) + assert rc == 2 + assert "include-target would overwrite" in err.lower() + assert "'y'" in err + + +def test_unreadable_input_json_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """``OSError`` from ``pd.read_json`` is surfaced as exit 2 too.""" + import pandas as pd + + in_path = tmp_path / "in.json" + in_path.write_text("[]") # contents irrelevant; we'll intercept + + def _raise_oserror(*args, **kwargs): + raise PermissionError("simulated read failure") + + monkeypatch.setattr(pd, "read_json", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read json" in err.lower() + + +def test_unreadable_input_parquet_returns_exit_2(tmp_path: Path, monkeypatch): + """``OSError`` from ``pd.read_parquet`` (e.g. corrupt file) is exit 2.""" + import pandas as pd + + in_path = tmp_path / "in.parquet" + in_path.write_bytes(b"") + + def _raise_oserror(*args, **kwargs): + raise OSError("simulated parquet read failure") + + monkeypatch.setattr(pd, "read_parquet", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read parquet" in err.lower() + + +def test_unwritable_output_json_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + import pandas as pd + + def _raise_oserror(self, *args, **kwargs): + raise PermissionError("simulated json write failure") + + monkeypatch.setattr(pd.DataFrame, "to_json", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.json"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to write json" in err.lower() + + +def test_unwritable_output_parquet_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """``OSError`` (vs ``ImportError``) from ``DataFrame.to_parquet`` -> exit 2.""" + import pandas as pd + + def _raise_oserror(self, *args, **kwargs): + raise OSError("simulated parquet write failure") + + monkeypatch.setattr(pd.DataFrame, "to_parquet", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.parquet"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to write parquet" in err.lower() + + +def test_parquet_read_engine_error_returns_exit_2(tmp_path: Path, monkeypatch): + """A non-OSError parquet *backend* error (e.g. ``pyarrow.lib.ArrowInvalid`` + for a corrupt file) must surface as exit 2, not the generic exit 1 + "unexpected error" backstop. The CLI catches ``Exception`` for parquet + operations because they are fully delegated to a third-party backend + whose failures are by definition user-facing data issues. + """ + import pandas as pd + + in_path = tmp_path / "fake.parquet" + in_path.write_bytes(b"\x00\x01\x02\x03") # not a real parquet file + + class _FakeArrowInvalid(Exception): + """Stand-in for ``pyarrow.lib.ArrowInvalid`` (also subclasses Exception).""" + + def _raise_backend_error(*args, **kwargs): + raise _FakeArrowInvalid("simulated corrupt parquet") + + monkeypatch.setattr(pd, "read_parquet", _raise_backend_error, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read parquet" in err.lower() + + +def test_parquet_write_engine_error_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """Same coverage on the write side: a backend-level pyarrow exception + that is *not* an ``OSError`` (e.g. an unsupported column-type + conversion error) must produce exit 2, not exit 1. + """ + import pandas as pd + + class _FakeArrowTypeError(Exception): + pass + + def _raise_backend_error(self, *args, **kwargs): + raise _FakeArrowTypeError("simulated unsupported column dtype for parquet") + + monkeypatch.setattr(pd.DataFrame, "to_parquet", _raise_backend_error, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.parquet"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to write parquet" in err.lower() + + +def test_uncreatable_parent_directory_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """If creating the output's parent directory fails, exit 2 with a clean message.""" + real_mkdir = Path.mkdir + + def _raise_oserror(self, *args, **kwargs): + # Only fail for our test's would-be output parent so other calls (e.g. + # tmp_path operations under the hood) still work. + if "deep" in self.parts: + raise PermissionError("simulated mkdir failure") + return real_mkdir(self, *args, **kwargs) + + monkeypatch.setattr(Path, "mkdir", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "deep" / "nested" / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "create parent directory" in err.lower() + + +# ----------------------- stderr is reserved for failures (warnings captured) + + +def test_transform_leakage_warning_does_not_pollute_stderr(tmp_path: Path): + """``leakage_guard='warn'`` (the default) must not bleed + ``warnings.warn(...)`` onto stderr on a successful run; the warnings + are captured and surfaced inside the JSON payload's ``warnings`` field + instead, so agents can keep treating non-empty stderr as failure metadata. + """ + rng = np.random.default_rng(0) + n = 200 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + # ``label_encoded`` is detected as leakage-prone ("label" + "encoded" + # both appear in the stoplist). + "label_encoded": rng.integers(0, 2, size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "in_with_leakage.csv" + df.to_csv(in_path, index=False) + out_path = tmp_path / "out.csv" + + rc, out, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "5", + "--json", + ] + ) + assert rc == 0, err + assert err == "", f"stderr should be empty on success but got: {err!r}" + payload = json.loads(out) + assert payload["status"] == "ok" + # ``warnings`` field is always present; it MAY contain the leakage + # warning depending on the heuristic. The contract being tested is + # that stderr stays clean — not that any specific warning was emitted + # (the leakage detector heuristics evolve). + assert "warnings" in payload + assert isinstance(payload["warnings"], list) + + +def test_explain_leakage_warning_does_not_pollute_stderr(tmp_path: Path): + """``explain`` has the same stderr-cleanliness contract as ``transform``.""" + rng = np.random.default_rng(0) + n = 200 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "label_encoded": rng.integers(0, 2, size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "in.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + ] + ) + assert rc == 0, err + assert err == "", f"stderr should be empty on success but got: {err!r}" + payload = json.loads(out) + assert payload["status"] == "ok" + # The ``warnings`` field is always present and is a list. Whether or + # not the leakage heuristic fires is not guaranteed (it evolves); the + # contract under test is that stderr stays clean. + assert isinstance(payload["warnings"], list) + + +def test_transform_logger_warning_does_not_pollute_stderr(tmp_path: Path, tabular_csv: Path): + """The CLI captures ``logger.warning(...)`` records (in addition to + ``warnings.warn``), so any successful run that exercises a code path + emitting a logger message — for example the do-no-harm gate's + fallback — keeps stderr empty. The captured records appear in the + JSON payload's ``warnings`` field. + """ + out_path = tmp_path / "out.csv" + rc, out, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "5", + "--verbose", # exercises ``logger.info(...)`` paths in engines + "--json", + ] + ) + assert rc == 0, err + assert err == "", f"stderr should be empty on success but got: {err!r}" + payload = json.loads(out) + assert payload["status"] == "ok" + assert isinstance(payload["warnings"], list) + + +def test_transform_verbose_logger_info_captured_not_on_stderr(tmp_path: Path, tabular_csv: Path): + """``--verbose`` enables ``logger.info(...)`` calls in + ``AutoFeatureEngineer`` and the engines. Those records must end up + in the JSON payload's ``warnings`` field, not on stderr. + """ + out_path = tmp_path / "out.csv" + rc, out, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "5", + "--verbose", + "--json", + ] + ) + assert rc == 0, err + assert err == "", f"stderr should be empty on success but got: {err!r}" + payload = json.loads(out) + # ``--verbose`` reliably emits "Fitted tabular engine" via logger.info, + # and selection / engineer calls also log. We don't pin the exact + # messages (they evolve) — just check at least one log record is + # present in the captured payload. + assert isinstance(payload["warnings"], list) + assert len(payload["warnings"]) >= 1 + + +def test_capture_featcopilot_messages_intercepts_logger_warning(): + """Direct unit test for the contextmanager so the docstring contract is + not just covered transitively via the CLI subcommands. + """ + fc_logger = logging.getLogger("featcopilot.test_cli") + # Reset Python's warning-deduplication state for the duration of the + # test so a previous test that fired ``warnings.warn`` at the same + # source location does not suppress this one. + with warnings.catch_warnings(): + warnings.simplefilter("always") + with fc_cli._capture_featcopilot_messages() as captured: + fc_logger.warning("captured-warning-message") + warnings.warn("captured-runtime-warning", UserWarning, stacklevel=2) + assert any("captured-warning-message" in m for m in captured) + assert any("captured-runtime-warning" in m for m in captured) + + +def test_capture_featcopilot_messages_does_not_mutate_logger_state_per_call(): + """The contextmanager installs hooks *once* (lazily) and then never + mutates the featcopilot logger again — so successive captures don't + add or remove handlers, regardless of test ordering. The earlier + "restores handlers" test (asserting equality with pre-first-call + state) was order-dependent: on the very first capture in a process, + ``_install_capture_hooks_once()`` permanently adds + ``_routing_handler`` and that's a one-way change. We instead assert + *stability* across an exception-propagating with-block, which is the + real behavioral contract. + """ + # First, force install via a no-op capture. + with fc_cli._capture_featcopilot_messages(): + pass + + fc_root = logging.getLogger("featcopilot") + handlers_before = list(fc_root.handlers) + level_before = fc_root.level + showwarning_before = warnings.showwarning + + with pytest.raises(RuntimeError): + with fc_cli._capture_featcopilot_messages(): + raise RuntimeError("boom") + + # Hooks remain installed (handler stays, level unchanged, showwarning + # override remains in place); per-call state has been popped. + assert fc_root.handlers == handlers_before + assert fc_root.level == level_before + assert warnings.showwarning is showwarning_before + + +def test_capture_featcopilot_messages_thread_safety(): + """Concurrent ``_capture_featcopilot_messages`` invocations must not + steal each other's records. Implementation uses per-thread routing + (no global lock held during the body), so threads execute concurrently. + """ + import threading + + fc_logger = logging.getLogger("featcopilot.test_concurrent") + + results: list[list[str]] = [] + barrier = threading.Barrier(2) + + def worker(tag: str): + # Force both threads to enter the with-block at roughly the same + # time so the routing dispatch is genuinely contended. + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + for i in range(20): + fc_logger.warning(f"{tag}-{i}") + results.append(captured) + + t1 = threading.Thread(target=worker, args=("A",)) + t2 = threading.Thread(target=worker, args=("B",)) + t1.start() + t2.start() + t1.join() + t2.join() + + assert len(results) == 2 + # Each capture list must contain exactly its own thread's records and + # nothing from the other thread. + for res in results: + # Find which tag this list belongs to. + tag = "A" if any("A-" in m for m in res) else "B" + assert all(f"{tag}-" in m for m in res), f"Thread isolation violated in capture {tag!r}: got {res!r}" + assert len(res) == 20 + + +def test_capture_does_not_block_concurrent_callers(): + """Two concurrent ``_capture_featcopilot_messages`` blocks must run in + parallel — i.e. the design does NOT serialize the body via a global + lock. Verified by timing: a worker that sleeps inside the block must + not block another worker from also entering the block at the same + time. + """ + import threading + import time + + inside = [] + inside_lock = threading.Lock() + seen_overlap = threading.Event() + barrier = threading.Barrier(2) + + def worker(): + barrier.wait() + with fc_cli._capture_featcopilot_messages(): + with inside_lock: + inside.append(1) + if len(inside) >= 2: + seen_overlap.set() + # Sleep long enough that, if the implementation serialized via + # a global lock, the second thread would never enter + # simultaneously. + time.sleep(0.2) + with inside_lock: + inside.pop() + + t1 = threading.Thread(target=worker) + t2 = threading.Thread(target=worker) + t1.start() + t2.start() + t1.join(timeout=5) + t2.join(timeout=5) + + assert seen_overlap.is_set(), ( + "Both threads should have been inside _capture_featcopilot_messages " + "simultaneously; the implementation appears to serialize the body." + ) + + +def test_capture_warnings_warn_thread_isolated(): + """``warnings.warn`` calls from one capturing thread must not leak into + another capturing thread's payload. The CLI overrides + ``warnings.showwarning`` per-thread (rather than using + ``warnings.catch_warnings(record=True)`` which is process-global). + """ + import threading + + barrier = threading.Barrier(2) + a_captured: list[str] = [] + b_captured: list[str] = [] + + def worker(tag: str, target: list[str]): + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + for i in range(10): + # ``stacklevel=2`` is forwarded; reset filter state so we + # don't lose the warning to Python's default dedup. + warnings.warn(f"{tag}-warn-{i}", UserWarning, stacklevel=2) + target.extend(captured) + + # Reset warning filters for this test so dedup doesn't suppress + # repeated emissions at the same source line. + with warnings.catch_warnings(): + warnings.simplefilter("always") + t1 = threading.Thread(target=worker, args=("A", a_captured)) + t2 = threading.Thread(target=worker, args=("B", b_captured)) + t1.start() + t2.start() + t1.join() + t2.join() + + assert all("A-warn-" in m for m in a_captured) + assert all("B-warn-" in m for m in b_captured) + assert not any("B-warn-" in m for m in a_captured) + assert not any("A-warn-" in m for m in b_captured) + + +def test_nested_capture_on_same_thread_preserves_outer_list(): + """A capture inside a capture on the same thread must: + + 1. Route records to the *innermost* list while the inner block is active. + 2. Restore the outer list when the inner block exits, so subsequent + records flow into the outer payload. + + The previous single-list-per-thread design clobbered the outer + registration; this test guards against that regression. + """ + fc_logger = logging.getLogger("featcopilot.test_nested") + + with warnings.catch_warnings(): + warnings.simplefilter("always") + with fc_cli._capture_featcopilot_messages() as outer: + fc_logger.warning("outer-before-nested") + with fc_cli._capture_featcopilot_messages() as inner: + fc_logger.warning("inner-only") + warnings.warn("inner-runtime", UserWarning, stacklevel=2) + fc_logger.warning("outer-after-nested") + + # Inner contains only the records emitted while it was the active + # capture. + assert any("inner-only" in m for m in inner) + assert any("inner-runtime" in m for m in inner) + assert not any("outer-before-nested" in m for m in inner) + assert not any("outer-after-nested" in m for m in inner) + + # Outer contains records emitted before AND after the inner block, + # but NOT records emitted while inner was active (those went to inner). + assert any("outer-before-nested" in m for m in outer) + assert any("outer-after-nested" in m for m in outer) + assert not any("inner-only" in m for m in outer) + assert not any("inner-runtime" in m for m in outer) + + +def test_overlapping_captures_with_out_of_order_exit(): + """Two threads enter the capture block, then thread A exits *before* + thread B. The CLI must continue to capture B's warnings even after + A has exited — i.e. A's exit must not restore a global state that + disables B's capture. + + This is the strict version of the warnings.showwarning race that + existed when the override was saved/restored per-call: A's exit + used to restore the original ``warnings.showwarning``, leaking B's + subsequent ``warnings.warn`` calls onto stderr. + """ + import threading + import time + + barrier = threading.Barrier(2) + a_done = threading.Event() + a_captured: list[str] = [] + b_captured: list[str] = [] + + fc_logger = logging.getLogger("featcopilot.test_overlap") + + def worker_a(): + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + fc_logger.warning("A-1") + warnings.warn("A-warn-1", UserWarning, stacklevel=2) + a_captured.extend(captured) + a_done.set() # signal: A has exited the capture block + + def worker_b(): + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + fc_logger.warning("B-1") + # Wait for A to fully exit before emitting B's tail records. + assert a_done.wait(timeout=5) + time.sleep(0.05) # small grace so any racy restoration would have happened + fc_logger.warning("B-2-after-A-exit") + warnings.warn("B-warn-after-A-exit", UserWarning, stacklevel=2) + b_captured.extend(captured) + + with warnings.catch_warnings(): + warnings.simplefilter("always") + t_a = threading.Thread(target=worker_a) + t_b = threading.Thread(target=worker_b) + t_b.start() # start B first so it's already in the block + time.sleep(0.05) + t_a.start() + t_a.join(timeout=5) + t_b.join(timeout=5) + + # B's records — including the ones emitted *after* A exited — must + # all be captured. None of A's records should have leaked into B. + assert any("B-1" in m for m in b_captured) + assert any("B-2-after-A-exit" in m for m in b_captured) + assert any("B-warn-after-A-exit" in m for m in b_captured) + assert not any("A-1" in m for m in b_captured) + assert not any("A-warn-1" in m for m in b_captured) + # A's payload likewise contains only A's records. + assert any("A-1" in m for m in a_captured) + assert any("A-warn-1" in m for m in a_captured) + assert not any("B-" in m for m in a_captured) + + +def test_unexpected_error_writes_single_stderr_line(monkeypatch, tmp_path: Path, tabular_csv: Path): + """An unexpected (non-ValueError) exception must produce exactly one + structured stderr line — no second timestamped traceback from + ``logger.exception(...)`` — so agents can parse failures + deterministically. + """ + import pandas as pd + + class _UnexpectedError(Exception): + """A non-ValueError, non-OSError exception that escapes the helpers.""" + + def _raise_unexpected(*args, **kwargs): + raise _UnexpectedError("simulated internal failure") + + # Monkey-patch ``pd.read_csv`` directly. Since ``_read_table``'s CSV + # branch normally catches ``OSError`` / ``ParserError`` / ``UnicodeDecodeError``, + # raising a different exception type forces us into the generic exit-1 + # backstop in ``main()``. + monkeypatch.setattr(pd, "read_csv", _raise_unexpected, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 1, err + # Exactly one non-empty line on stderr. + err_lines = [line for line in err.splitlines() if line.strip()] + assert len(err_lines) == 1, f"Expected single-line stderr, got: {err!r}" + assert err_lines[0].startswith("featcopilot: unexpected error:") + assert "_UnexpectedError" in err_lines[0] + assert "simulated internal failure" in err_lines[0] + # No traceback signature. + assert "Traceback" not in err + assert 'File "' not in err + + +# ----------------------- --target help text accuracy + + +def test_transform_target_help_reflects_actual_contract(): + """The ``--target`` help on ``transform`` must say the flag is required + only when ``--max-features`` is set (which is when the selector + actually fits), not whenever selection is enabled by default. + """ + parser = fc_cli._build_parser() + transform_parser = next( + action.choices["transform"] for action in parser._actions if isinstance(action, argparse._SubParsersAction) + ) + target_help = next(a.help for a in transform_parser._actions if "--target" in a.option_strings) + assert "max_features" in target_help.lower() or "max-features" in target_help.lower() + # The old ("required when selection is applied (the default ...)") + # phrasing was misleading — guard against regressions. + assert "the default" not in target_help.lower() + + +# ----------------------- target check runs after type validation + + +def test_invalid_max_features_in_config_takes_precedence_over_target_check(tmp_path: Path, tabular_csv: Path): + """A malformed ``max_features`` in ``--config`` (string, negative, etc.) + must surface its real validation error rather than ``--target is + required``. The CLI now builds the engineer first (which type-validates + every scalar config field) and only checks ``--target`` after. + """ + in_path = tmp_path / "in_notarget.csv" + pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False) + + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"max_features": "5"})) # string, not int + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--config", + str(cfg), + ] + ) + assert rc == 2 + # The real error is the type mismatch, NOT --target missing. + assert "max_features" in err + assert "--target" not in err + + +def test_check_scalar_type_rejects_none_when_required(): + """Direct unit test for ``_check_scalar_type`` to exercise the + ``allow_none=False`` + ``value is None`` branch, which the integration + path doesn't naturally hit (every scalar with ``allow_none=False`` has + a non-None default). + """ + with pytest.raises(ValueError, match="must not be null"): + fc_cli._check_scalar_type("foo", None, (int,), allow_none=False) + + +# -------------------------------------------------------------- error paths + + +def test_transform_missing_input_returns_exit_2(tmp_path: Path): + rc, _, err = _run( + [ + "transform", + "--input", + str(tmp_path / "nope.csv"), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "Input file not found" in err + + +def test_transform_unknown_target_returns_exit_2(tmp_path: Path, tabular_csv: Path): + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "does_not_exist", + ] + ) + assert rc == 2 + assert "does_not_exist" in err + + +def test_transform_unknown_extension_without_override(tmp_path: Path, tabular_csv: Path): + out_path = tmp_path / "out.weird" + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + ] + ) + assert rc == 2 + assert "infer format" in err.lower() + + +def test_transform_format_override_accepted(tmp_path: Path, tabular_csv: Path): + out_path = tmp_path / "out.weird" + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--output-format", + "csv", + ] + ) + assert rc == 0, err + assert out_path.exists() + + +def test_invalid_config_file_returns_exit_2(tmp_path: Path, tabular_csv: Path): + bad = tmp_path / "bad.json" + bad.write_text("[1, 2, 3]") # JSON, but not an object + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(bad), + ] + ) + assert rc == 2 + assert "JSON object" in err + + +def test_unknown_config_top_level_key_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """A typo in a top-level config key (``max_feature`` instead of + ``max_features``, etc.) must fail fast with a precise exit-2 message + listing the recognized keys — not silently run with defaults. + """ + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"max_feature": 5})) # missing 's' + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "max_feature" in err + assert "Recognized keys" in err or "recognized keys" in err.lower() + + +def test_unknown_config_top_level_key_lists_known_keys(tmp_path: Path, tabular_csv: Path): + """The error message must enumerate the recognized keys so users can + self-correct without reading the source. + """ + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"selection_method": ["mutual_info"]})) # missing 's' + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "selection_method" in err + # Recognized-keys list must include the canonical names. + assert "selection_methods" in err + assert "max_features" in err + + +def test_directory_as_config_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """Pointing ``--config`` at a directory must surface as exit 2, not the + generic ``exit 1`` backstop (``IsADirectoryError``). + """ + cfg_dir = tmp_path / "not_a_file" + cfg_dir.mkdir() + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(cfg_dir), + ] + ) + assert rc == 2 + assert "directory" in err.lower() + + +def test_malformed_json_config_returns_exit_2(tmp_path: Path, tabular_csv: Path): + bad = tmp_path / "bad.json" + bad.write_text("{not valid json,}") + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(bad), + ] + ) + assert rc == 2 + assert "valid json" in err.lower() + + +def test_non_dict_llm_config_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """A non-mapping ``llm_config`` (e.g. a string) must be rejected at + config-load time with a clean exit 2, not bubble up as an + ``AttributeError`` from ``.get(...)`` deep inside engine construction. + """ + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"engines": ["tabular"], "llm_config": "gpt-5"})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "llm_config" in err + assert "JSON object" in err or "mapping" in err.lower() + + +def test_no_subcommand_exits_nonzero(capsys): + # main() now returns the argparse-reported exit code (2 for usage error) + # rather than letting SystemExit propagate, so programmatic callers get + # an integer back even on parse-time failures. + rc = fc_cli.main([]) + assert rc == 2 + + +def test_unknown_flag_returns_exit_2(capsys): + rc = fc_cli.main(["transform", "--no-such-flag"]) + assert rc == 2 + + +def test_argparse_usage_error_emits_single_structured_line(tmp_path: Path, tabular_csv: Path): + """``argparse`` defaults to writing a multi-line usage banner before its + error message, mixing two pieces of information on stderr that agents + must then parse apart. The CLI's ``_StructuredArgumentParser`` collapses + those into the single canonical ``featcopilot: error: `` line + so usage failures match the rest of the exit-2 contract. + """ + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--no-such-flag", # genuine unknown flag (not a missing-required) + ] + ) + assert rc == 2 + err_lines = [line for line in err.splitlines() if line.strip()] + # Exactly one non-empty stderr line. + assert len(err_lines) == 1, f"Expected single-line stderr, got {err_lines!r}" + assert err_lines[0].startswith("featcopilot: error: ") + # No multi-line ``argparse`` usage banner. + assert "usage:" not in err.lower() + # Still mentions the offending flag. + assert "--no-such-flag" in err + + +def test_argparse_missing_subcommand_emits_single_structured_line(): + rc, _, err = _run([]) + assert rc == 2 + err_lines = [line for line in err.splitlines() if line.strip()] + assert len(err_lines) == 1, f"Expected single-line stderr, got {err_lines!r}" + assert err_lines[0].startswith("featcopilot: error: ") + assert "usage:" not in err.lower() + + +def test_help_flag_returns_zero(capsys): + rc = fc_cli.main(["--help"]) + assert rc == 0 + captured = capsys.readouterr() + assert "featcopilot" in captured.out + + +# ------------------------------------------------------------------ explain + + +def test_explain_emits_json_payload(tmp_path: Path, tabular_csv: Path): + rc, out, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + assert payload["engines"] == ["tabular"] + assert isinstance(payload["features"], list) + # The tabular engine actually generates derived features, and the explain + # subcommand must materialize them by running the full fit_transform + # pipeline (engines populate _feature_names during transform()). + assert payload["n_features"] > 0 + assert len(payload["features"]) == payload["n_features"] + # Each feature entry is a dict with the expected keys. + entry = payload["features"][0] + assert {"name", "explanation", "code"} <= set(entry.keys()) + assert entry["name"] + + +def test_explain_uses_full_input_by_default(tmp_path: Path): + """``explain`` defaults to using the FULL input — no implicit + sub-sampling. Some engines (e.g. ``TabularEngine`` categorical + encoding) decide which features to plan based on row counts and + per-category statistics, so silent sampling would change the + advertised metadata. Sampling is opt-in via ``--explain-sample-size``. + """ + rng = np.random.default_rng(0) + n = 1500 # arbitrary + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "big.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + # Default: no sampling — full input is used. + assert payload["n_rows_used"] == n + + +def test_explain_caps_input_size_when_sample_size_set(tmp_path: Path): + """When ``--explain-sample-size N`` is passed, the input is capped at + ``N`` rows (with a captured warning) so callers can opt into bounded + cost on huge inputs. The default remains full-input. + """ + rng = np.random.default_rng(0) + n = 5000 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "big.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "1000", + ] + ) + assert rc == 0, err + payload = json.loads(out) + # Sampling cap was enforced. + assert payload["n_rows_used"] == 1000 + assert payload["n_features"] > 0 + # The CLI emits a warning when sampling so callers can detect that + # metadata may not match a full-input transform run. + assert any("capping input" in w.lower() or "sampling" in w.lower() for w in payload["warnings"]) + + +def test_explain_sample_size_smaller_than_input_no_op(tmp_path: Path): + """When ``--explain-sample-size`` exceeds the actual input, no sampling + happens (and no warning is emitted). + """ + rng = np.random.default_rng(0) + n = 50 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "small.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "1000", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == n + assert not any("capping input" in w.lower() or "sampling" in w.lower() for w in payload["warnings"]) + + +def test_explain_sample_size_via_config(tmp_path: Path): + """``explain_sample_size`` is also recognized in ``--config`` JSON.""" + rng = np.random.default_rng(0) + n = 5000 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "big.csv" + df.to_csv(in_path, index=False) + + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"explain_sample_size": 500})) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == 500 + + +@pytest.mark.parametrize("bad_value", [0, -1, -100]) +def test_explain_sample_size_rejects_non_positive(tmp_path: Path, bad_value): + """``--explain-sample-size`` must be a positive integer.""" + rc, _, err = _run( + [ + "explain", + "--input", + str(tmp_path / "in.csv"), # missing — but flag check happens first + "--target", + "y", + "--explain-sample-size", + str(bad_value), + ] + ) + # We accept either argparse-level rejection or our own ValueError; + # both surface as exit 2. + assert rc == 2 + + +def test_explain_sample_size_rejects_string_in_config(tmp_path: Path, tabular_csv: Path): + """Type-validation: ``"explain_sample_size": "100"`` (string) is rejected.""" + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"explain_sample_size": "100"})) + rc, _, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "explain_sample_size" in err + + +def test_explain_sample_size_rejects_zero_in_config(tmp_path: Path, tabular_csv: Path): + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"explain_sample_size": 0})) + rc, _, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "explain_sample_size" in err + + +# --------------------------------------------------------------- parquet path + + +def test_transform_parquet_missing_engine_returns_exit_2(tmp_path, tabular_csv, monkeypatch): + """When pyarrow/fastparquet is missing, the CLI should surface a clean + user-facing dependency error (exit 2) rather than the generic exit 1 + backstop. + """ + import pandas as pd + + def _raise_import_error(self, *args, **kwargs): # noqa: ANN001 + raise ImportError("Missing optional dependency 'pyarrow' (simulated)") + + monkeypatch.setattr(pd.DataFrame, "to_parquet", _raise_import_error, raising=True) + + out_path = tmp_path / "out.parquet" + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "5", + ] + ) + assert rc == 2 + assert "parquet engine" in err.lower() + + +def test_transform_read_parquet_missing_engine_returns_exit_2(tmp_path, tabular_csv, monkeypatch): + """Symmetric coverage for reading a .parquet input when no engine is installed. + + The CLI must convert the ``ImportError`` from ``pd.read_parquet`` into + the deterministic exit-2 path (with a user-facing install hint), + just like the write path. + """ + import pandas as pd + + # Make sure the input path has a .parquet suffix so format detection picks parquet. + fake_pq = tmp_path / "fake.parquet" + fake_pq.write_bytes(b"") # contents don't matter; we'll intercept read_parquet + + def _raise_import_error(*args, **kwargs): + raise ImportError("Missing optional dependency 'pyarrow' (simulated)") + + monkeypatch.setattr(pd, "read_parquet", _raise_import_error, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(fake_pq), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "parquet engine" in err.lower() + + +def test_parquet_engine_available_returns_false_when_neither_installed(monkeypatch): + """When ``__import__`` raises ``ImportError`` for both engines, the + function reports parquet as unavailable. + """ + import builtins + + real_import = builtins.__import__ + + def fake_import(name, *args, **kwargs): + if name in ("pyarrow", "fastparquet"): + raise ImportError(f"No module named '{name}' (simulated)") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + assert fc_cli._parquet_engine_available() is False + + +def test_parquet_engine_available_returns_true_for_fastparquet_only(monkeypatch): + """Even without pyarrow, importing fastparquet must report parquet as available.""" + import builtins + + real_import = builtins.__import__ + + def fake_import(name, *args, **kwargs): + if name == "pyarrow": + raise ImportError("No module named 'pyarrow' (simulated)") + if name == "fastparquet": + # Simulate a successful import by short-circuiting; we don't + # actually need a real module object, just a non-raising return. + class _FakeModule: + pass + + return _FakeModule() + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + assert fc_cli._parquet_engine_available() is True + + +def test_parquet_engine_available_returns_false_for_broken_native_install(monkeypatch): + """A distribution that's on sys.path but raises a non-ImportError at + import time (e.g. broken native bindings) is reported as unavailable. + Using ``__import__`` (rather than ``importlib.util.find_spec``) is what + makes this honest: ``find_spec`` would have returned a spec and lied. + """ + import builtins + + real_import = builtins.__import__ + + def fake_import(name, *args, **kwargs): + if name in ("pyarrow", "fastparquet"): + # Simulate a broken native install (loader-level failure). + raise OSError("broken native install: undefined symbol (simulated)") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + assert fc_cli._parquet_engine_available() is False + + +def test_unreadable_config_returns_exit_2(tmp_path, tabular_csv, monkeypatch): + """An ``OSError`` while opening the config (permission denied, broken + symlink, etc.) is converted into the deterministic exit-2 path. + """ + cfg = tmp_path / "cfg.json" + cfg.write_text("{}") + + real_open = Path.open + + def _raise_oserror(self, *args, **kwargs): + if self == cfg: + raise PermissionError("simulated read failure") + return real_open(self, *args, **kwargs) + + monkeypatch.setattr(Path, "open", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "could not be read" in err.lower() + + +def test_explain_sample_size_handles_non_unique_index(tmp_path: Path): + """Sampling must keep X and y aligned even when the input frame has a + non-unique index — e.g. a parquet read that preserves a saved index + where labels can repeat. Positional sampling (``.iloc``) avoids the + label-based ``.loc`` expansion / reordering bug. + """ + pytest.importorskip("pyarrow") # parquet write needs an engine + + rng = np.random.default_rng(0) + n = 4000 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + # Force a non-unique index — labels repeat (each label appears twice). + df.index = pd.Index([i // 2 for i in range(n)], name="duplicated_index") + in_path = tmp_path / "non_unique.parquet" + df.to_parquet(in_path, index=True) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "100", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + # Sample size must be honored exactly, not expanded by ``.loc``-with- + # duplicate-labels behavior. + assert payload["n_rows_used"] == 100 + + +def test_include_target_collision_error_text_lists_only_actionable_options( + tmp_path: Path, tabular_csv: Path, monkeypatch +): + """The error text emitted when ``--include-target`` would overwrite an + engineered feature must only suggest actions that are actually + possible from this command. The CLI does not offer auto-rename, so + the message must NOT mention "rename and retry" or any other phantom + option. + """ + from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer + + real_fit_transform = AutoFeatureEngineer.fit_transform + + def _patched_fit_transform(self, X, y=None, **kwargs): + result = real_fit_transform(self, X, y, **kwargs) + result = result.copy() + result["y"] = result.iloc[:, 0] + return result + + monkeypatch.setattr(AutoFeatureEngineer, "fit_transform", _patched_fit_transform) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--include-target", + "--max-features", + "5", + ] + ) + assert rc == 2 + # Must mention the real options. + assert "rename the target column" in err.lower() + assert "drop --include-target" in err + # Must NOT mention non-existent CLI options. + assert "accept the rename" not in err.lower() + assert "retry" not in err.lower() + + +# ----------------------- explain --explain-sample-size memory bound + + +def test_explain_sample_size_bounds_csv_read_with_nrows(tmp_path: Path, monkeypatch): + """``--explain-sample-size N`` must propagate to ``pd.read_csv`` as + ``nrows=N`` so the underlying read is memory-bounded for huge CSV + inputs (rather than fully loading the file and then trimming). + """ + import pandas as pd + + rng = np.random.default_rng(0) + n = 5000 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "big.csv" + df.to_csv(in_path, index=False) + + real_read_csv = pd.read_csv + captured_kwargs: list[dict] = [] + + def _spy_read_csv(*args, **kwargs): + captured_kwargs.append(kwargs.copy()) + return real_read_csv(*args, **kwargs) + + monkeypatch.setattr(pd, "read_csv", _spy_read_csv, raising=True) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "200", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == 200 + # Must have called pd.read_csv with nrows=201 (sample_size + 1, the + # CLI requests one extra row so it can detect whether the input was + # actually larger than the cap and only emit the metadata-may-differ + # warning when truncation really happened). The full 5000-row file + # is never loaded. + explain_reads = [k for k in captured_kwargs if k.get("nrows") == 201] + assert explain_reads, f"expected pd.read_csv to be called with nrows=201; got {captured_kwargs!r}" + + +def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path): + """For parquet inputs, pandas has no native row-limit, so the bound + is applied post-read. The CLI must surface a warning describing the + limitation so callers know memory isn't strictly bounded. The + warning is emitted by ``_cmd_explain`` itself (not duplicated by + ``_read_table``) so the user sees one accurate message. + """ + pytest.importorskip("pyarrow") + rng = np.random.default_rng(0) + n = 4000 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "big.parquet" + df.to_parquet(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "100", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == 100 + # The post-read truncation notice must appear in the captured warnings. + # The unified message says: "For parquet, pandas does not expose a + # native row-limit, so the full file ... was loaded into memory before + # truncation." + captured = " ".join(payload["warnings"]).lower() + assert "native row-limit" in captured or "post-read" in captured or "memory before truncation" in captured + # The user-facing message uses the actual sample_size (100), NOT the + # internal +1 read size, AND there is exactly one truncation notice. + assert "100 rows" in " ".join(payload["warnings"]) + truncation_msgs = [w for w in payload["warnings"] if "truncat" in w.lower() or "capping" in w.lower()] + assert len(truncation_msgs) == 1, f"expected exactly one truncation notice, got {truncation_msgs!r}" + + +# ----------------------- strict per-thread capture isolation + + +def test_capture_does_not_route_unrelated_thread_records(): + """The capture layer must use STRICT per-thread routing for non-LLM + records: records emitted on threads other than the one that opened + a capture flow through the normal handler chain (and reach stderr) + — they are NOT silently rolled into the single in-flight CLI run's + payload. + + A previous "single-active-capture fallback" was too broad: when a + single CLI run was active, *any* featcopilot log on any thread + would have been swallowed into that command's payload, including + unrelated background work, causing misattribution. This test + guards against that regression for the non-LLM case (the narrow + LLM-only fallback is covered separately). + """ + import threading + + fc_logger = logging.getLogger("featcopilot.test_unrelated") + + with fc_cli._capture_featcopilot_messages() as captured: + # Caller emits on its own thread (must be captured). + fc_logger.warning("from-caller") + + # Spawn a separate, unrelated thread that ALSO emits via a + # NON-LLM featcopilot logger. With strict per-thread isolation + # for non-LLM records, that record must NOT appear in this + # capture's payload. + def _emit_elsewhere(): + fc_logger.warning("from-other-thread") + + t = threading.Thread(target=_emit_elsewhere) + t.start() + t.join() + + assert any("from-caller" in m for m in captured) + # Strict per-thread isolation for non-LLM records: unrelated thread's + # record is NOT in this capture's payload. + assert not any("from-other-thread" in m for m in captured) + + +def test_capture_routes_llm_client_worker_records_to_single_active_capture(): + """The narrow LLM-client fallback: when a record originates from one + of the *whitelisted* sync LLM client modules + (``featcopilot.llm.copilot_client`` / ``litellm_client`` / + ``openai_client``) and exactly one capture is active, the record + is routed to that capture even when emitted from a worker thread. + + This addresses the common case where an LLM sync client wrapping + ``ThreadPoolExecutor`` (the fallback used in event-loop + environments) emits a mock-mode startup warning on a worker thread + that ``submit()`` spawned. Without the narrow fallback, that + warning would bleed onto stderr on a successful run. + """ + import threading + from concurrent.futures import ThreadPoolExecutor + + # Use an actual whitelisted sync-client module name; an arbitrary + # ``featcopilot.llm.*`` name (e.g. ``test_client``) is intentionally + # NOT eligible — see ``test_capture_does_not_apply_llm_fallback_for_non_whitelisted_llm_loggers``. + llm_logger = logging.getLogger("featcopilot.llm.openai_client") + + def _emit_llm_in_worker(): + llm_logger.warning("llm-mock-mode-startup") + return "ok" + + with fc_cli._capture_featcopilot_messages() as captured: + # Caller emits its own LLM record (current-thread path). + llm_logger.warning("llm-from-caller") + # ThreadPoolExecutor worker emits an LLM record (cross-thread, + # but the narrow LLM-only fallback should route it). + with ThreadPoolExecutor(max_workers=1) as pool: + assert pool.submit(_emit_llm_in_worker).result(timeout=5) == "ok" + # A raw threading.Thread emits an LLM record too. + t = threading.Thread(target=_emit_llm_in_worker) + t.start() + t.join() + + # Caller's record + 2 worker records (one from pool, one from thread) + # are all in the capture. + assert any("llm-from-caller" in m for m in captured) + assert sum(1 for m in captured if "llm-mock-mode-startup" in m) >= 2 + + +def test_capture_does_not_apply_llm_fallback_for_non_whitelisted_llm_loggers(): + """The narrow LLM-client fallback whitelist is an *exact* set of + sync-client module names — NOT a ``featcopilot.llm.*`` prefix. + Other ``featcopilot.llm.*`` loggers (e.g. ``semantic_engine``, + ``code_generator``, ``transform_rule_generator``, ``explainer``) + must keep strict per-thread isolation, so cross-thread records + from unrelated background work cannot be silently swallowed into + an active CLI capture. + """ + import threading + + non_whitelisted = [ + "featcopilot.llm.semantic_engine", + "featcopilot.llm.code_generator", + "featcopilot.llm.transform_rule_generator", + "featcopilot.llm.explainer", + "featcopilot.llm.test_dummy", # arbitrary subname — must NOT match + ] + captured_lists: list[list[str]] = [] + + for name in non_whitelisted: + other_logger = logging.getLogger(name) + + def _emit_in_other_thread(logger=other_logger, tag=name): + logger.warning(f"{tag}-from-other-thread") + + with fc_cli._capture_featcopilot_messages() as captured: + t = threading.Thread(target=_emit_in_other_thread) + t.start() + t.join() + captured_lists.append(list(captured)) + + for name, captured in zip(non_whitelisted, captured_lists, strict=True): + assert not any( + f"{name}-from-other-thread" in m for m in captured + ), f"Non-whitelisted LLM logger {name} unexpectedly tripped the cross-thread fallback" + + +def test_capture_does_not_apply_llm_fallback_with_multiple_captures(): + """When two captures are concurrently active, the narrow LLM + fallback stays disabled — strict per-thread isolation is preserved + so concurrent CLI calls don't cross-contaminate, even for LLM + records. + """ + import threading + from concurrent.futures import ThreadPoolExecutor + + llm_logger = logging.getLogger("featcopilot.llm.openai_client") + a_captured: list[str] = [] + b_captured: list[str] = [] + enter_barrier = threading.Barrier(2) + inside_barrier = threading.Barrier(2) + done_barrier = threading.Barrier(2) + + def worker(tag: str, target: list[str]): + # Phase 0: both threads start at roughly the same time. + enter_barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + llm_logger.warning(f"{tag}-direct") + # Phase 1: BOTH threads have entered their captures, so + # ``_state._per_thread`` has TWO entries when either thread's + # worker fires below — that's the multi-capture scenario the + # narrow LLM fallback must skip. Without this barrier the + # threads race: one thread's worker can fire before the other + # has pushed its capture, making ``len == 1`` and (incorrectly + # for this test's intent) tripping the fallback. + inside_barrier.wait() + with ThreadPoolExecutor(max_workers=1) as pool: + pool.submit(lambda t=tag: llm_logger.warning(f"{t}-worker")).result(timeout=5) + # Phase 2: BOTH threads' workers have completed before either + # exits its capture. This pins ``len == 2`` for the entire + # worker-emit window. + done_barrier.wait() + target.extend(captured) + + t1 = threading.Thread(target=worker, args=("A", a_captured)) + t2 = threading.Thread(target=worker, args=("B", b_captured)) + t1.start() + t2.start() + t1.join() + t2.join() + + # Each capture sees its own direct record (current-thread path). + assert any("A-direct" in m for m in a_captured) + assert any("B-direct" in m for m in b_captured) + # The worker record is NOT in either capture (fallback disabled + # because two captures were active during the worker emit). + assert not any("worker" in m for m in a_captured) + assert not any("worker" in m for m in b_captured) + + +def test_capture_keeps_thread_isolation_with_multiple_active_captures(): + """The single-active-capture fallback must NOT activate when two + threads are concurrently capturing — each must see only its own + thread's records, not records emitted on the other thread's + workers. + """ + import threading + + fc_logger = logging.getLogger("featcopilot.test_dual") + a_captured: list[str] = [] + b_captured: list[str] = [] + barrier = threading.Barrier(2) + inside = threading.Event() + + def worker(tag: str, target: list[str]): + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + inside.set() + for i in range(10): + fc_logger.warning(f"{tag}-{i}") + target.extend(captured) + + t1 = threading.Thread(target=worker, args=("A", a_captured)) + t2 = threading.Thread(target=worker, args=("B", b_captured)) + t1.start() + t2.start() + t1.join() + t2.join() + + # Each capture must contain ONLY its own thread's records (no fallback + # cross-talk because two captures are active). + assert all("A-" in m for m in a_captured) + assert all("B-" in m for m in b_captured) + assert len(a_captured) == 10 + assert len(b_captured) == 10 + + +def test_capture_decision_is_cached_per_record_for_atomic_filter_emit(): + """The capture state must resolve each record's routing decision + *exactly once*, then cache the outcome on the record itself, so the + suppression filter and the routing handler always see the same + answer for that record. Otherwise a concurrent push/pop on another + thread could land between the filter (computed at handler-1 phase) + and the emit (computed at handler-2 phase), making the same record + both captured and emitted to stderr (or suppressed without being + captured) — breaking the CLI contract. + """ + state = fc_cli._ThreadCaptureState() + + class _CountingState: + """Wrap state so we can count ``get_for_llm_record`` calls.""" + + def __init__(self, inner): + self._inner = inner + self.calls: list[tuple[int, str]] = [] + + # Forward the attributes ``resolve_for_record`` reads. + @property + def _UNCACHED(self): + return self._inner._UNCACHED + + def get_for_llm_record(self, tid, name): + self.calls.append((tid, name)) + return self._inner.get_for_llm_record(tid, name) + + # Re-bind ``resolve_for_record`` so calls are counted via the + # wrapped ``get_for_llm_record``. + def resolve_for_record(self, record): + return fc_cli._ThreadCaptureState.resolve_for_record(self, record) + + counted = _CountingState(state) + + record = logging.LogRecord( + name="featcopilot.llm.openai_client", + level=logging.WARNING, + pathname=__file__, + lineno=1, + msg="hello", + args=(), + exc_info=None, + ) + + # First call computes and caches; subsequent calls must not hit + # ``get_for_llm_record`` again. + first = counted.resolve_for_record(record) + second = counted.resolve_for_record(record) + third = counted.resolve_for_record(record) + + assert first is second is third + assert len(counted.calls) == 1, ( + "resolve_for_record must compute the decision exactly once per record; " + f"saw {len(counted.calls)} get_for_llm_record calls" + ) + + # The cached attribute is set on the record itself. + assert hasattr(record, "_featcopilot_capture_target") + + +def test_capture_decision_stable_under_concurrent_pop_between_filter_and_emit(): + """Regression test for the atomic filter/emit invariant: even if a + concurrent thread pops its capture between the moment a record is + filtered and the moment it is emitted, both phases see the SAME + decision because it was resolved and cached on the record once. + """ + state = fc_cli._ThreadCaptureState() + cap_a: list[str] = [] + state.push(threading.get_ident() ^ 1, cap_a) # foreign-thread capture + try: + record = logging.LogRecord( + name="featcopilot.llm.copilot_client", + level=logging.WARNING, + pathname=__file__, + lineno=1, + msg="hi", + args=(), + exc_info=None, + ) + # Phase 1: "filter" computes and caches ("len(_per_thread)==1" + # so the LLM fallback returns ``cap_a``). + first = state.resolve_for_record(record) + assert first is cap_a + + # Concurrent pop: another thread tears its capture down. State + # would now produce a *different* answer for a fresh lookup. + state.pop(threading.get_ident() ^ 1) + fresh_lookup = state.get_for_llm_record(threading.get_ident(), record.name) + assert fresh_lookup is None # state has indeed changed + + # Phase 2: "emit" must still see the same decision via the cache. + second = state.resolve_for_record(record) + assert second is cap_a, ( + "After a concurrent pop, resolve_for_record must still return the " + "originally cached decision so filter and emit cannot disagree" + ) + finally: + # Clean up any stragglers. + state.pop(threading.get_ident() ^ 1) + + +def test_run_helper_redirects_featcopilot_stream_handlers(monkeypatch): + """Regression test for the test helper itself: ``_run`` must + redirect every ``logging.StreamHandler`` on the ``featcopilot`` + root logger so that any handler write that escapes the suppression + filter (the contract-violation scenario) lands in the captured + ``err`` buffer, NOT on the real terminal. + + Without this redirect, every ``err == ""`` assertion in this file + would be vacuously satisfied because the ``StreamHandler`` installed + at import time holds a reference to the *original* ``sys.stderr`` + object and ``redirect_stderr`` only swaps the module attribute. + """ + fc_logger = logging.getLogger("featcopilot") + stream_handlers = [h for h in fc_logger.handlers if isinstance(h, logging.StreamHandler)] + assert stream_handlers, "featcopilot logger must have at least one StreamHandler" + + # Stub ``cli.main`` to write directly through the StreamHandler's + # current ``stream`` attribute (which ``_run`` should have re-pointed + # at the captured ``err`` buffer). + def fake_main(argv): + for h in fc_logger.handlers: + if isinstance(h, logging.StreamHandler): + h.stream.write("HANDLER_LEAK_LINE\n") + h.stream.flush() + return 0 + + monkeypatch.setattr(fc_cli, "main", fake_main) + rc, _out, err = _run(["info"]) + assert rc == 0 + assert "HANDLER_LEAK_LINE" in err, ( + "_run must redirect featcopilot StreamHandler streams; otherwise stderr-cleanliness " "assertions are vacuous" + ) + # And the original stream is restored after the call. + for h in stream_handlers: + assert h.stream is sys.stderr or h.stream is sys.__stderr__ + + +# ----------------------- empty-input column-vs-row distinction + + +def test_transform_zero_columns_input_distinguishes_from_zero_rows(tmp_path: Path): + """``DataFrame.empty`` is ``True`` for both zero-row AND zero-column + frames. The CLI must distinguish: a JSON array of empty objects + ``[{}, {}, ...]`` is a zero-COLUMN input (the user has no feature + columns), not a zero-ROW input. The error message must point at + the actual problem so callers can take the right remediation. + """ + # JSON array of empty objects: pandas reads this as a frame with + # rows but no columns. + p = tmp_path / "empty_columns.json" + p.write_text("[{}, {}, {}]") + rc, _out, err = _run(["transform", "--input", str(p), "--output", str(tmp_path / "out.csv"), "--target", "y"]) + assert rc == 2 + assert "no columns" in err.lower(), err + assert "feature column" in err.lower(), err + assert "zero data rows" not in err.lower(), err + + +def test_transform_zero_rows_input_still_uses_zero_rows_message(tmp_path: Path): + """The zero-row case (header but no data) still surfaces the + distinct "zero data rows" wording so the two failure modes are + distinguishable in CLI output. + """ + p = tmp_path / "header_only.csv" + p.write_text("x1,x2,y\n") + rc, _out, err = _run(["transform", "--input", str(p), "--output", str(tmp_path / "out.csv"), "--target", "y"]) + assert rc == 2 + assert "zero data rows" in err.lower(), err + assert "no columns" not in err.lower(), err + + +# ----------------------- transform read/write warnings captured (not stderr) + + +def test_transform_read_warning_captured_not_on_stderr(tmp_path: Path, monkeypatch): + """``pd.read_csv`` can legitimately emit ``DtypeWarning`` on a + successful read with mixed-type columns. That warning must end up + in the JSON ``warnings`` field, NOT on stderr — the contract is + that successful runs keep stderr empty for agent callers. + """ + import pandas as _pd + + # Build a valid CSV input and a real fit_transform-able payload. + rng = np.random.default_rng(0) + df = pd.DataFrame( + { + "x1": rng.normal(size=50), + "x2": rng.integers(0, 5, size=50), + "y": rng.integers(0, 2, size=50), + } + ) + in_path = tmp_path / "in.csv" + df.to_csv(in_path, index=False) + out_path = tmp_path / "out.csv" + + # Patch ``pd.read_csv`` so that calling it emits a real Python + # ``warnings.warn`` (mirroring DtypeWarning on a successful read) + # while still returning the same DataFrame. + real_read_csv = _pd.read_csv + + def warning_emitting_read_csv(*a, **kw): + warnings.warn("pandas-mock-read-csv: DtypeWarning equivalent", UserWarning, stacklevel=2) + return real_read_csv(*a, **kw) + + monkeypatch.setattr(_pd, "read_csv", warning_emitting_read_csv) + + rc, out, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(out_path), + "--target", + "y", + "--no-selection", + "--json", + ] + ) + assert rc == 0, err + assert err == "", f"read-time warning leaked to stderr: {err!r}" + payload = json.loads(out) + assert any("pandas-mock-read-csv" in w for w in payload["warnings"]), payload["warnings"] + + +def test_transform_write_warning_captured_not_on_stderr(tmp_path: Path, monkeypatch, tabular_csv: Path): + """Pandas/pyarrow can legitimately emit ``FutureWarning`` / + ``UserWarning`` during ``DataFrame.to_csv`` / ``to_parquet`` / + ``to_json`` on a successful write. Those warnings must end up in + the JSON ``warnings`` field, NOT on stderr. + """ + out_path = tmp_path / "out.csv" + + # Patch ``DataFrame.to_csv`` so calling it emits a warning while + # still actually writing the file. + real_to_csv = pd.DataFrame.to_csv + + def warning_emitting_to_csv(self, *a, **kw): + warnings.warn("pandas-mock-to-csv: FutureWarning equivalent", FutureWarning, stacklevel=2) + return real_to_csv(self, *a, **kw) + + monkeypatch.setattr(pd.DataFrame, "to_csv", warning_emitting_to_csv) + + rc, out, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--no-selection", + "--json", + ] + ) + assert rc == 0, err + assert err == "", f"write-time warning leaked to stderr: {err!r}" + payload = json.loads(out) + assert any("pandas-mock-to-csv" in w for w in payload["warnings"]), payload["warnings"] + + +# ----------------------- explain captures explain_features warnings + + +def test_explain_features_warnings_captured_not_on_stderr(tmp_path: Path, monkeypatch, tabular_csv: Path): + """``explain_features`` / ``get_feature_code`` are now inside the + same capture as the read + ``fit_transform``, so any warning they + emit goes to the JSON ``warnings`` field, not stderr. + """ + from featcopilot.transformers import sklearn_compat as _sc + + real_explain = _sc.AutoFeatureEngineer.explain_features + + def warning_emitting_explain(self): + warnings.warn("explain-features-mock-warning", UserWarning, stacklevel=2) + return real_explain(self) + + monkeypatch.setattr(_sc.AutoFeatureEngineer, "explain_features", warning_emitting_explain) + + rc, out, err = _run(["explain", "--input", str(tabular_csv), "--target", "y"]) + assert rc == 0, err + assert err == "", f"explain_features warning leaked to stderr: {err!r}" + payload = json.loads(out) + assert any("explain-features-mock-warning" in w for w in payload["warnings"]), payload["warnings"] + + +# ----------------------- explain --explain-sample-size warning hygiene + + +def test_explain_no_sampling_warning_when_input_fits_exactly(tmp_path: Path): + """When the input has exactly ``--explain-sample-size`` rows, no + truncation actually happens, so the "metadata may differ" warning + must NOT fire. The success payload was previously inaccurate when + the warning fired on the boundary case. + """ + rng = np.random.default_rng(0) + n = 200 # exactly the sample-size we'll request + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "exact.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "200", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == 200 + # No "metadata may differ" warning — input fit naturally. + assert not any("capping input" in w.lower() or "metadata may differ" in w.lower() for w in payload["warnings"]) + + +def test_explain_no_sampling_warning_when_input_smaller_than_sample(tmp_path: Path): + """When the input has fewer rows than ``--explain-sample-size``, + obviously no truncation happens. Belt-and-suspenders coverage of + the "<= cap, no warning" branch. + """ + rng = np.random.default_rng(0) + n = 50 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "small.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "200", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == n + assert not any("capping input" in w.lower() or "metadata may differ" in w.lower() for w in payload["warnings"]) + + +def test_explain_sampling_warning_fires_when_input_strictly_larger(tmp_path: Path): + """Strict proof of truncation: input has at least one MORE row than + the cap. The warning must fire, and the payload must report + ``n_rows_used == sample_size``. + """ + rng = np.random.default_rng(0) + n = 201 # exactly one more than the cap + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "barely_over.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "200", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == 200 + assert any("capping input" in w.lower() for w in payload["warnings"]) + + +def test_explain_sample_size_help_text_describes_head_slice_not_random_seed(): + """The ``--explain-sample-size`` help text must accurately describe + the actual semantics (deterministic head slice, NOT a seeded random + sample). Guards against misleading users / agents who would expect + an unbiased sample. + """ + parser = fc_cli._build_parser() + explain_parser = next( + action.choices["explain"] for action in parser._actions if isinstance(action, argparse._SubParsersAction) + ) + sample_help = next(a.help for a in explain_parser._actions if "--explain-sample-size" in a.option_strings) + # Must accurately describe the implementation. + assert "head slice" in sample_help.lower() or "first n" in sample_help.lower() + # Must NOT use the misleading old phrasing. + assert "deterministic seed" not in sample_help.lower() + assert "random sample" not in sample_help.lower() or "not a random sample" in sample_help.lower() + + +# ----------------------- python -m + + +def test_dunder_main_module_runs(monkeypatch, capsys): + """``cli.main`` is invoked via the same code path as ``python -m featcopilot``.""" + monkeypatch.setattr(sys, "argv", ["featcopilot", "info", "--json"]) + rc = fc_cli.main(["info", "--json"]) + assert rc == 0 + + +def test_dunder_main_subprocess_invocation(): + """``python -m featcopilot info --json`` must succeed in a real subprocess. + + Exercises ``featcopilot/__main__.py`` end-to-end so a regression in + module-form invocation (e.g. a broken import path) actually breaks the + test, not just the unit-level call to ``cli.main``. + """ + import subprocess + + result = subprocess.run( + [sys.executable, "-m", "featcopilot", "info", "--json"], + capture_output=True, + text=True, + timeout=60, + check=False, + ) + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert payload["version"] == __version__ + assert "tabular" in payload["supported_engines"] + + +def test_dunder_main_subprocess_version_flag(): + """``python -m featcopilot --version`` must print and exit 0.""" + import subprocess + + result = subprocess.run( + [sys.executable, "-m", "featcopilot", "--version"], + capture_output=True, + text=True, + timeout=30, + check=False, + ) + assert result.returncode == 0, result.stderr + assert __version__ in result.stdout + + +# ------------------------------------------------------- console script + + +def _featcopilot_package_is_installed() -> bool: + """Return True iff the ``featcopilot`` distribution is installed in the + current environment (i.e. the entry-point machinery should have placed + the console script on ``PATH``). + + Used by the console-script tests to distinguish two cases: + + * Running tests directly against the source tree (``python -m pytest`` + from a clean checkout, no ``pip install -e .``): the package is + *not* installed; the script is legitimately missing and the test + should ``skip`` rather than report a packaging bug. + * Running tests after ``pip install`` (the CI flow): the package IS + installed, so the script MUST be on ``PATH``. If it isn't, that's a + real ``[project.scripts]`` regression and the test should ``fail``, + not silently pass via skip. + """ + try: + from importlib.metadata import PackageNotFoundError, distribution + except ImportError: # pragma: no cover - py3.10+ always has this + return False + try: + distribution("featcopilot") + except PackageNotFoundError: + return False + return True + + +def test_console_script_subprocess_invocation(): + """The installed ``featcopilot`` console script must be on PATH and runnable. + + Exercises the ``[project.scripts] featcopilot = "featcopilot.cli:main"`` + entry point end-to-end so a typo or packaging regression in + ``pyproject.toml`` would actually break the suite. When the + ``featcopilot`` distribution is installed, the script must be on + ``PATH``: a missing script in that case is a real packaging + regression, not a test environment quirk, so we ``fail`` (not + ``skip``). The skip is reserved for the rare case of running tests + against an un-installed source tree. + """ + import shutil + import subprocess + + script = shutil.which("featcopilot") + if script is None: + if _featcopilot_package_is_installed(): + pytest.fail( + "featcopilot package is installed but the `featcopilot` console " + "script is missing from PATH. This is a `[project.scripts]` " + "regression in pyproject.toml." + ) + pytest.skip( + "featcopilot package is not installed in this environment; install " + "it with `pip install -e .` to exercise the console-script entry point." + ) + + result = subprocess.run( + [script, "info", "--json"], + capture_output=True, + text=True, + timeout=60, + check=False, + ) + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert payload["version"] == __version__ + assert "tabular" in payload["supported_engines"] + + +def test_console_script_version_flag(): + """Same install-aware skip/fail policy as + :func:`test_console_script_subprocess_invocation`. + """ + import shutil + import subprocess + + script = shutil.which("featcopilot") + if script is None: + if _featcopilot_package_is_installed(): + pytest.fail( + "featcopilot package is installed but the `featcopilot` console " + "script is missing from PATH. This is a `[project.scripts]` " + "regression in pyproject.toml." + ) + pytest.skip("featcopilot package is not installed in this environment.") + + result = subprocess.run( + [script, "--version"], + capture_output=True, + text=True, + timeout=30, + check=False, + ) + assert result.returncode == 0, result.stderr + assert __version__ in result.stdout