From 3f0b6616f49067ba03774b62ae0eee1e11283063 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 14:05:53 +0800 Subject: [PATCH 01/30] feat(cli): add featcopilot command-line interface for agentic usage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces a stable, agent-friendly CLI exposed via the `featcopilot` console script (and `python -m featcopilot`). All subcommands accept `--json` for machine-readable stdout; user-facing errors are written to stderr with a non-zero exit code so agents (e.g. Copilot tool-use, shell scripts, CI pipelines) can parse failures deterministically. Subcommands: * `info` — print version, supported engines, selection methods, leakage guards, and supported I/O formats. * `transform` — read CSV / Parquet / JSON, run AutoFeatureEngineer, write engineered features. Supports `--config` JSON, `--engines`, `--max-features`, `--selection-methods`, `--correlation-threshold`, `--leakage-guard`, `--gate-n-jobs`, `--no-selection`, `--include-target`, and explicit `--input-format` / `--output-format` overrides. Emits a JSON status payload (rows, features, engines, selection_applied, ...) when `--json` is set. * `explain` — fit AutoFeatureEngineer and print a JSON document with `{name, explanation, code}` per feature so an LLM can consume the result directly. Files: * `featcopilot/cli.py` — argparse-based CLI (no new dependencies). * `featcopilot/__main__.py` — enables `python -m featcopilot`. * `pyproject.toml` — `[project.scripts]` entry point. * `tests/test_cli.py` — 18 tests covering info/transform/explain, CSV/Parquet/JSON round-trips, `--config` handling and override precedence, and all user-facing error paths. * `README.md` — new "Command-Line Interface" section. Coverage: featcopilot/cli.py at 94 %; project total 88.95 % (`--cov-fail-under=85`). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 26 +++ featcopilot/__main__.py | 6 + featcopilot/cli.py | 452 ++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 3 + tests/test_cli.py | 397 +++++++++++++++++++++++++++++++++++ 5 files changed, 884 insertions(+) create mode 100644 featcopilot/__main__.py create mode 100644 featcopilot/cli.py create mode 100644 tests/test_cli.py diff --git a/README.md b/README.md index e4f3d4f..157215b 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,32 @@ for feature, explanation in engineer.explain_features().items(): print(f"{feature}: {explanation}") ``` +## Command-Line Interface + +FeatCopilot ships a `featcopilot` CLI for shell, scripting, and agentic +(LLM tool-use) workflows — no Python glue required. All subcommands accept +`--json` for machine-readable stdout; errors are written to stderr with a +non-zero exit code so agents can parse failures deterministically. + +```bash +# Discover capabilities (engines, selection methods, I/O formats) +featcopilot info --json + +# Run feature engineering on a CSV / Parquet / JSON file +featcopilot transform \ + --input data.csv --target label --output features.parquet \ + --engines tabular --max-features 50 --json + +# Inspect generated features (name, explanation, code) as JSON for an LLM +featcopilot explain --input data.csv --target label + +# Equivalent module form +python -m featcopilot info --json +``` + +Pass `--config config.json` to provide nested keys such as `llm_config`; +explicit CLI flags override values from the config file. + ## Engines ### Tabular Engine diff --git a/featcopilot/__main__.py b/featcopilot/__main__.py new file mode 100644 index 0000000..0cce4e0 --- /dev/null +++ b/featcopilot/__main__.py @@ -0,0 +1,6 @@ +"""Enable ``python -m featcopilot`` to dispatch to the CLI.""" + +from featcopilot.cli import main + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/featcopilot/cli.py b/featcopilot/cli.py new file mode 100644 index 0000000..89fcdf4 --- /dev/null +++ b/featcopilot/cli.py @@ -0,0 +1,452 @@ +""" +FeatCopilot command-line interface. + +Provides a stable, agent-friendly CLI for invoking FeatCopilot from shells, +notebooks, agentic workflows (e.g. Copilot/LLM tool-use), and CI pipelines +without writing Python glue code. + +Subcommands +----------- +info + Print version and supported engines/methods. Always machine-readable + when ``--json`` is passed. +transform + Run :class:`featcopilot.AutoFeatureEngineer` on a tabular input file + (CSV / Parquet / JSON) and write engineered features to an output file. + Emits a JSON status line on stdout when ``--json`` is passed so that + agents can parse the result deterministically. +explain + Fit the engineer and print a JSON document describing each generated + feature (name, explanation, code) for downstream LLM consumption. + +Examples +-------- +Agentic usage (machine-readable result on stdout, errors on stderr):: + + featcopilot info --json + featcopilot transform \\ + --input data.csv --target label --output features.parquet \\ + --engines tabular --max-features 50 --json + featcopilot explain --input data.csv --target label --json + +Equivalent module invocation:: + + python -m featcopilot info --json +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any + +from featcopilot import __version__ +from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer +from featcopilot.utils.logger import get_logger + +logger = get_logger(__name__) + +SUPPORTED_INPUT_FORMATS = ("csv", "parquet", "json") +SUPPORTED_OUTPUT_FORMATS = ("csv", "parquet", "json") + + +def _detect_format(path: Path, override: str | None) -> str: + """Return one of ``SUPPORTED_INPUT_FORMATS`` for ``path``. + + Parameters + ---------- + path : pathlib.Path + File path whose suffix is inspected when ``override`` is ``None``. + override : str or None + Explicit format override (``csv`` / ``parquet`` / ``json``). + + Raises + ------ + ValueError + If the format cannot be determined or is not supported. + """ + if override is not None: + fmt = override.lower() + if fmt not in SUPPORTED_INPUT_FORMATS: + raise ValueError( + f"Unsupported format {override!r}; expected one of {SUPPORTED_INPUT_FORMATS}" + ) + return fmt + + suffix = path.suffix.lower().lstrip(".") + aliases = {"pq": "parquet", "parq": "parquet"} + fmt = aliases.get(suffix, suffix) + if fmt not in SUPPORTED_INPUT_FORMATS: + raise ValueError( + f"Cannot infer format from extension {path.suffix!r}; " + f"pass --input-format / --output-format (one of {SUPPORTED_INPUT_FORMATS})." + ) + return fmt + + +def _read_table(path: Path, fmt: str): + """Read a tabular file into a pandas DataFrame.""" + import pandas as pd + + if fmt == "csv": + return pd.read_csv(path) + if fmt == "parquet": + return pd.read_parquet(path) + if fmt == "json": + # ``orient='records'`` is the agent-friendly default; fall back to + # pandas' auto-detection when the file isn't a records list. + try: + return pd.read_json(path, orient="records") + except ValueError: + return pd.read_json(path) + raise ValueError(f"Unsupported input format: {fmt}") + + +def _write_table(df, path: Path, fmt: str) -> None: + """Write a pandas DataFrame to ``path`` in ``fmt``.""" + path.parent.mkdir(parents=True, exist_ok=True) + + if fmt == "csv": + df.to_csv(path, index=False) + elif fmt == "parquet": + df.to_parquet(path, index=False) + elif fmt == "json": + df.to_json(path, orient="records", indent=2) + else: + raise ValueError(f"Unsupported output format: {fmt}") + + +def _load_config(config_path: str | None) -> dict[str, Any]: + """Load a JSON config file (or return an empty dict).""" + if config_path is None: + return {} + path = Path(config_path) + if not path.exists(): + raise FileNotFoundError(f"Config file not found: {config_path}") + with path.open("r", encoding="utf-8") as fh: + data = json.load(fh) + if not isinstance(data, dict): + raise ValueError( + f"Config file {config_path!r} must contain a JSON object at the top level" + ) + return data + + +def _emit(payload: dict[str, Any], *, as_json: bool, stream=None) -> None: + """Emit a payload to stdout, JSON-encoded when ``as_json`` is true.""" + stream = stream if stream is not None else sys.stdout + if as_json: + stream.write(json.dumps(payload, default=str, sort_keys=True)) + stream.write("\n") + else: + for key, value in payload.items(): + stream.write(f"{key}: {value}\n") + stream.flush() + + +def _build_engineer(args: argparse.Namespace) -> AutoFeatureEngineer: + """Construct an :class:`AutoFeatureEngineer` from parsed CLI args. + + Precedence: explicit CLI flags override values from ``--config``. + """ + config = _load_config(args.config) + + def pick(flag_value, config_key, default): + if flag_value is not None: + return flag_value + return config.get(config_key, default) + + engines = pick(args.engines, "engines", None) or ["tabular"] + selection_methods = pick(args.selection_methods, "selection_methods", None) or [ + "mutual_info", + "importance", + ] + max_features = pick(args.max_features, "max_features", None) + correlation_threshold = pick(args.correlation_threshold, "correlation_threshold", 0.85) + leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn") + gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1) + llm_config = config.get("llm_config", {}) or {} + verbose = bool(pick(args.verbose, "verbose", False)) + + return AutoFeatureEngineer( + engines=list(engines), + max_features=max_features, + selection_methods=list(selection_methods), + correlation_threshold=correlation_threshold, + llm_config=llm_config, + verbose=verbose, + leakage_guard=leakage_guard, + gate_n_jobs=gate_n_jobs, + ) + + +def _split_xy(df, target: str | None): + """Split a DataFrame into ``(X, y)``; ``y`` is ``None`` when no target.""" + if target is None: + return df, None + if target not in df.columns: + raise ValueError( + f"Target column {target!r} not found in input. " + f"Available columns: {list(df.columns)[:20]}{'...' if len(df.columns) > 20 else ''}" + ) + y = df[target] + X = df.drop(columns=[target]) + return X, y + + +def _cmd_info(args: argparse.Namespace) -> int: + """Print version + supported engines/methods.""" + payload = { + "version": __version__, + "supported_engines": sorted(AutoFeatureEngineer.SUPPORTED_ENGINES), + "supported_selection_methods": sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS), + "supported_leakage_guards": sorted(AutoFeatureEngineer.SUPPORTED_LEAKAGE_GUARDS), + "supported_input_formats": list(SUPPORTED_INPUT_FORMATS), + "supported_output_formats": list(SUPPORTED_OUTPUT_FORMATS), + } + _emit(payload, as_json=args.json) + return 0 + + +def _cmd_transform(args: argparse.Namespace) -> int: + """Read input, fit/transform, write output.""" + input_path = Path(args.input) + if not input_path.exists(): + raise FileNotFoundError(f"Input file not found: {args.input}") + output_path = Path(args.output) + + in_fmt = _detect_format(input_path, args.input_format) + out_fmt = _detect_format(output_path, args.output_format) + + df = _read_table(input_path, in_fmt) + X, y = _split_xy(df, args.target) + + engineer = _build_engineer(args) + transformed = engineer.fit_transform( + X, + y, + task_description=args.task_description or "prediction task", + target_name=args.target, + apply_selection=not args.no_selection, + ) + + if args.include_target and y is not None: + # Re-attach the target column so downstream training scripts can + # consume the engineered file as a single artifact. + target_name = args.target if args.target in df.columns else "target" + transformed = transformed.copy() + transformed[target_name] = y.values + + _write_table(transformed, output_path, out_fmt) + + payload = { + "status": "ok", + "input": str(input_path), + "output": str(output_path), + "input_format": in_fmt, + "output_format": out_fmt, + "n_rows": int(transformed.shape[0]), + "n_features": int(transformed.shape[1]), + "n_input_columns": int(X.shape[1]), + "n_generated_features": len(engineer.get_feature_names()), + "engines": list(engineer.engines), + "selection_methods": list(engineer.selection_methods), + "max_features": engineer.max_features, + "target": args.target, + "selection_applied": engineer._selector is not None, + } + _emit(payload, as_json=args.json) + return 0 + + +def _cmd_explain(args: argparse.Namespace) -> int: + """Fit engines and print feature explanations + code as JSON.""" + input_path = Path(args.input) + if not input_path.exists(): + raise FileNotFoundError(f"Input file not found: {args.input}") + + in_fmt = _detect_format(input_path, args.input_format) + df = _read_table(input_path, in_fmt) + X, y = _split_xy(df, args.target) + + engineer = _build_engineer(args) + engineer.fit( + X, + y, + task_description=args.task_description or "prediction task", + target_name=args.target, + ) + + explanations = engineer.explain_features() + code = engineer.get_feature_code() + feature_names = engineer.get_feature_names() + + payload = { + "status": "ok", + "input": str(input_path), + "n_features": len(feature_names), + "engines": list(engineer.engines), + "features": [ + { + "name": name, + "explanation": explanations.get(name, ""), + "code": code.get(name, ""), + } + for name in feature_names + ], + } + + # explain always emits JSON to stdout (it's the only sensible format), + # but we still respect ``--json`` for symmetry with other subcommands. + _emit(payload, as_json=True) + return 0 + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="featcopilot", + description=( + "FeatCopilot CLI — automated feature engineering from the command line. " + "Designed for scripting and agentic usage; pass --json to any subcommand " + "for machine-readable stdout." + ), + ) + parser.add_argument( + "-V", + "--version", + action="version", + version=f"featcopilot {__version__}", + ) + subparsers = parser.add_subparsers(dest="command", required=True, metavar="COMMAND") + + # ----- info --------------------------------------------------------- + p_info = subparsers.add_parser( + "info", + help="Print version and supported engines/methods.", + description="Print the installed FeatCopilot version and the supported engines, " + "selection methods, leakage guards, and I/O formats.", + ) + p_info.add_argument("--json", action="store_true", help="Emit JSON to stdout.") + p_info.set_defaults(func=_cmd_info) + + # ----- transform ---------------------------------------------------- + p_transform = subparsers.add_parser( + "transform", + help="Run feature engineering on a tabular file.", + description="Read INPUT, run AutoFeatureEngineer, and write engineered features to OUTPUT.", + ) + _add_io_args(p_transform) + _add_engineer_args(p_transform) + p_transform.add_argument( + "--no-selection", + action="store_true", + help="Disable feature selection (skip do-no-harm gate).", + ) + p_transform.add_argument( + "--include-target", + action="store_true", + help="Include the target column in the output file.", + ) + p_transform.add_argument("--json", action="store_true", help="Emit a JSON status line on stdout.") + p_transform.set_defaults(func=_cmd_transform) + + # ----- explain ------------------------------------------------------ + p_explain = subparsers.add_parser( + "explain", + help="Print JSON feature explanations and code for agent consumption.", + description="Fit AutoFeatureEngineer on INPUT and emit a JSON document " + "describing each generated feature (name, explanation, code).", + ) + p_explain.add_argument("--input", "-i", required=True, help="Path to input file (CSV / Parquet / JSON).") + p_explain.add_argument("--input-format", choices=SUPPORTED_INPUT_FORMATS, help="Override input format detection.") + p_explain.add_argument("--target", "-t", help="Target column name (required for selection).") + p_explain.add_argument( + "--task-description", + help="Natural-language ML task description (used by the LLM engine).", + ) + _add_engineer_args(p_explain) + p_explain.add_argument("--json", action="store_true", help="(Always JSON — flag accepted for symmetry.)") + p_explain.set_defaults(func=_cmd_explain) + + return parser + + +def _add_io_args(p: argparse.ArgumentParser) -> None: + p.add_argument("--input", "-i", required=True, help="Path to input file (CSV / Parquet / JSON).") + p.add_argument("--output", "-o", required=True, help="Path to output file (CSV / Parquet / JSON).") + p.add_argument("--input-format", choices=SUPPORTED_INPUT_FORMATS, help="Override input format detection.") + p.add_argument("--output-format", choices=SUPPORTED_OUTPUT_FORMATS, help="Override output format detection.") + p.add_argument("--target", "-t", help="Target column name (required for selection).") + p.add_argument( + "--task-description", + help="Natural-language ML task description (used by the LLM engine).", + ) + + +def _add_engineer_args(p: argparse.ArgumentParser) -> None: + """Add ``AutoFeatureEngineer``-related flags to a subparser.""" + p.add_argument( + "--engines", + nargs="+", + choices=sorted(AutoFeatureEngineer.SUPPORTED_ENGINES), + help="Engines to use (default: tabular).", + ) + p.add_argument( + "--selection-methods", + nargs="+", + choices=sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS), + help="Selection methods (default: mutual_info importance).", + ) + p.add_argument("--max-features", type=int, help="Maximum number of features to keep.") + p.add_argument( + "--correlation-threshold", + type=float, + help="Maximum pairwise correlation in redundancy elimination (default: 0.85).", + ) + p.add_argument( + "--leakage-guard", + choices=sorted(AutoFeatureEngineer.SUPPORTED_LEAKAGE_GUARDS), + help="How to handle suspicious column names (default: warn).", + ) + p.add_argument( + "--gate-n-jobs", + type=int, + help="Parallelism for the do-no-harm gate's RF (default: 1; -1 = all cores).", + ) + p.add_argument( + "--config", + help="Path to a JSON config file. CLI flags take precedence over config keys. " + "Use this to pass nested keys such as ``llm_config``.", + ) + p.add_argument("--verbose", action="store_true", default=None, help="Enable verbose logging.") + + +def main(argv: list[str] | None = None) -> int: + """CLI entry point. + + Returns the process exit code; suitable for both the ``console_scripts`` + entry point (``featcopilot``) and ``python -m featcopilot``. + """ + parser = _build_parser() + args = parser.parse_args(argv) + + try: + return args.func(args) + except (FileNotFoundError, ValueError) as exc: + # User-facing input/config errors: print a clean message to stderr + # without a traceback so agents can parse the failure. + sys.stderr.write(f"featcopilot: error: {exc}\n") + return 2 + except KeyboardInterrupt: + sys.stderr.write("featcopilot: interrupted\n") + return 130 + except Exception as exc: # pragma: no cover - defensive backstop + sys.stderr.write(f"featcopilot: unexpected error: {type(exc).__name__}: {exc}\n") + logger.exception("Unhandled CLI exception") + return 1 + + +if __name__ == "__main__": # pragma: no cover + raise SystemExit(main()) diff --git a/pyproject.toml b/pyproject.toml index f904b9c..583f2eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,6 +86,9 @@ Homepage = "https://github.com/thinkall/featcopilot" Documentation = "https://github.com/thinkall/featcopilot#readme" Repository = "https://github.com/thinkall/featcopilot" +[project.scripts] +featcopilot = "featcopilot.cli:main" + [tool.setuptools.packages.find] where = ["."] include = ["featcopilot*"] diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..07d5aad --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,397 @@ +"""Tests for the featcopilot CLI.""" + +from __future__ import annotations + +import io +import json +import sys +from contextlib import redirect_stderr, redirect_stdout +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from featcopilot import __version__ +from featcopilot import cli as fc_cli + + +def _run(argv: list[str]) -> tuple[int, str, str]: + """Invoke ``cli.main(argv)`` and capture exit code, stdout, stderr.""" + out, err = io.StringIO(), io.StringIO() + with redirect_stdout(out), redirect_stderr(err): + rc = fc_cli.main(argv) + return rc, out.getvalue(), err.getvalue() + + +@pytest.fixture +def tabular_csv(tmp_path: Path) -> Path: + """A small classification dataset written to CSV.""" + rng = np.random.default_rng(42) + n = 200 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "x3": rng.integers(0, 5, size=n), + "y": rng.integers(0, 2, size=n), + } + ) + path = tmp_path / "in.csv" + df.to_csv(path, index=False) + return path + + +# --------------------------------------------------------------------- info + + +def test_info_json_emits_supported_options(): + rc, out, err = _run(["info", "--json"]) + assert rc == 0, err + payload = json.loads(out) + assert payload["version"] == __version__ + assert "tabular" in payload["supported_engines"] + assert "mutual_info" in payload["supported_selection_methods"] + assert "warn" in payload["supported_leakage_guards"] + assert set(payload["supported_input_formats"]) == {"csv", "parquet", "json"} + + +def test_info_text_mode_is_human_readable(): + rc, out, _ = _run(["info"]) + assert rc == 0 + # Not JSON: parsing should fail. + with pytest.raises(json.JSONDecodeError): + json.loads(out) + assert "version" in out + assert __version__ in out + + +def test_top_level_version_flag(capsys): + # ``argparse`` ``--version`` action prints to stdout and SystemExits 0. + with pytest.raises(SystemExit) as exc: + fc_cli.main(["--version"]) + assert exc.value.code == 0 + assert __version__ in capsys.readouterr().out + + +# ----------------------------------------------------------------- transform + + +def test_transform_csv_to_csv(tmp_path: Path, tabular_csv: Path): + out_path = tmp_path / "out.csv" + rc, out, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "10", + "--json", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + assert payload["target"] == "y" + assert payload["engines"] == ["tabular"] + assert payload["selection_applied"] is True + assert payload["n_input_columns"] == 3 # x1, x2, x3 (y is the target) + + # The output file exists and is readable as CSV. + assert out_path.exists() + written = pd.read_csv(out_path) + assert written.shape[0] == 200 + assert "y" not in written.columns # target excluded by default + + +def test_transform_include_target_round_trip(tmp_path: Path, tabular_csv: Path): + out_path = tmp_path / "out.csv" + rc, _, err = _run( + [ + "transform", + "-i", + str(tabular_csv), + "-o", + str(out_path), + "-t", + "y", + "--max-features", + "10", + "--include-target", + ] + ) + assert rc == 0, err + written = pd.read_csv(out_path) + assert "y" in written.columns + + +def test_transform_parquet_round_trip(tmp_path: Path): + pytest.importorskip("pyarrow") + rng = np.random.default_rng(0) + df = pd.DataFrame( + {"a": rng.normal(size=120), "b": rng.normal(size=120), "y": rng.integers(0, 2, size=120)} + ) + in_path = tmp_path / "in.parquet" + out_path = tmp_path / "out.parquet" + df.to_parquet(in_path, index=False) + + rc, out, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "8", + "--json", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["input_format"] == "parquet" + assert payload["output_format"] == "parquet" + pd.read_parquet(out_path) # readable + + +def test_transform_json_round_trip(tmp_path: Path): + rng = np.random.default_rng(0) + df = pd.DataFrame( + {"a": rng.normal(size=80), "b": rng.normal(size=80), "y": rng.integers(0, 2, size=80)} + ) + in_path = tmp_path / "in.json" + out_path = tmp_path / "out.json" + df.to_json(in_path, orient="records") + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(out_path), + "--target", + "y", + ] + ) + assert rc == 0, err + written = pd.read_json(out_path, orient="records") + assert written.shape[0] == 80 + + +def test_transform_no_selection_skips_selector(tmp_path: Path, tabular_csv: Path): + out_path = tmp_path / "out.csv" + rc, out, err = _run( + [ + "transform", + "-i", + str(tabular_csv), + "-o", + str(out_path), + "-t", + "y", + "--no-selection", + "--max-features", + "5", + "--json", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["selection_applied"] is False + + +def test_transform_config_file_supplies_engineer_kwargs(tmp_path: Path, tabular_csv: Path): + config_path = tmp_path / "cfg.json" + config_path.write_text( + json.dumps( + { + "engines": ["tabular"], + "selection_methods": ["mutual_info"], + "max_features": 7, + "correlation_threshold": 0.9, + "leakage_guard": "off", + } + ) + ) + out_path = tmp_path / "out.csv" + rc, out, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--config", + str(config_path), + "--json", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["selection_methods"] == ["mutual_info"] + assert payload["max_features"] == 7 + + +def test_transform_cli_flags_override_config(tmp_path: Path, tabular_csv: Path): + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"max_features": 5, "engines": ["tabular"]})) + out_path = tmp_path / "out.csv" + rc, out, _ = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--config", + str(config_path), + "--max-features", + "12", + "--json", + ] + ) + assert rc == 0 + assert json.loads(out)["max_features"] == 12 + + +# -------------------------------------------------------------- error paths + + +def test_transform_missing_input_returns_exit_2(tmp_path: Path): + rc, _, err = _run( + [ + "transform", + "--input", + str(tmp_path / "nope.csv"), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "Input file not found" in err + + +def test_transform_unknown_target_returns_exit_2(tmp_path: Path, tabular_csv: Path): + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "does_not_exist", + ] + ) + assert rc == 2 + assert "does_not_exist" in err + + +def test_transform_unknown_extension_without_override(tmp_path: Path, tabular_csv: Path): + out_path = tmp_path / "out.weird" + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + ] + ) + assert rc == 2 + assert "infer format" in err.lower() + + +def test_transform_format_override_accepted(tmp_path: Path, tabular_csv: Path): + out_path = tmp_path / "out.weird" + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--output-format", + "csv", + ] + ) + assert rc == 0, err + assert out_path.exists() + + +def test_invalid_config_file_returns_exit_2(tmp_path: Path, tabular_csv: Path): + bad = tmp_path / "bad.json" + bad.write_text("[1, 2, 3]") # JSON, but not an object + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(bad), + ] + ) + assert rc == 2 + assert "JSON object" in err + + +def test_no_subcommand_exits_nonzero(): + # argparse SystemExits with code 2 when ``required=True`` subparser is missing. + with pytest.raises(SystemExit) as exc: + fc_cli.main([]) + assert exc.value.code == 2 + + +# ------------------------------------------------------------------ explain + + +def test_explain_emits_json_payload(tmp_path: Path, tabular_csv: Path): + rc, out, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + assert payload["engines"] == ["tabular"] + assert isinstance(payload["features"], list) + # Each feature entry is a dict with the expected keys. + if payload["features"]: + entry = payload["features"][0] + assert {"name", "explanation", "code"} <= set(entry.keys()) + + +# ------------------------------------------------------------ python -m entry + + +def test_dunder_main_module_runs(monkeypatch, capsys): + """``python -m featcopilot info --json`` is exercised via the CLI entry.""" + monkeypatch.setattr(sys, "argv", ["featcopilot", "info", "--json"]) + rc = fc_cli.main(["info", "--json"]) + assert rc == 0 From b9995551940eb7f4ff7c8e060f9f2ba8e222717a Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 14:38:55 +0800 Subject: [PATCH 02/30] fix(cli): address round-1 review feedback Addresses all five review comments from Copilot and Codex on PR #5: * explain now actually returns generated features (Copilot review #1, Codex P1). Built-in engines (e.g. tabular) populate `_feature_names` during `transform()`, not `fit()`. `_cmd_explain` now calls `fit_transform(..., apply_selection=False)` so the JSON payload contains the full `{name, explanation, code}` records the subcommand advertises. Test asserts `n_features > 0` for tabular. * main(argv) -> int contract honored on parse errors (Copilot review #2). `argparse.parse_args` raises `SystemExit` for usage errors, `--help` and `--version`. `main` now traps those and returns the exit code so programmatic and agent callers always get an int. Tests cover `--version` (rc=0), `--help` (rc=0), no-subcommand (rc=2) and unknown-flag (rc=2). * Real subprocess test for python -m featcopilot (Copilot review #3). `test_dunder_main_subprocess_invocation` and `test_dunder_main_subprocess_version_flag` spawn a real `python -m featcopilot ...` subprocess and assert stdout JSON, so a regression in `__main__.py` actually breaks the suite. * Parquet `ImportError` -> clean exit 2 (Codex P2). `_read_table`/`_write_table` now wrap parquet calls and convert `ImportError` into a `ValueError` with a friendly install hint; the top-level handler routes that to the deterministic `exit 2` user-error path instead of the generic `exit 1` backstop. `test_transform_parquet_missing_engine_returns_exit_2` exercises this via `monkeypatch` of `DataFrame.to_parquet`. * Pre-commit black: re-applied formatting from the pinned `black 24.1.1` hook (joined two long string raises) so the CI pre-commit job passes. Tests: 23 (+5 new) in tests/test_cli.py, 796 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 76 ++++++++++++++++++++++----- tests/test_cli.py | 124 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 166 insertions(+), 34 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 89fcdf4..33b85e5 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -70,9 +70,7 @@ def _detect_format(path: Path, override: str | None) -> str: if override is not None: fmt = override.lower() if fmt not in SUPPORTED_INPUT_FORMATS: - raise ValueError( - f"Unsupported format {override!r}; expected one of {SUPPORTED_INPUT_FORMATS}" - ) + raise ValueError(f"Unsupported format {override!r}; expected one of {SUPPORTED_INPUT_FORMATS}") return fmt suffix = path.suffix.lower().lstrip(".") @@ -87,13 +85,25 @@ def _detect_format(path: Path, override: str | None) -> str: def _read_table(path: Path, fmt: str): - """Read a tabular file into a pandas DataFrame.""" + """Read a tabular file into a pandas DataFrame. + + For optional ``parquet`` engines (``pyarrow``/``fastparquet``), a missing + dependency is converted into a :class:`ValueError` so the CLI's top-level + error handler can route it to the deterministic ``exit 2`` user-error path + rather than the generic ``exit 1`` backstop. + """ import pandas as pd if fmt == "csv": return pd.read_csv(path) if fmt == "parquet": - return pd.read_parquet(path) + try: + return pd.read_parquet(path) + except ImportError as exc: + raise ValueError( + f"Reading parquet requires a parquet engine (pyarrow or fastparquet); " + f"install one of them, or convert the input to CSV/JSON. Original error: {exc}" + ) from exc if fmt == "json": # ``orient='records'`` is the agent-friendly default; fall back to # pandas' auto-detection when the file isn't a records list. @@ -105,13 +115,23 @@ def _read_table(path: Path, fmt: str): def _write_table(df, path: Path, fmt: str) -> None: - """Write a pandas DataFrame to ``path`` in ``fmt``.""" + """Write a pandas DataFrame to ``path`` in ``fmt``. + + Parquet ``ImportError`` is normalized to :class:`ValueError` so the CLI + surfaces a clean dependency message via the standard ``exit 2`` path. + """ path.parent.mkdir(parents=True, exist_ok=True) if fmt == "csv": df.to_csv(path, index=False) elif fmt == "parquet": - df.to_parquet(path, index=False) + try: + df.to_parquet(path, index=False) + except ImportError as exc: + raise ValueError( + f"Writing parquet requires a parquet engine (pyarrow or fastparquet); " + f"install one of them, or pick CSV/JSON via --output-format. Original error: {exc}" + ) from exc elif fmt == "json": df.to_json(path, orient="records", indent=2) else: @@ -128,9 +148,7 @@ def _load_config(config_path: str | None) -> dict[str, Any]: with path.open("r", encoding="utf-8") as fh: data = json.load(fh) if not isinstance(data, dict): - raise ValueError( - f"Config file {config_path!r} must contain a JSON object at the top level" - ) + raise ValueError(f"Config file {config_path!r} must contain a JSON object at the top level") return data @@ -262,7 +280,17 @@ def _cmd_transform(args: argparse.Namespace) -> int: def _cmd_explain(args: argparse.Namespace) -> int: - """Fit engines and print feature explanations + code as JSON.""" + """Fit + transform engines and print feature explanations + code as JSON. + + The built-in engines populate their internal feature-name registry during + :meth:`transform`, not :meth:`fit` (planning happens in ``fit`` but feature + objects are materialized in ``transform``). We therefore call + :meth:`AutoFeatureEngineer.fit_transform` so ``get_feature_names()``, + :meth:`explain_features` and :meth:`get_feature_code` all return the + actual generated features. Selection is intentionally skipped here so the + payload describes every candidate feature the engines produced, not just + the post-selection survivors. + """ input_path = Path(args.input) if not input_path.exists(): raise FileNotFoundError(f"Input file not found: {args.input}") @@ -272,11 +300,12 @@ def _cmd_explain(args: argparse.Namespace) -> int: X, y = _split_xy(df, args.target) engineer = _build_engineer(args) - engineer.fit( + engineer.fit_transform( X, y, task_description=args.task_description or "prediction task", target_name=args.target, + apply_selection=False, ) explanations = engineer.explain_features() @@ -427,10 +456,29 @@ def main(argv: list[str] | None = None) -> int: """CLI entry point. Returns the process exit code; suitable for both the ``console_scripts`` - entry point (``featcopilot``) and ``python -m featcopilot``. + entry point (``featcopilot``) and ``python -m featcopilot``. Argparse + usage errors (missing subcommand, unknown flag) and the cooperative + ``--help`` / ``--version`` actions all normally raise :class:`SystemExit`; + we trap those here and return their exit code so that programmatic + callers (and agent harnesses) get a consistent integer-returning API. """ parser = _build_parser() - args = parser.parse_args(argv) + + try: + args = parser.parse_args(argv) + except SystemExit as exc: + # argparse uses SystemExit(0) for ``--help`` / ``--version`` and + # SystemExit(2) for usage errors (also writing to stderr). We let the + # output through but convert the exit into a return value so + # ``main(argv) -> int`` is honored even on parse-time failures. + code = exc.code + if code is None: + return 0 + if isinstance(code, int): + return code + # Non-int code (e.g. error string): print to stderr, return 2. + sys.stderr.write(f"{code}\n") + return 2 try: return args.func(args) diff --git a/tests/test_cli.py b/tests/test_cli.py index 07d5aad..3c25a65 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -67,10 +67,10 @@ def test_info_text_mode_is_human_readable(): def test_top_level_version_flag(capsys): - # ``argparse`` ``--version`` action prints to stdout and SystemExits 0. - with pytest.raises(SystemExit) as exc: - fc_cli.main(["--version"]) - assert exc.value.code == 0 + # ``--version`` (argparse action) prints to stdout; main() now traps the + # SystemExit and returns the code so the API contract is consistent. + rc = fc_cli.main(["--version"]) + assert rc == 0 assert __version__ in capsys.readouterr().out @@ -132,9 +132,7 @@ def test_transform_include_target_round_trip(tmp_path: Path, tabular_csv: Path): def test_transform_parquet_round_trip(tmp_path: Path): pytest.importorskip("pyarrow") rng = np.random.default_rng(0) - df = pd.DataFrame( - {"a": rng.normal(size=120), "b": rng.normal(size=120), "y": rng.integers(0, 2, size=120)} - ) + df = pd.DataFrame({"a": rng.normal(size=120), "b": rng.normal(size=120), "y": rng.integers(0, 2, size=120)}) in_path = tmp_path / "in.parquet" out_path = tmp_path / "out.parquet" df.to_parquet(in_path, index=False) @@ -162,9 +160,7 @@ def test_transform_parquet_round_trip(tmp_path: Path): def test_transform_json_round_trip(tmp_path: Path): rng = np.random.default_rng(0) - df = pd.DataFrame( - {"a": rng.normal(size=80), "b": rng.normal(size=80), "y": rng.integers(0, 2, size=80)} - ) + df = pd.DataFrame({"a": rng.normal(size=80), "b": rng.normal(size=80), "y": rng.integers(0, 2, size=80)}) in_path = tmp_path / "in.json" out_path = tmp_path / "out.json" df.to_json(in_path, orient="records") @@ -356,11 +352,24 @@ def test_invalid_config_file_returns_exit_2(tmp_path: Path, tabular_csv: Path): assert "JSON object" in err -def test_no_subcommand_exits_nonzero(): - # argparse SystemExits with code 2 when ``required=True`` subparser is missing. - with pytest.raises(SystemExit) as exc: - fc_cli.main([]) - assert exc.value.code == 2 +def test_no_subcommand_exits_nonzero(capsys): + # main() now returns the argparse-reported exit code (2 for usage error) + # rather than letting SystemExit propagate, so programmatic callers get + # an integer back even on parse-time failures. + rc = fc_cli.main([]) + assert rc == 2 + + +def test_unknown_flag_returns_exit_2(capsys): + rc = fc_cli.main(["transform", "--no-such-flag"]) + assert rc == 2 + + +def test_help_flag_returns_zero(capsys): + rc = fc_cli.main(["--help"]) + assert rc == 0 + captured = capsys.readouterr() + assert "featcopilot" in captured.out # ------------------------------------------------------------------ explain @@ -381,17 +390,92 @@ def test_explain_emits_json_payload(tmp_path: Path, tabular_csv: Path): assert payload["status"] == "ok" assert payload["engines"] == ["tabular"] assert isinstance(payload["features"], list) + # The tabular engine actually generates derived features, and the explain + # subcommand must materialize them by running the full fit_transform + # pipeline (engines populate _feature_names during transform()). + assert payload["n_features"] > 0 + assert len(payload["features"]) == payload["n_features"] # Each feature entry is a dict with the expected keys. - if payload["features"]: - entry = payload["features"][0] - assert {"name", "explanation", "code"} <= set(entry.keys()) + entry = payload["features"][0] + assert {"name", "explanation", "code"} <= set(entry.keys()) + assert entry["name"] -# ------------------------------------------------------------ python -m entry +# --------------------------------------------------------------- parquet path + + +def test_transform_parquet_missing_engine_returns_exit_2(tmp_path, tabular_csv, monkeypatch): + """When pyarrow/fastparquet is missing, the CLI should surface a clean + user-facing dependency error (exit 2) rather than the generic exit 1 + backstop. + """ + import pandas as pd + + def _raise_import_error(self, *args, **kwargs): # noqa: ANN001 + raise ImportError("Missing optional dependency 'pyarrow' (simulated)") + + monkeypatch.setattr(pd.DataFrame, "to_parquet", _raise_import_error, raising=True) + + out_path = tmp_path / "out.parquet" + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "5", + ] + ) + assert rc == 2 + assert "parquet engine" in err.lower() + + +# --------------------------------------------------------------- python -m def test_dunder_main_module_runs(monkeypatch, capsys): - """``python -m featcopilot info --json`` is exercised via the CLI entry.""" + """``cli.main`` is invoked via the same code path as ``python -m featcopilot``.""" monkeypatch.setattr(sys, "argv", ["featcopilot", "info", "--json"]) rc = fc_cli.main(["info", "--json"]) assert rc == 0 + + +def test_dunder_main_subprocess_invocation(): + """``python -m featcopilot info --json`` must succeed in a real subprocess. + + Exercises ``featcopilot/__main__.py`` end-to-end so a regression in + module-form invocation (e.g. a broken import path) actually breaks the + test, not just the unit-level call to ``cli.main``. + """ + import subprocess + + result = subprocess.run( + [sys.executable, "-m", "featcopilot", "info", "--json"], + capture_output=True, + text=True, + timeout=60, + check=False, + ) + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert payload["version"] == __version__ + assert "tabular" in payload["supported_engines"] + + +def test_dunder_main_subprocess_version_flag(): + """``python -m featcopilot --version`` must print and exit 0.""" + import subprocess + + result = subprocess.run( + [sys.executable, "-m", "featcopilot", "--version"], + capture_output=True, + text=True, + timeout=30, + check=False, + ) + assert result.returncode == 0, result.stderr + assert __version__ in result.stdout From 72a07571e62abb23f8baa103e1c38cde6a131470 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 15:10:46 +0800 Subject: [PATCH 03/30] chore: re-trigger automated PR review No code changes. Triggers another round of automated review from the Copilot/Codex review bots so the full PR (not just the latest commit) can be re-evaluated against the current head after round-1 fixes in b999555 (which addressed all 5 review comments and resolved all 5 review threads). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 92fcd090fe170858df8070eb85f3e4c43592e59e Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 18:06:17 +0800 Subject: [PATCH 04/30] fix(cli): address round-2 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses all five new review comments from copilot-pull-request-reviewer on PR #5: * info now reflects parquet engine availability at runtime. `_parquet_engine_available()` probes `pyarrow` / `fastparquet`; parquet is included in `supported_input_formats` / `supported_output_formats` only when an engine is importable. Adds a new `"parquet_available"` boolean in the payload so agents can branch on it deterministically. Tests cover both the engine-present and engine-missing cases via `monkeypatch`. * _build_engineer no longer mangles misconfigured configs. Removed the `list(...)` coercion of `engines` / `selection_methods`: a misconfigured `"engines": "tabular"` (string) used to be silently expanded into `['t','a','b','u','l','a','r']` and bubble up as a confusing "unknown engines" error. It now flows straight into `AutoFeatureEngineer.__init__`'s precise type validation ("engines must be a list or tuple of strings"), surfaced via the standard exit-2 user-error path. * Empty config lists now produce the documented exit 2. Replaced `pick(...) or [default]` with a tri-state `pick` that honors explicit empty values from the config: `"engines": []` / `"selection_methods": []` now propagate into the transformer where `_validate_configuration` raises "must contain at least one ..." and the CLI returns exit 2 — instead of being silently rewritten into the defaults. * README parquet caveat. Switched the example to CSV/CSV (no surprise dependency for base installs) and added an explicit note that parquet I/O requires installing `pyarrow` or `fastparquet`, with a pointer to `info`'s `parquet_available` flag for runtime detection. * Console script test. New `test_console_script_subprocess_invocation` and `test_console_script_version_flag` use `shutil.which` to locate the installed `featcopilot` script and run it through `subprocess`; a typo or packaging regression in `[project.scripts]` now actually breaks the suite. Tests skip cleanly when the script is not on PATH. Tests: 30 (+7 new) in tests/test_cli.py, 803 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 11 +++- featcopilot/cli.py | 75 ++++++++++++++++++---- tests/test_cli.py | 156 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 227 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 157215b..280db2d 100644 --- a/README.md +++ b/README.md @@ -121,9 +121,9 @@ non-zero exit code so agents can parse failures deterministically. # Discover capabilities (engines, selection methods, I/O formats) featcopilot info --json -# Run feature engineering on a CSV / Parquet / JSON file +# Run feature engineering on a CSV / JSON file featcopilot transform \ - --input data.csv --target label --output features.parquet \ + --input data.csv --target label --output features.csv \ --engines tabular --max-features 50 --json # Inspect generated features (name, explanation, code) as JSON for an LLM @@ -136,6 +136,13 @@ python -m featcopilot info --json Pass `--config config.json` to provide nested keys such as `llm_config`; explicit CLI flags override values from the config file. +> **Parquet I/O.** FeatCopilot's base install does not pin a parquet engine. +> To use `--input file.parquet` / `--output file.parquet` (or the +> `parquet` value in `--input-format` / `--output-format`), install one of +> `pyarrow` or `fastparquet`. `featcopilot info --json` reports +> `"parquet_available": true` only when an engine is importable in the +> current environment. + ## Engines ### Tabular Engine diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 33b85e5..8c114ef 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -52,6 +52,29 @@ SUPPORTED_OUTPUT_FORMATS = ("csv", "parquet", "json") +def _parquet_engine_available() -> bool: + """Return ``True`` if a parquet engine (pyarrow or fastparquet) can be imported. + + FeatCopilot's base install pins neither ``pyarrow`` nor ``fastparquet``; + parquet I/O is therefore opportunistic. ``info`` uses this probe so the + machine-readable capability output reflects what will actually work in + the current environment, rather than always advertising parquet. + """ + try: + import pyarrow # noqa: F401 + + return True + except ImportError: + pass + try: + import fastparquet # noqa: F401 + + return True + except ImportError: + pass + return False + + def _detect_format(path: Path, override: str | None) -> str: """Return one of ``SUPPORTED_INPUT_FORMATS`` for ``path``. @@ -167,20 +190,30 @@ def _emit(payload: dict[str, Any], *, as_json: bool, stream=None) -> None: def _build_engineer(args: argparse.Namespace) -> AutoFeatureEngineer: """Construct an :class:`AutoFeatureEngineer` from parsed CLI args. - Precedence: explicit CLI flags override values from ``--config``. + Precedence: explicit CLI flags override values from ``--config``; + explicit config values (including empty lists) override the defaults. + Empty / non-list values are propagated unchanged so that + :meth:`AutoFeatureEngineer._validate_configuration` produces its + canonical (and deterministic) error path — the CLI's wrapper must not + silently rewrite a misconfigured config into something that looks + different from what the user wrote. """ config = _load_config(args.config) def pick(flag_value, config_key, default): + # Explicit CLI flag wins. Otherwise honor an explicit config entry + # — even a falsy one such as ``[]`` — so AutoFeatureEngineer can + # raise its own clear "must contain at least one" error rather than + # the CLI silently swapping in defaults. Only fall back to the + # default when the key is *absent* from the config. if flag_value is not None: return flag_value - return config.get(config_key, default) + if config_key in config: + return config[config_key] + return default - engines = pick(args.engines, "engines", None) or ["tabular"] - selection_methods = pick(args.selection_methods, "selection_methods", None) or [ - "mutual_info", - "importance", - ] + engines = pick(args.engines, "engines", ["tabular"]) + selection_methods = pick(args.selection_methods, "selection_methods", ["mutual_info", "importance"]) max_features = pick(args.max_features, "max_features", None) correlation_threshold = pick(args.correlation_threshold, "correlation_threshold", 0.85) leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn") @@ -188,10 +221,16 @@ def pick(flag_value, config_key, default): llm_config = config.get("llm_config", {}) or {} verbose = bool(pick(args.verbose, "verbose", False)) + # Pass ``engines`` / ``selection_methods`` through *unchanged* (no + # ``list(...)`` wrapping). Coercion would convert a misconfigured + # JSON string like ``"tabular"`` into ``['t','a','b','u','l','a','r']``, + # turning a clear type error into a confusing "Unknown engines" path. + # AutoFeatureEngineer.__init__ rejects non-list/tuple inputs with a + # precise message — let it. return AutoFeatureEngineer( - engines=list(engines), + engines=engines, max_features=max_features, - selection_methods=list(selection_methods), + selection_methods=selection_methods, correlation_threshold=correlation_threshold, llm_config=llm_config, verbose=verbose, @@ -215,14 +254,26 @@ def _split_xy(df, target: str | None): def _cmd_info(args: argparse.Namespace) -> int: - """Print version + supported engines/methods.""" + """Print version + supported engines/methods. + + Parquet appears in ``supported_input_formats`` / ``supported_output_formats`` + only when an actual parquet engine (``pyarrow`` or ``fastparquet``) can + be imported in the current environment — otherwise the ``info`` output + would advertise a format that immediately fails on use, which is + misleading for the agentic capability-discovery the CLI is designed to + support. + """ + parquet_ok = _parquet_engine_available() + input_formats = [f for f in SUPPORTED_INPUT_FORMATS if f != "parquet" or parquet_ok] + output_formats = [f for f in SUPPORTED_OUTPUT_FORMATS if f != "parquet" or parquet_ok] payload = { "version": __version__, "supported_engines": sorted(AutoFeatureEngineer.SUPPORTED_ENGINES), "supported_selection_methods": sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS), "supported_leakage_guards": sorted(AutoFeatureEngineer.SUPPORTED_LEAKAGE_GUARDS), - "supported_input_formats": list(SUPPORTED_INPUT_FORMATS), - "supported_output_formats": list(SUPPORTED_OUTPUT_FORMATS), + "supported_input_formats": input_formats, + "supported_output_formats": output_formats, + "parquet_available": parquet_ok, } _emit(payload, as_json=args.json) return 0 diff --git a/tests/test_cli.py b/tests/test_cli.py index 3c25a65..bfda149 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -53,7 +53,37 @@ def test_info_json_emits_supported_options(): assert "tabular" in payload["supported_engines"] assert "mutual_info" in payload["supported_selection_methods"] assert "warn" in payload["supported_leakage_guards"] - assert set(payload["supported_input_formats"]) == {"csv", "parquet", "json"} + # CSV/JSON are always supported; parquet is gated on engine availability. + assert {"csv", "json"} <= set(payload["supported_input_formats"]) + assert {"csv", "json"} <= set(payload["supported_output_formats"]) + assert isinstance(payload["parquet_available"], bool) + if payload["parquet_available"]: + assert "parquet" in payload["supported_input_formats"] + assert "parquet" in payload["supported_output_formats"] + else: + assert "parquet" not in payload["supported_input_formats"] + assert "parquet" not in payload["supported_output_formats"] + + +def test_info_excludes_parquet_when_engine_missing(monkeypatch): + """When no parquet engine can be imported, ``info`` must not advertise it.""" + monkeypatch.setattr(fc_cli, "_parquet_engine_available", lambda: False) + rc, out, _ = _run(["info", "--json"]) + assert rc == 0 + payload = json.loads(out) + assert payload["parquet_available"] is False + assert "parquet" not in payload["supported_input_formats"] + assert "parquet" not in payload["supported_output_formats"] + + +def test_info_includes_parquet_when_engine_present(monkeypatch): + monkeypatch.setattr(fc_cli, "_parquet_engine_available", lambda: True) + rc, out, _ = _run(["info", "--json"]) + assert rc == 0 + payload = json.loads(out) + assert payload["parquet_available"] is True + assert "parquet" in payload["supported_input_formats"] + assert "parquet" in payload["supported_output_formats"] def test_info_text_mode_is_human_readable(): @@ -261,6 +291,77 @@ def test_transform_cli_flags_override_config(tmp_path: Path, tabular_csv: Path): assert json.loads(out)["max_features"] == 12 +# ----------------------- _build_engineer config validation + + +def test_string_engines_in_config_returns_clean_exit_2(tmp_path: Path, tabular_csv: Path): + """A misconfigured ``"engines": "tabular"`` (string instead of list) must + surface ``AutoFeatureEngineer``'s precise type-validation error via the + standard exit-2 path — *not* be silently coerced into a per-character list. + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"engines": "tabular"})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + ] + ) + assert rc == 2 + assert "engines must be a list or tuple" in err + + +def test_empty_engines_list_in_config_returns_clean_exit_2(tmp_path: Path, tabular_csv: Path): + """An explicit empty ``engines`` list in the config must propagate to the + transformer's validation so the user sees the documented error, instead + of being silently rewritten into the defaults. + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"engines": []})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + ] + ) + assert rc == 2 + assert "at least one engine" in err.lower() or "empty sequence" in err.lower() + + +def test_empty_selection_methods_list_in_config_returns_clean_exit_2(tmp_path: Path, tabular_csv: Path): + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"selection_methods": []})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + ] + ) + assert rc == 2 + assert "at least one method" in err.lower() or "empty sequence" in err.lower() + + # -------------------------------------------------------------- error paths @@ -479,3 +580,56 @@ def test_dunder_main_subprocess_version_flag(): ) assert result.returncode == 0, result.stderr assert __version__ in result.stdout + + +# ------------------------------------------------------- console script + + +def test_console_script_subprocess_invocation(): + """The installed ``featcopilot`` console script must be on PATH and runnable. + + Exercises the ``[project.scripts] featcopilot = "featcopilot.cli:main"`` + entry point end-to-end so a typo or packaging regression in + ``pyproject.toml`` would actually break the suite. Skipped when the + script isn't on ``PATH`` (e.g. running tests without ``pip install``). + """ + import shutil + import subprocess + + script = shutil.which("featcopilot") + if script is None: + pytest.skip( + "featcopilot console script not on PATH (install the package " + "with `pip install -e .` to exercise the entry point)" + ) + + result = subprocess.run( + [script, "info", "--json"], + capture_output=True, + text=True, + timeout=60, + check=False, + ) + assert result.returncode == 0, result.stderr + payload = json.loads(result.stdout) + assert payload["version"] == __version__ + assert "tabular" in payload["supported_engines"] + + +def test_console_script_version_flag(): + import shutil + import subprocess + + script = shutil.which("featcopilot") + if script is None: + pytest.skip("featcopilot console script not on PATH") + + result = subprocess.run( + [script, "--version"], + capture_output=True, + text=True, + timeout=30, + check=False, + ) + assert result.returncode == 0, result.stderr + assert __version__ in result.stdout From 0072e5be39531b1d2fd20a6d6a63664aa827fa94 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 18:34:30 +0800 Subject: [PATCH 05/30] fix(cli): address round-3 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses both new comments from chatgpt-codex-connector on PR #5 (commit 92fcd09): * Validate `llm_config` is a JSON object (Codex P2). `_build_engineer` now type-checks the `llm_config` value from `--config` before forwarding it. A non-mapping (e.g. a string) previously made it all the way into `AutoFeatureEngineer._create_engine` where `self.llm_config.get(...)` raised `AttributeError` — bypassing the structured exit-2 user-error path and surfacing as exit 1 `unexpected error`. The new check raises a precise `ValueError` so the CLI returns exit 2 with a clean stderr message. Test `test_non_dict_llm_config_returns_exit_2` covers this. * Normalize `--config` user-input mistakes to exit 2 (Codex P3). Pointing `--config` at a directory used to raise `IsADirectoryError` from `path.open(...)` and fall into the generic exit-1 backstop. `_load_config` now: - rejects directories explicitly, - converts `json.JSONDecodeError` into a clean "is not valid JSON" message, - converts other read errors (`OSError`) into a clean "could not be read" message. All paths return exit 2 so automation can handle config errors consistently. Tests `test_directory_as_config_returns_exit_2` and `test_malformed_json_config_returns_exit_2` cover the new branches. Tests: 33 (+3 new) in tests/test_cli.py, 806 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 44 +++++++++++++++++++++++++++--- tests/test_cli.py | 68 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 4 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 8c114ef..7b13e1f 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -162,14 +162,32 @@ def _write_table(df, path: Path, fmt: str) -> None: def _load_config(config_path: str | None) -> dict[str, Any]: - """Load a JSON config file (or return an empty dict).""" + """Load a JSON config file (or return an empty dict). + + Normalizes user-input mistakes (missing path, directory passed instead + of a file, invalid JSON, non-object root) into :class:`ValueError` / + :class:`FileNotFoundError` so the CLI's top-level error handler can + route them all to the deterministic ``exit 2`` user-error path + (rather than e.g. ``IsADirectoryError`` falling into the generic + ``exit 1`` "unexpected error" backstop). + """ if config_path is None: return {} path = Path(config_path) if not path.exists(): raise FileNotFoundError(f"Config file not found: {config_path}") - with path.open("r", encoding="utf-8") as fh: - data = json.load(fh) + if path.is_dir(): + raise ValueError(f"--config expects a JSON file, but {config_path!r} is a directory.") + try: + with path.open("r", encoding="utf-8") as fh: + data = json.load(fh) + except json.JSONDecodeError as exc: + raise ValueError(f"Config file {config_path!r} is not valid JSON: {exc}") from exc + except OSError as exc: + # Catch-all for unreadable files (permission denied, broken symlink, + # etc.). Surface as a user-facing error rather than the generic + # exit-1 backstop. + raise ValueError(f"Config file {config_path!r} could not be read: {exc}") from exc if not isinstance(data, dict): raise ValueError(f"Config file {config_path!r} must contain a JSON object at the top level") return data @@ -218,7 +236,25 @@ def pick(flag_value, config_key, default): correlation_threshold = pick(args.correlation_threshold, "correlation_threshold", 0.85) leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn") gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1) - llm_config = config.get("llm_config", {}) or {} + + # Validate ``llm_config`` is a JSON object (i.e. a Python dict) before + # forwarding it. Without this check, a misconfigured non-dict value + # would only fail at engine-construction time inside + # ``AutoFeatureEngineer._create_engine`` via ``self.llm_config.get(...)``, + # raising an ``AttributeError`` that bypasses the structured exit-2 + # user-error path (the CLI would surface it as exit 1 "unexpected + # error", which is a poor agent contract for a documented config key). + llm_config_raw = config.get("llm_config") + if llm_config_raw is None: + llm_config: dict[str, Any] = {} + elif isinstance(llm_config_raw, dict): + llm_config = llm_config_raw + else: + raise ValueError( + "`llm_config` in the --config file must be a JSON object (mapping); " + f"got {type(llm_config_raw).__name__}={llm_config_raw!r}." + ) + verbose = bool(pick(args.verbose, "verbose", False)) # Pass ``engines`` / ``selection_methods`` through *unchanged* (no diff --git a/tests/test_cli.py b/tests/test_cli.py index bfda149..bfa9e53 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -453,6 +453,74 @@ def test_invalid_config_file_returns_exit_2(tmp_path: Path, tabular_csv: Path): assert "JSON object" in err +def test_directory_as_config_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """Pointing ``--config`` at a directory must surface as exit 2, not the + generic ``exit 1`` backstop (``IsADirectoryError``). + """ + cfg_dir = tmp_path / "not_a_file" + cfg_dir.mkdir() + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(cfg_dir), + ] + ) + assert rc == 2 + assert "directory" in err.lower() + + +def test_malformed_json_config_returns_exit_2(tmp_path: Path, tabular_csv: Path): + bad = tmp_path / "bad.json" + bad.write_text("{not valid json,}") + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(bad), + ] + ) + assert rc == 2 + assert "valid json" in err.lower() + + +def test_non_dict_llm_config_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """A non-mapping ``llm_config`` (e.g. a string) must be rejected at + config-load time with a clean exit 2, not bubble up as an + ``AttributeError`` from ``.get(...)`` deep inside engine construction. + """ + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"engines": ["tabular"], "llm_config": "gpt-5"})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "llm_config" in err + assert "JSON object" in err or "mapping" in err.lower() + + def test_no_subcommand_exits_nonzero(capsys): # main() now returns the argparse-reported exit code (2 for usage error) # rather than letting SystemExit propagate, so programmatic callers get From 8240e26b50c58f03ef9c6377daa4ed55d2aa7f0e Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 18:56:53 +0800 Subject: [PATCH 06/30] test(cli): bump patch coverage above codecov threshold Round-3 introduced two new error-handling branches whose untested counterparts pulled patch coverage to 88.83% (just below the 88.90% codecov target): * Refactored `_parquet_engine_available` to use `importlib.util.find_spec` so the probe is side-effect-free and trivially mockable. Covered by two new tests: - both engines absent -> False - fastparquet-only path -> True * Added `test_transform_read_parquet_missing_engine_returns_exit_2` to exercise the symmetric read-side `ImportError` -> exit-2 branch in `_read_table` (mirroring the existing write-side test). * Added `test_unreadable_config_returns_exit_2` to cover the `OSError` branch in `_load_config` (permission denied, broken symlink, etc.), via `monkeypatch` of `Path.open`. Tests: 37 (+4 new) in tests/test_cli.py. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 17 +++----- tests/test_cli.py | 101 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+), 12 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 7b13e1f..be8180e 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -59,20 +59,13 @@ def _parquet_engine_available() -> bool: parquet I/O is therefore opportunistic. ``info`` uses this probe so the machine-readable capability output reflects what will actually work in the current environment, rather than always advertising parquet. - """ - try: - import pyarrow # noqa: F401 - return True - except ImportError: - pass - try: - import fastparquet # noqa: F401 + Uses :func:`importlib.util.find_spec` so the probe is side-effect-free + (no actual module import) and easy to mock in tests. + """ + import importlib.util - return True - except ImportError: - pass - return False + return importlib.util.find_spec("pyarrow") is not None or importlib.util.find_spec("fastparquet") is not None def _detect_format(path: Path, override: str | None) -> str: diff --git a/tests/test_cli.py b/tests/test_cli.py index bfa9e53..7e96b05 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -603,6 +603,107 @@ def _raise_import_error(self, *args, **kwargs): # noqa: ANN001 assert "parquet engine" in err.lower() +def test_transform_read_parquet_missing_engine_returns_exit_2(tmp_path, tabular_csv, monkeypatch): + """Symmetric coverage for reading a .parquet input when no engine is installed. + + The CLI must convert the ``ImportError`` from ``pd.read_parquet`` into + the deterministic exit-2 path (with a user-facing install hint), + just like the write path. + """ + import pandas as pd + + # Make sure the input path has a .parquet suffix so format detection picks parquet. + fake_pq = tmp_path / "fake.parquet" + fake_pq.write_bytes(b"") # contents don't matter; we'll intercept read_parquet + + def _raise_import_error(*args, **kwargs): + raise ImportError("Missing optional dependency 'pyarrow' (simulated)") + + monkeypatch.setattr(pd, "read_parquet", _raise_import_error, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(fake_pq), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "parquet engine" in err.lower() + + +def test_parquet_engine_available_returns_false_when_neither_installed(monkeypatch): + """Both probes return ``None`` from ``find_spec`` -> function returns False.""" + import importlib.util + + real_find_spec = importlib.util.find_spec + + def fake_find_spec(name, *args, **kwargs): + if name in ("pyarrow", "fastparquet"): + return None + return real_find_spec(name, *args, **kwargs) + + monkeypatch.setattr(importlib.util, "find_spec", fake_find_spec) + assert fc_cli._parquet_engine_available() is False + + +def test_parquet_engine_available_returns_true_for_fastparquet_only(monkeypatch): + """Even without pyarrow, finding fastparquet must report parquet as available.""" + import importlib.util + + class _FakeSpec: + pass + + real_find_spec = importlib.util.find_spec + + def fake_find_spec(name, *args, **kwargs): + if name == "pyarrow": + return None + if name == "fastparquet": + return _FakeSpec() + return real_find_spec(name, *args, **kwargs) + + monkeypatch.setattr(importlib.util, "find_spec", fake_find_spec) + assert fc_cli._parquet_engine_available() is True + + +def test_unreadable_config_returns_exit_2(tmp_path, tabular_csv, monkeypatch): + """An ``OSError`` while opening the config (permission denied, broken + symlink, etc.) is converted into the deterministic exit-2 path. + """ + cfg = tmp_path / "cfg.json" + cfg.write_text("{}") + + real_open = Path.open + + def _raise_oserror(self, *args, **kwargs): + if self == cfg: + raise PermissionError("simulated read failure") + return real_open(self, *args, **kwargs) + + monkeypatch.setattr(Path, "open", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "could not be read" in err.lower() + + # --------------------------------------------------------------- python -m From 88e71ea02f0f29795c1a182495d68b9f5b8d4e89 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 19:27:51 +0800 Subject: [PATCH 07/30] fix(cli): address round-4 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses all seven new comments from copilot-pull-request-reviewer on PR #5 (commits 0072e5b and 8240e26): * Module docstring example uses CSV (not parquet). Mirrors the README fix from round-2 so the in-file example does not silently rely on an optional dependency. A short note about parquet availability via `info`'s `parquet_available` flag was added. * All input/output OSError paths normalize to exit 2. `_read_table` and `_write_table` now reject directories, catch `OSError` (permission denied, broken symlink, parent-dir creation failure, …), and convert them into `ValueError` with a precise message — surfaced via the standard exit-2 user-error path. The generic exit-1 backstop is reserved for truly unexpected errors. New tests cover `--input` directory, `--output` directory, unwritable output, and unreadable input. * Scalar `--config` fields are type-validated. New `_check_scalar_type` helper rejects malformed config values (e.g. `"max_features": "10"`, `"correlation_threshold": "0.9"`, `"gate_n_jobs": "2"`, `"leakage_guard": 42`) at config-load time with a precise `ValueError` -> exit 2. Without this, those values later raised `TypeError` deep inside the estimator and surfaced as exit 1 `unexpected error`. Parametrized test covers six type mismatches including the bool-as-int trap. * `--verbose` is a true tri-state (BooleanOptionalAction). `store_true` made it impossible to override a config-supplied `"verbose": true` back to false from the command line. Switched to `argparse.BooleanOptionalAction` (Python 3.9+; we require 3.10+), giving both `--verbose` and `--no-verbose` so the documented precedence rule (CLI > config > default) is honorable. Tests assert `--no-verbose` overrides config `true` and `--verbose` overrides config `false`. * `explain` no longer advertises selection-only flags. `_add_engineer_args` learned an `include_selection_args=False` mode used by the `explain` subparser. `--selection-methods`, `--max-features`, `--correlation-threshold` are no longer accepted on `explain` (which always disables selection), so a user / agent can no longer silently mis-configure the call. `_build_engineer` uses `getattr(..., None)` to fall through to config / defaults when those attributes aren't present. * `explain --target` help text fixed. Now says the target is used by the leakage-guard and as task context, and explicitly notes that selection is disabled in `explain`. `--target` help on `transform` is also clarified. Tests: 53 (+16 new) in tests/test_cli.py, 826 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 181 ++++++++++++++++++++++++++------- tests/test_cli.py | 247 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 394 insertions(+), 34 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index be8180e..d6cf451 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -25,13 +25,17 @@ featcopilot info --json featcopilot transform \\ - --input data.csv --target label --output features.parquet \\ + --input data.csv --target label --output features.csv \\ --engines tabular --max-features 50 --json featcopilot explain --input data.csv --target label --json Equivalent module invocation:: python -m featcopilot info --json + +Parquet I/O is supported only when ``pyarrow`` or ``fastparquet`` is +installed (FeatCopilot's base distribution does not pin either); ``info`` +reports the runtime availability via ``parquet_available``. """ from __future__ import annotations @@ -103,15 +107,23 @@ def _detect_format(path: Path, override: str | None) -> str: def _read_table(path: Path, fmt: str): """Read a tabular file into a pandas DataFrame. - For optional ``parquet`` engines (``pyarrow``/``fastparquet``), a missing - dependency is converted into a :class:`ValueError` so the CLI's top-level - error handler can route it to the deterministic ``exit 2`` user-error path - rather than the generic ``exit 1`` backstop. + All user-facing failure modes (missing parquet engine, ``--input`` + pointing at a directory, permission denied, malformed JSON/CSV, + decoding errors) are normalized into :class:`ValueError` so the CLI's + top-level handler routes them to the deterministic ``exit 2`` + user-error path. The generic ``exit 1`` backstop is reserved for + truly unexpected (i.e. CLI-internal) errors. """ import pandas as pd + if path.is_dir(): + raise ValueError(f"--input expects a file, but {str(path)!r} is a directory.") + if fmt == "csv": - return pd.read_csv(path) + try: + return pd.read_csv(path) + except (OSError, pd.errors.ParserError, UnicodeDecodeError) as exc: + raise ValueError(f"Failed to read CSV from {str(path)!r}: {exc}") from exc if fmt == "parquet": try: return pd.read_parquet(path) @@ -120,26 +132,45 @@ def _read_table(path: Path, fmt: str): f"Reading parquet requires a parquet engine (pyarrow or fastparquet); " f"install one of them, or convert the input to CSV/JSON. Original error: {exc}" ) from exc + except OSError as exc: + raise ValueError(f"Failed to read parquet from {str(path)!r}: {exc}") from exc if fmt == "json": # ``orient='records'`` is the agent-friendly default; fall back to # pandas' auto-detection when the file isn't a records list. try: return pd.read_json(path, orient="records") except ValueError: - return pd.read_json(path) + try: + return pd.read_json(path) + except ValueError as exc: + raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc + except OSError as exc: + raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc raise ValueError(f"Unsupported input format: {fmt}") def _write_table(df, path: Path, fmt: str) -> None: """Write a pandas DataFrame to ``path`` in ``fmt``. - Parquet ``ImportError`` is normalized to :class:`ValueError` so the CLI - surfaces a clean dependency message via the standard ``exit 2`` path. + All user-facing failure modes (missing parquet engine, ``--output`` + pointing at a directory, permission denied, parent-directory creation + failures) are normalized into :class:`ValueError` so the CLI surfaces a + clean stderr message via the standard ``exit 2`` path instead of the + generic ``exit 1`` "unexpected error" backstop. """ - path.parent.mkdir(parents=True, exist_ok=True) + if path.exists() and path.is_dir(): + raise ValueError(f"--output expects a file, but {str(path)!r} is an existing directory.") + + try: + path.parent.mkdir(parents=True, exist_ok=True) + except OSError as exc: + raise ValueError(f"Cannot create parent directory for {str(path)!r}: {exc}") from exc if fmt == "csv": - df.to_csv(path, index=False) + try: + df.to_csv(path, index=False) + except OSError as exc: + raise ValueError(f"Failed to write CSV to {str(path)!r}: {exc}") from exc elif fmt == "parquet": try: df.to_parquet(path, index=False) @@ -148,8 +179,13 @@ def _write_table(df, path: Path, fmt: str) -> None: f"Writing parquet requires a parquet engine (pyarrow or fastparquet); " f"install one of them, or pick CSV/JSON via --output-format. Original error: {exc}" ) from exc + except OSError as exc: + raise ValueError(f"Failed to write parquet to {str(path)!r}: {exc}") from exc elif fmt == "json": - df.to_json(path, orient="records", indent=2) + try: + df.to_json(path, orient="records", indent=2) + except OSError as exc: + raise ValueError(f"Failed to write JSON to {str(path)!r}: {exc}") from exc else: raise ValueError(f"Unsupported output format: {fmt}") @@ -198,6 +234,36 @@ def _emit(payload: dict[str, Any], *, as_json: bool, stream=None) -> None: stream.flush() +def _check_scalar_type( + name: str, + value: Any, + expected: tuple[type, ...], + *, + allow_none: bool = False, + allow_bool: bool = True, +) -> None: + """Validate a scalar value's type for ``--config``-supplied keys. + + Raises :class:`ValueError` (caught by ``main()`` -> exit 2) when the + value's type does not match. ``bool`` is a subclass of ``int`` in + Python; pass ``allow_bool=False`` to reject ``True``/``False`` for + numeric-only fields like ``max_features`` / ``correlation_threshold``. + """ + if value is None: + if allow_none: + return + raise ValueError(f"`{name}` must not be null in --config") + if not allow_bool and isinstance(value, bool): + raise ValueError( + f"`{name}` in --config must be a {' or '.join(t.__name__ for t in expected)}; " f"got bool={value!r}." + ) + if not isinstance(value, expected): + raise ValueError( + f"`{name}` in --config must be a {' or '.join(t.__name__ for t in expected)}; " + f"got {type(value).__name__}={value!r}." + ) + + def _build_engineer(args: argparse.Namespace) -> AutoFeatureEngineer: """Construct an :class:`AutoFeatureEngineer` from parsed CLI args. @@ -224,12 +290,29 @@ def pick(flag_value, config_key, default): return default engines = pick(args.engines, "engines", ["tabular"]) - selection_methods = pick(args.selection_methods, "selection_methods", ["mutual_info", "importance"]) - max_features = pick(args.max_features, "max_features", None) - correlation_threshold = pick(args.correlation_threshold, "correlation_threshold", 0.85) + # ``explain`` does not expose selection-only flags on argparse, so use + # ``getattr(..., None)`` to safely fall through to config / defaults + # without requiring the attribute to exist on the namespace. + selection_methods = pick( + getattr(args, "selection_methods", None), + "selection_methods", + ["mutual_info", "importance"], + ) + max_features = pick(getattr(args, "max_features", None), "max_features", None) + correlation_threshold = pick(getattr(args, "correlation_threshold", None), "correlation_threshold", 0.85) leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn") gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1) + # Type-check scalar config fields here so the CLI surfaces a clean + # exit-2 error instead of a downstream ``TypeError`` (e.g. from + # ``self.max_features <= 0`` when the JSON config supplied a string). + # ``argparse`` already enforces types for the flag side; this only + # guards against malformed ``--config`` JSON. + _check_scalar_type("max_features", max_features, (int,), allow_none=True, allow_bool=False) + _check_scalar_type("correlation_threshold", correlation_threshold, (int, float), allow_bool=False) + _check_scalar_type("gate_n_jobs", gate_n_jobs, (int,), allow_bool=False) + _check_scalar_type("leakage_guard", leakage_guard, (str,)) + # Validate ``llm_config`` is a JSON object (i.e. a Python dict) before # forwarding it. Without this check, a misconfigured non-dict value # would only fail at engine-construction time inside @@ -466,16 +549,23 @@ def _build_parser() -> argparse.ArgumentParser: "explain", help="Print JSON feature explanations and code for agent consumption.", description="Fit AutoFeatureEngineer on INPUT and emit a JSON document " - "describing each generated feature (name, explanation, code).", + "describing each generated feature (name, explanation, code). Selection is " + "intentionally disabled, so all candidate features are reported.", ) p_explain.add_argument("--input", "-i", required=True, help="Path to input file (CSV / Parquet / JSON).") p_explain.add_argument("--input-format", choices=SUPPORTED_INPUT_FORMATS, help="Override input format detection.") - p_explain.add_argument("--target", "-t", help="Target column name (required for selection).") + p_explain.add_argument( + "--target", + "-t", + help="Target column name. Used by leakage-guard checks and as task context " + "for the LLM engine. (Selection is disabled in `explain`, so this flag " + "does not gate selector behavior.)", + ) p_explain.add_argument( "--task-description", help="Natural-language ML task description (used by the LLM engine).", ) - _add_engineer_args(p_explain) + _add_engineer_args(p_explain, include_selection_args=False) p_explain.add_argument("--json", action="store_true", help="(Always JSON — flag accepted for symmetry.)") p_explain.set_defaults(func=_cmd_explain) @@ -487,33 +577,47 @@ def _add_io_args(p: argparse.ArgumentParser) -> None: p.add_argument("--output", "-o", required=True, help="Path to output file (CSV / Parquet / JSON).") p.add_argument("--input-format", choices=SUPPORTED_INPUT_FORMATS, help="Override input format detection.") p.add_argument("--output-format", choices=SUPPORTED_OUTPUT_FORMATS, help="Override output format detection.") - p.add_argument("--target", "-t", help="Target column name (required for selection).") + p.add_argument( + "--target", + "-t", + help="Target column name. Required when selection is applied (the default; " + "use --no-selection to skip selection entirely).", + ) p.add_argument( "--task-description", help="Natural-language ML task description (used by the LLM engine).", ) -def _add_engineer_args(p: argparse.ArgumentParser) -> None: - """Add ``AutoFeatureEngineer``-related flags to a subparser.""" +def _add_engineer_args(p: argparse.ArgumentParser, *, include_selection_args: bool = True) -> None: + """Add ``AutoFeatureEngineer``-related flags to a subparser. + + ``include_selection_args=False`` omits selection-only flags + (``--selection-methods``, ``--correlation-threshold``, + ``--max-features``) — these would be silently ignored by the + ``explain`` subcommand, which always runs with selection disabled, + and surfacing them in ``--help`` would be a confusing API for + automation. + """ p.add_argument( "--engines", nargs="+", choices=sorted(AutoFeatureEngineer.SUPPORTED_ENGINES), help="Engines to use (default: tabular).", ) - p.add_argument( - "--selection-methods", - nargs="+", - choices=sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS), - help="Selection methods (default: mutual_info importance).", - ) - p.add_argument("--max-features", type=int, help="Maximum number of features to keep.") - p.add_argument( - "--correlation-threshold", - type=float, - help="Maximum pairwise correlation in redundancy elimination (default: 0.85).", - ) + if include_selection_args: + p.add_argument( + "--selection-methods", + nargs="+", + choices=sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS), + help="Selection methods (default: mutual_info importance).", + ) + p.add_argument("--max-features", type=int, help="Maximum number of features to keep.") + p.add_argument( + "--correlation-threshold", + type=float, + help="Maximum pairwise correlation in redundancy elimination (default: 0.85).", + ) p.add_argument( "--leakage-guard", choices=sorted(AutoFeatureEngineer.SUPPORTED_LEAKAGE_GUARDS), @@ -529,7 +633,16 @@ def _add_engineer_args(p: argparse.ArgumentParser) -> None: help="Path to a JSON config file. CLI flags take precedence over config keys. " "Use this to pass nested keys such as ``llm_config``.", ) - p.add_argument("--verbose", action="store_true", default=None, help="Enable verbose logging.") + # ``BooleanOptionalAction`` (Python 3.9+) provides both ``--verbose`` + # and ``--no-verbose`` so a config-supplied ``"verbose": true`` can be + # explicitly turned off from the command line. ``default=None`` so the + # absence of either flag means "fall through to config / default". + p.add_argument( + "--verbose", + action=argparse.BooleanOptionalAction, + default=None, + help="Enable verbose logging (or --no-verbose to override config).", + ) def main(argv: list[str] | None = None) -> int: diff --git a/tests/test_cli.py b/tests/test_cli.py index 7e96b05..eb0ffd1 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -362,6 +362,253 @@ def test_empty_selection_methods_list_in_config_returns_clean_exit_2(tmp_path: P assert "at least one method" in err.lower() or "empty sequence" in err.lower() +# ----------------------- scalar config-type validation + + +@pytest.mark.parametrize( + "key,value,fragment", + [ + ("max_features", "10", "max_features"), + ("max_features", True, "max_features"), # bool rejected for numeric field + ("correlation_threshold", "0.9", "correlation_threshold"), + ("correlation_threshold", True, "correlation_threshold"), + ("gate_n_jobs", "2", "gate_n_jobs"), + ("leakage_guard", 42, "leakage_guard"), + ], +) +def test_scalar_type_mismatch_in_config_returns_exit_2(tmp_path: Path, tabular_csv: Path, key, value, fragment): + """A malformed JSON config (string in a numeric field, etc.) must hit the + deterministic exit-2 user-error path with a precise message — not bubble + up as a downstream ``TypeError`` (exit 1). + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({key: value})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + ] + ) + assert rc == 2 + assert fragment in err + + +# ----------------------- --verbose / --no-verbose + + +def test_no_verbose_overrides_config_verbose_true(tmp_path: Path, tabular_csv: Path): + """``--no-verbose`` (BooleanOptionalAction) must override a config-level + ``"verbose": true`` to false — the documented precedence rule. + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"verbose": True})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + "--no-verbose", + "--max-features", + "5", + "--json", + ] + ) + assert rc == 0, err + + +def test_verbose_overrides_config_verbose_false(tmp_path: Path, tabular_csv: Path): + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"verbose": False})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + "--verbose", + "--max-features", + "5", + ] + ) + assert rc == 0, err + + +# ----------------------- explain subparser doesn't expose selection-only flags + + +def test_explain_rejects_selection_methods_flag(tmp_path: Path, tabular_csv: Path): + """``explain`` always disables selection, so accepting ``--selection-methods`` + on the CLI would silently mis-configure the user. The subparser must not + advertise it. + """ + rc, _, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--selection-methods", + "mutual_info", + ] + ) + assert rc == 2 + assert "unrecognized" in err.lower() or "--selection-methods" in err.lower() + + +def test_explain_rejects_max_features_flag(tmp_path: Path, tabular_csv: Path): + rc, _, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--max-features", + "10", + ] + ) + assert rc == 2 + + +def test_explain_rejects_correlation_threshold_flag(tmp_path: Path, tabular_csv: Path): + rc, _, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--correlation-threshold", + "0.9", + ] + ) + assert rc == 2 + + +def test_explain_target_help_no_longer_says_required_for_selection(): + """The ``--target`` help on ``explain`` must not claim it gates selection + (selection is intentionally disabled in ``explain``). + """ + import argparse as _argparse + + parser = fc_cli._build_parser() + # argparse stores subparsers under a special action attribute + explain_parser = next( + action.choices["explain"] for action in parser._actions if isinstance(action, _argparse._SubParsersAction) + ) + target_help = next(a.help for a in explain_parser._actions if "--target" in a.option_strings) + assert "required for selection" not in target_help + assert "leakage" in target_help.lower() or "task context" in target_help.lower() + + +# ----------------------- I/O OSError normalization + + +def test_input_directory_returns_exit_2(tmp_path: Path): + """Pointing ``--input`` at a directory must surface as exit 2.""" + in_dir = tmp_path / "i_am_a_dir.csv" + in_dir.mkdir() + rc, _, err = _run( + [ + "transform", + "--input", + str(in_dir), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "directory" in err.lower() + + +def test_output_directory_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """Pointing ``--output`` at an existing directory must surface as exit 2.""" + out_dir = tmp_path / "i_am_a_dir.csv" + out_dir.mkdir() + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_dir), + "--target", + "y", + ] + ) + assert rc == 2 + assert "directory" in err.lower() + + +def test_unwritable_output_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """An ``OSError`` on write (e.g. permission denied) must surface as exit 2.""" + import pandas as pd + + def _raise_oserror(self, *args, **kwargs): + raise PermissionError("simulated write failure") + + monkeypatch.setattr(pd.DataFrame, "to_csv", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to write" in err.lower() + + +def test_unreadable_input_csv_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """An ``OSError`` while reading the input must surface as exit 2.""" + import pandas as pd + + def _raise_oserror(*args, **kwargs): + raise PermissionError("simulated read failure") + + monkeypatch.setattr(pd, "read_csv", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read" in err.lower() + + # -------------------------------------------------------------- error paths From 398c9327137ccb8c91f5222f1eb73c2a5df5a952 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 19:49:57 +0800 Subject: [PATCH 08/30] test(cli): cover new round-4 OSError + scalar-type branches Adds 6 targeted tests so codecov/patch lifts back above the 88.90% target after round-4''s expanded I/O error normalization (it dipped to 87.85% on commit 88e71ea): * JSON read OSError -> exit 2 + "failed to read json" * JSON write OSError -> exit 2 + "failed to write json" * Parquet read OSError -> exit 2 + "failed to read parquet" * Parquet write OSError (vs ImportError) -> exit 2 + "failed to write parquet" * Output parent-mkdir OSError -> exit 2 + "create parent directory" * `_check_scalar_type` direct unit test for the `allow_none=False` + `value is None` branch (not naturally hit via integration since every scalar with `allow_none=False` has a non-None default). Tests: 59 (+6 new) in tests/test_cli.py. Project coverage bumped from 88.83% to 89.22%. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/test_cli.py | 139 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 139 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index eb0ffd1..1be2702 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -609,6 +609,145 @@ def _raise_oserror(*args, **kwargs): assert "failed to read" in err.lower() +def test_unreadable_input_json_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """``OSError`` from ``pd.read_json`` is surfaced as exit 2 too.""" + import pandas as pd + + in_path = tmp_path / "in.json" + in_path.write_text("[]") # contents irrelevant; we'll intercept + + def _raise_oserror(*args, **kwargs): + raise PermissionError("simulated read failure") + + monkeypatch.setattr(pd, "read_json", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read json" in err.lower() + + +def test_unreadable_input_parquet_returns_exit_2(tmp_path: Path, monkeypatch): + """``OSError`` from ``pd.read_parquet`` (e.g. corrupt file) is exit 2.""" + import pandas as pd + + in_path = tmp_path / "in.parquet" + in_path.write_bytes(b"") + + def _raise_oserror(*args, **kwargs): + raise OSError("simulated parquet read failure") + + monkeypatch.setattr(pd, "read_parquet", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read parquet" in err.lower() + + +def test_unwritable_output_json_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + import pandas as pd + + def _raise_oserror(self, *args, **kwargs): + raise PermissionError("simulated json write failure") + + monkeypatch.setattr(pd.DataFrame, "to_json", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.json"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to write json" in err.lower() + + +def test_unwritable_output_parquet_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """``OSError`` (vs ``ImportError``) from ``DataFrame.to_parquet`` -> exit 2.""" + import pandas as pd + + def _raise_oserror(self, *args, **kwargs): + raise OSError("simulated parquet write failure") + + monkeypatch.setattr(pd.DataFrame, "to_parquet", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.parquet"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to write parquet" in err.lower() + + +def test_uncreatable_parent_directory_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """If creating the output's parent directory fails, exit 2 with a clean message.""" + real_mkdir = Path.mkdir + + def _raise_oserror(self, *args, **kwargs): + # Only fail for our test's would-be output parent so other calls (e.g. + # tmp_path operations under the hood) still work. + if "deep" in self.parts: + raise PermissionError("simulated mkdir failure") + return real_mkdir(self, *args, **kwargs) + + monkeypatch.setattr(Path, "mkdir", _raise_oserror, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "deep" / "nested" / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "create parent directory" in err.lower() + + +def test_check_scalar_type_rejects_none_when_required(): + """Direct unit test for ``_check_scalar_type`` to exercise the + ``allow_none=False`` + ``value is None`` branch, which the integration + path doesn't naturally hit (every scalar with ``allow_none=False`` has + a non-None default). + """ + with pytest.raises(ValueError, match="must not be null"): + fc_cli._check_scalar_type("foo", None, (int,), allow_none=False) + + # -------------------------------------------------------------- error paths From bb9e77c91fcfc30e933ba893e78895e20f886c46 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 20:23:37 +0800 Subject: [PATCH 09/30] fix(cli): address round-5 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses all five new comments from copilot-pull-request-reviewer and chatgpt-codex-connector on PR #5 (commit 88e71ea): * Type-validate erbose from --config (Copilot LsiH, Codex LtF8 P2). `verbose = bool(pick(...))` silently coerced malformed config values like `{"verbose": "false"}` (truthy string) into `True`. `_build_engineer` now type-checks `verbose` via the existing `_check_scalar_type` helper, rejecting non-bool values with a clean exit-2 error consistent with the other scalar fields. Parametrized test covers `"true"` / `"false"` / `1` / `0`. * Restore --max-features on xplain (Copilot Lsia + Lsim). `--max-features` is *not* a selection-only flag — `AutoFeatureEngineer` forwards it into engine construction (e.g. the tabular engine uses it to cap the number of generated features), so removing it from `explain` deprived callers of the only CLI-level handle on the explanation payload size. The `include_selection_args=False` mode in `_add_engineer_args` now only excludes `--selection-methods` and `--correlation-threshold`; `--max-features` is exposed on every engineer-using subcommand. Test `test_explain_accepts_max_features_flag` asserts `explain` succeeds with the flag (replacing the prior reject-all test). * Reject missing --target when selection is enabled (Copilot Lsis). `transform` without `--target` previously called `fit_transform(apply_selection=True)`, which silently no-ops the selector (only built when `y is not None`). The CLI now raises a clean `ValueError` -> exit 2 with a precise message: "--target is required when feature selection is applied. Pass --target , or pass --no-selection to skip selection." Two tests cover the new branch (selection -> exit 2; --no-selection with no target -> exit 0). Tests: 65 (+6 net) in tests/test_cli.py, 838 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 52 ++++++++++++++++++++++------ tests/test_cli.py | 85 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 122 insertions(+), 15 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index d6cf451..c4f0baf 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -290,15 +290,17 @@ def pick(flag_value, config_key, default): return default engines = pick(args.engines, "engines", ["tabular"]) - # ``explain`` does not expose selection-only flags on argparse, so use - # ``getattr(..., None)`` to safely fall through to config / defaults - # without requiring the attribute to exist on the namespace. + # ``explain`` exposes ``--engines`` and ``--max-features`` (engine-level + # caps) but not the selection-only flags ``--selection-methods`` and + # ``--correlation-threshold``. Use ``getattr(..., None)`` for the + # latter so we can fall through to config / defaults without requiring + # the attribute to exist on the namespace. selection_methods = pick( getattr(args, "selection_methods", None), "selection_methods", ["mutual_info", "importance"], ) - max_features = pick(getattr(args, "max_features", None), "max_features", None) + max_features = pick(args.max_features, "max_features", None) correlation_threshold = pick(getattr(args, "correlation_threshold", None), "correlation_threshold", 0.85) leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn") gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1) @@ -331,7 +333,15 @@ def pick(flag_value, config_key, default): f"got {type(llm_config_raw).__name__}={llm_config_raw!r}." ) - verbose = bool(pick(args.verbose, "verbose", False)) + # ``verbose`` is type-checked before being forwarded so a malformed + # config like ``{"verbose": "false"}`` (truthy string) does NOT silently + # turn verbose mode on — instead it raises a clean exit-2 error + # consistent with the other scalar fields. ``args.verbose`` is already + # a bool / None thanks to ``BooleanOptionalAction``; only the config + # path can introduce a non-bool. + verbose_raw = pick(args.verbose, "verbose", False) + _check_scalar_type("verbose", verbose_raw, (bool,)) + verbose = bool(verbose_raw) # Pass ``engines`` / ``selection_methods`` through *unchanged* (no # ``list(...)`` wrapping). Coercion would convert a misconfigured @@ -404,6 +414,17 @@ def _cmd_transform(args: argparse.Namespace) -> int: df = _read_table(input_path, in_fmt) X, y = _split_xy(df, args.target) + # Selection requires a target column to fit against. Without ``--target``, + # ``AutoFeatureEngineer.fit_transform(apply_selection=True)`` silently + # degrades to an unselected run because the selector is only built when + # ``y is not None``. Surface that as a clean exit-2 user error rather than + # silently producing the same output as ``--no-selection``. + if not args.no_selection and args.target is None: + raise ValueError( + "--target is required when feature selection is applied. " + "Pass --target , or pass --no-selection to skip selection." + ) + engineer = _build_engineer(args) transformed = engineer.fit_transform( X, @@ -593,11 +614,13 @@ def _add_engineer_args(p: argparse.ArgumentParser, *, include_selection_args: bo """Add ``AutoFeatureEngineer``-related flags to a subparser. ``include_selection_args=False`` omits selection-only flags - (``--selection-methods``, ``--correlation-threshold``, - ``--max-features``) — these would be silently ignored by the - ``explain`` subcommand, which always runs with selection disabled, - and surfacing them in ``--help`` would be a confusing API for - automation. + (``--selection-methods`` and ``--correlation-threshold``) — these are + silently ignored by the ``explain`` subcommand, which always runs with + selection disabled. ``--max-features`` is *not* selection-only: + ``AutoFeatureEngineer`` forwards it into engine construction (e.g. the + tabular engine uses it to cap the number of generated features), so it + is exposed even when ``include_selection_args=False`` to give callers + a CLI-level handle on the engine output size. """ p.add_argument( "--engines", @@ -605,6 +628,14 @@ def _add_engineer_args(p: argparse.ArgumentParser, *, include_selection_args: bo choices=sorted(AutoFeatureEngineer.SUPPORTED_ENGINES), help="Engines to use (default: tabular).", ) + # ``--max-features`` is exposed on every engineer-using subcommand + # because it caps engine output, not just selection — see the + # ``AutoFeatureEngineer`` constructor and ``TabularEngine``. + p.add_argument( + "--max-features", + type=int, + help="Maximum number of features to generate / keep (forwarded to engines and selector).", + ) if include_selection_args: p.add_argument( "--selection-methods", @@ -612,7 +643,6 @@ def _add_engineer_args(p: argparse.ArgumentParser, *, include_selection_args: bo choices=sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS), help="Selection methods (default: mutual_info importance).", ) - p.add_argument("--max-features", type=int, help="Maximum number of features to keep.") p.add_argument( "--correlation-threshold", type=float, diff --git a/tests/test_cli.py b/tests/test_cli.py index 1be2702..f32fdb3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -451,6 +451,76 @@ def test_verbose_overrides_config_verbose_false(tmp_path: Path, tabular_csv: Pat assert rc == 0, err +@pytest.mark.parametrize( + "value", + ["true", "false", 1, 0], +) +def test_non_bool_verbose_in_config_returns_exit_2(tmp_path: Path, tabular_csv: Path, value): + """A malformed ``"verbose": `` config must hit exit 2 with a + precise message, not silently turn verbose mode on/off via Python's + truthiness rules. + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"verbose": value})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--config", + str(config_path), + "--max-features", + "5", + ] + ) + assert rc == 2 + assert "verbose" in err + + +def test_transform_missing_target_with_selection_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """Without ``--target``, selection silently degrades to a no-op. The CLI + must surface that as a clean exit-2 user error so automation can react. + """ + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--max-features", + "5", + ] + ) + assert rc == 2 + assert "--target" in err + assert "selection" in err.lower() + + +def test_transform_missing_target_with_no_selection_succeeds(tmp_path: Path, tabular_csv: Path): + """Once selection is opted out, the missing target is no longer an error + (selection requires a target; raw transform doesn't). + """ + # Drop the target column so we can run without --target. + in_path = tmp_path / "in_notarget.csv" + pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False) + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--no-selection", + ] + ) + assert rc == 0, err + + # ----------------------- explain subparser doesn't expose selection-only flags @@ -474,8 +544,13 @@ def test_explain_rejects_selection_methods_flag(tmp_path: Path, tabular_csv: Pat assert "unrecognized" in err.lower() or "--selection-methods" in err.lower() -def test_explain_rejects_max_features_flag(tmp_path: Path, tabular_csv: Path): - rc, _, err = _run( +def test_explain_accepts_max_features_flag(tmp_path: Path, tabular_csv: Path): + """``--max-features`` is *not* selection-only — ``AutoFeatureEngineer`` + forwards it into engine construction (e.g. the tabular engine uses it + to cap how many features it generates). ``explain`` must therefore + expose it so callers can bound the size of the explanation payload. + """ + rc, out, err = _run( [ "explain", "--input", @@ -483,10 +558,12 @@ def test_explain_rejects_max_features_flag(tmp_path: Path, tabular_csv: Path): "--target", "y", "--max-features", - "10", + "5", ] ) - assert rc == 2 + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" def test_explain_rejects_correlation_threshold_flag(tmp_path: Path, tabular_csv: Path): From 586c51fa039e602b6a2cc032fe13e8c310348179 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 20:58:31 +0800 Subject: [PATCH 10/30] fix(cli): address round-6 review feedback Addresses all three new comments from copilot-pull-request-reviewer on PR #5 (commit bb9e77c): * Loosen --target requirement (Copilot L5bO). Round-5 made --target unconditionally required when selection was enabled, but `AutoFeatureEngineer` only actually fits a selector when `y is not None` AND `max_features` is set. With the default `max_features=None` the call is a raw feature-generation run and needs no target. The CLI now mirrors that contract: `--target` is required only when selection is enabled AND `--max-features` is configured (CLI flag or config). Three new tests: - missing target + no max_features -> exit 0 (raw transform) - missing target + --max-features -> exit 2 (selection would run) - missing target + max_features in config -> exit 2 * explain ignores selection-only config keys (Copilot L5bU). `_build_engineer` learned a `include_selection_config` flag. `_cmd_explain` calls `_build_engineer(args, include_selection_config=False)` so a shared transform/explain config with `selection_methods` / `correlation_threshold` no longer trips `explain` over keys that are inert at runtime (selection is disabled in `explain`). `test_explain_ignores_selection_only_config_keys` covers the new behavior with a mixed config. * PR description vs. shipped CLI surface (Copilot L5bc). The PR description on GitHub is updated separately to remove the stale claim that `--selection-methods` and `--correlation-threshold` are accepted on `explain`. Tests: 68 (+3 new) in tests/test_cli.py, 841 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 62 ++++++++++++++++++++++++----------- tests/test_cli.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 123 insertions(+), 19 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index c4f0baf..93ca7a1 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -264,7 +264,7 @@ def _check_scalar_type( ) -def _build_engineer(args: argparse.Namespace) -> AutoFeatureEngineer: +def _build_engineer(args: argparse.Namespace, *, include_selection_config: bool = True) -> AutoFeatureEngineer: """Construct an :class:`AutoFeatureEngineer` from parsed CLI args. Precedence: explicit CLI flags override values from ``--config``; @@ -274,6 +274,12 @@ def _build_engineer(args: argparse.Namespace) -> AutoFeatureEngineer: canonical (and deterministic) error path — the CLI's wrapper must not silently rewrite a misconfigured config into something that looks different from what the user wrote. + + ``include_selection_config=False`` (used by the ``explain`` subcommand) + skips reading selection-only config keys (``selection_methods``, + ``correlation_threshold``) so a shared config file with selection + settings does not cause ``explain`` to fail config-validation for keys + that are inert at runtime (selection is disabled in ``explain``). """ config = _load_config(args.config) @@ -292,16 +298,22 @@ def pick(flag_value, config_key, default): engines = pick(args.engines, "engines", ["tabular"]) # ``explain`` exposes ``--engines`` and ``--max-features`` (engine-level # caps) but not the selection-only flags ``--selection-methods`` and - # ``--correlation-threshold``. Use ``getattr(..., None)`` for the - # latter so we can fall through to config / defaults without requiring - # the attribute to exist on the namespace. - selection_methods = pick( - getattr(args, "selection_methods", None), - "selection_methods", - ["mutual_info", "importance"], - ) + # ``--correlation-threshold``. When ``include_selection_config`` is + # False (i.e. we're called from ``explain``) we also skip reading the + # selection-only keys from the config file, so a shared transform/explain + # config with selection settings won't trip ``explain`` over keys that + # have no effect on its runtime behavior. + if include_selection_config: + selection_methods = pick( + getattr(args, "selection_methods", None), + "selection_methods", + ["mutual_info", "importance"], + ) + correlation_threshold = pick(getattr(args, "correlation_threshold", None), "correlation_threshold", 0.85) + else: + selection_methods = ["mutual_info", "importance"] + correlation_threshold = 0.85 max_features = pick(args.max_features, "max_features", None) - correlation_threshold = pick(getattr(args, "correlation_threshold", None), "correlation_threshold", 0.85) leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn") gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1) @@ -414,15 +426,27 @@ def _cmd_transform(args: argparse.Namespace) -> int: df = _read_table(input_path, in_fmt) X, y = _split_xy(df, args.target) - # Selection requires a target column to fit against. Without ``--target``, - # ``AutoFeatureEngineer.fit_transform(apply_selection=True)`` silently - # degrades to an unselected run because the selector is only built when - # ``y is not None``. Surface that as a clean exit-2 user error rather than - # silently producing the same output as ``--no-selection``. - if not args.no_selection and args.target is None: + # Selection requires a target column to fit against. ``AutoFeatureEngineer`` + # only actually fits a selector when ``y is not None`` AND ``max_features`` + # is set; without ``max_features`` the call is a raw feature-generation + # run and does not need a target. The CLI mirrors that contract: only + # require ``--target`` when both selection is enabled (the default) AND + # ``max_features`` is configured (CLI flag or config), so commands like + # ``featcopilot transform --input in.csv --output out.csv`` (no target, + # no cap) still work. + effective_max_features = args.max_features + if effective_max_features is None and args.config is not None: + try: + cfg_max = _load_config(args.config).get("max_features") + except (FileNotFoundError, ValueError): + cfg_max = None + if cfg_max is not None: + effective_max_features = cfg_max + if not args.no_selection and args.target is None and effective_max_features is not None: raise ValueError( - "--target is required when feature selection is applied. " - "Pass --target , or pass --no-selection to skip selection." + "--target is required when feature selection is applied " + "(i.e. when --max-features / config max_features is set). " + "Pass --target , or pass --no-selection / drop --max-features to skip selection." ) engineer = _build_engineer(args) @@ -483,7 +507,7 @@ def _cmd_explain(args: argparse.Namespace) -> int: df = _read_table(input_path, in_fmt) X, y = _split_xy(df, args.target) - engineer = _build_engineer(args) + engineer = _build_engineer(args, include_selection_config=False) engineer.fit_transform( X, y, diff --git a/tests/test_cli.py b/tests/test_cli.py index f32fdb3..e5bbb81 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -521,6 +521,86 @@ def test_transform_missing_target_with_no_selection_succeeds(tmp_path: Path, tab assert rc == 0, err +def test_transform_missing_target_no_max_features_succeeds(tmp_path: Path, tabular_csv: Path): + """Without ``--max-features`` (and the corresponding config key), + ``AutoFeatureEngineer`` doesn't actually fit a selector even with + ``apply_selection=True``, so requiring ``--target`` would be a false + positive. Raw feature generation without target / without cap must + therefore succeed. + """ + in_path = tmp_path / "in_notarget.csv" + pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False) + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + ] + ) + assert rc == 0, err + + +def test_transform_missing_target_max_features_in_config_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """The ``--target`` requirement also fires when ``max_features`` comes + from ``--config`` (not just the CLI flag), since the selector will + actually run in that case. + """ + config_path = tmp_path / "cfg.json" + config_path.write_text(json.dumps({"max_features": 5})) + in_path = tmp_path / "in_notarget.csv" + pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False) + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--config", + str(config_path), + ] + ) + assert rc == 2 + assert "--target" in err + + +def test_explain_ignores_selection_only_config_keys(tmp_path: Path, tabular_csv: Path): + """A shared transform/explain config with selection-only keys + (``selection_methods`` / ``correlation_threshold``) must not break + ``explain``: those keys are inert at runtime (selection is disabled + in ``explain``) and ``_build_engineer(include_selection_config=False)`` + skips reading them so config-validation does not fire. + """ + config_path = tmp_path / "cfg.json" + # Use *valid* selection_methods values; the point is they''re ignored. + config_path.write_text( + json.dumps( + { + "engines": ["tabular"], + "selection_methods": ["mutual_info"], + "correlation_threshold": 0.5, + "max_features": 5, + } + ) + ) + rc, out, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--config", + str(config_path), + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + + # ----------------------- explain subparser doesn't expose selection-only flags From dc4e5b95aee8ad304d64ff8a77e15023a9c9b9e7 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 21:28:22 +0800 Subject: [PATCH 11/30] fix(cli): address round-7 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses both new comments from chatgpt-codex-connector on PR #5 (commit 586c51f): * Normalize all parquet read backend errors to exit 2 (Codex P2). `_read_table` now catches `Exception` (not just `OSError`) for the parquet branch. This routes engine-level failures like `pyarrow.lib.ArrowInvalid` (corrupt parquet) and fastparquet's metadata `ValueError`s through the deterministic exit-2 path instead of the generic exit-1 backstop. `Exception` is the right scope because parquet I/O is fully delegated to a third-party backend; any error raised is by definition an I/O or data issue, not a CLI bug. * Same broad catch for parquet write (Codex P2). `_write_table` parquet branch now also catches `Exception`, so pyarrow type / conversion errors for unsupported column values produce a clean exit 2 with a "Failed to write parquet to ..." message rather than exit 1 "unexpected error". Two new tests use stand-in `Exception` subclasses (not `OSError`) to verify both paths route to exit 2 — closing the previously demonstrated gap. Tests: 70 (+2 new) in tests/test_cli.py. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 17 ++++++++++-- tests/test_cli.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 93ca7a1..b0ae14b 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -132,7 +132,14 @@ def _read_table(path: Path, fmt: str): f"Reading parquet requires a parquet engine (pyarrow or fastparquet); " f"install one of them, or convert the input to CSV/JSON. Original error: {exc}" ) from exc - except OSError as exc: + except Exception as exc: + # Catch *any* backend failure (``OSError`` for I/O, + # ``pyarrow.lib.ArrowInvalid`` for corrupt files, + # ``ValueError`` from ``fastparquet`` for malformed metadata, + # etc.) and surface it via the deterministic exit-2 path. + # Catching ``Exception`` is appropriate here because the entire + # operation is delegated to a third-party backend; any error + # raised is by definition an I/O or data issue, not a CLI bug. raise ValueError(f"Failed to read parquet from {str(path)!r}: {exc}") from exc if fmt == "json": # ``orient='records'`` is the agent-friendly default; fall back to @@ -179,7 +186,13 @@ def _write_table(df, path: Path, fmt: str) -> None: f"Writing parquet requires a parquet engine (pyarrow or fastparquet); " f"install one of them, or pick CSV/JSON via --output-format. Original error: {exc}" ) from exc - except OSError as exc: + except Exception as exc: + # Same broad-catch rationale as ``_read_table``: parquet write + # is fully delegated to a backend (``pyarrow``/``fastparquet``) + # whose errors include ``OSError`` (I/O), engine-specific type + # / conversion exceptions for unsupported column values, etc. + # All of these are user-facing data issues, not CLI bugs, so + # they should produce a clean exit-2 failure. raise ValueError(f"Failed to write parquet to {str(path)!r}: {exc}") from exc elif fmt == "json": try: diff --git a/tests/test_cli.py b/tests/test_cli.py index e5bbb81..a4f1715 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -867,6 +867,71 @@ def _raise_oserror(self, *args, **kwargs): assert "failed to write parquet" in err.lower() +def test_parquet_read_engine_error_returns_exit_2(tmp_path: Path, monkeypatch): + """A non-OSError parquet *backend* error (e.g. ``pyarrow.lib.ArrowInvalid`` + for a corrupt file) must surface as exit 2, not the generic exit 1 + "unexpected error" backstop. The CLI catches ``Exception`` for parquet + operations because they are fully delegated to a third-party backend + whose failures are by definition user-facing data issues. + """ + import pandas as pd + + in_path = tmp_path / "fake.parquet" + in_path.write_bytes(b"\x00\x01\x02\x03") # not a real parquet file + + class _FakeArrowInvalid(Exception): + """Stand-in for ``pyarrow.lib.ArrowInvalid`` (also subclasses Exception).""" + + def _raise_backend_error(*args, **kwargs): + raise _FakeArrowInvalid("simulated corrupt parquet") + + monkeypatch.setattr(pd, "read_parquet", _raise_backend_error, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read parquet" in err.lower() + + +def test_parquet_write_engine_error_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): + """Same coverage on the write side: a backend-level pyarrow exception + that is *not* an ``OSError`` (e.g. an unsupported column-type + conversion error) must produce exit 2, not exit 1. + """ + import pandas as pd + + class _FakeArrowTypeError(Exception): + pass + + def _raise_backend_error(self, *args, **kwargs): + raise _FakeArrowTypeError("simulated unsupported column dtype for parquet") + + monkeypatch.setattr(pd.DataFrame, "to_parquet", _raise_backend_error, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.parquet"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to write parquet" in err.lower() + + def test_uncreatable_parent_directory_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): """If creating the output's parent directory fails, exit 2 with a clean message.""" real_mkdir = Path.mkdir From 459b1b93650e63646234bb4fd8f9ee220b3c6a63 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Sun, 3 May 2026 22:04:40 +0800 Subject: [PATCH 12/30] fix(cli): address round-8 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses all three new comments from copilot-pull-request-reviewer on PR #5 (commit dc4e5b9): * stderr is reserved for failures (Copilot MIZN + MIZS). `AutoFeatureEngineer.fit` calls `warnings.warn(...)` for leakage-prone column names under the default `leakage_guard='warn'`, bleeding non-empty stderr onto a zero-exit success path and breaking the CLI's agent-friendly contract. New `_fit_transform_capturing_warnings` and `_fit_capturing_warnings` helpers wrap the engineer call in `warnings.catch_warnings(record=True)` and surface every captured message as a JSON-serializable string list under a new `warnings` field in the success payload — keeping stderr deterministic for agent / tool-use parsing. Both `transform` and `explain` use the helpers. Tests assert `stderr == ""` on a successful run with a column name (`label_encoded`) that triggers the leakage heuristic. * --target check runs after type validation (Copilot MIZY). Round-6's pre-check used the raw `args.max_features` (and tried to read it from config), so a malformed value like `{"max_features": "5"}` (string) or `"max_features": -1` was reported as `--target is required` instead of the real type validation error. `_cmd_transform` now builds the engineer FIRST (which runs all scalar / list / dict `_check_scalar_type` validation on the merged CLI + config view), then performs the `--target` check using the validated `engineer.max_features` attribute. Users with a malformed config now see the precise type error and can remediate; they no longer get sent down the wrong path. `test_invalid_max_features_in_config_takes_precedence_over_target_check` asserts the error mentions `max_features` and *not* `--target`. Tests: 73 (+3 new) in tests/test_cli.py, 846 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 72 +++++++++++++++++++++++------ tests/test_cli.py | 112 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 13 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index b0ae14b..08504cc 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -426,6 +426,47 @@ def _cmd_info(args: argparse.Namespace) -> int: return 0 +def _fit_transform_capturing_warnings(engineer, X, y, **kwargs): + """Run ``engineer.fit_transform(X, y, **kwargs)`` while capturing any + Python ``UserWarning`` (or other warning) it emits. + + The CLI contract is that stdout carries the JSON payload and stderr is + reserved for failures. ``AutoFeatureEngineer.fit`` calls + ``warnings.warn(...)`` for leakage-prone column names under the default + ``leakage_guard='warn'``, which would otherwise bleed onto stderr on a + successful run and break agent / tool-use error parsing. This helper + intercepts those warnings, collects them as JSON-serializable strings, + and lets the caller surface them inside the ``warnings`` field of the + success payload — keeping stderr deterministic. + + Returns + ------- + (warnings_list, result) + ``warnings_list`` is a list of ``str`` (one entry per warning, in + emission order). ``result`` is whatever ``fit_transform`` returned. + """ + import warnings as _warnings + + captured: list[str] = [] + with _warnings.catch_warnings(record=True) as caught: + _warnings.simplefilter("always") + result = engineer.fit_transform(X, y, **kwargs) + captured.extend(str(w.message) for w in caught) + return captured, result + + +def _fit_capturing_warnings(engineer, X, y, **kwargs): + """Sibling of :func:`_fit_transform_capturing_warnings` for explain.""" + import warnings as _warnings + + captured: list[str] = [] + with _warnings.catch_warnings(record=True) as caught: + _warnings.simplefilter("always") + engineer.fit_transform(X, y, **kwargs) + captured.extend(str(w.message) for w in caught) + return captured + + def _cmd_transform(args: argparse.Namespace) -> int: """Read input, fit/transform, write output.""" input_path = Path(args.input) @@ -439,6 +480,13 @@ def _cmd_transform(args: argparse.Namespace) -> int: df = _read_table(input_path, in_fmt) X, y = _split_xy(df, args.target) + # Build the engineer first: ``_build_engineer`` runs all scalar / list / + # dict type validation on the merged CLI-flag + config view, so any + # malformed value (e.g. ``"max_features": "5"``, ``"verbose": "false"``) + # surfaces a precise exit-2 error here rather than down the wrong + # ``--target is required`` rabbit hole. + engineer = _build_engineer(args) + # Selection requires a target column to fit against. ``AutoFeatureEngineer`` # only actually fits a selector when ``y is not None`` AND ``max_features`` # is set; without ``max_features`` the call is a raw feature-generation @@ -446,24 +494,19 @@ def _cmd_transform(args: argparse.Namespace) -> int: # require ``--target`` when both selection is enabled (the default) AND # ``max_features`` is configured (CLI flag or config), so commands like # ``featcopilot transform --input in.csv --output out.csv`` (no target, - # no cap) still work. - effective_max_features = args.max_features - if effective_max_features is None and args.config is not None: - try: - cfg_max = _load_config(args.config).get("max_features") - except (FileNotFoundError, ValueError): - cfg_max = None - if cfg_max is not None: - effective_max_features = cfg_max - if not args.no_selection and args.target is None and effective_max_features is not None: + # no cap) still work. Using ``engineer.max_features`` here means the + # value has already been type-validated, so we never report + # ``--target is required`` when the real problem is a malformed + # ``max_features`` config value. + if not args.no_selection and args.target is None and engineer.max_features is not None: raise ValueError( "--target is required when feature selection is applied " "(i.e. when --max-features / config max_features is set). " "Pass --target , or pass --no-selection / drop --max-features to skip selection." ) - engineer = _build_engineer(args) - transformed = engineer.fit_transform( + captured_warnings, transformed = _fit_transform_capturing_warnings( + engineer, X, y, task_description=args.task_description or "prediction task", @@ -495,6 +538,7 @@ def _cmd_transform(args: argparse.Namespace) -> int: "max_features": engineer.max_features, "target": args.target, "selection_applied": engineer._selector is not None, + "warnings": captured_warnings, } _emit(payload, as_json=args.json) return 0 @@ -521,7 +565,8 @@ def _cmd_explain(args: argparse.Namespace) -> int: X, y = _split_xy(df, args.target) engineer = _build_engineer(args, include_selection_config=False) - engineer.fit_transform( + captured_warnings = _fit_capturing_warnings( + engineer, X, y, task_description=args.task_description or "prediction task", @@ -546,6 +591,7 @@ def _cmd_explain(args: argparse.Namespace) -> int: } for name in feature_names ], + "warnings": captured_warnings, } # explain always emits JSON to stdout (it's the only sensible format), diff --git a/tests/test_cli.py b/tests/test_cli.py index a4f1715..0fed666 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -960,6 +960,118 @@ def _raise_oserror(self, *args, **kwargs): assert "create parent directory" in err.lower() +# ----------------------- stderr is reserved for failures (warnings captured) + + +def test_transform_leakage_warning_does_not_pollute_stderr(tmp_path: Path): + """``leakage_guard='warn'`` (the default) must not bleed + ``warnings.warn(...)`` onto stderr on a successful run; the warnings + are captured and surfaced inside the JSON payload's ``warnings`` field + instead, so agents can keep treating non-empty stderr as failure metadata. + """ + rng = np.random.default_rng(0) + n = 200 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + # ``label_encoded`` is detected as leakage-prone ("label" + "encoded" + # both appear in the stoplist). + "label_encoded": rng.integers(0, 2, size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "in_with_leakage.csv" + df.to_csv(in_path, index=False) + out_path = tmp_path / "out.csv" + + rc, out, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "5", + "--json", + ] + ) + assert rc == 0, err + assert err == "", f"stderr should be empty on success but got: {err!r}" + payload = json.loads(out) + assert payload["status"] == "ok" + # ``warnings`` field is always present; it MAY contain the leakage + # warning depending on the heuristic. The contract being tested is + # that stderr stays clean — not that any specific warning was emitted + # (the leakage detector heuristics evolve). + assert "warnings" in payload + assert isinstance(payload["warnings"], list) + + +def test_explain_leakage_warning_does_not_pollute_stderr(tmp_path: Path): + """``explain`` has the same stderr-cleanliness contract as ``transform``.""" + rng = np.random.default_rng(0) + n = 200 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "label_encoded": rng.integers(0, 2, size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "in.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + ] + ) + assert rc == 0, err + assert err == "", f"stderr should be empty on success but got: {err!r}" + payload = json.loads(out) + assert "warnings" in payload + assert isinstance(payload["warnings"], list) + + +# ----------------------- target check runs after type validation + + +def test_invalid_max_features_in_config_takes_precedence_over_target_check(tmp_path: Path, tabular_csv: Path): + """A malformed ``max_features`` in ``--config`` (string, negative, etc.) + must surface its real validation error rather than ``--target is + required``. The CLI now builds the engineer first (which type-validates + every scalar config field) and only checks ``--target`` after. + """ + in_path = tmp_path / "in_notarget.csv" + pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False) + + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"max_features": "5"})) # string, not int + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--config", + str(cfg), + ] + ) + assert rc == 2 + # The real error is the type mismatch, NOT --target missing. + assert "max_features" in err + assert "--target" not in err + + def test_check_scalar_type_rejects_none_when_required(): """Direct unit test for ``_check_scalar_type`` to exercise the ``allow_none=False`` + ``value is None`` branch, which the integration From c388d32e286518249e260933c4c3fda2fc8813c8 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 4 May 2026 05:52:55 +0800 Subject: [PATCH 13/30] fix(cli): address round-9 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses all three new comments from copilot-pull-request-reviewer on PR #5 (commit 459b1b9): * Capture logger output in addition to warnings.warn (Copilot MhpC). Round-8's helpers only intercepted `warnings.warn(...)`, but the do-no-harm gate's fallback path calls `logger.warning(...)` and successful `transform` runs would still bleed those messages onto stderr — breaking the documented stderr-reserved-for-failures contract. New `_capture_featcopilot_messages` contextmanager swaps the `featcopilot` root logger's handlers for a list-appending handler for the duration of the call. Every `featcopilot.*` child logger's records propagate up to the root by default, so this single hook captures all log output (debug, info, warning, error). The captured messages are merged with the `warnings.warn` strings and surfaced in the JSON payload's `warnings` field. * Same fix applies to `explain` (Copilot MhpF). Both helper functions now go through the new contextmanager, so `explain --verbose` (which fires multiple `logger.info(...)` records) keeps stderr empty. Two new unit tests cover the contextmanager directly: it intercepts logger.warning + warnings.warn, and it restores the root logger state on exception. * Fix --target help text on transform (Copilot MhpI). The help previously said `--target` is required when "selection is applied (the default)", but the round-6 fix made the requirement conditional on `--max-features` actually being set (because the selector only fits in that case). The help text now matches the shipped contract; `test_transform_target_help_reflects_actual_contract` asserts both the new wording and a regression guard against the old misleading phrasing. Tests: 78 (+5 new) in tests/test_cli.py, 851 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 95 +++++++++++++++++++++++++---------- tests/test_cli.py | 121 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 187 insertions(+), 29 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 08504cc..2353ef7 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -41,8 +41,11 @@ from __future__ import annotations import argparse +import contextlib import json +import logging import sys +import warnings from pathlib import Path from typing import Any @@ -427,46 +430,86 @@ def _cmd_info(args: argparse.Namespace) -> int: def _fit_transform_capturing_warnings(engineer, X, y, **kwargs): - """Run ``engineer.fit_transform(X, y, **kwargs)`` while capturing any - Python ``UserWarning`` (or other warning) it emits. + """Run ``engineer.fit_transform(X, y, **kwargs)`` while capturing both + Python ``warnings.warn(...)`` and FeatCopilot logger records. The CLI contract is that stdout carries the JSON payload and stderr is - reserved for failures. ``AutoFeatureEngineer.fit`` calls - ``warnings.warn(...)`` for leakage-prone column names under the default - ``leakage_guard='warn'``, which would otherwise bleed onto stderr on a - successful run and break agent / tool-use error parsing. This helper - intercepts those warnings, collects them as JSON-serializable strings, - and lets the caller surface them inside the ``warnings`` field of the - success payload — keeping stderr deterministic. + reserved for failures. Two sources can otherwise bleed onto stderr on + a successful run: + + * ``warnings.warn(...)`` — emitted by ``AutoFeatureEngineer.fit`` for + leakage-prone column names under the default ``leakage_guard='warn'``. + * ``logger.warning(...)`` / ``logger.info(...)`` — emitted by e.g. + ``_do_no_harm_gate`` on validation-failure fallback, and by every + engine when ``--verbose`` is set. + + The single ``featcopilot`` root logger (``propagate=False``) receives + every child logger's records by ordinary Python logging propagation; + we swap in a capture handler for the duration of the call so the JSON + payload can surface those messages instead of stderr. Returns ------- - (warnings_list, result) - ``warnings_list`` is a list of ``str`` (one entry per warning, in + (messages, result) + ``messages`` is a list of ``str`` (warnings then logs, in emission order). ``result`` is whatever ``fit_transform`` returned. """ - import warnings as _warnings - - captured: list[str] = [] - with _warnings.catch_warnings(record=True) as caught: - _warnings.simplefilter("always") + with _capture_featcopilot_messages() as captured: result = engineer.fit_transform(X, y, **kwargs) - captured.extend(str(w.message) for w in caught) return captured, result def _fit_capturing_warnings(engineer, X, y, **kwargs): """Sibling of :func:`_fit_transform_capturing_warnings` for explain.""" - import warnings as _warnings - - captured: list[str] = [] - with _warnings.catch_warnings(record=True) as caught: - _warnings.simplefilter("always") + with _capture_featcopilot_messages() as captured: engineer.fit_transform(X, y, **kwargs) - captured.extend(str(w.message) for w in caught) return captured +@contextlib.contextmanager +def _capture_featcopilot_messages(): + """Capture all FeatCopilot ``warnings.warn`` calls and logger records. + + Yields a list that the caller can read after the with-block exits. The + list contains formatted log records (in emission order) followed by any + Python warning messages emitted during the with-block. The featcopilot + root logger's handlers are temporarily replaced with a list-appending + handler; child loggers propagate up to the root by default (the only + ``propagate=False`` in the project is on the root itself, which + prevents bleeding to Python's root logger). + """ + captured: list[str] = [] + + class _ListHandler(logging.Handler): + def emit(self, record): + try: + captured.append(self.format(record)) + except Exception: # pragma: no cover - never let logging crash the CLI + captured.append(record.getMessage()) + + list_handler = _ListHandler() + list_handler.setLevel(logging.DEBUG) + list_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s")) + + fc_root = logging.getLogger("featcopilot") + saved_handlers = list(fc_root.handlers) + saved_level = fc_root.level + fc_root.handlers = [list_handler] + fc_root.setLevel(logging.DEBUG) + + try: + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + yield captured + # Append warnings *after* the body returns so the order in the + # captured list mirrors emission order: log records first + # (appended live by the handler), warnings last. + captured.extend(str(w.message) for w in caught) + finally: + fc_root.handlers = saved_handlers + fc_root.setLevel(saved_level) + + def _cmd_transform(args: argparse.Namespace) -> int: """Read input, fit/transform, write output.""" input_path = Path(args.input) @@ -684,8 +727,10 @@ def _add_io_args(p: argparse.ArgumentParser) -> None: p.add_argument( "--target", "-t", - help="Target column name. Required when selection is applied (the default; " - "use --no-selection to skip selection entirely).", + help="Target column name. Required when feature selection is applied " + "(i.e. when --max-features / config max_features is set so the " + "selector actually fits). With no max_features, raw feature " + "generation runs without a target.", ) p.add_argument( "--task-description", diff --git a/tests/test_cli.py b/tests/test_cli.py index 0fed666..89ac103 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,9 +2,12 @@ from __future__ import annotations +import argparse import io import json +import logging import sys +import warnings from contextlib import redirect_stderr, redirect_stdout from pathlib import Path @@ -665,12 +668,10 @@ def test_explain_target_help_no_longer_says_required_for_selection(): """The ``--target`` help on ``explain`` must not claim it gates selection (selection is intentionally disabled in ``explain``). """ - import argparse as _argparse - parser = fc_cli._build_parser() # argparse stores subparsers under a special action attribute explain_parser = next( - action.choices["explain"] for action in parser._actions if isinstance(action, _argparse._SubParsersAction) + action.choices["explain"] for action in parser._actions if isinstance(action, argparse._SubParsersAction) ) target_help = next(a.help for a in explain_parser._actions if "--target" in a.option_strings) assert "required for selection" not in target_help @@ -1037,8 +1038,120 @@ def test_explain_leakage_warning_does_not_pollute_stderr(tmp_path: Path): assert rc == 0, err assert err == "", f"stderr should be empty on success but got: {err!r}" payload = json.loads(out) - assert "warnings" in payload + assert payload["status"] == "ok" + # The ``warnings`` field is always present and is a list. Whether or + # not the leakage heuristic fires is not guaranteed (it evolves); the + # contract under test is that stderr stays clean. + assert isinstance(payload["warnings"], list) + + +def test_transform_logger_warning_does_not_pollute_stderr(tmp_path: Path, tabular_csv: Path): + """The CLI captures ``logger.warning(...)`` records (in addition to + ``warnings.warn``), so any successful run that exercises a code path + emitting a logger message — for example the do-no-harm gate's + fallback — keeps stderr empty. The captured records appear in the + JSON payload's ``warnings`` field. + """ + out_path = tmp_path / "out.csv" + rc, out, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "5", + "--verbose", # exercises ``logger.info(...)`` paths in engines + "--json", + ] + ) + assert rc == 0, err + assert err == "", f"stderr should be empty on success but got: {err!r}" + payload = json.loads(out) + assert payload["status"] == "ok" + assert isinstance(payload["warnings"], list) + + +def test_transform_verbose_logger_info_captured_not_on_stderr(tmp_path: Path, tabular_csv: Path): + """``--verbose`` enables ``logger.info(...)`` calls in + ``AutoFeatureEngineer`` and the engines. Those records must end up + in the JSON payload's ``warnings`` field, not on stderr. + """ + out_path = tmp_path / "out.csv" + rc, out, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--max-features", + "5", + "--verbose", + "--json", + ] + ) + assert rc == 0, err + assert err == "", f"stderr should be empty on success but got: {err!r}" + payload = json.loads(out) + # ``--verbose`` reliably emits "Fitted tabular engine" via logger.info, + # and selection / engineer calls also log. We don't pin the exact + # messages (they evolve) — just check at least one log record is + # present in the captured payload. assert isinstance(payload["warnings"], list) + assert len(payload["warnings"]) >= 1 + + +def test_capture_featcopilot_messages_intercepts_logger_warning(): + """Direct unit test for the contextmanager so the docstring contract is + not just covered transitively via the CLI subcommands. + """ + fc_logger = logging.getLogger("featcopilot.test_cli") + with fc_cli._capture_featcopilot_messages() as captured: + fc_logger.warning("captured-warning-message") + warnings.warn("captured-runtime-warning", UserWarning, stacklevel=2) + assert any("captured-warning-message" in m for m in captured) + assert any("captured-runtime-warning" in m for m in captured) + + +def test_capture_featcopilot_messages_restores_handlers(): + """The contextmanager must restore the original featcopilot root logger + state after the with-block, even if an exception propagates. + """ + fc_root = logging.getLogger("featcopilot") + saved_handlers = list(fc_root.handlers) + saved_level = fc_root.level + + with pytest.raises(RuntimeError): + with fc_cli._capture_featcopilot_messages(): + raise RuntimeError("boom") + + assert fc_root.handlers == saved_handlers + assert fc_root.level == saved_level + + +# ----------------------- --target help text accuracy + + +def test_transform_target_help_reflects_actual_contract(): + """The ``--target`` help on ``transform`` must say the flag is required + only when ``--max-features`` is set (which is when the selector + actually fits), not whenever selection is enabled by default. + """ + parser = fc_cli._build_parser() + transform_parser = next( + action.choices["transform"] for action in parser._actions if isinstance(action, argparse._SubParsersAction) + ) + target_help = next(a.help for a in transform_parser._actions if "--target" in a.option_strings) + assert "max_features" in target_help.lower() or "max-features" in target_help.lower() + # The old ("required when selection is applied (the default ...)") + # phrasing was misleading — guard against regressions. + assert "the default" not in target_help.lower() # ----------------------- target check runs after type validation From 55a814820344202d716472c5218fbd9a80daa720 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 4 May 2026 05:57:07 +0800 Subject: [PATCH 14/30] fix(cli): address round-9 follow-up review feedback Addresses both new comments from copilot-pull-request-reviewer that arrived after the round-9 fix (still on commit 459b1b9): * Logger output capture (Copilot ODei). Already addressed by c388d32 (round-9): the new `_capture_featcopilot_messages` contextmanager replaces the `featcopilot` root logger's handlers for the duration of the engineer call, capturing every `logger.warning(...)` / `logger.info(...)` from every featcopilot.* module (including the copilot-sdk / litellm / openai mock-mode warnings inside `__init__` methods, and `TextEngine`'s missing-NLP-dependency warnings inside `fit`). All such records appear in the JSON payload's `warnings` field; stderr stays empty on success. * Parquet probe is now a real import, not find_spec (Copilot ODeo). `find_spec` only confirms a distribution is on `sys.path`; it doesn't prove the C extensions can load. `_parquet_engine_available` now uses `__import__` so a broken native install honestly reports `parquet_available=false`. New `test_parquet_engine_available_returns_false_for_broken_native_install` exercises the `OSError` (loader-level) branch via `builtins.__import__` monkey-patch; the existing engine-missing / fastparquet-only tests were rewritten to mock `__import__` instead of `find_spec`. Tests: 79 (+1 net) in tests/test_cli.py. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 17 +++++++++---- tests/test_cli.py | 60 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 54 insertions(+), 23 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 2353ef7..71644b4 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -67,12 +67,19 @@ def _parquet_engine_available() -> bool: machine-readable capability output reflects what will actually work in the current environment, rather than always advertising parquet. - Uses :func:`importlib.util.find_spec` so the probe is side-effect-free - (no actual module import) and easy to mock in tests. + Uses ``__import__`` (not ``importlib.util.find_spec``) so the probe is + *correct* even on environments with a broken native install: + ``find_spec`` only confirms a distribution is on ``sys.path``; it does + not prove the C extensions can actually load. A real import is the + only way to verify the engine is usable. """ - import importlib.util - - return importlib.util.find_spec("pyarrow") is not None or importlib.util.find_spec("fastparquet") is not None + for name in ("pyarrow", "fastparquet"): + try: + __import__(name) + return True + except Exception: # noqa: BLE001 - any import-time failure means unusable + continue + return False def _detect_format(path: Path, override: str | None) -> str: diff --git a/tests/test_cli.py b/tests/test_cli.py index 89ac103..d0b92b4 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1470,40 +1470,64 @@ def _raise_import_error(*args, **kwargs): def test_parquet_engine_available_returns_false_when_neither_installed(monkeypatch): - """Both probes return ``None`` from ``find_spec`` -> function returns False.""" - import importlib.util + """When ``__import__`` raises ``ImportError`` for both engines, the + function reports parquet as unavailable. + """ + import builtins - real_find_spec = importlib.util.find_spec + real_import = builtins.__import__ - def fake_find_spec(name, *args, **kwargs): + def fake_import(name, *args, **kwargs): if name in ("pyarrow", "fastparquet"): - return None - return real_find_spec(name, *args, **kwargs) + raise ImportError(f"No module named '{name}' (simulated)") + return real_import(name, *args, **kwargs) - monkeypatch.setattr(importlib.util, "find_spec", fake_find_spec) + monkeypatch.setattr(builtins, "__import__", fake_import) assert fc_cli._parquet_engine_available() is False def test_parquet_engine_available_returns_true_for_fastparquet_only(monkeypatch): - """Even without pyarrow, finding fastparquet must report parquet as available.""" - import importlib.util - - class _FakeSpec: - pass + """Even without pyarrow, importing fastparquet must report parquet as available.""" + import builtins - real_find_spec = importlib.util.find_spec + real_import = builtins.__import__ - def fake_find_spec(name, *args, **kwargs): + def fake_import(name, *args, **kwargs): if name == "pyarrow": - return None + raise ImportError("No module named 'pyarrow' (simulated)") if name == "fastparquet": - return _FakeSpec() - return real_find_spec(name, *args, **kwargs) + # Simulate a successful import by short-circuiting; we don't + # actually need a real module object, just a non-raising return. + class _FakeModule: + pass - monkeypatch.setattr(importlib.util, "find_spec", fake_find_spec) + return _FakeModule() + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) assert fc_cli._parquet_engine_available() is True +def test_parquet_engine_available_returns_false_for_broken_native_install(monkeypatch): + """A distribution that's on sys.path but raises a non-ImportError at + import time (e.g. broken native bindings) is reported as unavailable. + Using ``__import__`` (rather than ``importlib.util.find_spec``) is what + makes this honest: ``find_spec`` would have returned a spec and lied. + """ + import builtins + + real_import = builtins.__import__ + + def fake_import(name, *args, **kwargs): + if name in ("pyarrow", "fastparquet"): + # Simulate a broken native install (loader-level failure). + raise OSError("broken native install: undefined symbol (simulated)") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", fake_import) + assert fc_cli._parquet_engine_available() is False + + def test_unreadable_config_returns_exit_2(tmp_path, tabular_csv, monkeypatch): """An ``OSError`` while opening the config (permission denied, broken symlink, etc.) is converted into the deterministic exit-2 path. From 0f1f0b1b92e2549b6b642fd8b4173beb7fd5bc94 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 4 May 2026 06:32:04 +0800 Subject: [PATCH 15/30] fix(cli): address round-10 review feedback Addresses all three new comments from copilot-pull-request-reviewer on PR #5 (commit 55a8148): * Drop logger.exception in exit-1 path (Copilot OHcA). The generic exception handler used to write `featcopilot: unexpected error: ...` to stderr AND then call `logger.exception(...)`, which appended a second timestamped traceback (FeatCopilot loggers write to stderr). The CLI's contract is exactly one structured stderr line per failure; the `logger.exception` call is removed. Internal failure introspection is the caller''s job (e.g. `PYTHONFAULTHANDLER=1`). `test_unexpected_error_writes_single_stderr_line` asserts a single matching line and absence of any traceback signature. * Make the capture contextmanager thread-safe (Copilot OHcD). `_capture_featcopilot_messages()` mutates the global `featcopilot` logger's handlers/level. Concurrent in-process CLI calls (e.g. two threads invoking `cli.main(...)`) could steal each other's handlers and restore stale state. A module-level `threading.Lock` (`_capture_lock`) now serializes captures so each context gets a clean save/restore cycle. New `test_capture_featcopilot_messages_thread_safety` runs two threads through a `Barrier` to force contention and asserts each capture contains exactly its own 20 records (no cross-talk, no losses) and that no `ListHandler` leaks onto the global logger. * Reject unknown top-level config keys (Copilot OHcE). `_load_config` previously accepted typos like `{"max_feature": 5}` (missing 's') and silently ran with defaults, making the JSON config API hard to trust in automation. New `_KNOWN_CONFIG_KEYS` whitelist (`engines`, `selection_methods`, `max_features`, `correlation_threshold`, `leakage_guard`, `gate_n_jobs`, `llm_config`, `verbose`) is checked at config load; unknown keys produce a precise exit-2 error that lists the recognized keys so users can self-correct without reading source. Two new tests cover `max_feature` and `selection_method` typos. Tests: 83 (+4 new) in tests/test_cli.py, 856 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 87 +++++++++++++++++++++------- tests/test_cli.py | 138 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+), 21 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 71644b4..129e6da 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -45,6 +45,7 @@ import json import logging import sys +import threading import warnings from pathlib import Path from typing import Any @@ -213,15 +214,34 @@ def _write_table(df, path: Path, fmt: str) -> None: raise ValueError(f"Unsupported output format: {fmt}") +# Top-level keys recognized in a ``--config`` JSON file. The CLI rejects +# any other top-level key with a precise exit-2 error so typos like +# ``max_feature`` (no s) fail fast in automation rather than silently +# running with defaults. +_KNOWN_CONFIG_KEYS = frozenset( + { + "engines", + "selection_methods", + "max_features", + "correlation_threshold", + "leakage_guard", + "gate_n_jobs", + "llm_config", + "verbose", + } +) + + def _load_config(config_path: str | None) -> dict[str, Any]: """Load a JSON config file (or return an empty dict). Normalizes user-input mistakes (missing path, directory passed instead - of a file, invalid JSON, non-object root) into :class:`ValueError` / - :class:`FileNotFoundError` so the CLI's top-level error handler can - route them all to the deterministic ``exit 2`` user-error path - (rather than e.g. ``IsADirectoryError`` falling into the generic - ``exit 1`` "unexpected error" backstop). + of a file, invalid JSON, non-object root, unknown top-level keys) into + :class:`ValueError` / :class:`FileNotFoundError` so the CLI's top-level + error handler can route them all to the deterministic ``exit 2`` + user-error path (rather than e.g. ``IsADirectoryError`` falling into + the generic ``exit 1`` "unexpected error" backstop, or a typo silently + being ignored). """ if config_path is None: return {} @@ -242,6 +262,12 @@ def _load_config(config_path: str | None) -> dict[str, Any]: raise ValueError(f"Config file {config_path!r} could not be read: {exc}") from exc if not isinstance(data, dict): raise ValueError(f"Config file {config_path!r} must contain a JSON object at the top level") + unknown = sorted(set(data.keys()) - _KNOWN_CONFIG_KEYS) + if unknown: + raise ValueError( + f"Config file {config_path!r} has unknown top-level key(s): {unknown}. " + f"Recognized keys: {sorted(_KNOWN_CONFIG_KEYS)}." + ) return data @@ -484,6 +510,12 @@ def _capture_featcopilot_messages(): handler; child loggers propagate up to the root by default (the only ``propagate=False`` in the project is on the root itself, which prevents bleeding to Python's root logger). + + Concurrency: serialized via ``_capture_lock``. Multiple in-process CLI + calls that overlap (e.g. two threads calling ``cli.main(...)`` + simultaneously) take the lock in turn so neither steals the other's + handlers nor restores stale state. Single-process / single-CLI usage + is unaffected. """ captured: list[str] = [] @@ -499,22 +531,29 @@ def emit(self, record): list_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s")) fc_root = logging.getLogger("featcopilot") - saved_handlers = list(fc_root.handlers) - saved_level = fc_root.level - fc_root.handlers = [list_handler] - fc_root.setLevel(logging.DEBUG) + with _capture_lock: + saved_handlers = list(fc_root.handlers) + saved_level = fc_root.level + fc_root.handlers = [list_handler] + fc_root.setLevel(logging.DEBUG) - try: - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - yield captured - # Append warnings *after* the body returns so the order in the - # captured list mirrors emission order: log records first - # (appended live by the handler), warnings last. - captured.extend(str(w.message) for w in caught) - finally: - fc_root.handlers = saved_handlers - fc_root.setLevel(saved_level) + try: + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + yield captured + # Append warnings *after* the body returns so the order in + # the captured list mirrors emission order: log records + # first (appended live by the handler), warnings last. + captured.extend(str(w.message) for w in caught) + finally: + fc_root.handlers = saved_handlers + fc_root.setLevel(saved_level) + + +# Serializes ``_capture_featcopilot_messages`` so concurrent CLI calls in +# the same process can't steal each other's handlers / level on the +# global ``featcopilot`` logger. +_capture_lock = threading.Lock() def _cmd_transform(args: argparse.Namespace) -> int: @@ -849,8 +888,14 @@ def main(argv: list[str] | None = None) -> int: sys.stderr.write("featcopilot: interrupted\n") return 130 except Exception as exc: # pragma: no cover - defensive backstop + # Single deterministic stderr line so agents can parse the failure. + # We deliberately do NOT call ``logger.exception(...)`` here: + # FeatCopilot loggers write to stderr, which would append a second + # timestamped traceback after our structured line and break the + # CLI's "stderr is exactly one error message" contract. Internal + # failure introspection is the caller's job (e.g. set + # ``PYTHONFAULTHANDLER=1`` or attach a debugger). sys.stderr.write(f"featcopilot: unexpected error: {type(exc).__name__}: {exc}\n") - logger.exception("Unhandled CLI exception") return 1 diff --git a/tests/test_cli.py b/tests/test_cli.py index d0b92b4..c1e6729 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1135,6 +1135,93 @@ def test_capture_featcopilot_messages_restores_handlers(): assert fc_root.level == saved_level +def test_capture_featcopilot_messages_thread_safety(): + """Concurrent ``_capture_featcopilot_messages`` invocations must not + steal each other's handlers / lose log records. Implementation uses + ``_capture_lock`` to serialize captures. + """ + import threading + + fc_logger = logging.getLogger("featcopilot.test_concurrent") + + results: list[list[str]] = [] + barrier = threading.Barrier(2) + + def worker(tag: str): + # Force both threads to enter the with-block at roughly the same + # time so the lock is genuinely contended. + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + for i in range(20): + fc_logger.warning(f"{tag}-{i}") + results.append(captured) + + t1 = threading.Thread(target=worker, args=("A",)) + t2 = threading.Thread(target=worker, args=("B",)) + t1.start() + t2.start() + t1.join() + t2.join() + + assert len(results) == 2 + # Each capture list must contain exactly its own thread's records and + # nothing from the other thread. + for res in results: + # Find which tag this list belongs to. + tag = "A" if any("A-" in m for m in res) else "B" + assert all(f"{tag}-" in m for m in res), f"Thread isolation violated in capture {tag!r}: got {res!r}" + assert len(res) == 20 + + # Final state on the global logger must be cleanly restored. + fc_root = logging.getLogger("featcopilot") + assert all( + not isinstance(h, logging.Handler) or "ListHandler" not in type(h).__name__ for h in fc_root.handlers + ), "ListHandler leaked onto the global featcopilot logger" + + +def test_unexpected_error_writes_single_stderr_line(monkeypatch, tmp_path: Path, tabular_csv: Path): + """An unexpected (non-ValueError) exception must produce exactly one + structured stderr line — no second timestamped traceback from + ``logger.exception(...)`` — so agents can parse failures + deterministically. + """ + import pandas as pd + + class _UnexpectedError(Exception): + """A non-ValueError, non-OSError exception that escapes the helpers.""" + + def _raise_unexpected(*args, **kwargs): + raise _UnexpectedError("simulated internal failure") + + # Monkey-patch ``pd.read_csv`` directly. Since ``_read_table``'s CSV + # branch normally catches ``OSError`` / ``ParserError`` / ``UnicodeDecodeError``, + # raising a different exception type forces us into the generic exit-1 + # backstop in ``main()``. + monkeypatch.setattr(pd, "read_csv", _raise_unexpected, raising=True) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 1, err + # Exactly one non-empty line on stderr. + err_lines = [line for line in err.splitlines() if line.strip()] + assert len(err_lines) == 1, f"Expected single-line stderr, got: {err!r}" + assert err_lines[0].startswith("featcopilot: unexpected error:") + assert "_UnexpectedError" in err_lines[0] + assert "simulated internal failure" in err_lines[0] + # No traceback signature. + assert "Traceback" not in err + assert 'File "' not in err + + # ----------------------- --target help text accuracy @@ -1286,6 +1373,57 @@ def test_invalid_config_file_returns_exit_2(tmp_path: Path, tabular_csv: Path): assert "JSON object" in err +def test_unknown_config_top_level_key_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """A typo in a top-level config key (``max_feature`` instead of + ``max_features``, etc.) must fail fast with a precise exit-2 message + listing the recognized keys — not silently run with defaults. + """ + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"max_feature": 5})) # missing 's' + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "max_feature" in err + assert "Recognized keys" in err or "recognized keys" in err.lower() + + +def test_unknown_config_top_level_key_lists_known_keys(tmp_path: Path, tabular_csv: Path): + """The error message must enumerate the recognized keys so users can + self-correct without reading the source. + """ + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"selection_method": ["mutual_info"]})) # missing 's' + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "o.csv"), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "selection_method" in err + # Recognized-keys list must include the canonical names. + assert "selection_methods" in err + assert "max_features" in err + + def test_directory_as_config_returns_exit_2(tmp_path: Path, tabular_csv: Path): """Pointing ``--config`` at a directory must surface as exit 2, not the generic ``exit 1`` backstop (``IsADirectoryError``). From fd7c28a774a9cfdef4c841356296e9476bb458bf Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 4 May 2026 09:11:36 +0800 Subject: [PATCH 16/30] fix(cli): address round-11 review feedback Addresses both new comments from copilot-pull-request-reviewer on PR #5 (commit 0f1f0b1): * Capture no longer serializes concurrent CLI calls (Copilot OueS). Round-10's `_capture_lock` was held for the entire `fit_transform` body, so a second `cli.main(...)` from another in-process caller blocked until the first feature-engineering job finished. The contextmanager is now lock-free for the body and uses *per-thread routing* via two singletons added once to the `featcopilot` root logger: - `_ThreadRoutingHandler` appends each record to the calling thread's capture list (or no-ops if the thread isn't capturing). - `_SuppressCapturingFilter` is added to the existing handlers so capturing threads' records DON'T also bleed onto stderr. Concurrent threads each see only their own records and run in parallel; `test_capture_does_not_block_concurrent_callers` verifies the no-serialization property by having two workers `time.sleep(0.2)` inside the block and asserting both are inside simultaneously. `test_capture_concurrent_cli_calls_isolate_logs` is the end-to-end version: two real `transform --verbose` runs in parallel threads with empty stderr and isolated `warnings` payloads. * warnings.warn capture is now thread-local (Copilot OueZ). The previous `warnings.catch_warnings(record=True)` is process- global; warnings from a non-capturing thread (or another capturing thread) could be swallowed and mis-attributed. The contextmanager now overrides `warnings.showwarning` and routes by `threading.get_ident()`: only warnings from the registered thread go to that thread's list; warnings from other threads chain to the previous `showwarning` (preserving normal emission for non-capturing threads). `test_capture_warnings_warn_thread_isolated` asserts two threads capturing concurrently see only their own `warnings.warn` calls. Tests: 85 (+2 net) in tests/test_cli.py, 858 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 206 ++++++++++++++++++++++++++++++++++----------- tests/test_cli.py | 84 ++++++++++++++++-- 2 files changed, 233 insertions(+), 57 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 129e6da..708fe63 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -499,61 +499,169 @@ def _fit_capturing_warnings(engineer, X, y, **kwargs): return captured -@contextlib.contextmanager -def _capture_featcopilot_messages(): - """Capture all FeatCopilot ``warnings.warn`` calls and logger records. - - Yields a list that the caller can read after the with-block exits. The - list contains formatted log records (in emission order) followed by any - Python warning messages emitted during the with-block. The featcopilot - root logger's handlers are temporarily replaced with a list-appending - handler; child loggers propagate up to the root by default (the only - ``propagate=False`` in the project is on the root itself, which - prevents bleeding to Python's root logger). - - Concurrency: serialized via ``_capture_lock``. Multiple in-process CLI - calls that overlap (e.g. two threads calling ``cli.main(...)`` - simultaneously) take the lock in turn so neither steals the other's - handlers nor restores stale state. Single-process / single-CLI usage - is unaffected. +class _ThreadCaptureState: + """Holds per-thread capture lists. + + Shared by :class:`_ThreadRoutingHandler` (writes records) and + :class:`_SuppressCapturingFilter` (decides whether to drop a record + from the original handlers). Mutations are guarded by a small lock; + lookups use ``dict.get`` which is atomic under the GIL for hashable + keys. """ - captured: list[str] = [] - class _ListHandler(logging.Handler): - def emit(self, record): - try: - captured.append(self.format(record)) - except Exception: # pragma: no cover - never let logging crash the CLI - captured.append(record.getMessage()) + def __init__(self): + self._per_thread: dict[int, list[str]] = {} + self._lock = threading.Lock() + + def register(self, tid: int, target: list[str]) -> None: + with self._lock: + self._per_thread[tid] = target - list_handler = _ListHandler() - list_handler.setLevel(logging.DEBUG) - list_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s")) + def unregister(self, tid: int) -> None: + with self._lock: + self._per_thread.pop(tid, None) - fc_root = logging.getLogger("featcopilot") - with _capture_lock: - saved_handlers = list(fc_root.handlers) - saved_level = fc_root.level - fc_root.handlers = [list_handler] - fc_root.setLevel(logging.DEBUG) + def get(self, tid: int) -> list[str] | None: + # Lock-free read: ``dict.get`` is atomic for hashable keys under + # the CPython GIL, and we only ever read references to lists owned + # by individual threads — no shared mutation hazard. + return self._per_thread.get(tid) + + +class _ThreadRoutingHandler(logging.Handler): + """Logging handler that routes records to the calling thread's capture list. + + Attached once to the ``featcopilot`` root logger. Records propagated + from any ``featcopilot.*`` child logger reach this handler in the same + way they reach the existing stderr handler. If the calling thread has + a registered capture list, the record is appended to it; otherwise the + handler does nothing (the existing stderr handler is what produces the + user-facing output for non-capturing threads). + """ + def __init__(self, state: _ThreadCaptureState): + super().__init__(logging.DEBUG) + self._state = state + self.setFormatter(logging.Formatter("%(levelname)s: %(message)s")) + + def emit(self, record: logging.LogRecord) -> None: + target = self._state.get(threading.get_ident()) + if target is None: + return try: - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - yield captured - # Append warnings *after* the body returns so the order in - # the captured list mirrors emission order: log records - # first (appended live by the handler), warnings last. - captured.extend(str(w.message) for w in caught) - finally: - fc_root.handlers = saved_handlers - fc_root.setLevel(saved_level) - - -# Serializes ``_capture_featcopilot_messages`` so concurrent CLI calls in -# the same process can't steal each other's handlers / level on the -# global ``featcopilot`` logger. -_capture_lock = threading.Lock() + target.append(self.format(record)) + except Exception: # pragma: no cover - never let logging crash the CLI + target.append(record.getMessage()) + + +class _SuppressCapturingFilter(logging.Filter): + """Filter for the *existing* handlers: drops records from capturing threads. + + Without this filter, every record emitted by a capturing thread would + still hit the featcopilot root logger's stderr ``StreamHandler`` and + bleed onto stderr — breaking the CLI's "stderr reserved for failures" + contract. The filter checks ``threading.get_ident()`` against the + shared :class:`_ThreadCaptureState` so non-capturing threads continue + to see normal stderr output. + """ + + def __init__(self, state: _ThreadCaptureState): + super().__init__() + self._state = state + + def filter(self, record: logging.LogRecord) -> bool: + return self._state.get(threading.get_ident()) is None + + +# Module-level singletons. Installed exactly once on the featcopilot root +# logger / its existing handlers; subsequent ``_capture_featcopilot_messages`` +# calls just register/unregister thread state. No global lock is held during +# the slow ``fit_transform`` body — concurrent threads each capture their +# own records independently. +_capture_state = _ThreadCaptureState() +_routing_handler = _ThreadRoutingHandler(_capture_state) +_suppress_filter = _SuppressCapturingFilter(_capture_state) +_install_lock = threading.Lock() +_install_done = False + + +def _install_capture_hooks_once() -> None: + """Install the routing handler + suppress filter on the featcopilot root logger. + + Idempotent: subsequent calls are no-ops. Must be called before the + first capture; happens lazily on first use to avoid altering the + logging tree at module import time when the CLI is being introspected + rather than executed. + """ + global _install_done + if _install_done: + return + with _install_lock: + if _install_done: + return + fc_root = logging.getLogger("featcopilot") + if _routing_handler not in fc_root.handlers: + fc_root.addHandler(_routing_handler) + for handler in list(fc_root.handlers): + if handler is _routing_handler: + continue + if _suppress_filter not in handler.filters: + handler.addFilter(_suppress_filter) + _install_done = True + + +@contextlib.contextmanager +def _capture_featcopilot_messages(): + """Capture FeatCopilot log records and ``warnings.warn`` calls emitted + on the *current thread*. + + Yields a list that the caller can read after the with-block exits. + The list contains formatted log records (in emission order) followed + by any Python warning messages emitted during the with-block on this + thread. + + Concurrency model + ----------------- + * **Logger records** are routed *per-thread* via + :class:`_ThreadRoutingHandler` (added once to the ``featcopilot`` + root logger) and a :class:`_SuppressCapturingFilter` on the existing + handlers. Two threads can capture concurrently without blocking + each other; each sees only its own records, and other threads' + records still flow normally to stderr. + * **``warnings.warn`` records** are intercepted via a per-thread + override of :data:`warnings.showwarning`. The override appends to + the capturing thread's list and chains to the previous + ``showwarning`` for warnings emitted on non-capturing threads. + + The contextmanager does NOT hold any lock for the duration of the + with-block — only briefly during install/register/unregister — so + long-running ``fit_transform`` calls in one thread do not block + other threads from running concurrently. + """ + _install_capture_hooks_once() + + captured: list[str] = [] + tid = threading.get_ident() + _capture_state.register(tid, captured) + + # Per-thread ``warnings.warn`` interception. We chain to whatever + # ``warnings.showwarning`` was in place before us so non-capturing + # threads (or nested captures) still receive their warnings via the + # existing path. + previous_showwarning = warnings.showwarning + + def _routing_showwarning(message, category, filename, lineno, file=None, line=None): + if threading.get_ident() == tid: + captured.append(str(message)) + return + previous_showwarning(message, category, filename, lineno, file, line) + + warnings.showwarning = _routing_showwarning + try: + yield captured + finally: + warnings.showwarning = previous_showwarning + _capture_state.unregister(tid) def _cmd_transform(args: argparse.Namespace) -> int: diff --git a/tests/test_cli.py b/tests/test_cli.py index c1e6729..ea5770e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1137,8 +1137,8 @@ def test_capture_featcopilot_messages_restores_handlers(): def test_capture_featcopilot_messages_thread_safety(): """Concurrent ``_capture_featcopilot_messages`` invocations must not - steal each other's handlers / lose log records. Implementation uses - ``_capture_lock`` to serialize captures. + steal each other's records. Implementation uses per-thread routing + (no global lock held during the body), so threads execute concurrently. """ import threading @@ -1149,7 +1149,7 @@ def test_capture_featcopilot_messages_thread_safety(): def worker(tag: str): # Force both threads to enter the with-block at roughly the same - # time so the lock is genuinely contended. + # time so the routing dispatch is genuinely contended. barrier.wait() with fc_cli._capture_featcopilot_messages() as captured: for i in range(20): @@ -1172,11 +1172,79 @@ def worker(tag: str): assert all(f"{tag}-" in m for m in res), f"Thread isolation violated in capture {tag!r}: got {res!r}" assert len(res) == 20 - # Final state on the global logger must be cleanly restored. - fc_root = logging.getLogger("featcopilot") - assert all( - not isinstance(h, logging.Handler) or "ListHandler" not in type(h).__name__ for h in fc_root.handlers - ), "ListHandler leaked onto the global featcopilot logger" + +def test_capture_does_not_block_concurrent_callers(): + """Two concurrent ``_capture_featcopilot_messages`` blocks must run in + parallel — i.e. the design does NOT serialize the body via a global + lock. Verified by timing: a worker that sleeps inside the block must + not block another worker from also entering the block at the same + time. + """ + import threading + import time + + inside = [] + inside_lock = threading.Lock() + seen_overlap = threading.Event() + barrier = threading.Barrier(2) + + def worker(): + barrier.wait() + with fc_cli._capture_featcopilot_messages(): + with inside_lock: + inside.append(1) + if len(inside) >= 2: + seen_overlap.set() + # Sleep long enough that, if the implementation serialized via + # a global lock, the second thread would never enter + # simultaneously. + time.sleep(0.2) + with inside_lock: + inside.pop() + + t1 = threading.Thread(target=worker) + t2 = threading.Thread(target=worker) + t1.start() + t2.start() + t1.join(timeout=5) + t2.join(timeout=5) + + assert seen_overlap.is_set(), ( + "Both threads should have been inside _capture_featcopilot_messages " + "simultaneously; the implementation appears to serialize the body." + ) + + +def test_capture_warnings_warn_thread_isolated(): + """``warnings.warn`` calls from one capturing thread must not leak into + another capturing thread's payload. The CLI overrides + ``warnings.showwarning`` per-thread (rather than using + ``warnings.catch_warnings(record=True)`` which is process-global). + """ + import threading + + barrier = threading.Barrier(2) + a_captured: list[str] = [] + b_captured: list[str] = [] + + def worker(tag: str, target: list[str]): + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + for i in range(10): + warnings.warn(f"{tag}-warn-{i}", UserWarning, stacklevel=2) + target.extend(captured) + + t1 = threading.Thread(target=worker, args=("A", a_captured)) + t2 = threading.Thread(target=worker, args=("B", b_captured)) + t1.start() + t2.start() + t1.join() + t2.join() + + assert all("A-warn-" in m for m in a_captured) + assert all("B-warn-" in m for m in b_captured) + assert not any("B-warn-" in m for m in a_captured) + assert not any("A-warn-" in m for m in b_captured) def test_unexpected_error_writes_single_stderr_line(monkeypatch, tmp_path: Path, tabular_csv: Path): From 0c69dd9d000cc08b3613b9459af56e24f3e696d4 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 4 May 2026 15:38:23 +0800 Subject: [PATCH 17/30] fix(cli): address round-12 review feedback Addresses all three new comments from copilot-pull-request-reviewer on PR #5 (commit fd7c28a): * warnings.showwarning install is now overlap-safe (Copilot P4Ox). Round-11's per-call save-and-restore raced when two captures overlapped: A enters, B enters, A exits and restores the original `warnings.showwarning` while B is still active. The override is now installed by `_install_capture_hooks_once()` and survives the with-block; concurrent captures all dispatch through the same permanent override which routes by thread ID. The install is also rechecked on every capture entry, so a caller's `warnings.catch_warnings()` block (which restores `warnings.showwarning` on exit) can't silently undo the install for subsequent CLI runs. * Per-thread state is now a stack (Copilot P4PL). Nested `_capture_featcopilot_messages()` calls on the same thread used to clobber the outer registration; the inner `unregister()` removed the thread entirely, so any later log records / warnings in the outer block leaked to stderr and were missing from the outer payload. `_ThreadCaptureState` now keeps a stack per thread: `push` on entry, `pop` on exit, `get` returns the innermost active capture. Logs and warnings always go to the innermost list while it's active; outer captures resume automatically. `test_nested_capture_on_same_thread_preserves_outer_list` covers the full scenario. * Test asserts hook *stability*, not equality with pre-first-call state (Copilot P4PG). The previous restores-handlers test was order-dependent: on the very first capture in a process, `_install_capture_hooks_once()` permanently adds `_routing_handler`, so the post-block handler list differs from the pre-first-call list. The replacement test `test_capture_featcopilot_messages_does_not_mutate_logger_state_per_call` forces install via a no-op capture, then asserts the handler set, level, and `warnings.showwarning` are unchanged across an exception-propagating capture. * New `test_overlapping_captures_with_out_of_order_exit` exercises the strict failure mode: two threads enter the block, then thread A exits before B emits its tail records. B's log records and `warnings.warn` calls are still captured under the new permanent- install design. Tests: 87 (+2 net) in tests/test_cli.py, 860 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 172 ++++++++++++++++++++++++++++----------------- tests/test_cli.py | 157 ++++++++++++++++++++++++++++++++++++----- 2 files changed, 249 insertions(+), 80 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 708fe63..2c07916 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -500,32 +500,42 @@ def _fit_capturing_warnings(engineer, X, y, **kwargs): class _ThreadCaptureState: - """Holds per-thread capture lists. + """Holds per-thread capture *stacks*. - Shared by :class:`_ThreadRoutingHandler` (writes records) and - :class:`_SuppressCapturingFilter` (decides whether to drop a record - from the original handlers). Mutations are guarded by a small lock; - lookups use ``dict.get`` which is atomic under the GIL for hashable - keys. + Each thread maps to a stack of capture lists. Nested + :func:`_capture_featcopilot_messages` calls on the same thread push + onto the stack; the innermost active capture is always at the top + and receives records / warnings until its block exits, at which + point the outer capture (if any) becomes active again. + + Shared by :class:`_ThreadRoutingHandler` (writes records), + :class:`_SuppressCapturingFilter` (suppresses stderr), and the + routing ``warnings.showwarning`` override. """ def __init__(self): - self._per_thread: dict[int, list[str]] = {} + self._per_thread: dict[int, list[list[str]]] = {} self._lock = threading.Lock() - def register(self, tid: int, target: list[str]) -> None: + def push(self, tid: int, target: list[str]) -> None: with self._lock: - self._per_thread[tid] = target + self._per_thread.setdefault(tid, []).append(target) - def unregister(self, tid: int) -> None: + def pop(self, tid: int) -> None: with self._lock: - self._per_thread.pop(tid, None) + stack = self._per_thread.get(tid) + if stack: + stack.pop() + if not stack: + del self._per_thread[tid] def get(self, tid: int) -> list[str] | None: - # Lock-free read: ``dict.get`` is atomic for hashable keys under - # the CPython GIL, and we only ever read references to lists owned - # by individual threads — no shared mutation hazard. - return self._per_thread.get(tid) + # Brief lock for thread-safe stack-top read. + with self._lock: + stack = self._per_thread.get(tid) + if stack: + return stack[-1] + return None class _ThreadRoutingHandler(logging.Handler): @@ -575,39 +585,82 @@ def filter(self, record: logging.LogRecord) -> bool: # Module-level singletons. Installed exactly once on the featcopilot root # logger / its existing handlers; subsequent ``_capture_featcopilot_messages`` -# calls just register/unregister thread state. No global lock is held during -# the slow ``fit_transform`` body — concurrent threads each capture their -# own records independently. +# calls just push/pop thread state. No global lock is held during the slow +# ``fit_transform`` body — concurrent threads each capture their own records +# independently. _capture_state = _ThreadCaptureState() _routing_handler = _ThreadRoutingHandler(_capture_state) _suppress_filter = _SuppressCapturingFilter(_capture_state) _install_lock = threading.Lock() _install_done = False +# Captures the original ``warnings.showwarning`` at first install so the +# routing override can chain to it for non-capturing threads (and so we +# never mutate it again on subsequent capture calls — the previous +# per-call save/restore raced under concurrent overlapping captures). +_original_showwarning = None -def _install_capture_hooks_once() -> None: - """Install the routing handler + suppress filter on the featcopilot root logger. +def _routing_showwarning(message, category, filename, lineno, file=None, line=None): + """Permanent ``warnings.showwarning`` override (installed once). + + Routes warnings to the *innermost* capturing list for the current + thread (via :class:`_ThreadCaptureState` stack lookup). If the + current thread is not capturing, chains to the original + ``warnings.showwarning`` so non-capturing threads keep their normal + behavior. - Idempotent: subsequent calls are no-ops. Must be called before the - first capture; happens lazily on first use to avoid altering the - logging tree at module import time when the CLI is being introspected - rather than executed. + Installed once globally — *not* swapped per-call — so concurrent + overlapping captures on different threads cannot race on the + process-global ``warnings.showwarning`` slot. """ - global _install_done - if _install_done: + target = _capture_state.get(threading.get_ident()) + if target is not None: + target.append(str(message)) return + if _original_showwarning is not None: + _original_showwarning(message, category, filename, lineno, file, line) + + +def _install_capture_hooks_once() -> None: + """Install the routing handler + suppress filter + showwarning override. + + The logger handler and filter are installed exactly once (idempotent). + The ``warnings.showwarning`` override is re-installed every call if + something else has replaced it — this is necessary because external + code (most commonly ``warnings.catch_warnings()`` blocks) can reset + the global ``warnings.showwarning`` and undo a previous install. The + fresh re-install captures the current (caller's) ``showwarning`` as + the new "original" to chain to, so non-capturing threads still see + whatever warning behavior the caller had set up. + + All hooks themselves dispatch on :class:`_ThreadCaptureState` which + uses a per-thread stack, so they are no-ops for threads that aren't + currently capturing. + """ + global _install_done, _original_showwarning with _install_lock: - if _install_done: - return - fc_root = logging.getLogger("featcopilot") - if _routing_handler not in fc_root.handlers: - fc_root.addHandler(_routing_handler) - for handler in list(fc_root.handlers): - if handler is _routing_handler: - continue - if _suppress_filter not in handler.filters: - handler.addFilter(_suppress_filter) - _install_done = True + # Logger handler/filter install (truly once — these can't be + # silently undone by external code in the way ``warnings.showwarning`` + # can). + if not _install_done: + fc_root = logging.getLogger("featcopilot") + if _routing_handler not in fc_root.handlers: + fc_root.addHandler(_routing_handler) + for handler in list(fc_root.handlers): + if handler is _routing_handler: + continue + if _suppress_filter not in handler.filters: + handler.addFilter(_suppress_filter) + _install_done = True + + # ``warnings.showwarning`` install — re-check every entry. A + # caller's ``warnings.catch_warnings()`` block restores the + # previous ``showwarning`` on exit, undoing our install. Re- + # installing on next entry is what makes overlapping captures + # robust against caller-side warning context manipulation. + if warnings.showwarning is not _routing_showwarning: + _original_showwarning = warnings.showwarning + warnings.showwarning = _routing_showwarning @contextlib.contextmanager @@ -616,9 +669,8 @@ def _capture_featcopilot_messages(): on the *current thread*. Yields a list that the caller can read after the with-block exits. - The list contains formatted log records (in emission order) followed - by any Python warning messages emitted during the with-block on this - thread. + The list contains formatted log records (in emission order) and any + Python warning messages emitted during the with-block on this thread. Concurrency model ----------------- @@ -628,40 +680,32 @@ def _capture_featcopilot_messages(): handlers. Two threads can capture concurrently without blocking each other; each sees only its own records, and other threads' records still flow normally to stderr. - * **``warnings.warn`` records** are intercepted via a per-thread - override of :data:`warnings.showwarning`. The override appends to - the capturing thread's list and chains to the previous - ``showwarning`` for warnings emitted on non-capturing threads. + * **``warnings.warn`` records** are intercepted via a permanent + :func:`_routing_showwarning` override installed once. The override + routes by ``threading.get_ident()`` and chains to the original + ``warnings.showwarning`` for non-capturing threads. The override is + *not* swapped per-call, so concurrent overlapping captures on + different threads cannot race on the process-global + ``warnings.showwarning`` slot. + * **Nested captures** on the same thread are supported via a + per-thread stack in :class:`_ThreadCaptureState`. Records and + warnings always go to the innermost active capture; when the inner + block exits, the outer capture is automatically reactivated. The contextmanager does NOT hold any lock for the duration of the - with-block — only briefly during install/register/unregister — so - long-running ``fit_transform`` calls in one thread do not block - other threads from running concurrently. + with-block — only briefly during install/push/pop — so long-running + ``fit_transform`` calls in one thread do not block other threads + from running concurrently. """ _install_capture_hooks_once() captured: list[str] = [] tid = threading.get_ident() - _capture_state.register(tid, captured) - - # Per-thread ``warnings.warn`` interception. We chain to whatever - # ``warnings.showwarning`` was in place before us so non-capturing - # threads (or nested captures) still receive their warnings via the - # existing path. - previous_showwarning = warnings.showwarning - - def _routing_showwarning(message, category, filename, lineno, file=None, line=None): - if threading.get_ident() == tid: - captured.append(str(message)) - return - previous_showwarning(message, category, filename, lineno, file, line) - - warnings.showwarning = _routing_showwarning + _capture_state.push(tid, captured) try: yield captured finally: - warnings.showwarning = previous_showwarning - _capture_state.unregister(tid) + _capture_state.pop(tid) def _cmd_transform(args: argparse.Namespace) -> int: diff --git a/tests/test_cli.py b/tests/test_cli.py index ea5770e..a4ec0ff 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1112,27 +1112,47 @@ def test_capture_featcopilot_messages_intercepts_logger_warning(): not just covered transitively via the CLI subcommands. """ fc_logger = logging.getLogger("featcopilot.test_cli") - with fc_cli._capture_featcopilot_messages() as captured: - fc_logger.warning("captured-warning-message") - warnings.warn("captured-runtime-warning", UserWarning, stacklevel=2) + # Reset Python's warning-deduplication state for the duration of the + # test so a previous test that fired ``warnings.warn`` at the same + # source location does not suppress this one. + with warnings.catch_warnings(): + warnings.simplefilter("always") + with fc_cli._capture_featcopilot_messages() as captured: + fc_logger.warning("captured-warning-message") + warnings.warn("captured-runtime-warning", UserWarning, stacklevel=2) assert any("captured-warning-message" in m for m in captured) assert any("captured-runtime-warning" in m for m in captured) -def test_capture_featcopilot_messages_restores_handlers(): - """The contextmanager must restore the original featcopilot root logger - state after the with-block, even if an exception propagates. +def test_capture_featcopilot_messages_does_not_mutate_logger_state_per_call(): + """The contextmanager installs hooks *once* (lazily) and then never + mutates the featcopilot logger again — so successive captures don't + add or remove handlers, regardless of test ordering. The earlier + "restores handlers" test (asserting equality with pre-first-call + state) was order-dependent: on the very first capture in a process, + ``_install_capture_hooks_once()`` permanently adds + ``_routing_handler`` and that's a one-way change. We instead assert + *stability* across an exception-propagating with-block, which is the + real behavioral contract. """ + # First, force install via a no-op capture. + with fc_cli._capture_featcopilot_messages(): + pass + fc_root = logging.getLogger("featcopilot") - saved_handlers = list(fc_root.handlers) - saved_level = fc_root.level + handlers_before = list(fc_root.handlers) + level_before = fc_root.level + showwarning_before = warnings.showwarning with pytest.raises(RuntimeError): with fc_cli._capture_featcopilot_messages(): raise RuntimeError("boom") - assert fc_root.handlers == saved_handlers - assert fc_root.level == saved_level + # Hooks remain installed (handler stays, level unchanged, showwarning + # override remains in place); per-call state has been popped. + assert fc_root.handlers == handlers_before + assert fc_root.level == level_before + assert warnings.showwarning is showwarning_before def test_capture_featcopilot_messages_thread_safety(): @@ -1231,15 +1251,21 @@ def worker(tag: str, target: list[str]): barrier.wait() with fc_cli._capture_featcopilot_messages() as captured: for i in range(10): + # ``stacklevel=2`` is forwarded; reset filter state so we + # don't lose the warning to Python's default dedup. warnings.warn(f"{tag}-warn-{i}", UserWarning, stacklevel=2) target.extend(captured) - t1 = threading.Thread(target=worker, args=("A", a_captured)) - t2 = threading.Thread(target=worker, args=("B", b_captured)) - t1.start() - t2.start() - t1.join() - t2.join() + # Reset warning filters for this test so dedup doesn't suppress + # repeated emissions at the same source line. + with warnings.catch_warnings(): + warnings.simplefilter("always") + t1 = threading.Thread(target=worker, args=("A", a_captured)) + t2 = threading.Thread(target=worker, args=("B", b_captured)) + t1.start() + t2.start() + t1.join() + t2.join() assert all("A-warn-" in m for m in a_captured) assert all("B-warn-" in m for m in b_captured) @@ -1247,6 +1273,105 @@ def worker(tag: str, target: list[str]): assert not any("A-warn-" in m for m in b_captured) +def test_nested_capture_on_same_thread_preserves_outer_list(): + """A capture inside a capture on the same thread must: + + 1. Route records to the *innermost* list while the inner block is active. + 2. Restore the outer list when the inner block exits, so subsequent + records flow into the outer payload. + + The previous single-list-per-thread design clobbered the outer + registration; this test guards against that regression. + """ + fc_logger = logging.getLogger("featcopilot.test_nested") + + with warnings.catch_warnings(): + warnings.simplefilter("always") + with fc_cli._capture_featcopilot_messages() as outer: + fc_logger.warning("outer-before-nested") + with fc_cli._capture_featcopilot_messages() as inner: + fc_logger.warning("inner-only") + warnings.warn("inner-runtime", UserWarning, stacklevel=2) + fc_logger.warning("outer-after-nested") + + # Inner contains only the records emitted while it was the active + # capture. + assert any("inner-only" in m for m in inner) + assert any("inner-runtime" in m for m in inner) + assert not any("outer-before-nested" in m for m in inner) + assert not any("outer-after-nested" in m for m in inner) + + # Outer contains records emitted before AND after the inner block, + # but NOT records emitted while inner was active (those went to inner). + assert any("outer-before-nested" in m for m in outer) + assert any("outer-after-nested" in m for m in outer) + assert not any("inner-only" in m for m in outer) + assert not any("inner-runtime" in m for m in outer) + + +def test_overlapping_captures_with_out_of_order_exit(): + """Two threads enter the capture block, then thread A exits *before* + thread B. The CLI must continue to capture B's warnings even after + A has exited — i.e. A's exit must not restore a global state that + disables B's capture. + + This is the strict version of the warnings.showwarning race that + existed when the override was saved/restored per-call: A's exit + used to restore the original ``warnings.showwarning``, leaking B's + subsequent ``warnings.warn`` calls onto stderr. + """ + import threading + import time + + barrier = threading.Barrier(2) + a_done = threading.Event() + a_captured: list[str] = [] + b_captured: list[str] = [] + + fc_logger = logging.getLogger("featcopilot.test_overlap") + + def worker_a(): + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + fc_logger.warning("A-1") + warnings.warn("A-warn-1", UserWarning, stacklevel=2) + a_captured.extend(captured) + a_done.set() # signal: A has exited the capture block + + def worker_b(): + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + fc_logger.warning("B-1") + # Wait for A to fully exit before emitting B's tail records. + assert a_done.wait(timeout=5) + time.sleep(0.05) # small grace so any racy restoration would have happened + fc_logger.warning("B-2-after-A-exit") + warnings.warn("B-warn-after-A-exit", UserWarning, stacklevel=2) + b_captured.extend(captured) + + with warnings.catch_warnings(): + warnings.simplefilter("always") + t_a = threading.Thread(target=worker_a) + t_b = threading.Thread(target=worker_b) + t_b.start() # start B first so it's already in the block + time.sleep(0.05) + t_a.start() + t_a.join(timeout=5) + t_b.join(timeout=5) + + # B's records — including the ones emitted *after* A exited — must + # all be captured. None of A's records should have leaked into B. + assert any("B-1" in m for m in b_captured) + assert any("B-2-after-A-exit" in m for m in b_captured) + assert any("B-warn-after-A-exit" in m for m in b_captured) + assert not any("A-1" in m for m in b_captured) + assert not any("A-warn-1" in m for m in b_captured) + # A's payload likewise contains only A's records. + assert any("A-1" in m for m in a_captured) + assert any("A-warn-1" in m for m in a_captured) + assert not any("B-" in m for m in a_captured) + + def test_unexpected_error_writes_single_stderr_line(monkeypatch, tmp_path: Path, tabular_csv: Path): """An unexpected (non-ValueError) exception must produce exactly one structured stderr line — no second timestamped traceback from From 8d3a9735499ef8501d834e2508af77c810c8f176 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 4 May 2026 20:36:17 +0800 Subject: [PATCH 18/30] fix(cli): address round-13 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses both new comments from copilot-pull-request-reviewer on PR #5 (commit 0c69dd9): * Range-validate correlation_threshold to [0.0, 1.0] (Copilot TrAR). Out-of-range values silently change selector behavior: `correlation_threshold > 1.0` disables redundancy elimination entirely (`FeatureSelector.fit` only runs it when threshold < 1.0), while a negative value treats every numeric pair as redundant. `_build_engineer` now rejects out-of-range values (from CLI flag or config) up front with a precise exit-2 error. Boundary values 0.0 and 1.0 are accepted (inclusive). `max_features` likewise gets an explicit positive-int check at this layer so the message says `max_features` rather than the more cryptic transformer error. Six new tests parametrize negative / above-1 / boundary cases for both CLI flag and config sources. * Bound explain's memory / compute via input sampling (Copilot TrAv). `explain` is metadata-only — the transformed frame is discarded immediately — but it used to materialize every engineered value on the full input, which on large datasets makes `featcopilot explain` slow or OOM-prone in agent / CI workflows. The CLI now caps the input at `_EXPLAIN_SAMPLE_SIZE = 1000` rows (deterministic `random_state=0` so re-runs produce the same metadata). The candidate feature set is independent of input length — every engine plans from column structure, not row values — so the payload is identical to a full-input run. New `n_rows_used` field reports the effective sample size; three tests cover (a) input >> cap, (b) input < cap (no-op), and (c) determinism across re-runs. Tests: 96 (+9 new) in tests/test_cli.py, 869 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 44 +++++++++++ tests/test_cli.py | 181 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 2c07916..a30be02 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -376,6 +376,21 @@ def pick(flag_value, config_key, default): _check_scalar_type("gate_n_jobs", gate_n_jobs, (int,), allow_bool=False) _check_scalar_type("leakage_guard", leakage_guard, (str,)) + # Range-check ``correlation_threshold``: it's only meaningful in + # ``[0.0, 1.0]``. Values above 1 silently disable redundancy + # elimination (``FeatureSelector.fit`` only runs it when threshold + # < 1.0); values below 0 effectively treat every numeric pair as + # redundant. Reject out-of-range up front so the CLI doesn't quietly + # change selector behavior. + if not (0.0 <= float(correlation_threshold) <= 1.0): + raise ValueError(f"`correlation_threshold` must be in the range [0.0, 1.0]; got {correlation_threshold!r}.") + # ``max_features`` must be positive when set (matches + # AutoFeatureEngineer's own validation). Surface that here too so + # the message says ``max_features`` rather than the more cryptic + # transformer error. + if max_features is not None and max_features <= 0: + raise ValueError(f"`max_features` must be a positive integer when set; got {max_features!r}.") + # Validate ``llm_config`` is a JSON object (i.e. a Python dict) before # forwarding it. Without this check, a misconfigured non-dict value # would only fail at engine-construction time inside @@ -785,6 +800,15 @@ def _cmd_transform(args: argparse.Namespace) -> int: return 0 +# ``explain`` only needs to fire each engine's planning + feature-naming +# pass — the actual transformed values are discarded. Capping the input +# at this many rows keeps the metadata-only command from paying the full +# memory / compute cost of materializing every engineered value on large +# datasets, while still giving every engine enough rows to plan its +# features (the candidate set is independent of input length). +_EXPLAIN_SAMPLE_SIZE = 1000 + + def _cmd_explain(args: argparse.Namespace) -> int: """Fit + transform engines and print feature explanations + code as JSON. @@ -796,6 +820,15 @@ def _cmd_explain(args: argparse.Namespace) -> int: actual generated features. Selection is intentionally skipped here so the payload describes every candidate feature the engines produced, not just the post-selection survivors. + + Performance: large inputs are sub-sampled to at most + :data:`_EXPLAIN_SAMPLE_SIZE` rows. The engineered-feature *metadata* + (names, explanations, code snippets) is independent of input length — + every engine plans its candidate feature set from column structure + rather than from individual row values — so the sampled run produces + the same payload at a fraction of the memory / compute cost. This + keeps ``featcopilot explain`` fast and bounded for agent / CI + workflows where the input might be GBs of data. """ input_path = Path(args.input) if not input_path.exists(): @@ -805,6 +838,16 @@ def _cmd_explain(args: argparse.Namespace) -> int: df = _read_table(input_path, in_fmt) X, y = _split_xy(df, args.target) + # Sample to bound memory / compute. Use a deterministic ``random_state`` + # so re-running ``explain`` on the same input is reproducible. + n_sampled = len(X) + if n_sampled > _EXPLAIN_SAMPLE_SIZE: + sample_idx = X.sample(n=_EXPLAIN_SAMPLE_SIZE, random_state=0).index + X = X.loc[sample_idx] + if y is not None: + y = y.loc[sample_idx] + n_sampled = _EXPLAIN_SAMPLE_SIZE + engineer = _build_engineer(args, include_selection_config=False) captured_warnings = _fit_capturing_warnings( engineer, @@ -823,6 +866,7 @@ def _cmd_explain(args: argparse.Namespace) -> int: "status": "ok", "input": str(input_path), "n_features": len(feature_names), + "n_rows_used": n_sampled, "engines": list(engineer.engines), "features": [ { diff --git a/tests/test_cli.py b/tests/test_cli.py index a4ec0ff..47a75c7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -403,6 +403,98 @@ def test_scalar_type_mismatch_in_config_returns_exit_2(tmp_path: Path, tabular_c assert fragment in err +@pytest.mark.parametrize("threshold", [-0.1, 1.1, 5.0, -1.0]) +def test_correlation_threshold_out_of_range_returns_exit_2(tmp_path: Path, tabular_csv: Path, threshold): + """``correlation_threshold`` is only meaningful in [0.0, 1.0]. Out-of-range + values silently change selector behavior (>1 disables redundancy elim, + <0 treats every numeric pair as redundant), so the CLI rejects them up + front with a precise exit-2 error. + """ + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--correlation-threshold", + str(threshold), + "--max-features", + "5", + ] + ) + assert rc == 2 + assert "correlation_threshold" in err + assert "[0.0, 1.0]" in err or "0.0" in err + + +def test_correlation_threshold_in_config_out_of_range_returns_exit_2(tmp_path: Path, tabular_csv: Path): + """The same range check applies when ``correlation_threshold`` arrives + from ``--config`` rather than the CLI flag. + """ + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"correlation_threshold": 2.5})) + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--max-features", + "5", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "correlation_threshold" in err + + +def test_correlation_threshold_boundary_values_accepted(tmp_path: Path, tabular_csv: Path): + """The boundaries (0.0 and 1.0) must be accepted — they're the inclusive + valid range. Default 0.85 is also exercised throughout the suite. + """ + out_path = tmp_path / "out.csv" + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--correlation-threshold", + "0.0", + "--max-features", + "5", + ] + ) + assert rc == 0, err + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--correlation-threshold", + "1.0", + "--max-features", + "5", + ] + ) + assert rc == 0, err + + # ----------------------- --verbose / --no-verbose @@ -1734,6 +1826,95 @@ def test_explain_emits_json_payload(tmp_path: Path, tabular_csv: Path): assert entry["name"] +def test_explain_caps_input_size_for_large_inputs(tmp_path: Path): + """``explain`` is metadata-only. To bound memory / compute on large + inputs, the CLI sub-samples to at most ``_EXPLAIN_SAMPLE_SIZE`` rows + before running ``fit_transform``. The payload reports ``n_rows_used`` + so callers can confirm the sampling. + """ + rng = np.random.default_rng(0) + n = fc_cli._EXPLAIN_SAMPLE_SIZE * 5 # well above the cap + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "big.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + # Sampling cap was enforced. + assert payload["n_rows_used"] == fc_cli._EXPLAIN_SAMPLE_SIZE + assert payload["n_features"] > 0 + + +def test_explain_uses_full_input_when_smaller_than_sample_cap(tmp_path: Path): + """When the input has fewer rows than ``_EXPLAIN_SAMPLE_SIZE``, the + sampler is a no-op: ``n_rows_used`` reflects the actual input size. + """ + rng = np.random.default_rng(0) + n = 50 # well below the cap + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "small.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == n + + +def test_explain_sampling_is_deterministic(tmp_path: Path): + """Re-running ``explain`` on the same large input produces the same + set of feature names (sampling uses a fixed ``random_state``). + """ + rng = np.random.default_rng(0) + n = fc_cli._EXPLAIN_SAMPLE_SIZE * 3 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "big.csv" + df.to_csv(in_path, index=False) + + def _names(): + rc, out, _ = _run(["explain", "--input", str(in_path), "--target", "y"]) + assert rc == 0 + return sorted(f["name"] for f in json.loads(out)["features"]) + + assert _names() == _names() + + # --------------------------------------------------------------- parquet path From 59468052691e6cd9d6791113f1bdf13729ac7370 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 4 May 2026 21:13:48 +0800 Subject: [PATCH 19/30] fix(cli): address round-14 review feedback (Codex P1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the new P1 comment from chatgpt-codex-connector on PR #5 (commit 8d3a973): * Default explain back to FULL input — sampling is opt-in (Codex P1). Round-13's automatic sampling-to-1000-rows changed which features some engines plan, breaking the metadata-vs-full-transform faithful- ness contract. `TabularEngine._fit_categorical_encoding` (and similar engines) decide which encodings to apply based on `n_rows`, unique-count ratios, and per-category counts, all of which are data-size dependent. `_cmd_explain` now uses the full input by default. Callers who knowingly accept the trade-off can opt in via: - `--explain-sample-size N` (CLI flag), or - `"explain_sample_size": N` in `--config`. When sampling is active, the CLI emits a `UserWarning` (captured into the JSON payload's `warnings` field, NOT to stderr) explaining that the metadata may differ from a full-input transform run. The warning + `fit_transform` are wrapped in a single `_capture_featcopilot_messages` block so the sampling notice ends up where downstream agents will see it. Validation: `explain_sample_size` must be a positive int. Strings, zero, and negatives are rejected with a precise exit-2 error. Tests: 102 (+6 net) in tests/test_cli.py. * test_explain_uses_full_input_by_default * test_explain_caps_input_size_when_sample_size_set * test_explain_sample_size_smaller_than_input_no_op * test_explain_sample_size_via_config * test_explain_sample_size_rejects_non_positive (parametrized) * test_explain_sample_size_rejects_string_in_config * test_explain_sample_size_rejects_zero_in_config 875 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 105 ++++++++++++++++++++++---------- tests/test_cli.py | 148 ++++++++++++++++++++++++++++++++++++++------- 2 files changed, 199 insertions(+), 54 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index a30be02..9c31eaa 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -228,6 +228,7 @@ def _write_table(df, path: Path, fmt: str) -> None: "gate_n_jobs", "llm_config", "verbose", + "explain_sample_size", } ) @@ -800,15 +801,14 @@ def _cmd_transform(args: argparse.Namespace) -> int: return 0 -# ``explain`` only needs to fire each engine's planning + feature-naming -# pass — the actual transformed values are discarded. Capping the input -# at this many rows keeps the metadata-only command from paying the full -# memory / compute cost of materializing every engineered value on large -# datasets, while still giving every engine enough rows to plan its -# features (the candidate set is independent of input length). -_EXPLAIN_SAMPLE_SIZE = 1000 - - +# Default ``explain`` behavior is to use the full input so the metadata +# is a faithful description of what a corresponding ``transform`` run +# would do — engines like ``TabularEngine._fit_categorical_encoding`` +# use ``n_rows`` and per-category counts to decide e.g. one-hot vs. +# target-encoding, so subsampling can silently change which features +# appear. Callers who knowingly accept that trade-off can opt in via +# ``--explain-sample-size`` (set to ``None``/absent to disable, any +# positive integer to cap). def _cmd_explain(args: argparse.Namespace) -> int: """Fit + transform engines and print feature explanations + code as JSON. @@ -821,14 +821,22 @@ def _cmd_explain(args: argparse.Namespace) -> int: payload describes every candidate feature the engines produced, not just the post-selection survivors. - Performance: large inputs are sub-sampled to at most - :data:`_EXPLAIN_SAMPLE_SIZE` rows. The engineered-feature *metadata* - (names, explanations, code snippets) is independent of input length — - every engine plans its candidate feature set from column structure - rather than from individual row values — so the sampled run produces - the same payload at a fraction of the memory / compute cost. This - keeps ``featcopilot explain`` fast and bounded for agent / CI - workflows where the input might be GBs of data. + Performance vs. faithfulness + --------------------------- + By default ``explain`` runs on the *full* input so the reported + metadata is a faithful description of what a corresponding + ``transform`` would generate. Some engines (notably + :class:`TabularEngine`) consult row counts and per-category + statistics when deciding which features to plan, so blind + subsampling can silently change the result. + + For very large inputs where the metadata-only nature of ``explain`` + really should not pay full memory / compute cost, callers can pass + ``--explain-sample-size N`` (or set ``"explain_sample_size": N`` in + ``--config``) to cap the rows fed to the engineer. The CLI emits a + ``UserWarning`` (captured into the JSON payload) noting that the + metadata may differ from a full-input ``transform`` run; the + ``n_rows_used`` field reports the effective sample size. """ input_path = Path(args.input) if not input_path.exists(): @@ -838,25 +846,45 @@ def _cmd_explain(args: argparse.Namespace) -> int: df = _read_table(input_path, in_fmt) X, y = _split_xy(df, args.target) - # Sample to bound memory / compute. Use a deterministic ``random_state`` - # so re-running ``explain`` on the same input is reproducible. + # Apply opt-in sample cap from CLI flag or config (CLI flag wins). + sample_size = getattr(args, "explain_sample_size", None) + if sample_size is None and args.config is not None: + sample_size = _load_config(args.config).get("explain_sample_size") + if sample_size is not None: + _check_scalar_type("explain_sample_size", sample_size, (int,), allow_bool=False) + if sample_size <= 0: + raise ValueError(f"`explain_sample_size` must be a positive integer when set; got {sample_size!r}.") + n_sampled = len(X) - if n_sampled > _EXPLAIN_SAMPLE_SIZE: - sample_idx = X.sample(n=_EXPLAIN_SAMPLE_SIZE, random_state=0).index - X = X.loc[sample_idx] - if y is not None: - y = y.loc[sample_idx] - n_sampled = _EXPLAIN_SAMPLE_SIZE engineer = _build_engineer(args, include_selection_config=False) - captured_warnings = _fit_capturing_warnings( - engineer, - X, - y, - task_description=args.task_description or "prediction task", - target_name=args.target, - apply_selection=False, - ) + + # Run the sample-warning AND ``fit_transform`` inside a single + # capture context so the sampling notice ends up in the JSON + # payload's ``warnings`` field instead of bleeding onto stderr. + with _capture_featcopilot_messages() as captured_warnings: + if sample_size is not None and n_sampled > sample_size: + warnings.warn( + f"explain: sampling input down to {sample_size} of {n_sampled} rows. " + "Some engines (e.g. TabularEngine categorical encoding) decide which " + "features to plan based on row counts and per-category statistics, " + "so the reported metadata may differ from a full-input transform run.", + UserWarning, + stacklevel=2, + ) + sample_idx = X.sample(n=sample_size, random_state=0).index + X = X.loc[sample_idx] + if y is not None: + y = y.loc[sample_idx] + n_sampled = sample_size + + engineer.fit_transform( + X, + y, + task_description=args.task_description or "prediction task", + target_name=args.target, + apply_selection=False, + ) explanations = engineer.explain_features() code = engineer.get_feature_code() @@ -954,6 +982,17 @@ def _build_parser() -> argparse.ArgumentParser: "--task-description", help="Natural-language ML task description (used by the LLM engine).", ) + p_explain.add_argument( + "--explain-sample-size", + type=int, + default=None, + help="Cap the input fed to the engineer at this many rows (deterministic seed). " + "OFF by default: the full input is used so the metadata is a faithful description " + "of what a corresponding `transform` would generate. Pass a positive integer ONLY " + "when you knowingly accept that some engines (e.g. TabularEngine categorical " + "encoding) decide which features to plan based on row counts and per-category " + "statistics, so the reported metadata may differ from a full-input run.", + ) _add_engineer_args(p_explain, include_selection_args=False) p_explain.add_argument("--json", action="store_true", help="(Always JSON — flag accepted for symmetry.)") p_explain.set_defaults(func=_cmd_explain) diff --git a/tests/test_cli.py b/tests/test_cli.py index 47a75c7..e29e5e1 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1826,14 +1826,15 @@ def test_explain_emits_json_payload(tmp_path: Path, tabular_csv: Path): assert entry["name"] -def test_explain_caps_input_size_for_large_inputs(tmp_path: Path): - """``explain`` is metadata-only. To bound memory / compute on large - inputs, the CLI sub-samples to at most ``_EXPLAIN_SAMPLE_SIZE`` rows - before running ``fit_transform``. The payload reports ``n_rows_used`` - so callers can confirm the sampling. +def test_explain_uses_full_input_by_default(tmp_path: Path): + """``explain`` defaults to using the FULL input — no implicit + sub-sampling. Some engines (e.g. ``TabularEngine`` categorical + encoding) decide which features to plan based on row counts and + per-category statistics, so silent sampling would change the + advertised metadata. Sampling is opt-in via ``--explain-sample-size``. """ rng = np.random.default_rng(0) - n = fc_cli._EXPLAIN_SAMPLE_SIZE * 5 # well above the cap + n = 1500 # arbitrary df = pd.DataFrame( { "x1": rng.normal(size=n), @@ -1856,17 +1857,54 @@ def test_explain_caps_input_size_for_large_inputs(tmp_path: Path): assert rc == 0, err payload = json.loads(out) assert payload["status"] == "ok" + # Default: no sampling — full input is used. + assert payload["n_rows_used"] == n + + +def test_explain_caps_input_size_when_sample_size_set(tmp_path: Path): + """When ``--explain-sample-size N`` is passed, the input is capped at + ``N`` rows (with a captured warning) so callers can opt into bounded + cost on huge inputs. The default remains full-input. + """ + rng = np.random.default_rng(0) + n = 5000 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "big.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "1000", + ] + ) + assert rc == 0, err + payload = json.loads(out) # Sampling cap was enforced. - assert payload["n_rows_used"] == fc_cli._EXPLAIN_SAMPLE_SIZE + assert payload["n_rows_used"] == 1000 assert payload["n_features"] > 0 + # The CLI emits a warning when sampling so callers can detect that + # metadata may not match a full-input transform run. + assert any("sampling" in w.lower() for w in payload["warnings"]) -def test_explain_uses_full_input_when_smaller_than_sample_cap(tmp_path: Path): - """When the input has fewer rows than ``_EXPLAIN_SAMPLE_SIZE``, the - sampler is a no-op: ``n_rows_used`` reflects the actual input size. +def test_explain_sample_size_smaller_than_input_no_op(tmp_path: Path): + """When ``--explain-sample-size`` exceeds the actual input, no sampling + happens (and no warning is emitted). """ rng = np.random.default_rng(0) - n = 50 # well below the cap + n = 50 df = pd.DataFrame( { "x1": rng.normal(size=n), @@ -1884,19 +1922,20 @@ def test_explain_uses_full_input_when_smaller_than_sample_cap(tmp_path: Path): str(in_path), "--target", "y", + "--explain-sample-size", + "1000", ] ) assert rc == 0, err payload = json.loads(out) assert payload["n_rows_used"] == n + assert not any("sampling" in w.lower() for w in payload["warnings"]) -def test_explain_sampling_is_deterministic(tmp_path: Path): - """Re-running ``explain`` on the same large input produces the same - set of feature names (sampling uses a fixed ``random_state``). - """ +def test_explain_sample_size_via_config(tmp_path: Path): + """``explain_sample_size`` is also recognized in ``--config`` JSON.""" rng = np.random.default_rng(0) - n = fc_cli._EXPLAIN_SAMPLE_SIZE * 3 + n = 5000 df = pd.DataFrame( { "x1": rng.normal(size=n), @@ -1907,12 +1946,79 @@ def test_explain_sampling_is_deterministic(tmp_path: Path): in_path = tmp_path / "big.csv" df.to_csv(in_path, index=False) - def _names(): - rc, out, _ = _run(["explain", "--input", str(in_path), "--target", "y"]) - assert rc == 0 - return sorted(f["name"] for f in json.loads(out)["features"]) + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"explain_sample_size": 500})) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == 500 + - assert _names() == _names() +@pytest.mark.parametrize("bad_value", [0, -1, -100]) +def test_explain_sample_size_rejects_non_positive(tmp_path: Path, bad_value): + """``--explain-sample-size`` must be a positive integer.""" + rc, _, err = _run( + [ + "explain", + "--input", + str(tmp_path / "in.csv"), # missing — but flag check happens first + "--target", + "y", + "--explain-sample-size", + str(bad_value), + ] + ) + # We accept either argparse-level rejection or our own ValueError; + # both surface as exit 2. + assert rc == 2 + + +def test_explain_sample_size_rejects_string_in_config(tmp_path: Path, tabular_csv: Path): + """Type-validation: ``"explain_sample_size": "100"`` (string) is rejected.""" + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"explain_sample_size": "100"})) + rc, _, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "explain_sample_size" in err + + +def test_explain_sample_size_rejects_zero_in_config(tmp_path: Path, tabular_csv: Path): + cfg = tmp_path / "cfg.json" + cfg.write_text(json.dumps({"explain_sample_size": 0})) + rc, _, err = _run( + [ + "explain", + "--input", + str(tabular_csv), + "--target", + "y", + "--config", + str(cfg), + ] + ) + assert rc == 2 + assert "explain_sample_size" in err # --------------------------------------------------------------- parquet path From e85b79194a2947c8ea9a81878dace5ad43e187b2 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 4 May 2026 21:47:37 +0800 Subject: [PATCH 20/30] fix(cli): address round-15 review feedback Addresses both new comments from copilot-pull-request-reviewer on PR #5 (commit 5946805): * Remove dead helper _fit_capturing_warnings (Copilot Wufb). After round-14 inlined the capture context into `_cmd_explain` (so the sampling `UserWarning` lands in the JSON payload alongside engine warnings), the `_fit_capturing_warnings` thin wrapper has no callers. Leaving it would create two competing capture paths that could drift apart over time. Removed; the actively used `_fit_transform_capturing_warnings` (called by `_cmd_transform`) remains. * Console-script tests fail loudly when the package is installed but the script is missing (Copilot WueV). The previous `pytest.skip("not on PATH")` would have hidden a real `[project.scripts]` regression in CI: the tests workflow does `pip install -e .` before pytest, so the script MUST be on PATH. New `_featcopilot_package_is_installed` helper distinguishes the two scenarios: - Package installed, script missing -> `pytest.fail` with a message explaining the packaging regression. - Package not installed (rare: running tests against an un-installed source tree) -> `pytest.skip` with an install hint. Both `test_console_script_subprocess_invocation` and `test_console_script_version_flag` use the new policy. Tests: 102 in tests/test_cli.py (unchanged count; helper + policy update), 875 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 7 ------ tests/test_cli.py | 56 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 51 insertions(+), 12 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 9c31eaa..09deabe 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -508,13 +508,6 @@ def _fit_transform_capturing_warnings(engineer, X, y, **kwargs): return captured, result -def _fit_capturing_warnings(engineer, X, y, **kwargs): - """Sibling of :func:`_fit_transform_capturing_warnings` for explain.""" - with _capture_featcopilot_messages() as captured: - engineer.fit_transform(X, y, **kwargs) - return captured - - class _ThreadCaptureState: """Holds per-thread capture *stacks*. diff --git a/tests/test_cli.py b/tests/test_cli.py index e29e5e1..c297c96 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2229,22 +2229,59 @@ def test_dunder_main_subprocess_version_flag(): # ------------------------------------------------------- console script +def _featcopilot_package_is_installed() -> bool: + """Return True iff the ``featcopilot`` distribution is installed in the + current environment (i.e. the entry-point machinery should have placed + the console script on ``PATH``). + + Used by the console-script tests to distinguish two cases: + + * Running tests directly against the source tree (``python -m pytest`` + from a clean checkout, no ``pip install -e .``): the package is + *not* installed; the script is legitimately missing and the test + should ``skip`` rather than report a packaging bug. + * Running tests after ``pip install`` (the CI flow): the package IS + installed, so the script MUST be on ``PATH``. If it isn't, that's a + real ``[project.scripts]`` regression and the test should ``fail``, + not silently pass via skip. + """ + try: + from importlib.metadata import PackageNotFoundError, distribution + except ImportError: # pragma: no cover - py3.10+ always has this + return False + try: + distribution("featcopilot") + except PackageNotFoundError: + return False + return True + + def test_console_script_subprocess_invocation(): """The installed ``featcopilot`` console script must be on PATH and runnable. Exercises the ``[project.scripts] featcopilot = "featcopilot.cli:main"`` entry point end-to-end so a typo or packaging regression in - ``pyproject.toml`` would actually break the suite. Skipped when the - script isn't on ``PATH`` (e.g. running tests without ``pip install``). + ``pyproject.toml`` would actually break the suite. When the + ``featcopilot`` distribution is installed, the script must be on + ``PATH``: a missing script in that case is a real packaging + regression, not a test environment quirk, so we ``fail`` (not + ``skip``). The skip is reserved for the rare case of running tests + against an un-installed source tree. """ import shutil import subprocess script = shutil.which("featcopilot") if script is None: + if _featcopilot_package_is_installed(): + pytest.fail( + "featcopilot package is installed but the `featcopilot` console " + "script is missing from PATH. This is a `[project.scripts]` " + "regression in pyproject.toml." + ) pytest.skip( - "featcopilot console script not on PATH (install the package " - "with `pip install -e .` to exercise the entry point)" + "featcopilot package is not installed in this environment; install " + "it with `pip install -e .` to exercise the console-script entry point." ) result = subprocess.run( @@ -2261,12 +2298,21 @@ def test_console_script_subprocess_invocation(): def test_console_script_version_flag(): + """Same install-aware skip/fail policy as + :func:`test_console_script_subprocess_invocation`. + """ import shutil import subprocess script = shutil.which("featcopilot") if script is None: - pytest.skip("featcopilot console script not on PATH") + if _featcopilot_package_is_installed(): + pytest.fail( + "featcopilot package is installed but the `featcopilot` console " + "script is missing from PATH. This is a `[project.scripts]` " + "regression in pyproject.toml." + ) + pytest.skip("featcopilot package is not installed in this environment.") result = subprocess.run( [script, "--version"], From 15c15e9cf4aafda563b8bd32009186f30bbbb3ca Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 4 May 2026 22:52:57 +0800 Subject: [PATCH 21/30] fix(cli): address round-16 review feedback Addresses both new comments from copilot-pull-request-reviewer on PR #5 (commit e85b791): * Catch pandas EmptyDataError in CSV read (Copilot YL1H). `_read_table` now catches `pandas.errors.EmptyDataError` in addition to `OSError` / `ParserError` / `UnicodeDecodeError`. Without it, a zero-byte / headerless `.csv` would have fallen through to the generic exit-1 "unexpected error" path instead of the documented exit-2 user-input error. New tests cover both zero-byte and "newlines-only" inputs. * Detect target/feature name collision in --include-target (Copilot YL2C). `--include-target` used to blindly assign `transformed[target_name] = y.values`, which silently overwrites any engineered feature that happens to share the target's column name (e.g. a target named `x1_pow2` or `a_x_b` matching a tabular-engine derived feature). The CLI now detects the collision before assigning and raises a precise exit-2 error so the user can rename the target or drop `--include-target`. Two tests cover the contract: a real-data attempt that may or may not trip the collision (skipped when the engine doesn't materialize the colliding name), and a deterministic version that monkey-patches the engineer to inject a colliding column. Tests: 106 (+4 new) in tests/test_cli.py, 879 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 26 ++++++++- tests/test_cli.py | 138 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+), 2 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 09deabe..d25cc8e 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -133,7 +133,16 @@ def _read_table(path: Path, fmt: str): if fmt == "csv": try: return pd.read_csv(path) - except (OSError, pd.errors.ParserError, UnicodeDecodeError) as exc: + except ( + OSError, + pd.errors.ParserError, + pd.errors.EmptyDataError, + UnicodeDecodeError, + ) as exc: + # ``EmptyDataError`` fires for headerless / zero-byte CSVs; + # without it, those inputs would fall into the generic exit-1 + # "unexpected error" path instead of the documented exit-2 + # user-input error. raise ValueError(f"Failed to read CSV from {str(path)!r}: {exc}") from exc if fmt == "parquet": try: @@ -766,8 +775,21 @@ def _cmd_transform(args: argparse.Namespace) -> int: if args.include_target and y is not None: # Re-attach the target column so downstream training scripts can - # consume the engineered file as a single artifact. + # consume the engineered file as a single artifact. Detect column + # collisions: if an engineered feature happens to share the + # target's column name (e.g. a target named ``foo_pow2`` matching + # a tabular-engine derived feature), blindly assigning ``transformed[ + # target_name] = y.values`` would silently overwrite the engineered + # column. Surface that as a clean exit-2 error instead. Callers + # who knowingly want to overwrite can rename their target before + # invoking ``transform`` (or skip ``--include-target``). target_name = args.target if args.target in df.columns else "target" + if target_name in transformed.columns: + raise ValueError( + f"--include-target would overwrite engineered feature {target_name!r} " + "with the target values. Rename the target column in the input file, " + "drop --include-target, or accept the rename and retry." + ) transformed = transformed.copy() transformed[target_name] = y.values diff --git a/tests/test_cli.py b/tests/test_cli.py index c297c96..e4bf86c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -859,6 +859,144 @@ def _raise_oserror(*args, **kwargs): assert "failed to read" in err.lower() +def test_empty_csv_input_returns_exit_2(tmp_path: Path): + """A zero-byte / headerless CSV triggers ``pandas.errors.EmptyDataError``, + which must be normalized to the documented exit-2 user-input error path + rather than falling through to the generic exit-1 backstop. + """ + in_path = tmp_path / "empty.csv" + in_path.write_text("") # zero bytes -> EmptyDataError on read + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read csv" in err.lower() + + +def test_headerless_csv_input_returns_exit_2(tmp_path: Path): + """A CSV with no header and no rows is also empty-data territory and + must surface as exit 2. + """ + in_path = tmp_path / "headerless.csv" + in_path.write_text("\n\n\n") # only newlines, no header + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "failed to read csv" in err.lower() + + +def test_transform_include_target_collision_returns_exit_2(tmp_path: Path): + """``--include-target`` would silently overwrite an engineered feature + if it happens to share the target column's name. The CLI must detect + that collision and fail with exit 2 instead of losing the engineered + feature. + + A target named ``x1_pow2`` (which the tabular engine generates as a + derived feature from a numeric column ``x1``) provokes the collision. + """ + rng = np.random.default_rng(0) + n = 200 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + # Target column has a name that the tabular engine would also + # generate (``x1_pow2`` etc. is in the tabular engine's + # derived feature catalog). + "x1_pow2": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "collision.csv" + df.to_csv(in_path, index=False) + out_path = tmp_path / "out.csv" + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(out_path), + "--target", + "x1_pow2", + "--include-target", + "--max-features", + "5", + ] + ) + # Either the engineered set actually contains the colliding name (in + # which case we MUST exit 2), or selection happened to drop it. Skip + # if the engine didn't materialize the colliding feature this run — + # the test is about the contract, not whether ``x1_pow2`` is always + # generated. + if rc == 2: + assert "include-target would overwrite" in err.lower() + assert "x1_pow2" in err + else: + # No collision actually occurred; the test is a no-op for this + # input. Future engine changes that always emit ``x1_pow2`` will + # expose the collision branch. + assert rc == 0, err + + +def test_transform_include_target_collision_deterministic(tmp_path: Path, tabular_csv: Path, monkeypatch): + """Deterministic version of the collision test: monkey-patch the + engineer so its transformed frame contains a column with the target's + name. This guarantees we exercise the exit-2 collision branch + regardless of which features the real engineer picks. + """ + from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer + + real_fit_transform = AutoFeatureEngineer.fit_transform + + def _patched_fit_transform(self, X, y=None, **kwargs): + result = real_fit_transform(self, X, y, **kwargs) + # Inject a column named ``y`` into the result so it collides with + # the target column the test will pass. + result = result.copy() + result["y"] = result.iloc[:, 0] # arbitrary engineered values + return result + + monkeypatch.setattr(AutoFeatureEngineer, "fit_transform", _patched_fit_transform) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--include-target", + "--max-features", + "5", + ] + ) + assert rc == 2 + assert "include-target would overwrite" in err.lower() + assert "'y'" in err + + def test_unreadable_input_json_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch): """``OSError`` from ``pd.read_json`` is surfaced as exit 2 too.""" import pandas as pd From 5cbb843b105d625fc746bb8ad7288a767115e628 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Mon, 4 May 2026 23:29:29 +0800 Subject: [PATCH 22/30] fix(cli): address round-17 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses both new comments from copilot-pull-request-reviewer on PR #5 (commit 15c15e9): * Sample by position, not label (Copilot Ymol). `_cmd_explain`'s sampling path used `.sample(...).index` plus `.loc[sample_idx]` to keep `X` and `y` aligned. `.loc` selects by label, so when the input has a non-unique index — common with parquet files that preserve a saved index — duplicate labels expand or reorder rows and `X` and `y` no longer line up. The CLI now samples by *position* with a seeded NumPy RNG and uses `.iloc[sample_positions]` for both, which is index-agnostic. `test_explain_sample_size_handles_non_unique_index` reads a parquet file with a deliberately duplicated index and asserts `n_rows_used` matches the requested cap exactly. * --include-target collision message lists only actionable options (Copilot YmpI). The error text mentioned "accept the rename and retry", but the CLI does not offer any rename / auto-rename option. The misleading suffix is removed; the message now says "Rename the target column in the input file, or drop --include-target." (only options the caller can actually act on). `test_include_target_collision_error_text_lists_only_actionable_options` asserts the new wording and a regression guard against the old phantom-option phrasing. Tests: 108 (+2 new) in tests/test_cli.py, 881 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 22 +++++++++--- tests/test_cli.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 4 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index d25cc8e..8792d12 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -50,6 +50,8 @@ from pathlib import Path from typing import Any +import numpy as np + from featcopilot import __version__ from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer from featcopilot.utils.logger import get_logger @@ -788,7 +790,7 @@ def _cmd_transform(args: argparse.Namespace) -> int: raise ValueError( f"--include-target would overwrite engineered feature {target_name!r} " "with the target values. Rename the target column in the input file, " - "drop --include-target, or accept the rename and retry." + "or drop --include-target." ) transformed = transformed.copy() transformed[target_name] = y.values @@ -887,10 +889,22 @@ def _cmd_explain(args: argparse.Namespace) -> int: UserWarning, stacklevel=2, ) - sample_idx = X.sample(n=sample_size, random_state=0).index - X = X.loc[sample_idx] + # Sample by *position* (``.iloc[...]``), not label + # (``.sample(...).index`` + ``.loc[...]``). ``.loc`` selects + # by label, so a non-unique index — common when reading + # parquet files that preserve a saved index — would let + # duplicate labels expand or reorder rows so ``X`` and ``y`` + # no longer line up. Positional sampling via a NumPy RNG + + # ``.iloc`` keeps them aligned regardless of input index. + rng_sampler = np.random.default_rng(0) + sample_positions = rng_sampler.choice(n_sampled, size=sample_size, replace=False) + # Sort the positions for determinism / readable output ordering + # (the random selection itself is already deterministic via + # the seeded RNG). + sample_positions.sort() + X = X.iloc[sample_positions] if y is not None: - y = y.loc[sample_idx] + y = y.iloc[sample_positions] n_sampled = sample_size engineer.fit_transform( diff --git a/tests/test_cli.py b/tests/test_cli.py index e4bf86c..b43a850 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2317,6 +2317,91 @@ def _raise_oserror(self, *args, **kwargs): assert "could not be read" in err.lower() +def test_explain_sample_size_handles_non_unique_index(tmp_path: Path): + """Sampling must keep X and y aligned even when the input frame has a + non-unique index — e.g. a parquet read that preserves a saved index + where labels can repeat. Positional sampling (``.iloc``) avoids the + label-based ``.loc`` expansion / reordering bug. + """ + pytest.importorskip("pyarrow") # parquet write needs an engine + + rng = np.random.default_rng(0) + n = 4000 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + # Force a non-unique index — labels repeat (each label appears twice). + df.index = pd.Index([i // 2 for i in range(n)], name="duplicated_index") + in_path = tmp_path / "non_unique.parquet" + df.to_parquet(in_path, index=True) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "100", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["status"] == "ok" + # Sample size must be honored exactly, not expanded by ``.loc``-with- + # duplicate-labels behavior. + assert payload["n_rows_used"] == 100 + + +def test_include_target_collision_error_text_lists_only_actionable_options( + tmp_path: Path, tabular_csv: Path, monkeypatch +): + """The error text emitted when ``--include-target`` would overwrite an + engineered feature must only suggest actions that are actually + possible from this command. The CLI does not offer auto-rename, so + the message must NOT mention "rename and retry" or any other phantom + option. + """ + from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer + + real_fit_transform = AutoFeatureEngineer.fit_transform + + def _patched_fit_transform(self, X, y=None, **kwargs): + result = real_fit_transform(self, X, y, **kwargs) + result = result.copy() + result["y"] = result.iloc[:, 0] + return result + + monkeypatch.setattr(AutoFeatureEngineer, "fit_transform", _patched_fit_transform) + + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--include-target", + "--max-features", + "5", + ] + ) + assert rc == 2 + # Must mention the real options. + assert "rename the target column" in err.lower() + assert "drop --include-target" in err + # Must NOT mention non-existent CLI options. + assert "accept the rename" not in err.lower() + assert "retry" not in err.lower() + + # --------------------------------------------------------------- python -m From f710fbe772fc1e340daea33f61cf133a6a06f463 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 5 May 2026 08:03:51 +0800 Subject: [PATCH 23/30] fix(cli): address round-18 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses both new comments from copilot-pull-request-reviewer on PR #5 (commit 5cbb843): * --explain-sample-size now actually bounds memory (Copilot gHFx). Round-13/14's sampling trimmed `X`/`y` AFTER the entire input had already been loaded into a pandas DataFrame, so on huge CSVs the subcommand could OOM before reaching the sampling branch — contradicting the "bounded cost on huge inputs" contract. `_read_table` now accepts an `nrows` parameter and propagates it to the underlying read: - **CSV**: `pd.read_csv(path, nrows=N)` — memory-bounded by pandas natively, never loads more than `N` rows. - **parquet / JSON**: pandas exposes no native row-limit, so the file is fully read and then truncated. A `UserWarning` is emitted (captured into the JSON payload's `warnings` field) explaining the limitation and recommending CSV for hard memory bounds on huge inputs. `_cmd_explain` now passes `nrows=sample_size` to `_read_table` and drops the post-read positional sampling step entirely. `test_explain_sample_size_bounds_csv_read_with_nrows` spies on `pd.read_csv` to assert the `nrows=200` kwarg is actually threaded through (not just truncated post-load); the existing parquet test now asserts the post-read warning fires. * Worker-thread records routed via single-active-capture fallback (Copilot gHGE). `_ThreadCaptureState.get(tid)` previously returned `None` when the calling thread had no capture stack of its own. That meant log records emitted on worker threads spawned by the capturing thread (e.g. an LLM sync client wrapping `ThreadPoolExecutor` because it was called from a process with a running event loop) escaped capture and bled onto stderr. `get` now falls back: when exactly ONE capture is active in the process, cross-thread records are routed to that single capture. When two or more captures are concurrently active, the fallback stays disabled — each capture continues to see only its own thread's records, so concurrent CLI calls don't cross-contaminate. Both `_ThreadRoutingHandler` and `_routing_showwarning` go through the same `get`, so log records and `warnings.warn` calls share the policy. `test_capture_routes_worker_thread_records_to_single_active_capture` exercises a `ThreadPoolExecutor` worker plus a freshly spawned `threading.Thread` and asserts both are captured. `test_capture_keeps_thread_isolation_with_multiple_active_captures` guards the multi-capture isolation property. Cleanup: dropped the now-unused `import numpy as np` from `featcopilot/cli.py`. Tests: 112 (+4 new) in tests/test_cli.py, 885 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 121 +++++++++++++++++++++++--------- tests/test_cli.py | 170 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 256 insertions(+), 35 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 8792d12..ba87ca9 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -50,8 +50,6 @@ from pathlib import Path from typing import Any -import numpy as np - from featcopilot import __version__ from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer from featcopilot.utils.logger import get_logger @@ -117,7 +115,7 @@ def _detect_format(path: Path, override: str | None) -> str: return fmt -def _read_table(path: Path, fmt: str): +def _read_table(path: Path, fmt: str, *, nrows: int | None = None): """Read a tabular file into a pandas DataFrame. All user-facing failure modes (missing parquet engine, ``--input`` @@ -126,6 +124,22 @@ def _read_table(path: Path, fmt: str): top-level handler routes them to the deterministic ``exit 2`` user-error path. The generic ``exit 1`` backstop is reserved for truly unexpected (i.e. CLI-internal) errors. + + Parameters + ---------- + path : pathlib.Path + File to read. + fmt : str + One of ``csv`` / ``parquet`` / ``json``. + nrows : int or None, optional + Cap the number of rows returned. For ``csv``, this is propagated + directly to :func:`pandas.read_csv` so the underlying read is + memory-bounded. For ``parquet`` and ``json``, pandas does not + expose a native row limit, so the file is fully read and then + truncated; a :class:`UserWarning` is issued in that case so the + caller knows the bound is post-read (not memory-bounded). The + ``nrows`` cap is applied with a deterministic head slice so + re-runs on the same input produce the same metadata. """ import pandas as pd @@ -134,7 +148,11 @@ def _read_table(path: Path, fmt: str): if fmt == "csv": try: - return pd.read_csv(path) + # ``nrows`` is the only memory-bound knob native to read_csv; + # passing it here is what lets ``--explain-sample-size`` actually + # cap memory on huge CSV inputs (rather than loading the entire + # file and then trimming). + return pd.read_csv(path, nrows=nrows) except ( OSError, pd.errors.ParserError, @@ -148,7 +166,7 @@ def _read_table(path: Path, fmt: str): raise ValueError(f"Failed to read CSV from {str(path)!r}: {exc}") from exc if fmt == "parquet": try: - return pd.read_parquet(path) + df = pd.read_parquet(path) except ImportError as exc: raise ValueError( f"Reading parquet requires a parquet engine (pyarrow or fastparquet); " @@ -163,18 +181,42 @@ def _read_table(path: Path, fmt: str): # operation is delegated to a third-party backend; any error # raised is by definition an I/O or data issue, not a CLI bug. raise ValueError(f"Failed to read parquet from {str(path)!r}: {exc}") from exc + if nrows is not None and len(df) > nrows: + warnings.warn( + f"--explain-sample-size cap is applied post-read for parquet " + f"(loaded {len(df)} rows, truncating to {nrows}). pandas " + "does not expose a native parquet row-limit, so the full " + "file is materialized in memory before the cap. For hard " + "memory bounds on huge inputs, convert to CSV first.", + UserWarning, + stacklevel=2, + ) + df = df.iloc[:nrows] + return df if fmt == "json": # ``orient='records'`` is the agent-friendly default; fall back to # pandas' auto-detection when the file isn't a records list. try: - return pd.read_json(path, orient="records") + df = pd.read_json(path, orient="records") except ValueError: try: - return pd.read_json(path) + df = pd.read_json(path) except ValueError as exc: raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc except OSError as exc: raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc + if nrows is not None and len(df) > nrows: + warnings.warn( + f"--explain-sample-size cap is applied post-read for JSON " + f"(loaded {len(df)} rows, truncating to {nrows}). pandas " + "does not expose a native JSON row-limit, so the full " + "file is materialized in memory before the cap. For hard " + "memory bounds on huge inputs, convert to CSV first.", + UserWarning, + stacklevel=2, + ) + df = df.iloc[:nrows] + return df raise ValueError(f"Unsupported input format: {fmt}") @@ -520,7 +562,7 @@ def _fit_transform_capturing_warnings(engineer, X, y, **kwargs): class _ThreadCaptureState: - """Holds per-thread capture *stacks*. + """Holds per-thread capture *stacks* with a single-active-capture fallback. Each thread maps to a stack of capture lists. Nested :func:`_capture_featcopilot_messages` calls on the same thread push @@ -528,6 +570,18 @@ class _ThreadCaptureState: and receives records / warnings until its block exits, at which point the outer capture (if any) becomes active again. + **Worker-thread fallback.** When the calling thread doesn't have a + capture but exactly one capture is active anywhere in the process, + :meth:`get` returns that single capture. This handles the common + case where the capturing thread spawns worker threads (e.g. an LLM + sync client wrapping ``ThreadPoolExecutor`` because it was called + from a process with a running event loop) — those workers' log + records logically belong to the single in-flight CLI run, and + routing them there keeps stderr clean. When more than one capture + is active concurrently, the fallback stays disabled (each captures + only its own thread's records) so concurrent CLI calls don't bleed + into each other. + Shared by :class:`_ThreadRoutingHandler` (writes records), :class:`_SuppressCapturingFilter` (suppresses stderr), and the routing ``warnings.showwarning`` override. @@ -550,11 +604,21 @@ def pop(self, tid: int) -> None: del self._per_thread[tid] def get(self, tid: int) -> list[str] | None: - # Brief lock for thread-safe stack-top read. + # Brief lock for thread-safe stack-top read AND single-active- + # capture fallback (both walk ``self._per_thread``). with self._lock: stack = self._per_thread.get(tid) if stack: return stack[-1] + # Worker-thread fallback. Cross-thread records (e.g. from a + # ThreadPoolExecutor worker spawned by the capturing thread) + # are routed to the single active capture when there is no + # ambiguity. Multiple concurrent captures keep their strict + # per-thread isolation. + if len(self._per_thread) == 1: + only_stack = next(iter(self._per_thread.values())) + if only_stack: + return only_stack[-1] return None @@ -860,10 +924,13 @@ def _cmd_explain(args: argparse.Namespace) -> int: raise FileNotFoundError(f"Input file not found: {args.input}") in_fmt = _detect_format(input_path, args.input_format) - df = _read_table(input_path, in_fmt) - X, y = _split_xy(df, args.target) # Apply opt-in sample cap from CLI flag or config (CLI flag wins). + # Resolve and validate it BEFORE reading the input so the cap can be + # threaded into ``_read_table(... nrows=sample_size)`` to bound memory + # on huge inputs (CSV uses ``pd.read_csv(nrows=...)`` natively; + # parquet/JSON fall back to post-read truncation with a UserWarning + # since pandas doesn't expose a native row-limit for those formats). sample_size = getattr(args, "explain_sample_size", None) if sample_size is None and args.config is not None: sample_size = _load_config(args.config).get("explain_sample_size") @@ -872,40 +939,30 @@ def _cmd_explain(args: argparse.Namespace) -> int: if sample_size <= 0: raise ValueError(f"`explain_sample_size` must be a positive integer when set; got {sample_size!r}.") - n_sampled = len(X) - engineer = _build_engineer(args, include_selection_config=False) # Run the sample-warning AND ``fit_transform`` inside a single # capture context so the sampling notice ends up in the JSON # payload's ``warnings`` field instead of bleeding onto stderr. with _capture_featcopilot_messages() as captured_warnings: - if sample_size is not None and n_sampled > sample_size: + # Read with ``nrows=sample_size`` so the underlying I/O is + # memory-bounded for CSV; for parquet/JSON the bound is + # post-read with an emitted UserWarning (captured into the + # payload below). Reading FIRST gives us ``len(df)`` so we + # only emit the "metadata may differ" notice when the cap + # actually shortened the input. + df = _read_table(input_path, in_fmt, nrows=sample_size) + X, y = _split_xy(df, args.target) + n_sampled = len(X) + if sample_size is not None and n_sampled >= sample_size: warnings.warn( - f"explain: sampling input down to {sample_size} of {n_sampled} rows. " + f"explain: capping input to {sample_size} rows (sampling). " "Some engines (e.g. TabularEngine categorical encoding) decide which " "features to plan based on row counts and per-category statistics, " "so the reported metadata may differ from a full-input transform run.", UserWarning, stacklevel=2, ) - # Sample by *position* (``.iloc[...]``), not label - # (``.sample(...).index`` + ``.loc[...]``). ``.loc`` selects - # by label, so a non-unique index — common when reading - # parquet files that preserve a saved index — would let - # duplicate labels expand or reorder rows so ``X`` and ``y`` - # no longer line up. Positional sampling via a NumPy RNG + - # ``.iloc`` keeps them aligned regardless of input index. - rng_sampler = np.random.default_rng(0) - sample_positions = rng_sampler.choice(n_sampled, size=sample_size, replace=False) - # Sort the positions for determinism / readable output ordering - # (the random selection itself is already deterministic via - # the seeded RNG). - sample_positions.sort() - X = X.iloc[sample_positions] - if y is not None: - y = y.iloc[sample_positions] - n_sampled = sample_size engineer.fit_transform( X, diff --git a/tests/test_cli.py b/tests/test_cli.py index b43a850..6d37277 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2034,7 +2034,7 @@ def test_explain_caps_input_size_when_sample_size_set(tmp_path: Path): assert payload["n_features"] > 0 # The CLI emits a warning when sampling so callers can detect that # metadata may not match a full-input transform run. - assert any("sampling" in w.lower() for w in payload["warnings"]) + assert any("capping input" in w.lower() or "sampling" in w.lower() for w in payload["warnings"]) def test_explain_sample_size_smaller_than_input_no_op(tmp_path: Path): @@ -2067,7 +2067,7 @@ def test_explain_sample_size_smaller_than_input_no_op(tmp_path: Path): assert rc == 0, err payload = json.loads(out) assert payload["n_rows_used"] == n - assert not any("sampling" in w.lower() for w in payload["warnings"]) + assert not any("capping input" in w.lower() or "sampling" in w.lower() for w in payload["warnings"]) def test_explain_sample_size_via_config(tmp_path: Path): @@ -2402,7 +2402,171 @@ def _patched_fit_transform(self, X, y=None, **kwargs): assert "retry" not in err.lower() -# --------------------------------------------------------------- python -m +# ----------------------- explain --explain-sample-size memory bound + + +def test_explain_sample_size_bounds_csv_read_with_nrows(tmp_path: Path, monkeypatch): + """``--explain-sample-size N`` must propagate to ``pd.read_csv`` as + ``nrows=N`` so the underlying read is memory-bounded for huge CSV + inputs (rather than fully loading the file and then trimming). + """ + import pandas as pd + + rng = np.random.default_rng(0) + n = 5000 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "big.csv" + df.to_csv(in_path, index=False) + + real_read_csv = pd.read_csv + captured_kwargs: list[dict] = [] + + def _spy_read_csv(*args, **kwargs): + captured_kwargs.append(kwargs.copy()) + return real_read_csv(*args, **kwargs) + + monkeypatch.setattr(pd, "read_csv", _spy_read_csv, raising=True) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "200", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == 200 + # Must have called pd.read_csv with nrows=200, not loaded the whole + # 5000-row file. Multiple calls are OK; at least one must be the + # explain read with nrows. + explain_reads = [k for k in captured_kwargs if k.get("nrows") == 200] + assert explain_reads, f"expected pd.read_csv to be called with nrows=200; got {captured_kwargs!r}" + + +def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path): + """For parquet inputs, pandas has no native row-limit, so the bound + is applied post-read. The CLI must surface a warning describing the + limitation so callers know memory isn't strictly bounded. + """ + pytest.importorskip("pyarrow") + rng = np.random.default_rng(0) + n = 4000 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "big.parquet" + df.to_parquet(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "100", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == 100 + # The post-read truncation notice must appear in the captured warnings. + assert any("post-read" in w.lower() for w in payload["warnings"]) + + +# ----------------------- worker-thread capture fallback + + +def test_capture_routes_worker_thread_records_to_single_active_capture(): + """When exactly one capture is active in the process, log records + emitted on a *different* thread (e.g. a ``ThreadPoolExecutor`` + worker spawned by an LLM sync client) must still be routed to the + single active capture rather than escaping to stderr. + + This is the documented "single-active-capture fallback" of + :class:`_ThreadCaptureState`. + """ + import threading + from concurrent.futures import ThreadPoolExecutor + + fc_logger = logging.getLogger("featcopilot.test_worker") + + def _emit_in_worker(): + fc_logger.warning("from-worker") + return "ok" + + with fc_cli._capture_featcopilot_messages() as captured: + # Caller emits on its own thread (must be captured). + fc_logger.warning("from-caller") + # Spawn a worker thread (different ident) and emit there. + with ThreadPoolExecutor(max_workers=1) as pool: + assert pool.submit(_emit_in_worker).result(timeout=5) == "ok" + # Different non-worker thread also goes through the fallback. + t = threading.Thread(target=_emit_in_worker) + t.start() + t.join() + + assert any("from-caller" in m for m in captured) + # Worker-thread records ARE captured under the single-active-capture + # fallback (the per-thread stack lookup misses, but exactly one + # capture is active, so :meth:`_ThreadCaptureState.get` returns it). + assert sum(1 for m in captured if "from-worker" in m) >= 2 + + +def test_capture_keeps_thread_isolation_with_multiple_active_captures(): + """The single-active-capture fallback must NOT activate when two + threads are concurrently capturing — each must see only its own + thread's records, not records emitted on the other thread's + workers. + """ + import threading + + fc_logger = logging.getLogger("featcopilot.test_dual") + a_captured: list[str] = [] + b_captured: list[str] = [] + barrier = threading.Barrier(2) + inside = threading.Event() + + def worker(tag: str, target: list[str]): + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + inside.set() + for i in range(10): + fc_logger.warning(f"{tag}-{i}") + target.extend(captured) + + t1 = threading.Thread(target=worker, args=("A", a_captured)) + t2 = threading.Thread(target=worker, args=("B", b_captured)) + t1.start() + t2.start() + t1.join() + t2.join() + + # Each capture must contain ONLY its own thread's records (no fallback + # cross-talk because two captures are active). + assert all("A-" in m for m in a_captured) + assert all("B-" in m for m in b_captured) + assert len(a_captured) == 10 + assert len(b_captured) == 10 + + +# ----------------------- python -m def test_dunder_main_module_runs(monkeypatch, capsys): From bfb5da8373221d3dbcc3d7e6af8ac63f17c1acf0 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 5 May 2026 08:39:37 +0800 Subject: [PATCH 24/30] fix(cli): address round-19 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses all three new comments from round-19 reviewers on PR #5 (commit f710fbe): * No false sampling warning at exact-boundary (Codex gZs1 P3, Copilot gmOw — same root cause). The previous `n_sampled >= sample_size` check fired the "metadata may differ" warning even when the input had EXACTLY `sample_size` rows (no truncation actually happened), causing agents to misinterpret an unsampled run as degraded metadata. `_cmd_explain` now reads with `nrows = sample_size + 1` so the returned length is a strict proof of the file size relative to the cap: `len(df) > sample_size` means at least one row was dropped. The warning fires only in that case; the post-read truncation to exactly `sample_size` happens at the same time. CSV memory bound is preserved (`pd.read_csv(nrows=N+1)` still caps at N+1, no full load). Three new tests cover the three boundary cases: - input exactly = cap -> no warning, n_rows_used == cap - input < cap -> no warning, n_rows_used == input size - input = cap + 1 -> warning fires, n_rows_used == cap * --explain-sample-size help text describes actual semantics (Copilot gmPC). The help previously said "deterministic seed", implying a seeded random sample, but the implementation is now a deterministic head slice (`read_csv(nrows=N)` / `df.iloc[:N]`). The help and the docstring both updated to say "deterministic head slice (the first N rows of the input)" and to call out explicitly that this is NOT a random sample. New `test_explain_sample_size_help_text_describes_head_slice_not_random_seed` asserts the new wording and a regression guard against the old. Tests: 116 (+4 new) in tests/test_cli.py, 889 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 55 ++++++++++++------ tests/test_cli.py | 137 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 170 insertions(+), 22 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index ba87ca9..e396bf5 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -914,10 +914,17 @@ def _cmd_explain(args: argparse.Namespace) -> int: For very large inputs where the metadata-only nature of ``explain`` really should not pay full memory / compute cost, callers can pass ``--explain-sample-size N`` (or set ``"explain_sample_size": N`` in - ``--config``) to cap the rows fed to the engineer. The CLI emits a - ``UserWarning`` (captured into the JSON payload) noting that the - metadata may differ from a full-input ``transform`` run; the - ``n_rows_used`` field reports the effective sample size. + ``--config``) to cap the rows fed to the engineer. The cap is a + deterministic *head slice* (the first N rows): for CSV the cap is + threaded through ``pd.read_csv(nrows=N)`` so memory is bounded + natively; for parquet/JSON pandas has no native row-limit so the + file is fully read and then truncated, with a UserWarning explaining + the limitation. The cap is NOT a random sample — callers who need + randomness should sample externally before invoking ``explain``. + A "metadata may differ" UserWarning is emitted (captured into the + JSON payload's ``warnings`` field) only when the cap actually + truncated the input. The ``n_rows_used`` field reports the effective + sample size. """ input_path = Path(args.input) if not input_path.exists(): @@ -945,18 +952,28 @@ def _cmd_explain(args: argparse.Namespace) -> int: # capture context so the sampling notice ends up in the JSON # payload's ``warnings`` field instead of bleeding onto stderr. with _capture_featcopilot_messages() as captured_warnings: - # Read with ``nrows=sample_size`` so the underlying I/O is - # memory-bounded for CSV; for parquet/JSON the bound is - # post-read with an emitted UserWarning (captured into the - # payload below). Reading FIRST gives us ``len(df)`` so we - # only emit the "metadata may differ" notice when the cap - # actually shortened the input. - df = _read_table(input_path, in_fmt, nrows=sample_size) + # Read with ``nrows=sample_size + 1`` so the underlying I/O is + # memory-bounded for CSV (``pd.read_csv(nrows=...)``) AND we can + # tell from the returned length whether the file actually had + # more rows than the cap. ``len(df) > sample_size`` is a strict + # proof the file was truncated; ``len(df) <= sample_size`` means + # the file fit naturally and no metadata-may-differ warning is + # warranted. For parquet/JSON the bound is post-read with its + # own UserWarning emitted by ``_read_table``. + read_nrows = (sample_size + 1) if sample_size is not None else None + df = _read_table(input_path, in_fmt, nrows=read_nrows) X, y = _split_xy(df, args.target) n_sampled = len(X) - if sample_size is not None and n_sampled >= sample_size: + if sample_size is not None and n_sampled > sample_size: + # Strict proof of truncation: file had at least one more row + # than the requested cap. Trim to the exact cap and emit the + # "metadata may differ" notice. + X = X.iloc[:sample_size] + if y is not None: + y = y.iloc[:sample_size] + n_sampled = sample_size warnings.warn( - f"explain: capping input to {sample_size} rows (sampling). " + f"explain: capping input to {sample_size} rows (head slice). " "Some engines (e.g. TabularEngine categorical encoding) decide which " "features to plan based on row counts and per-category statistics, " "so the reported metadata may differ from a full-input transform run.", @@ -1072,10 +1089,14 @@ def _build_parser() -> argparse.ArgumentParser: "--explain-sample-size", type=int, default=None, - help="Cap the input fed to the engineer at this many rows (deterministic seed). " - "OFF by default: the full input is used so the metadata is a faithful description " - "of what a corresponding `transform` would generate. Pass a positive integer ONLY " - "when you knowingly accept that some engines (e.g. TabularEngine categorical " + help="Cap the input fed to the engineer at this many rows. The cap is " + "applied as a deterministic head slice (the first N rows of the input — " + "for CSV via `pd.read_csv(nrows=N)` so memory is bounded; for parquet/JSON " + "the file is fully read and then truncated, with a warning). OFF by default: " + "the full input is used so the metadata is a faithful description of what a " + "corresponding `transform` would generate. Pass a positive integer ONLY when " + "you knowingly accept that (a) the analyzed rows are the first N (not a " + "random sample), and (b) some engines (e.g. TabularEngine categorical " "encoding) decide which features to plan based on row counts and per-category " "statistics, so the reported metadata may differ from a full-input run.", ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 6d37277..3a2f224 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2447,11 +2447,13 @@ def _spy_read_csv(*args, **kwargs): assert rc == 0, err payload = json.loads(out) assert payload["n_rows_used"] == 200 - # Must have called pd.read_csv with nrows=200, not loaded the whole - # 5000-row file. Multiple calls are OK; at least one must be the - # explain read with nrows. - explain_reads = [k for k in captured_kwargs if k.get("nrows") == 200] - assert explain_reads, f"expected pd.read_csv to be called with nrows=200; got {captured_kwargs!r}" + # Must have called pd.read_csv with nrows=201 (sample_size + 1, the + # CLI requests one extra row so it can detect whether the input was + # actually larger than the cap and only emit the metadata-may-differ + # warning when truncation really happened). The full 5000-row file + # is never loaded. + explain_reads = [k for k in captured_kwargs if k.get("nrows") == 201] + assert explain_reads, f"expected pd.read_csv to be called with nrows=201; got {captured_kwargs!r}" def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path): @@ -2566,6 +2568,131 @@ def worker(tag: str, target: list[str]): assert len(b_captured) == 10 +# ----------------------- explain --explain-sample-size warning hygiene + + +def test_explain_no_sampling_warning_when_input_fits_exactly(tmp_path: Path): + """When the input has exactly ``--explain-sample-size`` rows, no + truncation actually happens, so the "metadata may differ" warning + must NOT fire. The success payload was previously inaccurate when + the warning fired on the boundary case. + """ + rng = np.random.default_rng(0) + n = 200 # exactly the sample-size we'll request + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "exact.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "200", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == 200 + # No "metadata may differ" warning — input fit naturally. + assert not any("capping input" in w.lower() or "metadata may differ" in w.lower() for w in payload["warnings"]) + + +def test_explain_no_sampling_warning_when_input_smaller_than_sample(tmp_path: Path): + """When the input has fewer rows than ``--explain-sample-size``, + obviously no truncation happens. Belt-and-suspenders coverage of + the "<= cap, no warning" branch. + """ + rng = np.random.default_rng(0) + n = 50 + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "small.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "200", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == n + assert not any("capping input" in w.lower() or "metadata may differ" in w.lower() for w in payload["warnings"]) + + +def test_explain_sampling_warning_fires_when_input_strictly_larger(tmp_path: Path): + """Strict proof of truncation: input has at least one MORE row than + the cap. The warning must fire, and the payload must report + ``n_rows_used == sample_size``. + """ + rng = np.random.default_rng(0) + n = 201 # exactly one more than the cap + df = pd.DataFrame( + { + "x1": rng.normal(size=n), + "x2": rng.normal(size=n), + "y": rng.integers(0, 2, size=n), + } + ) + in_path = tmp_path / "barely_over.csv" + df.to_csv(in_path, index=False) + + rc, out, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + "--explain-sample-size", + "200", + ] + ) + assert rc == 0, err + payload = json.loads(out) + assert payload["n_rows_used"] == 200 + assert any("capping input" in w.lower() for w in payload["warnings"]) + + +def test_explain_sample_size_help_text_describes_head_slice_not_random_seed(): + """The ``--explain-sample-size`` help text must accurately describe + the actual semantics (deterministic head slice, NOT a seeded random + sample). Guards against misleading users / agents who would expect + an unbiased sample. + """ + parser = fc_cli._build_parser() + explain_parser = next( + action.choices["explain"] for action in parser._actions if isinstance(action, argparse._SubParsersAction) + ) + sample_help = next(a.help for a in explain_parser._actions if "--explain-sample-size" in a.option_strings) + # Must accurately describe the implementation. + assert "head slice" in sample_help.lower() or "first n" in sample_help.lower() + # Must NOT use the misleading old phrasing. + assert "deterministic seed" not in sample_help.lower() + assert "random sample" not in sample_help.lower() or "not a random sample" in sample_help.lower() + + # ----------------------- python -m From 167b49081062b4ff07246501ca964a0026c9948a Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 5 May 2026 09:14:11 +0800 Subject: [PATCH 25/30] fix(cli): address round-20 review feedback (Codex P2) Addresses the Codex P2 comment on PR #5 (commit bfb5da8): * Reject empty / header-only inputs with exit 2 (Codex gtYr). When a CSV has headers but zero data rows, `pd.read_csv` returns an empty DataFrame rather than raising `EmptyDataError`. Same for an empty JSON array (`[]`) or a parquet file with schema but zero rows. The CLI then passed that frame into `TabularEngine`, which divides by `len(X)` while fitting categorical encoding and the command exited via the generic exit-1 `unexpected error` path instead of a clean user-input error. `_read_table` now performs an explicit `df.empty` check after every supported format read and raises `ValueError("Input file ... is empty (zero data rows). Feature engineering requires at least one row of data.")` -> exit 2 with a precise stderr message. Four new tests cover the new branch: - header-only CSV (transform) - empty JSON array (transform) - parquet with schema but zero rows (transform) - header-only CSV (explain) Tests: 120 (+4 new) in tests/test_cli.py, 893 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 26 ++++++++++---- tests/test_cli.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 6 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index e396bf5..3425605 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -152,7 +152,7 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None): # passing it here is what lets ``--explain-sample-size`` actually # cap memory on huge CSV inputs (rather than loading the entire # file and then trimming). - return pd.read_csv(path, nrows=nrows) + df = pd.read_csv(path, nrows=nrows) except ( OSError, pd.errors.ParserError, @@ -164,7 +164,7 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None): # "unexpected error" path instead of the documented exit-2 # user-input error. raise ValueError(f"Failed to read CSV from {str(path)!r}: {exc}") from exc - if fmt == "parquet": + elif fmt == "parquet": try: df = pd.read_parquet(path) except ImportError as exc: @@ -192,8 +192,7 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None): stacklevel=2, ) df = df.iloc[:nrows] - return df - if fmt == "json": + elif fmt == "json": # ``orient='records'`` is the agent-friendly default; fall back to # pandas' auto-detection when the file isn't a records list. try: @@ -216,8 +215,23 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None): stacklevel=2, ) df = df.iloc[:nrows] - return df - raise ValueError(f"Unsupported input format: {fmt}") + else: + raise ValueError(f"Unsupported input format: {fmt}") + + # Reject "header-only" / empty inputs across every supported format. + # ``pd.read_csv`` returns an empty DataFrame (no exception) when the + # CSV has headers but zero data rows; the same goes for an empty + # parquet file or ``[]`` JSON body. Without this check, the CLI + # would pass an empty frame into ``TabularEngine``, which divides by + # ``len(X)`` while fitting categorical encoding and exits via the + # generic ``unexpected error`` path. Surface the issue as a clean + # exit-2 user-input error. + if df.empty: + raise ValueError( + f"Input file {str(path)!r} is empty (zero data rows). " + "Feature engineering requires at least one row of data." + ) + return df def _write_table(df, path: Path, fmt: str) -> None: diff --git a/tests/test_cli.py b/tests/test_cli.py index 3a2f224..d1e044f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -904,6 +904,93 @@ def test_headerless_csv_input_returns_exit_2(tmp_path: Path): assert "failed to read csv" in err.lower() +def test_header_only_csv_input_returns_exit_2(tmp_path: Path): + """A CSV that has a header line but ZERO data rows is read by pandas + as an *empty* DataFrame (no exception). Without the explicit empty + check, the CLI would feed it into ``TabularEngine`` which divides by + ``len(X)`` and exits via the generic exit-1 backstop. The CLI must + surface this as a clean exit-2 user-input error. + """ + in_path = tmp_path / "header_only.csv" + in_path.write_text("x1,x2,y\n") # header but no data + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "empty" in err.lower() + assert "zero data rows" in err.lower() + + +def test_empty_json_input_returns_exit_2(tmp_path: Path): + """An empty JSON array is parsed as an empty DataFrame and must be + rejected up front like header-only CSV. + """ + in_path = tmp_path / "empty.json" + in_path.write_text("[]") + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "empty" in err.lower() + + +def test_empty_parquet_input_returns_exit_2(tmp_path: Path): + """A parquet file with schema but zero rows is rejected up front.""" + pytest.importorskip("pyarrow") + in_path = tmp_path / "empty.parquet" + pd.DataFrame({"x1": [], "x2": [], "y": []}).to_parquet(in_path, index=False) + + rc, _, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + ] + ) + assert rc == 2 + assert "empty" in err.lower() + + +def test_explain_header_only_csv_returns_exit_2(tmp_path: Path): + """The empty-input check is applied to ``explain`` too.""" + in_path = tmp_path / "header_only.csv" + in_path.write_text("x1,x2,y\n") + + rc, _, err = _run( + [ + "explain", + "--input", + str(in_path), + "--target", + "y", + ] + ) + assert rc == 2 + assert "empty" in err.lower() + + def test_transform_include_target_collision_returns_exit_2(tmp_path: Path): """``--include-target`` would silently overwrite an engineered feature if it happens to share the target column's name. The CLI must detect From 016fe34d4f35c49f5a4f8bd3ac577834b19ae871 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 5 May 2026 12:21:54 +0800 Subject: [PATCH 26/30] fix(cli): address round-22 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses all three new comments from copilot-pull-request-reviewer on PR #5 (commits 167b490 + 5cbb843): * Drop the over-broad single-active-capture fallback (Copilot hFmA). Round-18's `_ThreadCaptureState.get` fallback routed records from ANY thread to the single in-flight CLI capture whenever exactly one capture was active. As Copilot pointed out, that was too broad: unrelated background work using `featcopilot` in the same process would have its log/warning output silently swallowed into the active CLI command's payload, with no ownership tie back to the command. `get` is now strictly per-thread; cross-thread records (e.g. ThreadPoolExecutor workers) bleed onto stderr like any other background log. The class docstring documents the trade-off and the existing options for callers who really need every worker record captured. The previous worker-routing test was inverted into `test_capture_does_not_route_unrelated_thread_records`, which guards against the regression. * Structured single-line argparse error format (Copilot hFmP). `argparse`'s default `error()` writes the multi-line usage banner (`usage: featcopilot ...`) PLUS `prog: error: ...` to stderr before raising `SystemExit`. `main()` was only converting the exit code; the banner still appeared on stderr, breaking the documented "stderr carries one `featcopilot: error: ...` line per failure" contract. New `_StructuredArgumentParser` overrides `error()` to emit exactly one `featcopilot: error: ` line with no banner, and is wired into both the top-level parser and all subparsers via `parser_class=_StructuredArgumentParser`. Two new tests cover the contract: single-line stderr for an unknown flag and for a missing subcommand, plus a regression guard against the `usage:` substring appearing in stderr. * Eliminate double truncation warning for parquet/JSON sample (Copilot iH8R). `_cmd_explain` was passing `nrows=sample_size+1` to `_read_table` so it could detect truncation. For parquet/JSON `_read_table` then emitted its own post-read warning saying "truncating to N+1" and trimmed to that, after which `_cmd_explain` trimmed again to N and emitted a second warning. Two near-duplicate warnings with off-by-one numbers — confusing to agents. The fix: - `_read_table` now accepts `suppress_truncation_warning=True` so callers that emit their own consolidated message can silence its post-read notice. - `_cmd_explain` only passes `nrows` to `_read_table` when pandas can natively bound the read (CSV with `nrows=N+1`). For parquet/JSON it reads with `nrows=None` (full file) and handles both detection and trimming itself. - The single `_cmd_explain` UserWarning now uses the user-facing `sample_size` value, AND for parquet/JSON includes the "memory wasn't bounded" caveat as a single sentence — same information, no duplication. Test `test_explain_sample_size_warns_post_read_for_parquet` asserts exactly one truncation notice and that it references `100 rows` (not the internal +1). Tests: 122 (+2 net) in tests/test_cli.py, 895 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 189 ++++++++++++++++++++++++++++++--------------- tests/test_cli.py | 100 ++++++++++++++++++------ 2 files changed, 201 insertions(+), 88 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index 3425605..eea1b76 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -115,7 +115,7 @@ def _detect_format(path: Path, override: str | None) -> str: return fmt -def _read_table(path: Path, fmt: str, *, nrows: int | None = None): +def _read_table(path: Path, fmt: str, *, nrows: int | None = None, suppress_truncation_warning: bool = False): """Read a tabular file into a pandas DataFrame. All user-facing failure modes (missing parquet engine, ``--input`` @@ -136,10 +136,17 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None): directly to :func:`pandas.read_csv` so the underlying read is memory-bounded. For ``parquet`` and ``json``, pandas does not expose a native row limit, so the file is fully read and then - truncated; a :class:`UserWarning` is issued in that case so the - caller knows the bound is post-read (not memory-bounded). The - ``nrows`` cap is applied with a deterministic head slice so - re-runs on the same input produce the same metadata. + truncated; a :class:`UserWarning` is issued in that case (unless + ``suppress_truncation_warning`` is true) so the caller knows the + bound is post-read (not memory-bounded). The ``nrows`` cap is + applied with a deterministic head slice so re-runs on the same + input produce the same metadata. + suppress_truncation_warning : bool, optional + When True, the post-read truncation notice (parquet / JSON only) + is *not* emitted from this helper. Used by callers that emit + their own consolidated, user-facing warning so users don't see + a confusing pair of messages — see ``_cmd_explain``'s + ``--explain-sample-size`` handling. """ import pandas as pd @@ -182,15 +189,16 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None): # raised is by definition an I/O or data issue, not a CLI bug. raise ValueError(f"Failed to read parquet from {str(path)!r}: {exc}") from exc if nrows is not None and len(df) > nrows: - warnings.warn( - f"--explain-sample-size cap is applied post-read for parquet " - f"(loaded {len(df)} rows, truncating to {nrows}). pandas " - "does not expose a native parquet row-limit, so the full " - "file is materialized in memory before the cap. For hard " - "memory bounds on huge inputs, convert to CSV first.", - UserWarning, - stacklevel=2, - ) + if not suppress_truncation_warning: + warnings.warn( + f"--explain-sample-size cap is applied post-read for parquet " + f"(loaded {len(df)} rows, truncating to {nrows}). pandas " + "does not expose a native parquet row-limit, so the full " + "file is materialized in memory before the cap. For hard " + "memory bounds on huge inputs, convert to CSV first.", + UserWarning, + stacklevel=2, + ) df = df.iloc[:nrows] elif fmt == "json": # ``orient='records'`` is the agent-friendly default; fall back to @@ -205,15 +213,16 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None): except OSError as exc: raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc if nrows is not None and len(df) > nrows: - warnings.warn( - f"--explain-sample-size cap is applied post-read for JSON " - f"(loaded {len(df)} rows, truncating to {nrows}). pandas " - "does not expose a native JSON row-limit, so the full " - "file is materialized in memory before the cap. For hard " - "memory bounds on huge inputs, convert to CSV first.", - UserWarning, - stacklevel=2, - ) + if not suppress_truncation_warning: + warnings.warn( + f"--explain-sample-size cap is applied post-read for JSON " + f"(loaded {len(df)} rows, truncating to {nrows}). pandas " + "does not expose a native JSON row-limit, so the full " + "file is materialized in memory before the cap. For hard " + "memory bounds on huge inputs, convert to CSV first.", + UserWarning, + stacklevel=2, + ) df = df.iloc[:nrows] else: raise ValueError(f"Unsupported input format: {fmt}") @@ -576,7 +585,7 @@ def _fit_transform_capturing_warnings(engineer, X, y, **kwargs): class _ThreadCaptureState: - """Holds per-thread capture *stacks* with a single-active-capture fallback. + """Holds per-thread capture *stacks* with strict per-thread isolation. Each thread maps to a stack of capture lists. Nested :func:`_capture_featcopilot_messages` calls on the same thread push @@ -584,17 +593,22 @@ class _ThreadCaptureState: and receives records / warnings until its block exits, at which point the outer capture (if any) becomes active again. - **Worker-thread fallback.** When the calling thread doesn't have a - capture but exactly one capture is active anywhere in the process, - :meth:`get` returns that single capture. This handles the common - case where the capturing thread spawns worker threads (e.g. an LLM - sync client wrapping ``ThreadPoolExecutor`` because it was called - from a process with a running event loop) — those workers' log - records logically belong to the single in-flight CLI run, and - routing them there keeps stderr clean. When more than one capture - is active concurrently, the fallback stays disabled (each captures - only its own thread's records) so concurrent CLI calls don't bleed - into each other. + **Strict thread isolation.** :meth:`get` returns a target list ONLY + for the calling thread itself. Records emitted on threads other + than the one that opened a capture (e.g. a worker spawned by an + LLM sync client wrapping ``ThreadPoolExecutor``) flow through the + normal handler chain and reach stderr — same as records emitted + by unrelated background work that happens to use ``featcopilot`` + in the same process. This is intentional: a previous "single- + active-capture fallback" was too broad — when a single CLI run + was active, *any* featcopilot log on any thread would have been + silently swallowed into that command's payload, including + unrelated background work, causing misattribution. Strict per- + thread routing avoids that ambiguity at the cost of letting some + worker-thread records bleed onto stderr; callers who need every + last log captured should make sure their worker code explicitly + propagates the calling thread's identity (e.g. via + ``contextvars`` or a dedicated logging wrapper). Shared by :class:`_ThreadRoutingHandler` (writes records), :class:`_SuppressCapturingFilter` (suppresses stderr), and the @@ -618,21 +632,14 @@ def pop(self, tid: int) -> None: del self._per_thread[tid] def get(self, tid: int) -> list[str] | None: - # Brief lock for thread-safe stack-top read AND single-active- - # capture fallback (both walk ``self._per_thread``). + # Strict per-thread lookup. No cross-thread fallback (see class + # docstring): the previous "single-active-capture" fallback + # was too broad and could silently swallow unrelated + # background log output into a CLI run's payload. with self._lock: stack = self._per_thread.get(tid) if stack: return stack[-1] - # Worker-thread fallback. Cross-thread records (e.g. from a - # ThreadPoolExecutor worker spawned by the capturing thread) - # are routed to the single active capture when there is no - # ambiguity. Multiple concurrent captures keep their strict - # per-thread isolation. - if len(self._per_thread) == 1: - only_stack = next(iter(self._per_thread.values())) - if only_stack: - return only_stack[-1] return None @@ -966,16 +973,30 @@ def _cmd_explain(args: argparse.Namespace) -> int: # capture context so the sampling notice ends up in the JSON # payload's ``warnings`` field instead of bleeding onto stderr. with _capture_featcopilot_messages() as captured_warnings: - # Read with ``nrows=sample_size + 1`` so the underlying I/O is - # memory-bounded for CSV (``pd.read_csv(nrows=...)``) AND we can - # tell from the returned length whether the file actually had - # more rows than the cap. ``len(df) > sample_size`` is a strict - # proof the file was truncated; ``len(df) <= sample_size`` means - # the file fit naturally and no metadata-may-differ warning is - # warranted. For parquet/JSON the bound is post-read with its - # own UserWarning emitted by ``_read_table``. - read_nrows = (sample_size + 1) if sample_size is not None else None - df = _read_table(input_path, in_fmt, nrows=read_nrows) + # Choose a read strategy that gives us: + # 1. a memory bound where pandas allows it (CSV ``nrows``), and + # 2. enough information to detect truncation without ``_read_table`` + # emitting its own (slightly off-by-one) warning that would + # then double up with ours. + # + # For CSV we ask for ``sample_size + 1`` rows: ``pd.read_csv`` + # reads at most that many, AND ``len(df) > sample_size`` becomes + # a strict proof of truncation. We pass ``suppress_truncation_warning`` + # so ``_read_table`` doesn't emit its own message — ``_cmd_explain`` + # is the single source of truth for the sampling notice and uses + # the user-facing ``sample_size`` value. + # + # For parquet/JSON pandas exposes no native row-limit, so we + # always read fully (``nrows=None``) and let ``_cmd_explain`` + # both detect truncation and emit a single notice that includes + # the "memory bound is post-read" caveat. Asking ``_read_table`` + # for a limit there would only have caused it to truncate at + # ``sample_size + 1`` and emit a confusing duplicate warning. + if sample_size is not None and in_fmt == "csv": + read_nrows: int | None = sample_size + 1 + else: + read_nrows = None + df = _read_table(input_path, in_fmt, nrows=read_nrows, suppress_truncation_warning=True) X, y = _split_xy(df, args.target) n_sampled = len(X) if sample_size is not None and n_sampled > sample_size: @@ -985,15 +1006,25 @@ def _cmd_explain(args: argparse.Namespace) -> int: X = X.iloc[:sample_size] if y is not None: y = y.iloc[:sample_size] + original_len = n_sampled n_sampled = sample_size - warnings.warn( - f"explain: capping input to {sample_size} rows (head slice). " + msg = f"explain: capping input to {sample_size} rows (head slice). " + if in_fmt != "csv": + # For parquet/JSON we read the whole file before truncation + # (no native row-limit). Surface that fact so callers know + # memory wasn't bounded. + msg += ( + f"For {in_fmt}, pandas does not expose a native row-limit, " + f"so the full file ({original_len}+ rows) was loaded into " + "memory before truncation. For hard memory bounds on huge " + "inputs, convert to CSV first. " + ) + msg += ( "Some engines (e.g. TabularEngine categorical encoding) decide which " "features to plan based on row counts and per-category statistics, " - "so the reported metadata may differ from a full-input transform run.", - UserWarning, - stacklevel=2, + "so the reported metadata may differ from a full-input transform run." ) + warnings.warn(msg, UserWarning, stacklevel=2) engineer.fit_transform( X, @@ -1030,8 +1061,35 @@ def _cmd_explain(args: argparse.Namespace) -> int: return 0 +class _StructuredArgumentParser(argparse.ArgumentParser): + """``argparse.ArgumentParser`` that emits the CLI's structured single-line + error format on usage failures. + + The default ``argparse`` ``error()`` method writes the multi-line + usage banner ("usage: featcopilot ...\\n featcopilot: error: ...") to + stderr before raising :class:`SystemExit`. That breaks the CLI + contract that stderr carries exactly one ``featcopilot: error: ...`` + line per failure — agents parsing stderr deterministically would see + the banner and the actual error mixed together, with no easy way to + tell which is which. + + This subclass overrides :meth:`error` to write a single line and + skip the banner, so usage failures (missing required argument, + unknown flag, missing subcommand, etc.) follow the same single-line + contract as the rest of the CLI's exit-2 paths. + """ + + def error(self, message: str) -> None: # type: ignore[override] + sys.stderr.write(f"featcopilot: error: {message}\n") + # ``ArgumentParser.error`` is documented to terminate; ``SystemExit(2)`` + # is what the parent class would do after writing the banner. + # ``main()``'s ``except SystemExit`` handler converts this to an int + # return value so callers still see the documented exit-2 contract. + raise SystemExit(2) + + def _build_parser() -> argparse.ArgumentParser: - parser = argparse.ArgumentParser( + parser = _StructuredArgumentParser( prog="featcopilot", description=( "FeatCopilot CLI — automated feature engineering from the command line. " @@ -1045,7 +1103,12 @@ def _build_parser() -> argparse.ArgumentParser: action="version", version=f"featcopilot {__version__}", ) - subparsers = parser.add_subparsers(dest="command", required=True, metavar="COMMAND") + # Use the structured parser class for subparsers too so any + # subcommand-specific usage error (unknown flag, missing required + # arg) follows the same single-line stderr contract. + subparsers = parser.add_subparsers( + dest="command", required=True, metavar="COMMAND", parser_class=_StructuredArgumentParser + ) # ----- info --------------------------------------------------------- p_info = subparsers.add_parser( diff --git a/tests/test_cli.py b/tests/test_cli.py index d1e044f..aea1b13 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2015,6 +2015,45 @@ def test_unknown_flag_returns_exit_2(capsys): assert rc == 2 +def test_argparse_usage_error_emits_single_structured_line(tmp_path: Path, tabular_csv: Path): + """``argparse`` defaults to writing a multi-line usage banner before its + error message, mixing two pieces of information on stderr that agents + must then parse apart. The CLI's ``_StructuredArgumentParser`` collapses + those into the single canonical ``featcopilot: error: `` line + so usage failures match the rest of the exit-2 contract. + """ + rc, _, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(tmp_path / "out.csv"), + "--target", + "y", + "--no-such-flag", # genuine unknown flag (not a missing-required) + ] + ) + assert rc == 2 + err_lines = [line for line in err.splitlines() if line.strip()] + # Exactly one non-empty stderr line. + assert len(err_lines) == 1, f"Expected single-line stderr, got {err_lines!r}" + assert err_lines[0].startswith("featcopilot: error: ") + # No multi-line ``argparse`` usage banner. + assert "usage:" not in err.lower() + # Still mentions the offending flag. + assert "--no-such-flag" in err + + +def test_argparse_missing_subcommand_emits_single_structured_line(): + rc, _, err = _run([]) + assert rc == 2 + err_lines = [line for line in err.splitlines() if line.strip()] + assert len(err_lines) == 1, f"Expected single-line stderr, got {err_lines!r}" + assert err_lines[0].startswith("featcopilot: error: ") + assert "usage:" not in err.lower() + + def test_help_flag_returns_zero(capsys): rc = fc_cli.main(["--help"]) assert rc == 0 @@ -2546,7 +2585,9 @@ def _spy_read_csv(*args, **kwargs): def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path): """For parquet inputs, pandas has no native row-limit, so the bound is applied post-read. The CLI must surface a warning describing the - limitation so callers know memory isn't strictly bounded. + limitation so callers know memory isn't strictly bounded. The + warning is emitted by ``_cmd_explain`` itself (not duplicated by + ``_read_table``) so the user sees one accurate message. """ pytest.importorskip("pyarrow") rng = np.random.default_rng(0) @@ -2576,46 +2617,55 @@ def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path): payload = json.loads(out) assert payload["n_rows_used"] == 100 # The post-read truncation notice must appear in the captured warnings. - assert any("post-read" in w.lower() for w in payload["warnings"]) + # The unified message says: "For parquet, pandas does not expose a + # native row-limit, so the full file ... was loaded into memory before + # truncation." + captured = " ".join(payload["warnings"]).lower() + assert "native row-limit" in captured or "post-read" in captured or "memory before truncation" in captured + # The user-facing message uses the actual sample_size (100), NOT the + # internal +1 read size, AND there is exactly one truncation notice. + assert "100 rows" in " ".join(payload["warnings"]) + truncation_msgs = [w for w in payload["warnings"] if "truncat" in w.lower() or "capping" in w.lower()] + assert len(truncation_msgs) == 1, f"expected exactly one truncation notice, got {truncation_msgs!r}" -# ----------------------- worker-thread capture fallback +# ----------------------- strict per-thread capture isolation -def test_capture_routes_worker_thread_records_to_single_active_capture(): - """When exactly one capture is active in the process, log records - emitted on a *different* thread (e.g. a ``ThreadPoolExecutor`` - worker spawned by an LLM sync client) must still be routed to the - single active capture rather than escaping to stderr. +def test_capture_does_not_route_unrelated_thread_records(): + """The capture layer must use STRICT per-thread routing: records + emitted on threads other than the one that opened a capture flow + through the normal handler chain (and reach stderr) — they are + NOT silently rolled into the single in-flight CLI run's payload. - This is the documented "single-active-capture fallback" of - :class:`_ThreadCaptureState`. + A previous "single-active-capture fallback" was too broad: when a + single CLI run was active, *any* featcopilot log on any thread + would have been swallowed into that command's payload, including + unrelated background work, causing misattribution. This test + guards against that regression. """ import threading - from concurrent.futures import ThreadPoolExecutor - - fc_logger = logging.getLogger("featcopilot.test_worker") - def _emit_in_worker(): - fc_logger.warning("from-worker") - return "ok" + fc_logger = logging.getLogger("featcopilot.test_unrelated") with fc_cli._capture_featcopilot_messages() as captured: # Caller emits on its own thread (must be captured). fc_logger.warning("from-caller") - # Spawn a worker thread (different ident) and emit there. - with ThreadPoolExecutor(max_workers=1) as pool: - assert pool.submit(_emit_in_worker).result(timeout=5) == "ok" - # Different non-worker thread also goes through the fallback. - t = threading.Thread(target=_emit_in_worker) + + # Spawn a separate, unrelated thread that ALSO emits via the + # featcopilot logger. With the over-broad fallback removed, that + # record must NOT appear in this capture's payload. + def _emit_elsewhere(): + fc_logger.warning("from-other-thread") + + t = threading.Thread(target=_emit_elsewhere) t.start() t.join() assert any("from-caller" in m for m in captured) - # Worker-thread records ARE captured under the single-active-capture - # fallback (the per-thread stack lookup misses, but exactly one - # capture is active, so :meth:`_ThreadCaptureState.get` returns it). - assert sum(1 for m in captured if "from-worker" in m) >= 2 + # Strict per-thread isolation: unrelated thread's record is NOT in + # this capture's payload. + assert not any("from-other-thread" in m for m in captured) def test_capture_keeps_thread_isolation_with_multiple_active_captures(): From 298d4def60b26b8e79dd8fa68fac6b96cebd0d4c Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 5 May 2026 20:06:22 +0800 Subject: [PATCH 27/30] fix(cli): address round-23 review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the new comment from copilot-pull-request-reviewer on PR #5 (commit 016fe34): * Restore narrow LLM-only cross-thread fallback (Copilot osJO). Round-22 removed the single-active-capture fallback in response to Copilot's complaint that it was too broad (would misattribute unrelated background `featcopilot` work). Round-23 Copilot now flags that the strict per-thread routing breaks the "stderr is reserved for failures" contract for LLM-backed runs in event-loop environments — the sync LLM clients in `featcopilot/llm/*_client.py` fall back to `ThreadPoolExecutor` there, and their mock-mode startup warnings emit from worker threads that `submit()` spawns. Resolution: a *narrow* fallback that satisfies both reviewers. `_ThreadCaptureState` now exposes `get_for_llm_record(tid, logger_name)`: it returns the calling thread's capture if any, otherwise — and only when the record originates from `featcopilot.llm.*` AND exactly one capture is active in the process — it routes to that single capture. Multiple concurrent captures keep strict isolation (no LLM cross-talk either). Records from non-LLM featcopilot loggers on cross-threads still flow to stderr as before, so unrelated background work is NOT misattributed. Both `_ThreadRoutingHandler.emit` and `_SuppressCapturingFilter.filter` go through `get_for_llm_record`, so the routing handler and the stderr- suppression filter stay in lockstep: anything captured is also suppressed from the original handlers. Three tests cover the policy: - `test_capture_does_not_route_unrelated_thread_records`: non-LLM record from a cross-thread is NOT captured (regression guard against round-22's "too broad" complaint). - `test_capture_routes_llm_client_worker_records_to_single_active_capture`: LLM record from ThreadPoolExecutor + raw threading.Thread workers IS captured. - `test_capture_does_not_apply_llm_fallback_with_multiple_captures`: even for LLM records, two concurrent captures stay strictly isolated. Tests: 124 (+2 net) in tests/test_cli.py, 897 passed full suite. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 112 +++++++++++++++++++++++++++++++-------------- tests/test_cli.py | 103 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 171 insertions(+), 44 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index eea1b76..ea48ce3 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -585,7 +585,8 @@ def _fit_transform_capturing_warnings(engineer, X, y, **kwargs): class _ThreadCaptureState: - """Holds per-thread capture *stacks* with strict per-thread isolation. + """Holds per-thread capture *stacks* with strict per-thread isolation + and a *narrow* cross-thread fallback for LLM-client log records. Each thread maps to a stack of capture lists. Nested :func:`_capture_featcopilot_messages` calls on the same thread push @@ -593,28 +594,35 @@ class _ThreadCaptureState: and receives records / warnings until its block exits, at which point the outer capture (if any) becomes active again. - **Strict thread isolation.** :meth:`get` returns a target list ONLY - for the calling thread itself. Records emitted on threads other - than the one that opened a capture (e.g. a worker spawned by an - LLM sync client wrapping ``ThreadPoolExecutor``) flow through the - normal handler chain and reach stderr — same as records emitted - by unrelated background work that happens to use ``featcopilot`` - in the same process. This is intentional: a previous "single- - active-capture fallback" was too broad — when a single CLI run - was active, *any* featcopilot log on any thread would have been - silently swallowed into that command's payload, including - unrelated background work, causing misattribution. Strict per- - thread routing avoids that ambiguity at the cost of letting some - worker-thread records bleed onto stderr; callers who need every - last log captured should make sure their worker code explicitly - propagates the calling thread's identity (e.g. via - ``contextvars`` or a dedicated logging wrapper). + **Strict thread isolation by default.** :meth:`get` returns a target + list ONLY for the calling thread itself. This avoids the + misattribution that an unconditional single-active-capture fallback + would cause: any featcopilot log on any thread would otherwise be + silently swallowed into the active CLI run's payload, including + output from unrelated background work happening in the same process. + + **Narrow LLM-client fallback.** :meth:`get_for_llm_record` is the + one exception: when a record originates from + ``featcopilot.llm.*_client`` (the sync LLM clients that fall back to + ``ThreadPoolExecutor`` in event-loop environments), and exactly one + capture is active in the process, the record is routed to that + capture even when emitted on a worker thread. This addresses the + common case where an LLM client's mock-mode startup warning fires + on a worker that ``submit()`` spawned and would otherwise bleed + onto stderr; the targeted whitelist keeps unrelated background + featcopilot work from being misattributed. Shared by :class:`_ThreadRoutingHandler` (writes records), :class:`_SuppressCapturingFilter` (suppresses stderr), and the routing ``warnings.showwarning`` override. """ + # Logger-name prefixes whose records are eligible for the narrow + # cross-thread fallback. Only the LLM client modules whose sync + # entry points fall back to ``ThreadPoolExecutor`` in event-loop + # environments are listed. + _LLM_FALLBACK_LOGGER_PREFIXES = ("featcopilot.llm.",) + def __init__(self): self._per_thread: dict[int, list[list[str]]] = {} self._lock = threading.Lock() @@ -632,14 +640,42 @@ def pop(self, tid: int) -> None: del self._per_thread[tid] def get(self, tid: int) -> list[str] | None: - # Strict per-thread lookup. No cross-thread fallback (see class - # docstring): the previous "single-active-capture" fallback - # was too broad and could silently swallow unrelated - # background log output into a CLI run's payload. + # Strict per-thread lookup. No cross-thread fallback: an + # unconditional fallback was too broad and could silently + # swallow unrelated background log output into a CLI run's + # payload. The narrow LLM-client fallback lives in + # :meth:`get_for_llm_record` instead, opted into by name. + with self._lock: + stack = self._per_thread.get(tid) + if stack: + return stack[-1] + return None + + def get_for_llm_record(self, tid: int, logger_name: str) -> list[str] | None: + """Per-thread lookup with a narrow cross-thread fallback for LLM + client records. + + When the calling thread has its own active capture, that's used. + Otherwise — and only when the record originates from a + whitelisted ``featcopilot.llm.*`` logger AND exactly one capture + is active in the process — the record is routed to that single + capture so it doesn't bleed onto stderr. The whitelist keeps + unrelated background featcopilot work strictly isolated. + """ with self._lock: stack = self._per_thread.get(tid) if stack: return stack[-1] + if not logger_name.startswith(self._LLM_FALLBACK_LOGGER_PREFIXES): + return None + if len(self._per_thread) != 1: + # Either no captures are active (nothing to route to) or + # multiple are active (ambiguous — keep strict isolation + # so concurrent CLI calls don't cross-contaminate). + return None + only_stack = next(iter(self._per_thread.values())) + if only_stack: + return only_stack[-1] return None @@ -649,9 +685,13 @@ class _ThreadRoutingHandler(logging.Handler): Attached once to the ``featcopilot`` root logger. Records propagated from any ``featcopilot.*`` child logger reach this handler in the same way they reach the existing stderr handler. If the calling thread has - a registered capture list, the record is appended to it; otherwise the - handler does nothing (the existing stderr handler is what produces the - user-facing output for non-capturing threads). + a registered capture list, the record is appended to it. + Otherwise, for records originating from a ``featcopilot.llm.*`` + logger AND when exactly one capture is active in the process, the + record is routed to that capture (the narrow LLM-client cross-thread + fallback — see :class:`_ThreadCaptureState`). Records from any + other thread / logger combination flow through to the existing + stderr handler. """ def __init__(self, state: _ThreadCaptureState): @@ -660,7 +700,7 @@ def __init__(self, state: _ThreadCaptureState): self.setFormatter(logging.Formatter("%(levelname)s: %(message)s")) def emit(self, record: logging.LogRecord) -> None: - target = self._state.get(threading.get_ident()) + target = self._state.get_for_llm_record(threading.get_ident(), record.name) if target is None: return try: @@ -670,14 +710,18 @@ def emit(self, record: logging.LogRecord) -> None: class _SuppressCapturingFilter(logging.Filter): - """Filter for the *existing* handlers: drops records from capturing threads. - - Without this filter, every record emitted by a capturing thread would - still hit the featcopilot root logger's stderr ``StreamHandler`` and - bleed onto stderr — breaking the CLI's "stderr reserved for failures" - contract. The filter checks ``threading.get_ident()`` against the - shared :class:`_ThreadCaptureState` so non-capturing threads continue - to see normal stderr output. + """Filter for the *existing* handlers: drops records being captured. + + Without this filter, every record routed by + :class:`_ThreadRoutingHandler` to a capture list would still hit the + featcopilot root logger's stderr ``StreamHandler`` and bleed onto + stderr — breaking the CLI's "stderr reserved for failures" contract. + The filter mirrors the routing handler's policy so the two stay in + lockstep: anything captured (current-thread record OR cross-thread + LLM-client record under the narrow fallback) is also suppressed + from the original handlers; anything else (records from + non-capturing threads / unrelated background work) flows through to + stderr unchanged. """ def __init__(self, state: _ThreadCaptureState): @@ -685,7 +729,7 @@ def __init__(self, state: _ThreadCaptureState): self._state = state def filter(self, record: logging.LogRecord) -> bool: - return self._state.get(threading.get_ident()) is None + return self._state.get_for_llm_record(threading.get_ident(), record.name) is None # Module-level singletons. Installed exactly once on the featcopilot root diff --git a/tests/test_cli.py b/tests/test_cli.py index aea1b13..80171d0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2633,16 +2633,18 @@ def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path): def test_capture_does_not_route_unrelated_thread_records(): - """The capture layer must use STRICT per-thread routing: records - emitted on threads other than the one that opened a capture flow - through the normal handler chain (and reach stderr) — they are - NOT silently rolled into the single in-flight CLI run's payload. + """The capture layer must use STRICT per-thread routing for non-LLM + records: records emitted on threads other than the one that opened + a capture flow through the normal handler chain (and reach stderr) + — they are NOT silently rolled into the single in-flight CLI run's + payload. A previous "single-active-capture fallback" was too broad: when a single CLI run was active, *any* featcopilot log on any thread would have been swallowed into that command's payload, including unrelated background work, causing misattribution. This test - guards against that regression. + guards against that regression for the non-LLM case (the narrow + LLM-only fallback is covered separately). """ import threading @@ -2652,9 +2654,10 @@ def test_capture_does_not_route_unrelated_thread_records(): # Caller emits on its own thread (must be captured). fc_logger.warning("from-caller") - # Spawn a separate, unrelated thread that ALSO emits via the - # featcopilot logger. With the over-broad fallback removed, that - # record must NOT appear in this capture's payload. + # Spawn a separate, unrelated thread that ALSO emits via a + # NON-LLM featcopilot logger. With strict per-thread isolation + # for non-LLM records, that record must NOT appear in this + # capture's payload. def _emit_elsewhere(): fc_logger.warning("from-other-thread") @@ -2663,11 +2666,91 @@ def _emit_elsewhere(): t.join() assert any("from-caller" in m for m in captured) - # Strict per-thread isolation: unrelated thread's record is NOT in - # this capture's payload. + # Strict per-thread isolation for non-LLM records: unrelated thread's + # record is NOT in this capture's payload. assert not any("from-other-thread" in m for m in captured) +def test_capture_routes_llm_client_worker_records_to_single_active_capture(): + """The narrow LLM-client fallback: when a record originates from a + ``featcopilot.llm.*_client`` logger and exactly one capture is + active, the record is routed to that capture even when emitted + from a worker thread. + + This addresses the common case where an LLM sync client wrapping + ``ThreadPoolExecutor`` (the fallback used in event-loop + environments) emits a mock-mode startup warning on a worker thread + that ``submit()`` spawned. Without the narrow fallback, that + warning would bleed onto stderr on a successful run. + """ + import threading + from concurrent.futures import ThreadPoolExecutor + + llm_logger = logging.getLogger("featcopilot.llm.test_client") + + def _emit_llm_in_worker(): + llm_logger.warning("llm-mock-mode-startup") + return "ok" + + with fc_cli._capture_featcopilot_messages() as captured: + # Caller emits its own LLM record (current-thread path). + llm_logger.warning("llm-from-caller") + # ThreadPoolExecutor worker emits an LLM record (cross-thread, + # but the narrow LLM-only fallback should route it). + with ThreadPoolExecutor(max_workers=1) as pool: + assert pool.submit(_emit_llm_in_worker).result(timeout=5) == "ok" + # A raw threading.Thread emits an LLM record too. + t = threading.Thread(target=_emit_llm_in_worker) + t.start() + t.join() + + # Caller's record + 2 worker records (one from pool, one from thread) + # are all in the capture. + assert any("llm-from-caller" in m for m in captured) + assert sum(1 for m in captured if "llm-mock-mode-startup" in m) >= 2 + + +def test_capture_does_not_apply_llm_fallback_with_multiple_captures(): + """When two captures are concurrently active, the narrow LLM + fallback stays disabled — strict per-thread isolation is preserved + so concurrent CLI calls don't cross-contaminate, even for LLM + records. + """ + import threading + from concurrent.futures import ThreadPoolExecutor + + llm_logger = logging.getLogger("featcopilot.llm.test_dual") + a_captured: list[str] = [] + b_captured: list[str] = [] + barrier = threading.Barrier(2) + + def worker(tag: str, target: list[str]): + barrier.wait() + with fc_cli._capture_featcopilot_messages() as captured: + llm_logger.warning(f"{tag}-direct") + with ThreadPoolExecutor(max_workers=1) as pool: + # Submit a worker that emits a record. With two + # captures active, the narrow fallback must NOT + # activate (it would be ambiguous which capture + # "owns" the worker's record). + pool.submit(lambda t=tag: llm_logger.warning(f"{t}-worker")).result(timeout=5) + target.extend(captured) + + t1 = threading.Thread(target=worker, args=("A", a_captured)) + t2 = threading.Thread(target=worker, args=("B", b_captured)) + t1.start() + t2.start() + t1.join() + t2.join() + + # Each capture sees its own direct record (current-thread path). + assert any("A-direct" in m for m in a_captured) + assert any("B-direct" in m for m in b_captured) + # The worker record is NOT in either capture (fallback disabled). + assert not any("worker" in m for m in a_captured) + assert not any("worker" in m for m in b_captured) + + def test_capture_keeps_thread_isolation_with_multiple_active_captures(): """The single-active-capture fallback must NOT activate when two threads are concurrently capturing — each must see only its own From ffa2b3a669b1181c367301be75e1e4625744f9b5 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 5 May 2026 20:20:31 +0800 Subject: [PATCH 28/30] fix(test): make multi-capture LLM-fallback test race-proof MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-23's `test_capture_does_not_apply_llm_fallback_with_multiple_captures` passed locally but failed in CI on Python 3.10/3.12/3.13. Root cause: a race between the two threads. Without explicit barriers, one thread's `ThreadPoolExecutor` worker could fire BEFORE the other thread had pushed its capture onto `_state._per_thread`. At that moment `len(_per_thread) == 1`, so the narrow LLM fallback (correctly) routed the worker's record to the only active capture — which is what the test was checking the multi-capture scenario does NOT do. Replaced the single `Barrier(2)` with a three-phase protocol: - `enter_barrier`: both threads start at the same time - `inside_barrier`: BOTH have pushed their capture (so any subsequent emit sees `len == 2`) - `done_barrier`: BOTH workers have completed before EITHER exits its capture (pins `len == 2` for the entire worker window) The contract under test — multi-capture isolation — is unchanged; the test now reliably exercises the multi-capture branch of `_ThreadCaptureState.get_for_llm_record`. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/test_cli.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 80171d0..e9c9d55 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2722,18 +2722,29 @@ def test_capture_does_not_apply_llm_fallback_with_multiple_captures(): llm_logger = logging.getLogger("featcopilot.llm.test_dual") a_captured: list[str] = [] b_captured: list[str] = [] - barrier = threading.Barrier(2) + enter_barrier = threading.Barrier(2) + inside_barrier = threading.Barrier(2) + done_barrier = threading.Barrier(2) def worker(tag: str, target: list[str]): - barrier.wait() + # Phase 0: both threads start at roughly the same time. + enter_barrier.wait() with fc_cli._capture_featcopilot_messages() as captured: llm_logger.warning(f"{tag}-direct") + # Phase 1: BOTH threads have entered their captures, so + # ``_state._per_thread`` has TWO entries when either thread's + # worker fires below — that's the multi-capture scenario the + # narrow LLM fallback must skip. Without this barrier the + # threads race: one thread's worker can fire before the other + # has pushed its capture, making ``len == 1`` and (incorrectly + # for this test's intent) tripping the fallback. + inside_barrier.wait() with ThreadPoolExecutor(max_workers=1) as pool: - # Submit a worker that emits a record. With two - # captures active, the narrow fallback must NOT - # activate (it would be ambiguous which capture - # "owns" the worker's record). pool.submit(lambda t=tag: llm_logger.warning(f"{t}-worker")).result(timeout=5) + # Phase 2: BOTH threads' workers have completed before either + # exits its capture. This pins ``len == 2`` for the entire + # worker-emit window. + done_barrier.wait() target.extend(captured) t1 = threading.Thread(target=worker, args=("A", a_captured)) @@ -2746,7 +2757,8 @@ def worker(tag: str, target: list[str]): # Each capture sees its own direct record (current-thread path). assert any("A-direct" in m for m in a_captured) assert any("B-direct" in m for m in b_captured) - # The worker record is NOT in either capture (fallback disabled). + # The worker record is NOT in either capture (fallback disabled + # because two captures were active during the worker emit). assert not any("worker" in m for m in a_captured) assert not any("worker" in m for m in b_captured) From 100500989b54578d25b11e326b385ac8a03139ab Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 5 May 2026 20:43:53 +0800 Subject: [PATCH 29/30] fix(cli): atomicize capture decision and tighten LLM-fallback whitelist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-24 reviewer feedback addressed: 1. **Codex P1 — atomic routing/suppression decision.** `_SuppressCapturingFilter.filter()` and `_ThreadRoutingHandler.emit()` each independently called `state.get_for_llm_record(...)`. Under concurrent CLI calls, another thread could push or pop a capture between filter and emit so the two phases saw different state — the same record could end up both captured AND on stderr (or suppressed without being captured). Replaced both call sites with `state.resolve_for_record(record)`, which computes the decision once and caches it on the record itself; the two phases now always see the same answer. 2. **Copilot — narrow LLM whitelist was too broad.** `_LLM_FALLBACK_LOGGER_PREFIXES = ("featcopilot.llm.",)` matched every `featcopilot.llm.*` logger, including modules that never hop onto worker threads (`semantic_engine`, `code_generator`, `transform_rule_generator`, `explainer`). Replaced with an *exact* set `_LLM_FALLBACK_LOGGER_NAMES` containing only the three sync-client modules that actually fall back to `ThreadPoolExecutor` (`copilot_client`, `litellm_client`, `openai_client`). 3. **Copilot — `_run` did not actually capture handler output.** `featcopilot/utils/logger.py` installs `StreamHandler(sys.stderr)` at import time, so `redirect_stderr` only swapped the module attribute while the handler kept writing to the *original* stream. That made the file's stderr-cleanliness assertions vacuous. The `_run` helper now also temporarily re-points every `StreamHandler` on the `featcopilot` logger at the captured `err` buffer for the duration of the call, restoring the original `stream` attribute in `finally`. New tests: - `test_capture_does_not_apply_llm_fallback_for_non_whitelisted_llm_loggers` pins the exact-set whitelist (semantic_engine et al. stay strictly isolated). - `test_capture_decision_is_cached_per_record_for_atomic_filter_emit` proves the resolver computes the decision exactly once per record. - `test_capture_decision_stable_under_concurrent_pop_between_filter_and_emit` proves the cached decision survives a concurrent pop. - `test_run_helper_redirects_featcopilot_stream_handlers` proves the test helper captures handler-stream writes (so leaks would be detectable, not silently passing). Existing test `test_capture_routes_llm_client_worker_records_to_single_active_capture` updated to use a whitelisted logger name (`openai_client`) since arbitrary `featcopilot.llm.*` names no longer trigger the fallback. 901 tests pass locally (full suite); 128 in tests/test_cli.py. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 114 ++++++++++++++++++----- tests/test_cli.py | 224 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 308 insertions(+), 30 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index ea48ce3..ceec11e 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -602,26 +602,54 @@ class _ThreadCaptureState: output from unrelated background work happening in the same process. **Narrow LLM-client fallback.** :meth:`get_for_llm_record` is the - one exception: when a record originates from - ``featcopilot.llm.*_client`` (the sync LLM clients that fall back to - ``ThreadPoolExecutor`` in event-loop environments), and exactly one - capture is active in the process, the record is routed to that - capture even when emitted on a worker thread. This addresses the - common case where an LLM client's mock-mode startup warning fires - on a worker that ``submit()`` spawned and would otherwise bleed - onto stderr; the targeted whitelist keeps unrelated background - featcopilot work from being misattributed. + one exception: when a record originates from one of the *exact* + sync LLM client modules whitelisted in + :attr:`_LLM_FALLBACK_LOGGER_NAMES` (the modules whose synchronous + entry points fall back to ``ThreadPoolExecutor`` in event-loop + environments), and exactly one capture is active in the process, + the record is routed to that capture even when emitted on a worker + thread. This addresses the common case where an LLM client's + mock-mode startup warning fires on a worker that ``submit()`` + spawned and would otherwise bleed onto stderr. The whitelist is an + explicit set of module names — *not* a prefix — so unrelated + ``featcopilot.llm.*`` loggers (e.g. ``semantic_engine``, + ``code_generator``, ``transform_rule_generator``, ``explainer``) + that never run on worker threads do not trigger the fallback. + + **Atomic per-record decision.** :meth:`resolve_for_record` caches + the lookup on the record itself the first time it is called, so + :class:`_ThreadRoutingHandler` (which routes records) and + :class:`_SuppressCapturingFilter` (which suppresses them from + stderr) cannot disagree under concurrent push/pop activity from + other threads. Without that caching, ``filter`` could see one + state and the later ``emit`` could see another, breaking the + "captured XOR on stderr" invariant. Shared by :class:`_ThreadRoutingHandler` (writes records), :class:`_SuppressCapturingFilter` (suppresses stderr), and the routing ``warnings.showwarning`` override. """ - # Logger-name prefixes whose records are eligible for the narrow - # cross-thread fallback. Only the LLM client modules whose sync - # entry points fall back to ``ThreadPoolExecutor`` in event-loop - # environments are listed. - _LLM_FALLBACK_LOGGER_PREFIXES = ("featcopilot.llm.",) + # Logger names whose records are eligible for the narrow cross-thread + # fallback. Only the synchronous LLM client modules whose ``run`` / + # batch entry points fall back to ``ThreadPoolExecutor`` workers in + # event-loop environments are listed; their startup / mock-mode + # warnings legitimately fire from those worker threads. Other + # ``featcopilot.llm.*`` loggers are intentionally NOT included so + # cross-thread records from unrelated background work cannot be + # silently swallowed into an active CLI capture. + _LLM_FALLBACK_LOGGER_NAMES = frozenset( + { + "featcopilot.llm.copilot_client", + "featcopilot.llm.litellm_client", + "featcopilot.llm.openai_client", + } + ) + + # Sentinel for "no cached decision yet" on a log record. Distinct + # from ``None``, which is itself a valid resolved decision meaning + # "no capture target — let the record flow to stderr". + _UNCACHED = object() def __init__(self): self._per_thread: dict[int, list[list[str]]] = {} @@ -657,16 +685,25 @@ def get_for_llm_record(self, tid: int, logger_name: str) -> list[str] | None: When the calling thread has its own active capture, that's used. Otherwise — and only when the record originates from a - whitelisted ``featcopilot.llm.*`` logger AND exactly one capture - is active in the process — the record is routed to that single - capture so it doesn't bleed onto stderr. The whitelist keeps - unrelated background featcopilot work strictly isolated. + whitelisted sync LLM client module + (:attr:`_LLM_FALLBACK_LOGGER_NAMES`) AND exactly one capture is + active in the process — the record is routed to that single + capture so it doesn't bleed onto stderr. The whitelist is an + explicit set of module names so unrelated ``featcopilot.llm.*`` + loggers (e.g. ``semantic_engine``, ``code_generator``) that + never hop onto worker threads do not trigger the fallback. + + This method takes the lock once and returns a snapshot + decision; callers should generally use :meth:`resolve_for_record` + to additionally cache the result on the record so that paired + filter / emit calls can never disagree under concurrent + push/pop on other threads. """ with self._lock: stack = self._per_thread.get(tid) if stack: return stack[-1] - if not logger_name.startswith(self._LLM_FALLBACK_LOGGER_PREFIXES): + if logger_name not in self._LLM_FALLBACK_LOGGER_NAMES: return None if len(self._per_thread) != 1: # Either no captures are active (nothing to route to) or @@ -678,6 +715,29 @@ def get_for_llm_record(self, tid: int, logger_name: str) -> list[str] | None: return only_stack[-1] return None + def resolve_for_record(self, record: logging.LogRecord) -> list[str] | None: + """Resolve the capture target for ``record`` exactly once. + + The first call computes the decision via + :meth:`get_for_llm_record` and caches it on the record itself + as ``record._featcopilot_capture_target``; subsequent calls + return the cached value. This is what makes the routing + handler's ``emit`` and the suppression filter's ``filter`` + atomic with respect to each other: they always see the same + decision for a given record even if another thread pops or + pushes a capture between the two calls. + + Logging records are produced and dispatched to handlers in the + same thread, so caching directly on the record is safe — there + is no concurrent reader of the same record's attributes. + """ + cached = getattr(record, "_featcopilot_capture_target", self._UNCACHED) + if cached is not self._UNCACHED: + return cached + target = self.get_for_llm_record(threading.get_ident(), record.name) + record._featcopilot_capture_target = target + return target + class _ThreadRoutingHandler(logging.Handler): """Logging handler that routes records to the calling thread's capture list. @@ -700,7 +760,11 @@ def __init__(self, state: _ThreadCaptureState): self.setFormatter(logging.Formatter("%(levelname)s: %(message)s")) def emit(self, record: logging.LogRecord) -> None: - target = self._state.get_for_llm_record(threading.get_ident(), record.name) + # Use the cached resolver so this handler and the paired + # ``_SuppressCapturingFilter`` always see the same decision + # for a given record, even if another thread pushes or pops + # a capture between the filter and emit phases. + target = self._state.resolve_for_record(record) if target is None: return try: @@ -729,7 +793,15 @@ def __init__(self, state: _ThreadCaptureState): self._state = state def filter(self, record: logging.LogRecord) -> bool: - return self._state.get_for_llm_record(threading.get_ident(), record.name) is None + # Use the cached resolver so this filter and the paired + # ``_ThreadRoutingHandler`` always see the same decision for a + # given record, even if another thread pushes or pops a capture + # between this call and the routing handler's emit. Without the + # cache, the two could disagree and the same record could end + # up both captured and on stderr (or suppressed without being + # captured) — breaking the "stderr reserved for failures" + # contract under concurrent CLI calls. + return self._state.resolve_for_record(record) is None # Module-level singletons. Installed exactly once on the featcopilot root diff --git a/tests/test_cli.py b/tests/test_cli.py index e9c9d55..84928a1 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -7,6 +7,7 @@ import json import logging import sys +import threading import warnings from contextlib import redirect_stderr, redirect_stdout from pathlib import Path @@ -20,10 +21,34 @@ def _run(argv: list[str]) -> tuple[int, str, str]: - """Invoke ``cli.main(argv)`` and capture exit code, stdout, stderr.""" + """Invoke ``cli.main(argv)`` and capture exit code, stdout, stderr. + + The featcopilot logger installs a ``StreamHandler(sys.stderr)`` at + import time, which holds a reference to the *original* ``sys.stderr`` + object. ``redirect_stderr`` only swaps the ``sys.stderr`` module + attribute, so without also redirecting the handler's ``stream`` any + log output the suppression filter doesn't catch would still go to + the real terminal — leaving every ``err == ""`` assertion in this + file vacuously satisfied even in the presence of a leak. This helper + therefore both redirects ``sys.stderr`` AND temporarily re-points + every ``StreamHandler`` on the ``featcopilot`` root logger at the + same ``err`` buffer for the duration of the call, so the captured + ``err`` value reflects what would actually have been written to the + user's terminal. + """ out, err = io.StringIO(), io.StringIO() - with redirect_stdout(out), redirect_stderr(err): - rc = fc_cli.main(argv) + fc_logger = logging.getLogger("featcopilot") + saved_streams: list[tuple[logging.StreamHandler, object]] = [] + for handler in list(fc_logger.handlers): + if isinstance(handler, logging.StreamHandler): + saved_streams.append((handler, handler.stream)) + handler.stream = err + try: + with redirect_stdout(out), redirect_stderr(err): + rc = fc_cli.main(argv) + finally: + for handler, original_stream in saved_streams: + handler.stream = original_stream return rc, out.getvalue(), err.getvalue() @@ -2672,10 +2697,11 @@ def _emit_elsewhere(): def test_capture_routes_llm_client_worker_records_to_single_active_capture(): - """The narrow LLM-client fallback: when a record originates from a - ``featcopilot.llm.*_client`` logger and exactly one capture is - active, the record is routed to that capture even when emitted - from a worker thread. + """The narrow LLM-client fallback: when a record originates from one + of the *whitelisted* sync LLM client modules + (``featcopilot.llm.copilot_client`` / ``litellm_client`` / + ``openai_client``) and exactly one capture is active, the record + is routed to that capture even when emitted from a worker thread. This addresses the common case where an LLM sync client wrapping ``ThreadPoolExecutor`` (the fallback used in event-loop @@ -2686,7 +2712,10 @@ def test_capture_routes_llm_client_worker_records_to_single_active_capture(): import threading from concurrent.futures import ThreadPoolExecutor - llm_logger = logging.getLogger("featcopilot.llm.test_client") + # Use an actual whitelisted sync-client module name; an arbitrary + # ``featcopilot.llm.*`` name (e.g. ``test_client``) is intentionally + # NOT eligible — see ``test_capture_does_not_apply_llm_fallback_for_non_whitelisted_llm_loggers``. + llm_logger = logging.getLogger("featcopilot.llm.openai_client") def _emit_llm_in_worker(): llm_logger.warning("llm-mock-mode-startup") @@ -2710,6 +2739,44 @@ def _emit_llm_in_worker(): assert sum(1 for m in captured if "llm-mock-mode-startup" in m) >= 2 +def test_capture_does_not_apply_llm_fallback_for_non_whitelisted_llm_loggers(): + """The narrow LLM-client fallback whitelist is an *exact* set of + sync-client module names — NOT a ``featcopilot.llm.*`` prefix. + Other ``featcopilot.llm.*`` loggers (e.g. ``semantic_engine``, + ``code_generator``, ``transform_rule_generator``, ``explainer``) + must keep strict per-thread isolation, so cross-thread records + from unrelated background work cannot be silently swallowed into + an active CLI capture. + """ + import threading + + non_whitelisted = [ + "featcopilot.llm.semantic_engine", + "featcopilot.llm.code_generator", + "featcopilot.llm.transform_rule_generator", + "featcopilot.llm.explainer", + "featcopilot.llm.test_dummy", # arbitrary subname — must NOT match + ] + captured_lists: list[list[str]] = [] + + for name in non_whitelisted: + other_logger = logging.getLogger(name) + + def _emit_in_other_thread(logger=other_logger, tag=name): + logger.warning(f"{tag}-from-other-thread") + + with fc_cli._capture_featcopilot_messages() as captured: + t = threading.Thread(target=_emit_in_other_thread) + t.start() + t.join() + captured_lists.append(list(captured)) + + for name, captured in zip(non_whitelisted, captured_lists, strict=True): + assert not any( + f"{name}-from-other-thread" in m for m in captured + ), f"Non-whitelisted LLM logger {name} unexpectedly tripped the cross-thread fallback" + + def test_capture_does_not_apply_llm_fallback_with_multiple_captures(): """When two captures are concurrently active, the narrow LLM fallback stays disabled — strict per-thread isolation is preserved @@ -2719,7 +2786,7 @@ def test_capture_does_not_apply_llm_fallback_with_multiple_captures(): import threading from concurrent.futures import ThreadPoolExecutor - llm_logger = logging.getLogger("featcopilot.llm.test_dual") + llm_logger = logging.getLogger("featcopilot.llm.openai_client") a_captured: list[str] = [] b_captured: list[str] = [] enter_barrier = threading.Barrier(2) @@ -2800,6 +2867,145 @@ def worker(tag: str, target: list[str]): assert len(b_captured) == 10 +def test_capture_decision_is_cached_per_record_for_atomic_filter_emit(): + """The capture state must resolve each record's routing decision + *exactly once*, then cache the outcome on the record itself, so the + suppression filter and the routing handler always see the same + answer for that record. Otherwise a concurrent push/pop on another + thread could land between the filter (computed at handler-1 phase) + and the emit (computed at handler-2 phase), making the same record + both captured and emitted to stderr (or suppressed without being + captured) — breaking the CLI contract. + """ + state = fc_cli._ThreadCaptureState() + + class _CountingState: + """Wrap state so we can count ``get_for_llm_record`` calls.""" + + def __init__(self, inner): + self._inner = inner + self.calls: list[tuple[int, str]] = [] + + # Forward the attributes ``resolve_for_record`` reads. + @property + def _UNCACHED(self): + return self._inner._UNCACHED + + def get_for_llm_record(self, tid, name): + self.calls.append((tid, name)) + return self._inner.get_for_llm_record(tid, name) + + # Re-bind ``resolve_for_record`` so calls are counted via the + # wrapped ``get_for_llm_record``. + def resolve_for_record(self, record): + return fc_cli._ThreadCaptureState.resolve_for_record(self, record) + + counted = _CountingState(state) + + record = logging.LogRecord( + name="featcopilot.llm.openai_client", + level=logging.WARNING, + pathname=__file__, + lineno=1, + msg="hello", + args=(), + exc_info=None, + ) + + # First call computes and caches; subsequent calls must not hit + # ``get_for_llm_record`` again. + first = counted.resolve_for_record(record) + second = counted.resolve_for_record(record) + third = counted.resolve_for_record(record) + + assert first is second is third + assert len(counted.calls) == 1, ( + "resolve_for_record must compute the decision exactly once per record; " + f"saw {len(counted.calls)} get_for_llm_record calls" + ) + + # The cached attribute is set on the record itself. + assert hasattr(record, "_featcopilot_capture_target") + + +def test_capture_decision_stable_under_concurrent_pop_between_filter_and_emit(): + """Regression test for the atomic filter/emit invariant: even if a + concurrent thread pops its capture between the moment a record is + filtered and the moment it is emitted, both phases see the SAME + decision because it was resolved and cached on the record once. + """ + state = fc_cli._ThreadCaptureState() + cap_a: list[str] = [] + state.push(threading.get_ident() ^ 1, cap_a) # foreign-thread capture + try: + record = logging.LogRecord( + name="featcopilot.llm.copilot_client", + level=logging.WARNING, + pathname=__file__, + lineno=1, + msg="hi", + args=(), + exc_info=None, + ) + # Phase 1: "filter" computes and caches ("len(_per_thread)==1" + # so the LLM fallback returns ``cap_a``). + first = state.resolve_for_record(record) + assert first is cap_a + + # Concurrent pop: another thread tears its capture down. State + # would now produce a *different* answer for a fresh lookup. + state.pop(threading.get_ident() ^ 1) + fresh_lookup = state.get_for_llm_record(threading.get_ident(), record.name) + assert fresh_lookup is None # state has indeed changed + + # Phase 2: "emit" must still see the same decision via the cache. + second = state.resolve_for_record(record) + assert second is cap_a, ( + "After a concurrent pop, resolve_for_record must still return the " + "originally cached decision so filter and emit cannot disagree" + ) + finally: + # Clean up any stragglers. + state.pop(threading.get_ident() ^ 1) + + +def test_run_helper_redirects_featcopilot_stream_handlers(monkeypatch): + """Regression test for the test helper itself: ``_run`` must + redirect every ``logging.StreamHandler`` on the ``featcopilot`` + root logger so that any handler write that escapes the suppression + filter (the contract-violation scenario) lands in the captured + ``err`` buffer, NOT on the real terminal. + + Without this redirect, every ``err == ""`` assertion in this file + would be vacuously satisfied because the ``StreamHandler`` installed + at import time holds a reference to the *original* ``sys.stderr`` + object and ``redirect_stderr`` only swaps the module attribute. + """ + fc_logger = logging.getLogger("featcopilot") + stream_handlers = [h for h in fc_logger.handlers if isinstance(h, logging.StreamHandler)] + assert stream_handlers, "featcopilot logger must have at least one StreamHandler" + + # Stub ``cli.main`` to write directly through the StreamHandler's + # current ``stream`` attribute (which ``_run`` should have re-pointed + # at the captured ``err`` buffer). + def fake_main(argv): + for h in fc_logger.handlers: + if isinstance(h, logging.StreamHandler): + h.stream.write("HANDLER_LEAK_LINE\n") + h.stream.flush() + return 0 + + monkeypatch.setattr(fc_cli, "main", fake_main) + rc, _out, err = _run(["info"]) + assert rc == 0 + assert "HANDLER_LEAK_LINE" in err, ( + "_run must redirect featcopilot StreamHandler streams; otherwise stderr-cleanliness " "assertions are vacuous" + ) + # And the original stream is restored after the call. + for h in stream_handlers: + assert h.stream is sys.stderr or h.stream is sys.__stderr__ + + # ----------------------- explain --explain-sample-size warning hygiene From 99fe807624fc775d287fc79948cb101ba2cdaa50 Mon Sep 17 00:00:00 2001 From: Li Jiang Date: Tue, 5 May 2026 23:14:38 +0800 Subject: [PATCH 30/30] fix(cli): widen warning capture to read+write phases; split empty-input cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-25 reviewer feedback addressed: 1. **Copilot — empty-input branch conflated zero-row and zero-column.** `_read_table` raised the same "zero data rows" error for both `len(df) == 0` and `len(df.columns) == 0` (both make `DataFrame.empty` return `True`). A JSON `[{}, {}]` array produces a frame WITH rows but NO columns ù very different user error than a header-only CSV. Split into two distinct, accurately worded error messages: "no columns" vs "zero data rows". 2. **Copilot — read-time warnings leaked to stderr.** `_cmd_transform` only opened the capture context around `engineer.fit_transform`. `pd.read_csv` can legitimately emit `DtypeWarning` on mixed-type CSVs and parquet/JSON readers can emit pyarrow / pandas warnings on a successful read; those were bypassing the capture and bleeding to stderr, breaking the "stderr reserved for failures" contract. 3. **Copilot — write-time warnings also leaked to stderr.** `_write_table` was likewise outside the capture, so pandas / pyarrow `FutureWarning` / `UserWarning` from a successful write could leak. Same contract violation as #2. The fix wraps the entire `_cmd_transform` pipeline (read + build_engineer + fit_transform + write) in a single `_capture_featcopilot_messages` block so warnings from ANY phase end up in the JSON `warnings` field instead of stderr. `_cmd_explain` also widened: `explain_features` / `get_feature_code` are now inside the same capture as the read+fit_transform. The dead helper `_fit_transform_capturing_warnings` is no longer used internally; kept as a thin convenience wrapper for external test code with an updated docstring noting that the CLI now wraps a wider region. New tests: - `test_transform_zero_columns_input_distinguishes_from_zero_rows` ù pins the column-vs-row error-message distinction. - `test_transform_zero_rows_input_still_uses_zero_rows_message` ù guards the existing zero-rows wording for the header-only case. - `test_transform_read_warning_captured_not_on_stderr` ù patches `pd.read_csv` to emit a warning, asserts `err == ""` and the warning lands in JSON `warnings`. - `test_transform_write_warning_captured_not_on_stderr` ù patches `DataFrame.to_csv` to emit a warning, same contract. - `test_explain_features_warnings_captured_not_on_stderr` ù patches `AutoFeatureEngineer.explain_features` to emit a warning, same contract for `_cmd_explain`. 906 tests pass locally (full suite); 133 in tests/test_cli.py. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- featcopilot/cli.py | 199 ++++++++++++++++++++++++++------------------- tests/test_cli.py | 150 ++++++++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+), 82 deletions(-) diff --git a/featcopilot/cli.py b/featcopilot/cli.py index ceec11e..6f019f5 100644 --- a/featcopilot/cli.py +++ b/featcopilot/cli.py @@ -228,14 +228,24 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None, suppress_trun raise ValueError(f"Unsupported input format: {fmt}") # Reject "header-only" / empty inputs across every supported format. - # ``pd.read_csv`` returns an empty DataFrame (no exception) when the - # CSV has headers but zero data rows; the same goes for an empty - # parquet file or ``[]`` JSON body. Without this check, the CLI - # would pass an empty frame into ``TabularEngine``, which divides by - # ``len(X)`` while fitting categorical encoding and exits via the - # generic ``unexpected error`` path. Surface the issue as a clean - # exit-2 user-input error. - if df.empty: + # ``DataFrame.empty`` returns ``True`` for both zero-row AND + # zero-column frames, but those are very different user errors that + # warrant different remediation paths, so check the two cases + # explicitly. ``pd.read_csv`` returns an empty DataFrame (no + # exception) when the CSV has headers but zero data rows; the same + # goes for an empty parquet file or ``[]`` JSON body. Without this + # check, the CLI would pass an empty frame into ``TabularEngine``, + # which divides by ``len(X)`` while fitting categorical encoding and + # exits via the generic ``unexpected error`` path. Surface both + # cases as clean exit-2 user-input errors. + if len(df.columns) == 0: + raise ValueError( + f"Input file {str(path)!r} has no columns. " + "Feature engineering requires at least one input feature column " + "(e.g. a JSON array of ``{}`` objects, or a table that only " + "preserved an index, would hit this error)." + ) + if len(df) == 0: raise ValueError( f"Input file {str(path)!r} is empty (zero data rows). " "Feature engineering requires at least one row of data." @@ -555,23 +565,16 @@ def _cmd_info(args: argparse.Namespace) -> int: def _fit_transform_capturing_warnings(engineer, X, y, **kwargs): - """Run ``engineer.fit_transform(X, y, **kwargs)`` while capturing both - Python ``warnings.warn(...)`` and FeatCopilot logger records. - - The CLI contract is that stdout carries the JSON payload and stderr is - reserved for failures. Two sources can otherwise bleed onto stderr on - a successful run: + """Thin convenience wrapper: run ``engineer.fit_transform(X, y, **kwargs)`` + inside a single ``_capture_featcopilot_messages()`` block. - * ``warnings.warn(...)`` — emitted by ``AutoFeatureEngineer.fit`` for - leakage-prone column names under the default ``leakage_guard='warn'``. - * ``logger.warning(...)`` / ``logger.info(...)`` — emitted by e.g. - ``_do_no_harm_gate`` on validation-failure fallback, and by every - engine when ``--verbose`` is set. - - The single ``featcopilot`` root logger (``propagate=False``) receives - every child logger's records by ordinary Python logging propagation; - we swap in a capture handler for the duration of the call so the JSON - payload can surface those messages instead of stderr. + This helper is *not* used by the CLI subcommands themselves anymore + — they wrap a wider region (read + fit_transform + write) in one + capture so warnings emitted during pandas / pyarrow read or write + phases are also surfaced via the JSON ``warnings`` field instead of + leaking onto stderr. It is preserved as a public-ish helper for + external test code that just wants to capture messages around a + plain ``fit_transform`` call. Returns ------- @@ -930,7 +933,26 @@ def _capture_featcopilot_messages(): def _cmd_transform(args: argparse.Namespace) -> int: - """Read input, fit/transform, write output.""" + """Read input, fit/transform, write output. + + The "successful runs keep stderr empty" CLI contract requires that + *every* phase that can legitimately emit a Python warning be wrapped + in the message capture, not just ``fit_transform``. Concretely: + + * **Read** — ``pd.read_csv`` can emit ``DtypeWarning`` on mixed-type + columns and parquet/JSON readers can emit pyarrow / pandas + future-warnings on a successful read. + * **Fit / transform** — engineers themselves emit warnings (e.g. + ``AutoFeatureEngineer.fit`` for leakage-prone column names under + ``leakage_guard='warn'``). + * **Write** — ``DataFrame.to_csv`` / ``to_parquet`` / ``to_json`` + can emit pandas / pyarrow ``FutureWarning`` / ``UserWarning`` + during a successful write. + + All three phases now live inside one capture block so any warnings + they emit are surfaced via the JSON ``warnings`` field rather than + leaking onto stderr. + """ input_path = Path(args.input) if not input_path.exists(): raise FileNotFoundError(f"Input file not found: {args.input}") @@ -939,64 +961,70 @@ def _cmd_transform(args: argparse.Namespace) -> int: in_fmt = _detect_format(input_path, args.input_format) out_fmt = _detect_format(output_path, args.output_format) - df = _read_table(input_path, in_fmt) - X, y = _split_xy(df, args.target) - - # Build the engineer first: ``_build_engineer`` runs all scalar / list / - # dict type validation on the merged CLI-flag + config view, so any - # malformed value (e.g. ``"max_features": "5"``, ``"verbose": "false"``) - # surfaces a precise exit-2 error here rather than down the wrong - # ``--target is required`` rabbit hole. - engineer = _build_engineer(args) - - # Selection requires a target column to fit against. ``AutoFeatureEngineer`` - # only actually fits a selector when ``y is not None`` AND ``max_features`` - # is set; without ``max_features`` the call is a raw feature-generation - # run and does not need a target. The CLI mirrors that contract: only - # require ``--target`` when both selection is enabled (the default) AND - # ``max_features`` is configured (CLI flag or config), so commands like - # ``featcopilot transform --input in.csv --output out.csv`` (no target, - # no cap) still work. Using ``engineer.max_features`` here means the - # value has already been type-validated, so we never report - # ``--target is required`` when the real problem is a malformed - # ``max_features`` config value. - if not args.no_selection and args.target is None and engineer.max_features is not None: - raise ValueError( - "--target is required when feature selection is applied " - "(i.e. when --max-features / config max_features is set). " - "Pass --target , or pass --no-selection / drop --max-features to skip selection." - ) - - captured_warnings, transformed = _fit_transform_capturing_warnings( - engineer, - X, - y, - task_description=args.task_description or "prediction task", - target_name=args.target, - apply_selection=not args.no_selection, - ) + # Single capture context spans read + fit_transform + write so any + # legitimate-but-noisy warnings from pandas/pyarrow during read or + # write end up in the JSON payload's ``warnings`` field instead of + # bleeding to stderr. ``_capture_featcopilot_messages`` is a no-op + # for non-warning code paths so wrapping the broader region has no + # side effects on the happy path. + with _capture_featcopilot_messages() as captured_warnings: + df = _read_table(input_path, in_fmt) + X, y = _split_xy(df, args.target) - if args.include_target and y is not None: - # Re-attach the target column so downstream training scripts can - # consume the engineered file as a single artifact. Detect column - # collisions: if an engineered feature happens to share the - # target's column name (e.g. a target named ``foo_pow2`` matching - # a tabular-engine derived feature), blindly assigning ``transformed[ - # target_name] = y.values`` would silently overwrite the engineered - # column. Surface that as a clean exit-2 error instead. Callers - # who knowingly want to overwrite can rename their target before - # invoking ``transform`` (or skip ``--include-target``). - target_name = args.target if args.target in df.columns else "target" - if target_name in transformed.columns: + # Build the engineer first: ``_build_engineer`` runs all scalar / list / + # dict type validation on the merged CLI-flag + config view, so any + # malformed value (e.g. ``"max_features": "5"``, ``"verbose": "false"``) + # surfaces a precise exit-2 error here rather than down the wrong + # ``--target is required`` rabbit hole. + engineer = _build_engineer(args) + + # Selection requires a target column to fit against. ``AutoFeatureEngineer`` + # only actually fits a selector when ``y is not None`` AND ``max_features`` + # is set; without ``max_features`` the call is a raw feature-generation + # run and does not need a target. The CLI mirrors that contract: only + # require ``--target`` when both selection is enabled (the default) AND + # ``max_features`` is configured (CLI flag or config), so commands like + # ``featcopilot transform --input in.csv --output out.csv`` (no target, + # no cap) still work. Using ``engineer.max_features`` here means the + # value has already been type-validated, so we never report + # ``--target is required`` when the real problem is a malformed + # ``max_features`` config value. + if not args.no_selection and args.target is None and engineer.max_features is not None: raise ValueError( - f"--include-target would overwrite engineered feature {target_name!r} " - "with the target values. Rename the target column in the input file, " - "or drop --include-target." + "--target is required when feature selection is applied " + "(i.e. when --max-features / config max_features is set). " + "Pass --target , or pass --no-selection / drop --max-features to skip selection." ) - transformed = transformed.copy() - transformed[target_name] = y.values - _write_table(transformed, output_path, out_fmt) + transformed = engineer.fit_transform( + X, + y, + task_description=args.task_description or "prediction task", + target_name=args.target, + apply_selection=not args.no_selection, + ) + + if args.include_target and y is not None: + # Re-attach the target column so downstream training scripts can + # consume the engineered file as a single artifact. Detect column + # collisions: if an engineered feature happens to share the + # target's column name (e.g. a target named ``foo_pow2`` matching + # a tabular-engine derived feature), blindly assigning ``transformed[ + # target_name] = y.values`` would silently overwrite the engineered + # column. Surface that as a clean exit-2 error instead. Callers + # who knowingly want to overwrite can rename their target before + # invoking ``transform`` (or skip ``--include-target``). + target_name = args.target if args.target in df.columns else "target" + if target_name in transformed.columns: + raise ValueError( + f"--include-target would overwrite engineered feature {target_name!r} " + "with the target values. Rename the target column in the input file, " + "or drop --include-target." + ) + transformed = transformed.copy() + transformed[target_name] = y.values + + _write_table(transformed, output_path, out_fmt) payload = { "status": "ok", @@ -1150,9 +1178,16 @@ def _cmd_explain(args: argparse.Namespace) -> int: apply_selection=False, ) - explanations = engineer.explain_features() - code = engineer.get_feature_code() - feature_names = engineer.get_feature_names() + # ``explain_features`` / ``get_feature_code`` consult engine + # internals that may legitimately emit pandas / pyarrow warnings + # (e.g. when stringifying expression trees touching deprecated + # APIs). Keep them inside the capture so any such warning ends + # up in the JSON payload's ``warnings`` field instead of bleeding + # to stderr — same "stderr reserved for failures" contract as + # the read / fit_transform phases above. + explanations = engineer.explain_features() + code = engineer.get_feature_code() + feature_names = engineer.get_feature_names() payload = { "status": "ok", diff --git a/tests/test_cli.py b/tests/test_cli.py index 84928a1..2f8c1f3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3006,6 +3006,156 @@ def fake_main(argv): assert h.stream is sys.stderr or h.stream is sys.__stderr__ +# ----------------------- empty-input column-vs-row distinction + + +def test_transform_zero_columns_input_distinguishes_from_zero_rows(tmp_path: Path): + """``DataFrame.empty`` is ``True`` for both zero-row AND zero-column + frames. The CLI must distinguish: a JSON array of empty objects + ``[{}, {}, ...]`` is a zero-COLUMN input (the user has no feature + columns), not a zero-ROW input. The error message must point at + the actual problem so callers can take the right remediation. + """ + # JSON array of empty objects: pandas reads this as a frame with + # rows but no columns. + p = tmp_path / "empty_columns.json" + p.write_text("[{}, {}, {}]") + rc, _out, err = _run(["transform", "--input", str(p), "--output", str(tmp_path / "out.csv"), "--target", "y"]) + assert rc == 2 + assert "no columns" in err.lower(), err + assert "feature column" in err.lower(), err + assert "zero data rows" not in err.lower(), err + + +def test_transform_zero_rows_input_still_uses_zero_rows_message(tmp_path: Path): + """The zero-row case (header but no data) still surfaces the + distinct "zero data rows" wording so the two failure modes are + distinguishable in CLI output. + """ + p = tmp_path / "header_only.csv" + p.write_text("x1,x2,y\n") + rc, _out, err = _run(["transform", "--input", str(p), "--output", str(tmp_path / "out.csv"), "--target", "y"]) + assert rc == 2 + assert "zero data rows" in err.lower(), err + assert "no columns" not in err.lower(), err + + +# ----------------------- transform read/write warnings captured (not stderr) + + +def test_transform_read_warning_captured_not_on_stderr(tmp_path: Path, monkeypatch): + """``pd.read_csv`` can legitimately emit ``DtypeWarning`` on a + successful read with mixed-type columns. That warning must end up + in the JSON ``warnings`` field, NOT on stderr — the contract is + that successful runs keep stderr empty for agent callers. + """ + import pandas as _pd + + # Build a valid CSV input and a real fit_transform-able payload. + rng = np.random.default_rng(0) + df = pd.DataFrame( + { + "x1": rng.normal(size=50), + "x2": rng.integers(0, 5, size=50), + "y": rng.integers(0, 2, size=50), + } + ) + in_path = tmp_path / "in.csv" + df.to_csv(in_path, index=False) + out_path = tmp_path / "out.csv" + + # Patch ``pd.read_csv`` so that calling it emits a real Python + # ``warnings.warn`` (mirroring DtypeWarning on a successful read) + # while still returning the same DataFrame. + real_read_csv = _pd.read_csv + + def warning_emitting_read_csv(*a, **kw): + warnings.warn("pandas-mock-read-csv: DtypeWarning equivalent", UserWarning, stacklevel=2) + return real_read_csv(*a, **kw) + + monkeypatch.setattr(_pd, "read_csv", warning_emitting_read_csv) + + rc, out, err = _run( + [ + "transform", + "--input", + str(in_path), + "--output", + str(out_path), + "--target", + "y", + "--no-selection", + "--json", + ] + ) + assert rc == 0, err + assert err == "", f"read-time warning leaked to stderr: {err!r}" + payload = json.loads(out) + assert any("pandas-mock-read-csv" in w for w in payload["warnings"]), payload["warnings"] + + +def test_transform_write_warning_captured_not_on_stderr(tmp_path: Path, monkeypatch, tabular_csv: Path): + """Pandas/pyarrow can legitimately emit ``FutureWarning`` / + ``UserWarning`` during ``DataFrame.to_csv`` / ``to_parquet`` / + ``to_json`` on a successful write. Those warnings must end up in + the JSON ``warnings`` field, NOT on stderr. + """ + out_path = tmp_path / "out.csv" + + # Patch ``DataFrame.to_csv`` so calling it emits a warning while + # still actually writing the file. + real_to_csv = pd.DataFrame.to_csv + + def warning_emitting_to_csv(self, *a, **kw): + warnings.warn("pandas-mock-to-csv: FutureWarning equivalent", FutureWarning, stacklevel=2) + return real_to_csv(self, *a, **kw) + + monkeypatch.setattr(pd.DataFrame, "to_csv", warning_emitting_to_csv) + + rc, out, err = _run( + [ + "transform", + "--input", + str(tabular_csv), + "--output", + str(out_path), + "--target", + "y", + "--no-selection", + "--json", + ] + ) + assert rc == 0, err + assert err == "", f"write-time warning leaked to stderr: {err!r}" + payload = json.loads(out) + assert any("pandas-mock-to-csv" in w for w in payload["warnings"]), payload["warnings"] + + +# ----------------------- explain captures explain_features warnings + + +def test_explain_features_warnings_captured_not_on_stderr(tmp_path: Path, monkeypatch, tabular_csv: Path): + """``explain_features`` / ``get_feature_code`` are now inside the + same capture as the read + ``fit_transform``, so any warning they + emit goes to the JSON ``warnings`` field, not stderr. + """ + from featcopilot.transformers import sklearn_compat as _sc + + real_explain = _sc.AutoFeatureEngineer.explain_features + + def warning_emitting_explain(self): + warnings.warn("explain-features-mock-warning", UserWarning, stacklevel=2) + return real_explain(self) + + monkeypatch.setattr(_sc.AutoFeatureEngineer, "explain_features", warning_emitting_explain) + + rc, out, err = _run(["explain", "--input", str(tabular_csv), "--target", "y"]) + assert rc == 0, err + assert err == "", f"explain_features warning leaked to stderr: {err!r}" + payload = json.loads(out) + assert any("explain-features-mock-warning" in w for w in payload["warnings"]), payload["warnings"] + + # ----------------------- explain --explain-sample-size warning hygiene