From 3f0b6616f49067ba03774b62ae0eee1e11283063 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 14:05:53 +0800
Subject: [PATCH 01/30] feat(cli): add featcopilot command-line interface for
 agentic usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces a stable, agent-friendly CLI exposed via the `featcopilot`
console script (and `python -m featcopilot`). All subcommands accept
`--json` for machine-readable stdout; user-facing errors are written
to stderr with a non-zero exit code so agents (e.g. Copilot tool-use,
shell scripts, CI pipelines) can parse failures deterministically.

Subcommands:

* `info`      — print version, supported engines, selection methods,
                  leakage guards, and supported I/O formats.
* `transform` — read CSV / Parquet / JSON, run AutoFeatureEngineer,
                  write engineered features. Supports `--config` JSON,
                  `--engines`, `--max-features`, `--selection-methods`,
                  `--correlation-threshold`, `--leakage-guard`,
                  `--gate-n-jobs`, `--no-selection`, `--include-target`,
                  and explicit `--input-format` / `--output-format`
                  overrides. Emits a JSON status payload (rows, features,
                  engines, selection_applied, ...) when `--json` is set.
* `explain`   — fit AutoFeatureEngineer and print a JSON document with
                  `{name, explanation, code}` per feature so an LLM can
                  consume the result directly.

Files:

* `featcopilot/cli.py`      — argparse-based CLI (no new dependencies).
* `featcopilot/__main__.py` — enables `python -m featcopilot`.
* `pyproject.toml`          — `[project.scripts]` entry point.
* `tests/test_cli.py`       — 18 tests covering info/transform/explain,
                                CSV/Parquet/JSON round-trips, `--config`
                                handling and override precedence, and all
                                user-facing error paths.
* `README.md`                — new "Command-Line Interface" section.

Coverage: featcopilot/cli.py at 94 %; project total 88.95 %
(`--cov-fail-under=85`).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 README.md               |  26 +++
 featcopilot/__main__.py |   6 +
 featcopilot/cli.py      | 452 ++++++++++++++++++++++++++++++++++++++++
 pyproject.toml          |   3 +
 tests/test_cli.py       | 397 +++++++++++++++++++++++++++++++++++
 5 files changed, 884 insertions(+)
 create mode 100644 featcopilot/__main__.py
 create mode 100644 featcopilot/cli.py
 create mode 100644 tests/test_cli.py

diff --git a/README.md b/README.md
index e4f3d4f..157215b 100644
--- a/README.md
+++ b/README.md
@@ -110,6 +110,32 @@ for feature, explanation in engineer.explain_features().items():
     print(f"{feature}: {explanation}")
 ```
 
+## Command-Line Interface
+
+FeatCopilot ships a `featcopilot` CLI for shell, scripting, and agentic
+(LLM tool-use) workflows — no Python glue required. All subcommands accept
+`--json` for machine-readable stdout; errors are written to stderr with a
+non-zero exit code so agents can parse failures deterministically.
+
+```bash
+# Discover capabilities (engines, selection methods, I/O formats)
+featcopilot info --json
+
+# Run feature engineering on a CSV / Parquet / JSON file
+featcopilot transform \
+    --input data.csv --target label --output features.parquet \
+    --engines tabular --max-features 50 --json
+
+# Inspect generated features (name, explanation, code) as JSON for an LLM
+featcopilot explain --input data.csv --target label
+
+# Equivalent module form
+python -m featcopilot info --json
+```
+
+Pass `--config config.json` to provide nested keys such as `llm_config`;
+explicit CLI flags override values from the config file.
+
 ## Engines
 
 ### Tabular Engine
diff --git a/featcopilot/__main__.py b/featcopilot/__main__.py
new file mode 100644
index 0000000..0cce4e0
--- /dev/null
+++ b/featcopilot/__main__.py
@@ -0,0 +1,6 @@
+"""Enable ``python -m featcopilot`` to dispatch to the CLI."""
+
+from featcopilot.cli import main
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/featcopilot/cli.py b/featcopilot/cli.py
new file mode 100644
index 0000000..89fcdf4
--- /dev/null
+++ b/featcopilot/cli.py
@@ -0,0 +1,452 @@
+"""
+FeatCopilot command-line interface.
+
+Provides a stable, agent-friendly CLI for invoking FeatCopilot from shells,
+notebooks, agentic workflows (e.g. Copilot/LLM tool-use), and CI pipelines
+without writing Python glue code.
+
+Subcommands
+-----------
+info
+    Print version and supported engines/methods. Always machine-readable
+    when ``--json`` is passed.
+transform
+    Run :class:`featcopilot.AutoFeatureEngineer` on a tabular input file
+    (CSV / Parquet / JSON) and write engineered features to an output file.
+    Emits a JSON status line on stdout when ``--json`` is passed so that
+    agents can parse the result deterministically.
+explain
+    Fit the engineer and print a JSON document describing each generated
+    feature (name, explanation, code) for downstream LLM consumption.
+
+Examples
+--------
+Agentic usage (machine-readable result on stdout, errors on stderr)::
+
+    featcopilot info --json
+    featcopilot transform \\
+        --input data.csv --target label --output features.parquet \\
+        --engines tabular --max-features 50 --json
+    featcopilot explain --input data.csv --target label --json
+
+Equivalent module invocation::
+
+    python -m featcopilot info --json
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+from featcopilot import __version__
+from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer
+from featcopilot.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+SUPPORTED_INPUT_FORMATS = ("csv", "parquet", "json")
+SUPPORTED_OUTPUT_FORMATS = ("csv", "parquet", "json")
+
+
+def _detect_format(path: Path, override: str | None) -> str:
+    """Return one of ``SUPPORTED_INPUT_FORMATS`` for ``path``.
+
+    Parameters
+    ----------
+    path : pathlib.Path
+        File path whose suffix is inspected when ``override`` is ``None``.
+    override : str or None
+        Explicit format override (``csv`` / ``parquet`` / ``json``).
+
+    Raises
+    ------
+    ValueError
+        If the format cannot be determined or is not supported.
+    """
+    if override is not None:
+        fmt = override.lower()
+        if fmt not in SUPPORTED_INPUT_FORMATS:
+            raise ValueError(
+                f"Unsupported format {override!r}; expected one of {SUPPORTED_INPUT_FORMATS}"
+            )
+        return fmt
+
+    suffix = path.suffix.lower().lstrip(".")
+    aliases = {"pq": "parquet", "parq": "parquet"}
+    fmt = aliases.get(suffix, suffix)
+    if fmt not in SUPPORTED_INPUT_FORMATS:
+        raise ValueError(
+            f"Cannot infer format from extension {path.suffix!r}; "
+            f"pass --input-format / --output-format (one of {SUPPORTED_INPUT_FORMATS})."
+        )
+    return fmt
+
+
+def _read_table(path: Path, fmt: str):
+    """Read a tabular file into a pandas DataFrame."""
+    import pandas as pd
+
+    if fmt == "csv":
+        return pd.read_csv(path)
+    if fmt == "parquet":
+        return pd.read_parquet(path)
+    if fmt == "json":
+        # ``orient='records'`` is the agent-friendly default; fall back to
+        # pandas' auto-detection when the file isn't a records list.
+        try:
+            return pd.read_json(path, orient="records")
+        except ValueError:
+            return pd.read_json(path)
+    raise ValueError(f"Unsupported input format: {fmt}")
+
+
+def _write_table(df, path: Path, fmt: str) -> None:
+    """Write a pandas DataFrame to ``path`` in ``fmt``."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+
+    if fmt == "csv":
+        df.to_csv(path, index=False)
+    elif fmt == "parquet":
+        df.to_parquet(path, index=False)
+    elif fmt == "json":
+        df.to_json(path, orient="records", indent=2)
+    else:
+        raise ValueError(f"Unsupported output format: {fmt}")
+
+
+def _load_config(config_path: str | None) -> dict[str, Any]:
+    """Load a JSON config file (or return an empty dict)."""
+    if config_path is None:
+        return {}
+    path = Path(config_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    with path.open("r", encoding="utf-8") as fh:
+        data = json.load(fh)
+    if not isinstance(data, dict):
+        raise ValueError(
+            f"Config file {config_path!r} must contain a JSON object at the top level"
+        )
+    return data
+
+
+def _emit(payload: dict[str, Any], *, as_json: bool, stream=None) -> None:
+    """Emit a payload to stdout, JSON-encoded when ``as_json`` is true."""
+    stream = stream if stream is not None else sys.stdout
+    if as_json:
+        stream.write(json.dumps(payload, default=str, sort_keys=True))
+        stream.write("\n")
+    else:
+        for key, value in payload.items():
+            stream.write(f"{key}: {value}\n")
+    stream.flush()
+
+
+def _build_engineer(args: argparse.Namespace) -> AutoFeatureEngineer:
+    """Construct an :class:`AutoFeatureEngineer` from parsed CLI args.
+
+    Precedence: explicit CLI flags override values from ``--config``.
+    """
+    config = _load_config(args.config)
+
+    def pick(flag_value, config_key, default):
+        if flag_value is not None:
+            return flag_value
+        return config.get(config_key, default)
+
+    engines = pick(args.engines, "engines", None) or ["tabular"]
+    selection_methods = pick(args.selection_methods, "selection_methods", None) or [
+        "mutual_info",
+        "importance",
+    ]
+    max_features = pick(args.max_features, "max_features", None)
+    correlation_threshold = pick(args.correlation_threshold, "correlation_threshold", 0.85)
+    leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn")
+    gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1)
+    llm_config = config.get("llm_config", {}) or {}
+    verbose = bool(pick(args.verbose, "verbose", False))
+
+    return AutoFeatureEngineer(
+        engines=list(engines),
+        max_features=max_features,
+        selection_methods=list(selection_methods),
+        correlation_threshold=correlation_threshold,
+        llm_config=llm_config,
+        verbose=verbose,
+        leakage_guard=leakage_guard,
+        gate_n_jobs=gate_n_jobs,
+    )
+
+
+def _split_xy(df, target: str | None):
+    """Split a DataFrame into ``(X, y)``; ``y`` is ``None`` when no target."""
+    if target is None:
+        return df, None
+    if target not in df.columns:
+        raise ValueError(
+            f"Target column {target!r} not found in input. "
+            f"Available columns: {list(df.columns)[:20]}{'...' if len(df.columns) > 20 else ''}"
+        )
+    y = df[target]
+    X = df.drop(columns=[target])
+    return X, y
+
+
+def _cmd_info(args: argparse.Namespace) -> int:
+    """Print version + supported engines/methods."""
+    payload = {
+        "version": __version__,
+        "supported_engines": sorted(AutoFeatureEngineer.SUPPORTED_ENGINES),
+        "supported_selection_methods": sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS),
+        "supported_leakage_guards": sorted(AutoFeatureEngineer.SUPPORTED_LEAKAGE_GUARDS),
+        "supported_input_formats": list(SUPPORTED_INPUT_FORMATS),
+        "supported_output_formats": list(SUPPORTED_OUTPUT_FORMATS),
+    }
+    _emit(payload, as_json=args.json)
+    return 0
+
+
+def _cmd_transform(args: argparse.Namespace) -> int:
+    """Read input, fit/transform, write output."""
+    input_path = Path(args.input)
+    if not input_path.exists():
+        raise FileNotFoundError(f"Input file not found: {args.input}")
+    output_path = Path(args.output)
+
+    in_fmt = _detect_format(input_path, args.input_format)
+    out_fmt = _detect_format(output_path, args.output_format)
+
+    df = _read_table(input_path, in_fmt)
+    X, y = _split_xy(df, args.target)
+
+    engineer = _build_engineer(args)
+    transformed = engineer.fit_transform(
+        X,
+        y,
+        task_description=args.task_description or "prediction task",
+        target_name=args.target,
+        apply_selection=not args.no_selection,
+    )
+
+    if args.include_target and y is not None:
+        # Re-attach the target column so downstream training scripts can
+        # consume the engineered file as a single artifact.
+        target_name = args.target if args.target in df.columns else "target"
+        transformed = transformed.copy()
+        transformed[target_name] = y.values
+
+    _write_table(transformed, output_path, out_fmt)
+
+    payload = {
+        "status": "ok",
+        "input": str(input_path),
+        "output": str(output_path),
+        "input_format": in_fmt,
+        "output_format": out_fmt,
+        "n_rows": int(transformed.shape[0]),
+        "n_features": int(transformed.shape[1]),
+        "n_input_columns": int(X.shape[1]),
+        "n_generated_features": len(engineer.get_feature_names()),
+        "engines": list(engineer.engines),
+        "selection_methods": list(engineer.selection_methods),
+        "max_features": engineer.max_features,
+        "target": args.target,
+        "selection_applied": engineer._selector is not None,
+    }
+    _emit(payload, as_json=args.json)
+    return 0
+
+
+def _cmd_explain(args: argparse.Namespace) -> int:
+    """Fit engines and print feature explanations + code as JSON."""
+    input_path = Path(args.input)
+    if not input_path.exists():
+        raise FileNotFoundError(f"Input file not found: {args.input}")
+
+    in_fmt = _detect_format(input_path, args.input_format)
+    df = _read_table(input_path, in_fmt)
+    X, y = _split_xy(df, args.target)
+
+    engineer = _build_engineer(args)
+    engineer.fit(
+        X,
+        y,
+        task_description=args.task_description or "prediction task",
+        target_name=args.target,
+    )
+
+    explanations = engineer.explain_features()
+    code = engineer.get_feature_code()
+    feature_names = engineer.get_feature_names()
+
+    payload = {
+        "status": "ok",
+        "input": str(input_path),
+        "n_features": len(feature_names),
+        "engines": list(engineer.engines),
+        "features": [
+            {
+                "name": name,
+                "explanation": explanations.get(name, ""),
+                "code": code.get(name, ""),
+            }
+            for name in feature_names
+        ],
+    }
+
+    # explain always emits JSON to stdout (it's the only sensible format),
+    # but we still respect ``--json`` for symmetry with other subcommands.
+    _emit(payload, as_json=True)
+    return 0
+
+
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        prog="featcopilot",
+        description=(
+            "FeatCopilot CLI — automated feature engineering from the command line. "
+            "Designed for scripting and agentic usage; pass --json to any subcommand "
+            "for machine-readable stdout."
+        ),
+    )
+    parser.add_argument(
+        "-V",
+        "--version",
+        action="version",
+        version=f"featcopilot {__version__}",
+    )
+    subparsers = parser.add_subparsers(dest="command", required=True, metavar="COMMAND")
+
+    # ----- info ---------------------------------------------------------
+    p_info = subparsers.add_parser(
+        "info",
+        help="Print version and supported engines/methods.",
+        description="Print the installed FeatCopilot version and the supported engines, "
+        "selection methods, leakage guards, and I/O formats.",
+    )
+    p_info.add_argument("--json", action="store_true", help="Emit JSON to stdout.")
+    p_info.set_defaults(func=_cmd_info)
+
+    # ----- transform ----------------------------------------------------
+    p_transform = subparsers.add_parser(
+        "transform",
+        help="Run feature engineering on a tabular file.",
+        description="Read INPUT, run AutoFeatureEngineer, and write engineered features to OUTPUT.",
+    )
+    _add_io_args(p_transform)
+    _add_engineer_args(p_transform)
+    p_transform.add_argument(
+        "--no-selection",
+        action="store_true",
+        help="Disable feature selection (skip do-no-harm gate).",
+    )
+    p_transform.add_argument(
+        "--include-target",
+        action="store_true",
+        help="Include the target column in the output file.",
+    )
+    p_transform.add_argument("--json", action="store_true", help="Emit a JSON status line on stdout.")
+    p_transform.set_defaults(func=_cmd_transform)
+
+    # ----- explain ------------------------------------------------------
+    p_explain = subparsers.add_parser(
+        "explain",
+        help="Print JSON feature explanations and code for agent consumption.",
+        description="Fit AutoFeatureEngineer on INPUT and emit a JSON document "
+        "describing each generated feature (name, explanation, code).",
+    )
+    p_explain.add_argument("--input", "-i", required=True, help="Path to input file (CSV / Parquet / JSON).")
+    p_explain.add_argument("--input-format", choices=SUPPORTED_INPUT_FORMATS, help="Override input format detection.")
+    p_explain.add_argument("--target", "-t", help="Target column name (required for selection).")
+    p_explain.add_argument(
+        "--task-description",
+        help="Natural-language ML task description (used by the LLM engine).",
+    )
+    _add_engineer_args(p_explain)
+    p_explain.add_argument("--json", action="store_true", help="(Always JSON — flag accepted for symmetry.)")
+    p_explain.set_defaults(func=_cmd_explain)
+
+    return parser
+
+
+def _add_io_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--input", "-i", required=True, help="Path to input file (CSV / Parquet / JSON).")
+    p.add_argument("--output", "-o", required=True, help="Path to output file (CSV / Parquet / JSON).")
+    p.add_argument("--input-format", choices=SUPPORTED_INPUT_FORMATS, help="Override input format detection.")
+    p.add_argument("--output-format", choices=SUPPORTED_OUTPUT_FORMATS, help="Override output format detection.")
+    p.add_argument("--target", "-t", help="Target column name (required for selection).")
+    p.add_argument(
+        "--task-description",
+        help="Natural-language ML task description (used by the LLM engine).",
+    )
+
+
+def _add_engineer_args(p: argparse.ArgumentParser) -> None:
+    """Add ``AutoFeatureEngineer``-related flags to a subparser."""
+    p.add_argument(
+        "--engines",
+        nargs="+",
+        choices=sorted(AutoFeatureEngineer.SUPPORTED_ENGINES),
+        help="Engines to use (default: tabular).",
+    )
+    p.add_argument(
+        "--selection-methods",
+        nargs="+",
+        choices=sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS),
+        help="Selection methods (default: mutual_info importance).",
+    )
+    p.add_argument("--max-features", type=int, help="Maximum number of features to keep.")
+    p.add_argument(
+        "--correlation-threshold",
+        type=float,
+        help="Maximum pairwise correlation in redundancy elimination (default: 0.85).",
+    )
+    p.add_argument(
+        "--leakage-guard",
+        choices=sorted(AutoFeatureEngineer.SUPPORTED_LEAKAGE_GUARDS),
+        help="How to handle suspicious column names (default: warn).",
+    )
+    p.add_argument(
+        "--gate-n-jobs",
+        type=int,
+        help="Parallelism for the do-no-harm gate's RF (default: 1; -1 = all cores).",
+    )
+    p.add_argument(
+        "--config",
+        help="Path to a JSON config file. CLI flags take precedence over config keys. "
+        "Use this to pass nested keys such as ``llm_config``.",
+    )
+    p.add_argument("--verbose", action="store_true", default=None, help="Enable verbose logging.")
+
+
+def main(argv: list[str] | None = None) -> int:
+    """CLI entry point.
+
+    Returns the process exit code; suitable for both the ``console_scripts``
+    entry point (``featcopilot``) and ``python -m featcopilot``.
+    """
+    parser = _build_parser()
+    args = parser.parse_args(argv)
+
+    try:
+        return args.func(args)
+    except (FileNotFoundError, ValueError) as exc:
+        # User-facing input/config errors: print a clean message to stderr
+        # without a traceback so agents can parse the failure.
+        sys.stderr.write(f"featcopilot: error: {exc}\n")
+        return 2
+    except KeyboardInterrupt:
+        sys.stderr.write("featcopilot: interrupted\n")
+        return 130
+    except Exception as exc:  # pragma: no cover - defensive backstop
+        sys.stderr.write(f"featcopilot: unexpected error: {type(exc).__name__}: {exc}\n")
+        logger.exception("Unhandled CLI exception")
+        return 1
+
+
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())
diff --git a/pyproject.toml b/pyproject.toml
index f904b9c..583f2eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -86,6 +86,9 @@ Homepage = "https://github.com/thinkall/featcopilot"
 Documentation = "https://github.com/thinkall/featcopilot#readme"
 Repository = "https://github.com/thinkall/featcopilot"
 
+[project.scripts]
+featcopilot = "featcopilot.cli:main"
+
 [tool.setuptools.packages.find]
 where = ["."]
 include = ["featcopilot*"]
diff --git a/tests/test_cli.py b/tests/test_cli.py
new file mode 100644
index 0000000..07d5aad
--- /dev/null
+++ b/tests/test_cli.py
@@ -0,0 +1,397 @@
+"""Tests for the featcopilot CLI."""
+
+from __future__ import annotations
+
+import io
+import json
+import sys
+from contextlib import redirect_stderr, redirect_stdout
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import pytest
+
+from featcopilot import __version__
+from featcopilot import cli as fc_cli
+
+
+def _run(argv: list[str]) -> tuple[int, str, str]:
+    """Invoke ``cli.main(argv)`` and capture exit code, stdout, stderr."""
+    out, err = io.StringIO(), io.StringIO()
+    with redirect_stdout(out), redirect_stderr(err):
+        rc = fc_cli.main(argv)
+    return rc, out.getvalue(), err.getvalue()
+
+
+@pytest.fixture
+def tabular_csv(tmp_path: Path) -> Path:
+    """A small classification dataset written to CSV."""
+    rng = np.random.default_rng(42)
+    n = 200
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "x3": rng.integers(0, 5, size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    path = tmp_path / "in.csv"
+    df.to_csv(path, index=False)
+    return path
+
+
+# --------------------------------------------------------------------- info
+
+
+def test_info_json_emits_supported_options():
+    rc, out, err = _run(["info", "--json"])
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["version"] == __version__
+    assert "tabular" in payload["supported_engines"]
+    assert "mutual_info" in payload["supported_selection_methods"]
+    assert "warn" in payload["supported_leakage_guards"]
+    assert set(payload["supported_input_formats"]) == {"csv", "parquet", "json"}
+
+
+def test_info_text_mode_is_human_readable():
+    rc, out, _ = _run(["info"])
+    assert rc == 0
+    # Not JSON: parsing should fail.
+    with pytest.raises(json.JSONDecodeError):
+        json.loads(out)
+    assert "version" in out
+    assert __version__ in out
+
+
+def test_top_level_version_flag(capsys):
+    # ``argparse`` ``--version`` action prints to stdout and SystemExits 0.
+    with pytest.raises(SystemExit) as exc:
+        fc_cli.main(["--version"])
+    assert exc.value.code == 0
+    assert __version__ in capsys.readouterr().out
+
+
+# ----------------------------------------------------------------- transform
+
+
+def test_transform_csv_to_csv(tmp_path: Path, tabular_csv: Path):
+    out_path = tmp_path / "out.csv"
+    rc, out, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--max-features",
+            "10",
+            "--json",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["status"] == "ok"
+    assert payload["target"] == "y"
+    assert payload["engines"] == ["tabular"]
+    assert payload["selection_applied"] is True
+    assert payload["n_input_columns"] == 3  # x1, x2, x3 (y is the target)
+
+    # The output file exists and is readable as CSV.
+    assert out_path.exists()
+    written = pd.read_csv(out_path)
+    assert written.shape[0] == 200
+    assert "y" not in written.columns  # target excluded by default
+
+
+def test_transform_include_target_round_trip(tmp_path: Path, tabular_csv: Path):
+    out_path = tmp_path / "out.csv"
+    rc, _, err = _run(
+        [
+            "transform",
+            "-i",
+            str(tabular_csv),
+            "-o",
+            str(out_path),
+            "-t",
+            "y",
+            "--max-features",
+            "10",
+            "--include-target",
+        ]
+    )
+    assert rc == 0, err
+    written = pd.read_csv(out_path)
+    assert "y" in written.columns
+
+
+def test_transform_parquet_round_trip(tmp_path: Path):
+    pytest.importorskip("pyarrow")
+    rng = np.random.default_rng(0)
+    df = pd.DataFrame(
+        {"a": rng.normal(size=120), "b": rng.normal(size=120), "y": rng.integers(0, 2, size=120)}
+    )
+    in_path = tmp_path / "in.parquet"
+    out_path = tmp_path / "out.parquet"
+    df.to_parquet(in_path, index=False)
+
+    rc, out, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--max-features",
+            "8",
+            "--json",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["input_format"] == "parquet"
+    assert payload["output_format"] == "parquet"
+    pd.read_parquet(out_path)  # readable
+
+
+def test_transform_json_round_trip(tmp_path: Path):
+    rng = np.random.default_rng(0)
+    df = pd.DataFrame(
+        {"a": rng.normal(size=80), "b": rng.normal(size=80), "y": rng.integers(0, 2, size=80)}
+    )
+    in_path = tmp_path / "in.json"
+    out_path = tmp_path / "out.json"
+    df.to_json(in_path, orient="records")
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 0, err
+    written = pd.read_json(out_path, orient="records")
+    assert written.shape[0] == 80
+
+
+def test_transform_no_selection_skips_selector(tmp_path: Path, tabular_csv: Path):
+    out_path = tmp_path / "out.csv"
+    rc, out, err = _run(
+        [
+            "transform",
+            "-i",
+            str(tabular_csv),
+            "-o",
+            str(out_path),
+            "-t",
+            "y",
+            "--no-selection",
+            "--max-features",
+            "5",
+            "--json",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["selection_applied"] is False
+
+
+def test_transform_config_file_supplies_engineer_kwargs(tmp_path: Path, tabular_csv: Path):
+    config_path = tmp_path / "cfg.json"
+    config_path.write_text(
+        json.dumps(
+            {
+                "engines": ["tabular"],
+                "selection_methods": ["mutual_info"],
+                "max_features": 7,
+                "correlation_threshold": 0.9,
+                "leakage_guard": "off",
+            }
+        )
+    )
+    out_path = tmp_path / "out.csv"
+    rc, out, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--config",
+            str(config_path),
+            "--json",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["selection_methods"] == ["mutual_info"]
+    assert payload["max_features"] == 7
+
+
+def test_transform_cli_flags_override_config(tmp_path: Path, tabular_csv: Path):
+    config_path = tmp_path / "cfg.json"
+    config_path.write_text(json.dumps({"max_features": 5, "engines": ["tabular"]}))
+    out_path = tmp_path / "out.csv"
+    rc, out, _ = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--config",
+            str(config_path),
+            "--max-features",
+            "12",
+            "--json",
+        ]
+    )
+    assert rc == 0
+    assert json.loads(out)["max_features"] == 12
+
+
+# -------------------------------------------------------------- error paths
+
+
+def test_transform_missing_input_returns_exit_2(tmp_path: Path):
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tmp_path / "nope.csv"),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "Input file not found" in err
+
+
+def test_transform_unknown_target_returns_exit_2(tmp_path: Path, tabular_csv: Path):
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "does_not_exist",
+        ]
+    )
+    assert rc == 2
+    assert "does_not_exist" in err
+
+
+def test_transform_unknown_extension_without_override(tmp_path: Path, tabular_csv: Path):
+    out_path = tmp_path / "out.weird"
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "infer format" in err.lower()
+
+
+def test_transform_format_override_accepted(tmp_path: Path, tabular_csv: Path):
+    out_path = tmp_path / "out.weird"
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--output-format",
+            "csv",
+        ]
+    )
+    assert rc == 0, err
+    assert out_path.exists()
+
+
+def test_invalid_config_file_returns_exit_2(tmp_path: Path, tabular_csv: Path):
+    bad = tmp_path / "bad.json"
+    bad.write_text("[1, 2, 3]")  # JSON, but not an object
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "o.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(bad),
+        ]
+    )
+    assert rc == 2
+    assert "JSON object" in err
+
+
+def test_no_subcommand_exits_nonzero():
+    # argparse SystemExits with code 2 when ``required=True`` subparser is missing.
+    with pytest.raises(SystemExit) as exc:
+        fc_cli.main([])
+    assert exc.value.code == 2
+
+
+# ------------------------------------------------------------------ explain
+
+
+def test_explain_emits_json_payload(tmp_path: Path, tabular_csv: Path):
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(tabular_csv),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["status"] == "ok"
+    assert payload["engines"] == ["tabular"]
+    assert isinstance(payload["features"], list)
+    # Each feature entry is a dict with the expected keys.
+    if payload["features"]:
+        entry = payload["features"][0]
+        assert {"name", "explanation", "code"} <= set(entry.keys())
+
+
+# ------------------------------------------------------------ python -m entry
+
+
+def test_dunder_main_module_runs(monkeypatch, capsys):
+    """``python -m featcopilot info --json`` is exercised via the CLI entry."""
+    monkeypatch.setattr(sys, "argv", ["featcopilot", "info", "--json"])
+    rc = fc_cli.main(["info", "--json"])
+    assert rc == 0

From b9995551940eb7f4ff7c8e060f9f2ba8e222717a Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 14:38:55 +0800
Subject: [PATCH 02/30] fix(cli): address round-1 review feedback

Addresses all five review comments from Copilot and Codex on PR #5:

* explain now actually returns generated features
  (Copilot review #1, Codex P1).
  Built-in engines (e.g. tabular) populate `_feature_names` during
  `transform()`, not `fit()`. `_cmd_explain` now calls
  `fit_transform(..., apply_selection=False)` so the JSON payload
  contains the full `{name, explanation, code}` records the
  subcommand advertises. Test asserts `n_features > 0` for tabular.

* main(argv) -> int contract honored on parse errors
  (Copilot review #2).
  `argparse.parse_args` raises `SystemExit` for usage errors,
  `--help` and `--version`. `main` now traps those and returns
  the exit code so programmatic and agent callers always get an int.
  Tests cover `--version` (rc=0), `--help` (rc=0), no-subcommand
  (rc=2) and unknown-flag (rc=2).

* Real subprocess test for python -m featcopilot
  (Copilot review #3).
  `test_dunder_main_subprocess_invocation` and
  `test_dunder_main_subprocess_version_flag` spawn a real
  `python -m featcopilot ...` subprocess and assert stdout JSON,
  so a regression in `__main__.py` actually breaks the suite.

* Parquet `ImportError` -> clean exit 2 (Codex P2).
  `_read_table`/`_write_table` now wrap parquet calls and convert
  `ImportError` into a `ValueError` with a friendly install hint;
  the top-level handler routes that to the deterministic `exit 2`
  user-error path instead of the generic `exit 1` backstop.
  `test_transform_parquet_missing_engine_returns_exit_2` exercises
  this via `monkeypatch` of `DataFrame.to_parquet`.

* Pre-commit black: re-applied formatting from the pinned
  `black 24.1.1` hook (joined two long string raises) so the CI
  pre-commit job passes.

Tests: 23 (+5 new) in tests/test_cli.py, 796 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py |  76 ++++++++++++++++++++++-----
 tests/test_cli.py  | 124 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 166 insertions(+), 34 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 89fcdf4..33b85e5 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -70,9 +70,7 @@ def _detect_format(path: Path, override: str | None) -> str:
     if override is not None:
         fmt = override.lower()
         if fmt not in SUPPORTED_INPUT_FORMATS:
-            raise ValueError(
-                f"Unsupported format {override!r}; expected one of {SUPPORTED_INPUT_FORMATS}"
-            )
+            raise ValueError(f"Unsupported format {override!r}; expected one of {SUPPORTED_INPUT_FORMATS}")
         return fmt
 
     suffix = path.suffix.lower().lstrip(".")
@@ -87,13 +85,25 @@ def _detect_format(path: Path, override: str | None) -> str:
 
 
 def _read_table(path: Path, fmt: str):
-    """Read a tabular file into a pandas DataFrame."""
+    """Read a tabular file into a pandas DataFrame.
+
+    For optional ``parquet`` engines (``pyarrow``/``fastparquet``), a missing
+    dependency is converted into a :class:`ValueError` so the CLI's top-level
+    error handler can route it to the deterministic ``exit 2`` user-error path
+    rather than the generic ``exit 1`` backstop.
+    """
     import pandas as pd
 
     if fmt == "csv":
         return pd.read_csv(path)
     if fmt == "parquet":
-        return pd.read_parquet(path)
+        try:
+            return pd.read_parquet(path)
+        except ImportError as exc:
+            raise ValueError(
+                f"Reading parquet requires a parquet engine (pyarrow or fastparquet); "
+                f"install one of them, or convert the input to CSV/JSON. Original error: {exc}"
+            ) from exc
     if fmt == "json":
         # ``orient='records'`` is the agent-friendly default; fall back to
         # pandas' auto-detection when the file isn't a records list.
@@ -105,13 +115,23 @@ def _read_table(path: Path, fmt: str):
 
 
 def _write_table(df, path: Path, fmt: str) -> None:
-    """Write a pandas DataFrame to ``path`` in ``fmt``."""
+    """Write a pandas DataFrame to ``path`` in ``fmt``.
+
+    Parquet ``ImportError`` is normalized to :class:`ValueError` so the CLI
+    surfaces a clean dependency message via the standard ``exit 2`` path.
+    """
     path.parent.mkdir(parents=True, exist_ok=True)
 
     if fmt == "csv":
         df.to_csv(path, index=False)
     elif fmt == "parquet":
-        df.to_parquet(path, index=False)
+        try:
+            df.to_parquet(path, index=False)
+        except ImportError as exc:
+            raise ValueError(
+                f"Writing parquet requires a parquet engine (pyarrow or fastparquet); "
+                f"install one of them, or pick CSV/JSON via --output-format. Original error: {exc}"
+            ) from exc
     elif fmt == "json":
         df.to_json(path, orient="records", indent=2)
     else:
@@ -128,9 +148,7 @@ def _load_config(config_path: str | None) -> dict[str, Any]:
     with path.open("r", encoding="utf-8") as fh:
         data = json.load(fh)
     if not isinstance(data, dict):
-        raise ValueError(
-            f"Config file {config_path!r} must contain a JSON object at the top level"
-        )
+        raise ValueError(f"Config file {config_path!r} must contain a JSON object at the top level")
     return data
 
 
@@ -262,7 +280,17 @@ def _cmd_transform(args: argparse.Namespace) -> int:
 
 
 def _cmd_explain(args: argparse.Namespace) -> int:
-    """Fit engines and print feature explanations + code as JSON."""
+    """Fit + transform engines and print feature explanations + code as JSON.
+
+    The built-in engines populate their internal feature-name registry during
+    :meth:`transform`, not :meth:`fit` (planning happens in ``fit`` but feature
+    objects are materialized in ``transform``). We therefore call
+    :meth:`AutoFeatureEngineer.fit_transform` so ``get_feature_names()``,
+    :meth:`explain_features` and :meth:`get_feature_code` all return the
+    actual generated features. Selection is intentionally skipped here so the
+    payload describes every candidate feature the engines produced, not just
+    the post-selection survivors.
+    """
     input_path = Path(args.input)
     if not input_path.exists():
         raise FileNotFoundError(f"Input file not found: {args.input}")
@@ -272,11 +300,12 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     X, y = _split_xy(df, args.target)
 
     engineer = _build_engineer(args)
-    engineer.fit(
+    engineer.fit_transform(
         X,
         y,
         task_description=args.task_description or "prediction task",
         target_name=args.target,
+        apply_selection=False,
     )
 
     explanations = engineer.explain_features()
@@ -427,10 +456,29 @@ def main(argv: list[str] | None = None) -> int:
     """CLI entry point.
 
     Returns the process exit code; suitable for both the ``console_scripts``
-    entry point (``featcopilot``) and ``python -m featcopilot``.
+    entry point (``featcopilot``) and ``python -m featcopilot``. Argparse
+    usage errors (missing subcommand, unknown flag) and the cooperative
+    ``--help`` / ``--version`` actions all normally raise :class:`SystemExit`;
+    we trap those here and return their exit code so that programmatic
+    callers (and agent harnesses) get a consistent integer-returning API.
     """
     parser = _build_parser()
-    args = parser.parse_args(argv)
+
+    try:
+        args = parser.parse_args(argv)
+    except SystemExit as exc:
+        # argparse uses SystemExit(0) for ``--help`` / ``--version`` and
+        # SystemExit(2) for usage errors (also writing to stderr). We let the
+        # output through but convert the exit into a return value so
+        # ``main(argv) -> int`` is honored even on parse-time failures.
+        code = exc.code
+        if code is None:
+            return 0
+        if isinstance(code, int):
+            return code
+        # Non-int code (e.g. error string): print to stderr, return 2.
+        sys.stderr.write(f"{code}\n")
+        return 2
 
     try:
         return args.func(args)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 07d5aad..3c25a65 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -67,10 +67,10 @@ def test_info_text_mode_is_human_readable():
 
 
 def test_top_level_version_flag(capsys):
-    # ``argparse`` ``--version`` action prints to stdout and SystemExits 0.
-    with pytest.raises(SystemExit) as exc:
-        fc_cli.main(["--version"])
-    assert exc.value.code == 0
+    # ``--version`` (argparse action) prints to stdout; main() now traps the
+    # SystemExit and returns the code so the API contract is consistent.
+    rc = fc_cli.main(["--version"])
+    assert rc == 0
     assert __version__ in capsys.readouterr().out
 
 
@@ -132,9 +132,7 @@ def test_transform_include_target_round_trip(tmp_path: Path, tabular_csv: Path):
 def test_transform_parquet_round_trip(tmp_path: Path):
     pytest.importorskip("pyarrow")
     rng = np.random.default_rng(0)
-    df = pd.DataFrame(
-        {"a": rng.normal(size=120), "b": rng.normal(size=120), "y": rng.integers(0, 2, size=120)}
-    )
+    df = pd.DataFrame({"a": rng.normal(size=120), "b": rng.normal(size=120), "y": rng.integers(0, 2, size=120)})
     in_path = tmp_path / "in.parquet"
     out_path = tmp_path / "out.parquet"
     df.to_parquet(in_path, index=False)
@@ -162,9 +160,7 @@ def test_transform_parquet_round_trip(tmp_path: Path):
 
 def test_transform_json_round_trip(tmp_path: Path):
     rng = np.random.default_rng(0)
-    df = pd.DataFrame(
-        {"a": rng.normal(size=80), "b": rng.normal(size=80), "y": rng.integers(0, 2, size=80)}
-    )
+    df = pd.DataFrame({"a": rng.normal(size=80), "b": rng.normal(size=80), "y": rng.integers(0, 2, size=80)})
     in_path = tmp_path / "in.json"
     out_path = tmp_path / "out.json"
     df.to_json(in_path, orient="records")
@@ -356,11 +352,24 @@ def test_invalid_config_file_returns_exit_2(tmp_path: Path, tabular_csv: Path):
     assert "JSON object" in err
 
 
-def test_no_subcommand_exits_nonzero():
-    # argparse SystemExits with code 2 when ``required=True`` subparser is missing.
-    with pytest.raises(SystemExit) as exc:
-        fc_cli.main([])
-    assert exc.value.code == 2
+def test_no_subcommand_exits_nonzero(capsys):
+    # main() now returns the argparse-reported exit code (2 for usage error)
+    # rather than letting SystemExit propagate, so programmatic callers get
+    # an integer back even on parse-time failures.
+    rc = fc_cli.main([])
+    assert rc == 2
+
+
+def test_unknown_flag_returns_exit_2(capsys):
+    rc = fc_cli.main(["transform", "--no-such-flag"])
+    assert rc == 2
+
+
+def test_help_flag_returns_zero(capsys):
+    rc = fc_cli.main(["--help"])
+    assert rc == 0
+    captured = capsys.readouterr()
+    assert "featcopilot" in captured.out
 
 
 # ------------------------------------------------------------------ explain
@@ -381,17 +390,92 @@ def test_explain_emits_json_payload(tmp_path: Path, tabular_csv: Path):
     assert payload["status"] == "ok"
     assert payload["engines"] == ["tabular"]
     assert isinstance(payload["features"], list)
+    # The tabular engine actually generates derived features, and the explain
+    # subcommand must materialize them by running the full fit_transform
+    # pipeline (engines populate _feature_names during transform()).
+    assert payload["n_features"] > 0
+    assert len(payload["features"]) == payload["n_features"]
     # Each feature entry is a dict with the expected keys.
-    if payload["features"]:
-        entry = payload["features"][0]
-        assert {"name", "explanation", "code"} <= set(entry.keys())
+    entry = payload["features"][0]
+    assert {"name", "explanation", "code"} <= set(entry.keys())
+    assert entry["name"]
 
 
-# ------------------------------------------------------------ python -m entry
+# --------------------------------------------------------------- parquet path
+
+
+def test_transform_parquet_missing_engine_returns_exit_2(tmp_path, tabular_csv, monkeypatch):
+    """When pyarrow/fastparquet is missing, the CLI should surface a clean
+    user-facing dependency error (exit 2) rather than the generic exit 1
+    backstop.
+    """
+    import pandas as pd
+
+    def _raise_import_error(self, *args, **kwargs):  # noqa: ANN001
+        raise ImportError("Missing optional dependency 'pyarrow' (simulated)")
+
+    monkeypatch.setattr(pd.DataFrame, "to_parquet", _raise_import_error, raising=True)
+
+    out_path = tmp_path / "out.parquet"
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--max-features",
+            "5",
+        ]
+    )
+    assert rc == 2
+    assert "parquet engine" in err.lower()
+
+
+# --------------------------------------------------------------- python -m
 
 
 def test_dunder_main_module_runs(monkeypatch, capsys):
-    """``python -m featcopilot info --json`` is exercised via the CLI entry."""
+    """``cli.main`` is invoked via the same code path as ``python -m featcopilot``."""
     monkeypatch.setattr(sys, "argv", ["featcopilot", "info", "--json"])
     rc = fc_cli.main(["info", "--json"])
     assert rc == 0
+
+
+def test_dunder_main_subprocess_invocation():
+    """``python -m featcopilot info --json`` must succeed in a real subprocess.
+
+    Exercises ``featcopilot/__main__.py`` end-to-end so a regression in
+    module-form invocation (e.g. a broken import path) actually breaks the
+    test, not just the unit-level call to ``cli.main``.
+    """
+    import subprocess
+
+    result = subprocess.run(
+        [sys.executable, "-m", "featcopilot", "info", "--json"],
+        capture_output=True,
+        text=True,
+        timeout=60,
+        check=False,
+    )
+    assert result.returncode == 0, result.stderr
+    payload = json.loads(result.stdout)
+    assert payload["version"] == __version__
+    assert "tabular" in payload["supported_engines"]
+
+
+def test_dunder_main_subprocess_version_flag():
+    """``python -m featcopilot --version`` must print and exit 0."""
+    import subprocess
+
+    result = subprocess.run(
+        [sys.executable, "-m", "featcopilot", "--version"],
+        capture_output=True,
+        text=True,
+        timeout=30,
+        check=False,
+    )
+    assert result.returncode == 0, result.stderr
+    assert __version__ in result.stdout

From 72a07571e62abb23f8baa103e1c38cde6a131470 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 15:10:46 +0800
Subject: [PATCH 03/30] chore: re-trigger automated PR review

No code changes. Triggers another round of automated review from the
Copilot/Codex review bots so the full PR (not just the latest commit)
can be re-evaluated against the current head after round-1 fixes in
b999555 (which addressed all 5 review comments and resolved all 5
review threads).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>

From 92fcd090fe170858df8070eb85f3e4c43592e59e Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 18:06:17 +0800
Subject: [PATCH 04/30] fix(cli): address round-2 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses all five new review comments from copilot-pull-request-reviewer
on PR #5:

* info now reflects parquet engine availability at runtime.
  `_parquet_engine_available()` probes `pyarrow` / `fastparquet`;
  parquet is included in `supported_input_formats` /
  `supported_output_formats` only when an engine is importable. Adds a
  new `"parquet_available"` boolean in the payload so agents can branch
  on it deterministically. Tests cover both the engine-present and
  engine-missing cases via `monkeypatch`.

* _build_engineer no longer mangles misconfigured configs.
  Removed the `list(...)` coercion of `engines` /
  `selection_methods`: a misconfigured `"engines": "tabular"` (string)
  used to be silently expanded into `['t','a','b','u','l','a','r']` and
  bubble up as a confusing "unknown engines" error. It now flows straight
  into `AutoFeatureEngineer.__init__`'s precise type validation
  ("engines must be a list or tuple of strings"), surfaced via the
  standard exit-2 user-error path.

* Empty config lists now produce the documented exit 2.
  Replaced `pick(...) or [default]` with a tri-state `pick` that
  honors explicit empty values from the config: `"engines": []` /
  `"selection_methods": []` now propagate into the transformer where
  `_validate_configuration` raises "must contain at least one ..." and
  the CLI returns exit 2 — instead of being silently rewritten into the
  defaults.

* README parquet caveat. Switched the example to CSV/CSV (no surprise
  dependency for base installs) and added an explicit note that parquet
  I/O requires installing `pyarrow` or `fastparquet`, with a pointer
  to `info`'s `parquet_available` flag for runtime detection.

* Console script test. New `test_console_script_subprocess_invocation`
  and `test_console_script_version_flag` use `shutil.which` to locate
  the installed `featcopilot` script and run it through `subprocess`;
  a typo or packaging regression in `[project.scripts]` now actually
  breaks the suite. Tests skip cleanly when the script is not on PATH.

Tests: 30 (+7 new) in tests/test_cli.py, 803 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 README.md          |  11 +++-
 featcopilot/cli.py |  75 ++++++++++++++++++----
 tests/test_cli.py  | 156 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 227 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 157215b..280db2d 100644
--- a/README.md
+++ b/README.md
@@ -121,9 +121,9 @@ non-zero exit code so agents can parse failures deterministically.
 # Discover capabilities (engines, selection methods, I/O formats)
 featcopilot info --json
 
-# Run feature engineering on a CSV / Parquet / JSON file
+# Run feature engineering on a CSV / JSON file
 featcopilot transform \
-    --input data.csv --target label --output features.parquet \
+    --input data.csv --target label --output features.csv \
     --engines tabular --max-features 50 --json
 
 # Inspect generated features (name, explanation, code) as JSON for an LLM
@@ -136,6 +136,13 @@ python -m featcopilot info --json
 Pass `--config config.json` to provide nested keys such as `llm_config`;
 explicit CLI flags override values from the config file.
 
+> **Parquet I/O.** FeatCopilot's base install does not pin a parquet engine.
+> To use `--input file.parquet` / `--output file.parquet` (or the
+> `parquet` value in `--input-format` / `--output-format`), install one of
+> `pyarrow` or `fastparquet`. `featcopilot info --json` reports
+> `"parquet_available": true` only when an engine is importable in the
+> current environment.
+
 ## Engines
 
 ### Tabular Engine
diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 33b85e5..8c114ef 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -52,6 +52,29 @@
 SUPPORTED_OUTPUT_FORMATS = ("csv", "parquet", "json")
 
 
+def _parquet_engine_available() -> bool:
+    """Return ``True`` if a parquet engine (pyarrow or fastparquet) can be imported.
+
+    FeatCopilot's base install pins neither ``pyarrow`` nor ``fastparquet``;
+    parquet I/O is therefore opportunistic. ``info`` uses this probe so the
+    machine-readable capability output reflects what will actually work in
+    the current environment, rather than always advertising parquet.
+    """
+    try:
+        import pyarrow  # noqa: F401
+
+        return True
+    except ImportError:
+        pass
+    try:
+        import fastparquet  # noqa: F401
+
+        return True
+    except ImportError:
+        pass
+    return False
+
+
 def _detect_format(path: Path, override: str | None) -> str:
     """Return one of ``SUPPORTED_INPUT_FORMATS`` for ``path``.
 
@@ -167,20 +190,30 @@ def _emit(payload: dict[str, Any], *, as_json: bool, stream=None) -> None:
 def _build_engineer(args: argparse.Namespace) -> AutoFeatureEngineer:
     """Construct an :class:`AutoFeatureEngineer` from parsed CLI args.
 
-    Precedence: explicit CLI flags override values from ``--config``.
+    Precedence: explicit CLI flags override values from ``--config``;
+    explicit config values (including empty lists) override the defaults.
+    Empty / non-list values are propagated unchanged so that
+    :meth:`AutoFeatureEngineer._validate_configuration` produces its
+    canonical (and deterministic) error path — the CLI's wrapper must not
+    silently rewrite a misconfigured config into something that looks
+    different from what the user wrote.
     """
     config = _load_config(args.config)
 
     def pick(flag_value, config_key, default):
+        # Explicit CLI flag wins. Otherwise honor an explicit config entry
+        # — even a falsy one such as ``[]`` — so AutoFeatureEngineer can
+        # raise its own clear "must contain at least one" error rather than
+        # the CLI silently swapping in defaults. Only fall back to the
+        # default when the key is *absent* from the config.
         if flag_value is not None:
             return flag_value
-        return config.get(config_key, default)
+        if config_key in config:
+            return config[config_key]
+        return default
 
-    engines = pick(args.engines, "engines", None) or ["tabular"]
-    selection_methods = pick(args.selection_methods, "selection_methods", None) or [
-        "mutual_info",
-        "importance",
-    ]
+    engines = pick(args.engines, "engines", ["tabular"])
+    selection_methods = pick(args.selection_methods, "selection_methods", ["mutual_info", "importance"])
     max_features = pick(args.max_features, "max_features", None)
     correlation_threshold = pick(args.correlation_threshold, "correlation_threshold", 0.85)
     leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn")
@@ -188,10 +221,16 @@ def pick(flag_value, config_key, default):
     llm_config = config.get("llm_config", {}) or {}
     verbose = bool(pick(args.verbose, "verbose", False))
 
+    # Pass ``engines`` / ``selection_methods`` through *unchanged* (no
+    # ``list(...)`` wrapping). Coercion would convert a misconfigured
+    # JSON string like ``"tabular"`` into ``['t','a','b','u','l','a','r']``,
+    # turning a clear type error into a confusing "Unknown engines" path.
+    # AutoFeatureEngineer.__init__ rejects non-list/tuple inputs with a
+    # precise message — let it.
     return AutoFeatureEngineer(
-        engines=list(engines),
+        engines=engines,
         max_features=max_features,
-        selection_methods=list(selection_methods),
+        selection_methods=selection_methods,
         correlation_threshold=correlation_threshold,
         llm_config=llm_config,
         verbose=verbose,
@@ -215,14 +254,26 @@ def _split_xy(df, target: str | None):
 
 
 def _cmd_info(args: argparse.Namespace) -> int:
-    """Print version + supported engines/methods."""
+    """Print version + supported engines/methods.
+
+    Parquet appears in ``supported_input_formats`` / ``supported_output_formats``
+    only when an actual parquet engine (``pyarrow`` or ``fastparquet``) can
+    be imported in the current environment — otherwise the ``info`` output
+    would advertise a format that immediately fails on use, which is
+    misleading for the agentic capability-discovery the CLI is designed to
+    support.
+    """
+    parquet_ok = _parquet_engine_available()
+    input_formats = [f for f in SUPPORTED_INPUT_FORMATS if f != "parquet" or parquet_ok]
+    output_formats = [f for f in SUPPORTED_OUTPUT_FORMATS if f != "parquet" or parquet_ok]
     payload = {
         "version": __version__,
         "supported_engines": sorted(AutoFeatureEngineer.SUPPORTED_ENGINES),
         "supported_selection_methods": sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS),
         "supported_leakage_guards": sorted(AutoFeatureEngineer.SUPPORTED_LEAKAGE_GUARDS),
-        "supported_input_formats": list(SUPPORTED_INPUT_FORMATS),
-        "supported_output_formats": list(SUPPORTED_OUTPUT_FORMATS),
+        "supported_input_formats": input_formats,
+        "supported_output_formats": output_formats,
+        "parquet_available": parquet_ok,
     }
     _emit(payload, as_json=args.json)
     return 0
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 3c25a65..bfda149 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -53,7 +53,37 @@ def test_info_json_emits_supported_options():
     assert "tabular" in payload["supported_engines"]
     assert "mutual_info" in payload["supported_selection_methods"]
     assert "warn" in payload["supported_leakage_guards"]
-    assert set(payload["supported_input_formats"]) == {"csv", "parquet", "json"}
+    # CSV/JSON are always supported; parquet is gated on engine availability.
+    assert {"csv", "json"} <= set(payload["supported_input_formats"])
+    assert {"csv", "json"} <= set(payload["supported_output_formats"])
+    assert isinstance(payload["parquet_available"], bool)
+    if payload["parquet_available"]:
+        assert "parquet" in payload["supported_input_formats"]
+        assert "parquet" in payload["supported_output_formats"]
+    else:
+        assert "parquet" not in payload["supported_input_formats"]
+        assert "parquet" not in payload["supported_output_formats"]
+
+
+def test_info_excludes_parquet_when_engine_missing(monkeypatch):
+    """When no parquet engine can be imported, ``info`` must not advertise it."""
+    monkeypatch.setattr(fc_cli, "_parquet_engine_available", lambda: False)
+    rc, out, _ = _run(["info", "--json"])
+    assert rc == 0
+    payload = json.loads(out)
+    assert payload["parquet_available"] is False
+    assert "parquet" not in payload["supported_input_formats"]
+    assert "parquet" not in payload["supported_output_formats"]
+
+
+def test_info_includes_parquet_when_engine_present(monkeypatch):
+    monkeypatch.setattr(fc_cli, "_parquet_engine_available", lambda: True)
+    rc, out, _ = _run(["info", "--json"])
+    assert rc == 0
+    payload = json.loads(out)
+    assert payload["parquet_available"] is True
+    assert "parquet" in payload["supported_input_formats"]
+    assert "parquet" in payload["supported_output_formats"]
 
 
 def test_info_text_mode_is_human_readable():
@@ -261,6 +291,77 @@ def test_transform_cli_flags_override_config(tmp_path: Path, tabular_csv: Path):
     assert json.loads(out)["max_features"] == 12
 
 
+# ----------------------- _build_engineer config validation
+
+
+def test_string_engines_in_config_returns_clean_exit_2(tmp_path: Path, tabular_csv: Path):
+    """A misconfigured ``"engines": "tabular"`` (string instead of list) must
+    surface ``AutoFeatureEngineer``'s precise type-validation error via the
+    standard exit-2 path — *not* be silently coerced into a per-character list.
+    """
+    config_path = tmp_path / "cfg.json"
+    config_path.write_text(json.dumps({"engines": "tabular"}))
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(config_path),
+        ]
+    )
+    assert rc == 2
+    assert "engines must be a list or tuple" in err
+
+
+def test_empty_engines_list_in_config_returns_clean_exit_2(tmp_path: Path, tabular_csv: Path):
+    """An explicit empty ``engines`` list in the config must propagate to the
+    transformer's validation so the user sees the documented error, instead
+    of being silently rewritten into the defaults.
+    """
+    config_path = tmp_path / "cfg.json"
+    config_path.write_text(json.dumps({"engines": []}))
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(config_path),
+        ]
+    )
+    assert rc == 2
+    assert "at least one engine" in err.lower() or "empty sequence" in err.lower()
+
+
+def test_empty_selection_methods_list_in_config_returns_clean_exit_2(tmp_path: Path, tabular_csv: Path):
+    config_path = tmp_path / "cfg.json"
+    config_path.write_text(json.dumps({"selection_methods": []}))
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(config_path),
+        ]
+    )
+    assert rc == 2
+    assert "at least one method" in err.lower() or "empty sequence" in err.lower()
+
+
 # -------------------------------------------------------------- error paths
 
 
@@ -479,3 +580,56 @@ def test_dunder_main_subprocess_version_flag():
     )
     assert result.returncode == 0, result.stderr
     assert __version__ in result.stdout
+
+
+# ------------------------------------------------------- console script
+
+
+def test_console_script_subprocess_invocation():
+    """The installed ``featcopilot`` console script must be on PATH and runnable.
+
+    Exercises the ``[project.scripts] featcopilot = "featcopilot.cli:main"``
+    entry point end-to-end so a typo or packaging regression in
+    ``pyproject.toml`` would actually break the suite. Skipped when the
+    script isn't on ``PATH`` (e.g. running tests without ``pip install``).
+    """
+    import shutil
+    import subprocess
+
+    script = shutil.which("featcopilot")
+    if script is None:
+        pytest.skip(
+            "featcopilot console script not on PATH (install the package "
+            "with `pip install -e .` to exercise the entry point)"
+        )
+
+    result = subprocess.run(
+        [script, "info", "--json"],
+        capture_output=True,
+        text=True,
+        timeout=60,
+        check=False,
+    )
+    assert result.returncode == 0, result.stderr
+    payload = json.loads(result.stdout)
+    assert payload["version"] == __version__
+    assert "tabular" in payload["supported_engines"]
+
+
+def test_console_script_version_flag():
+    import shutil
+    import subprocess
+
+    script = shutil.which("featcopilot")
+    if script is None:
+        pytest.skip("featcopilot console script not on PATH")
+
+    result = subprocess.run(
+        [script, "--version"],
+        capture_output=True,
+        text=True,
+        timeout=30,
+        check=False,
+    )
+    assert result.returncode == 0, result.stderr
+    assert __version__ in result.stdout

From 0072e5be39531b1d2fd20a6d6a63664aa827fa94 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 18:34:30 +0800
Subject: [PATCH 05/30] fix(cli): address round-3 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses both new comments from chatgpt-codex-connector on PR #5
(commit 92fcd09):

* Validate `llm_config` is a JSON object (Codex P2).
  `_build_engineer` now type-checks the `llm_config` value from
  `--config` before forwarding it. A non-mapping (e.g. a string)
  previously made it all the way into `AutoFeatureEngineer._create_engine`
  where `self.llm_config.get(...)` raised `AttributeError` — bypassing
  the structured exit-2 user-error path and surfacing as exit 1
  `unexpected error`. The new check raises a precise `ValueError` so
  the CLI returns exit 2 with a clean stderr message.
  Test `test_non_dict_llm_config_returns_exit_2` covers this.

* Normalize `--config` user-input mistakes to exit 2 (Codex P3).
  Pointing `--config` at a directory used to raise
  `IsADirectoryError` from `path.open(...)` and fall into the generic
  exit-1 backstop. `_load_config` now:
  - rejects directories explicitly,
  - converts `json.JSONDecodeError` into a clean
    "is not valid JSON" message,
  - converts other read errors (`OSError`) into a clean
    "could not be read" message.
  All paths return exit 2 so automation can handle config errors
  consistently. Tests `test_directory_as_config_returns_exit_2` and
  `test_malformed_json_config_returns_exit_2` cover the new branches.

Tests: 33 (+3 new) in tests/test_cli.py, 806 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 44 +++++++++++++++++++++++++++---
 tests/test_cli.py  | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 108 insertions(+), 4 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 8c114ef..7b13e1f 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -162,14 +162,32 @@ def _write_table(df, path: Path, fmt: str) -> None:
 
 
 def _load_config(config_path: str | None) -> dict[str, Any]:
-    """Load a JSON config file (or return an empty dict)."""
+    """Load a JSON config file (or return an empty dict).
+
+    Normalizes user-input mistakes (missing path, directory passed instead
+    of a file, invalid JSON, non-object root) into :class:`ValueError` /
+    :class:`FileNotFoundError` so the CLI's top-level error handler can
+    route them all to the deterministic ``exit 2`` user-error path
+    (rather than e.g. ``IsADirectoryError`` falling into the generic
+    ``exit 1`` "unexpected error" backstop).
+    """
     if config_path is None:
         return {}
     path = Path(config_path)
     if not path.exists():
         raise FileNotFoundError(f"Config file not found: {config_path}")
-    with path.open("r", encoding="utf-8") as fh:
-        data = json.load(fh)
+    if path.is_dir():
+        raise ValueError(f"--config expects a JSON file, but {config_path!r} is a directory.")
+    try:
+        with path.open("r", encoding="utf-8") as fh:
+            data = json.load(fh)
+    except json.JSONDecodeError as exc:
+        raise ValueError(f"Config file {config_path!r} is not valid JSON: {exc}") from exc
+    except OSError as exc:
+        # Catch-all for unreadable files (permission denied, broken symlink,
+        # etc.). Surface as a user-facing error rather than the generic
+        # exit-1 backstop.
+        raise ValueError(f"Config file {config_path!r} could not be read: {exc}") from exc
     if not isinstance(data, dict):
         raise ValueError(f"Config file {config_path!r} must contain a JSON object at the top level")
     return data
@@ -218,7 +236,25 @@ def pick(flag_value, config_key, default):
     correlation_threshold = pick(args.correlation_threshold, "correlation_threshold", 0.85)
     leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn")
     gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1)
-    llm_config = config.get("llm_config", {}) or {}
+
+    # Validate ``llm_config`` is a JSON object (i.e. a Python dict) before
+    # forwarding it. Without this check, a misconfigured non-dict value
+    # would only fail at engine-construction time inside
+    # ``AutoFeatureEngineer._create_engine`` via ``self.llm_config.get(...)``,
+    # raising an ``AttributeError`` that bypasses the structured exit-2
+    # user-error path (the CLI would surface it as exit 1 "unexpected
+    # error", which is a poor agent contract for a documented config key).
+    llm_config_raw = config.get("llm_config")
+    if llm_config_raw is None:
+        llm_config: dict[str, Any] = {}
+    elif isinstance(llm_config_raw, dict):
+        llm_config = llm_config_raw
+    else:
+        raise ValueError(
+            "`llm_config` in the --config file must be a JSON object (mapping); "
+            f"got {type(llm_config_raw).__name__}={llm_config_raw!r}."
+        )
+
     verbose = bool(pick(args.verbose, "verbose", False))
 
     # Pass ``engines`` / ``selection_methods`` through *unchanged* (no
diff --git a/tests/test_cli.py b/tests/test_cli.py
index bfda149..bfa9e53 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -453,6 +453,74 @@ def test_invalid_config_file_returns_exit_2(tmp_path: Path, tabular_csv: Path):
     assert "JSON object" in err
 
 
+def test_directory_as_config_returns_exit_2(tmp_path: Path, tabular_csv: Path):
+    """Pointing ``--config`` at a directory must surface as exit 2, not the
+    generic ``exit 1`` backstop (``IsADirectoryError``).
+    """
+    cfg_dir = tmp_path / "not_a_file"
+    cfg_dir.mkdir()
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "o.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(cfg_dir),
+        ]
+    )
+    assert rc == 2
+    assert "directory" in err.lower()
+
+
+def test_malformed_json_config_returns_exit_2(tmp_path: Path, tabular_csv: Path):
+    bad = tmp_path / "bad.json"
+    bad.write_text("{not valid json,}")
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "o.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(bad),
+        ]
+    )
+    assert rc == 2
+    assert "valid json" in err.lower()
+
+
+def test_non_dict_llm_config_returns_exit_2(tmp_path: Path, tabular_csv: Path):
+    """A non-mapping ``llm_config`` (e.g. a string) must be rejected at
+    config-load time with a clean exit 2, not bubble up as an
+    ``AttributeError`` from ``.get(...)`` deep inside engine construction.
+    """
+    cfg = tmp_path / "cfg.json"
+    cfg.write_text(json.dumps({"engines": ["tabular"], "llm_config": "gpt-5"}))
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "o.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(cfg),
+        ]
+    )
+    assert rc == 2
+    assert "llm_config" in err
+    assert "JSON object" in err or "mapping" in err.lower()
+
+
 def test_no_subcommand_exits_nonzero(capsys):
     # main() now returns the argparse-reported exit code (2 for usage error)
     # rather than letting SystemExit propagate, so programmatic callers get

From 8240e26b50c58f03ef9c6377daa4ed55d2aa7f0e Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 18:56:53 +0800
Subject: [PATCH 06/30] test(cli): bump patch coverage above codecov threshold

Round-3 introduced two new error-handling branches whose untested
counterparts pulled patch coverage to 88.83% (just below the 88.90%
codecov target):

* Refactored `_parquet_engine_available` to use
  `importlib.util.find_spec` so the probe is side-effect-free and
  trivially mockable. Covered by two new tests:
  - both engines absent -> False
  - fastparquet-only path -> True

* Added `test_transform_read_parquet_missing_engine_returns_exit_2`
  to exercise the symmetric read-side `ImportError` -> exit-2
  branch in `_read_table` (mirroring the existing write-side test).

* Added `test_unreadable_config_returns_exit_2` to cover the
  `OSError` branch in `_load_config` (permission denied, broken
  symlink, etc.), via `monkeypatch` of `Path.open`.

Tests: 37 (+4 new) in tests/test_cli.py.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py |  17 +++-----
 tests/test_cli.py  | 101 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 12 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 7b13e1f..be8180e 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -59,20 +59,13 @@ def _parquet_engine_available() -> bool:
     parquet I/O is therefore opportunistic. ``info`` uses this probe so the
     machine-readable capability output reflects what will actually work in
     the current environment, rather than always advertising parquet.
-    """
-    try:
-        import pyarrow  # noqa: F401
 
-        return True
-    except ImportError:
-        pass
-    try:
-        import fastparquet  # noqa: F401
+    Uses :func:`importlib.util.find_spec` so the probe is side-effect-free
+    (no actual module import) and easy to mock in tests.
+    """
+    import importlib.util
 
-        return True
-    except ImportError:
-        pass
-    return False
+    return importlib.util.find_spec("pyarrow") is not None or importlib.util.find_spec("fastparquet") is not None
 
 
 def _detect_format(path: Path, override: str | None) -> str:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index bfa9e53..7e96b05 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -603,6 +603,107 @@ def _raise_import_error(self, *args, **kwargs):  # noqa: ANN001
     assert "parquet engine" in err.lower()
 
 
+def test_transform_read_parquet_missing_engine_returns_exit_2(tmp_path, tabular_csv, monkeypatch):
+    """Symmetric coverage for reading a .parquet input when no engine is installed.
+
+    The CLI must convert the ``ImportError`` from ``pd.read_parquet`` into
+    the deterministic exit-2 path (with a user-facing install hint),
+    just like the write path.
+    """
+    import pandas as pd
+
+    # Make sure the input path has a .parquet suffix so format detection picks parquet.
+    fake_pq = tmp_path / "fake.parquet"
+    fake_pq.write_bytes(b"")  # contents don't matter; we'll intercept read_parquet
+
+    def _raise_import_error(*args, **kwargs):
+        raise ImportError("Missing optional dependency 'pyarrow' (simulated)")
+
+    monkeypatch.setattr(pd, "read_parquet", _raise_import_error, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(fake_pq),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "parquet engine" in err.lower()
+
+
+def test_parquet_engine_available_returns_false_when_neither_installed(monkeypatch):
+    """Both probes return ``None`` from ``find_spec`` -> function returns False."""
+    import importlib.util
+
+    real_find_spec = importlib.util.find_spec
+
+    def fake_find_spec(name, *args, **kwargs):
+        if name in ("pyarrow", "fastparquet"):
+            return None
+        return real_find_spec(name, *args, **kwargs)
+
+    monkeypatch.setattr(importlib.util, "find_spec", fake_find_spec)
+    assert fc_cli._parquet_engine_available() is False
+
+
+def test_parquet_engine_available_returns_true_for_fastparquet_only(monkeypatch):
+    """Even without pyarrow, finding fastparquet must report parquet as available."""
+    import importlib.util
+
+    class _FakeSpec:
+        pass
+
+    real_find_spec = importlib.util.find_spec
+
+    def fake_find_spec(name, *args, **kwargs):
+        if name == "pyarrow":
+            return None
+        if name == "fastparquet":
+            return _FakeSpec()
+        return real_find_spec(name, *args, **kwargs)
+
+    monkeypatch.setattr(importlib.util, "find_spec", fake_find_spec)
+    assert fc_cli._parquet_engine_available() is True
+
+
+def test_unreadable_config_returns_exit_2(tmp_path, tabular_csv, monkeypatch):
+    """An ``OSError`` while opening the config (permission denied, broken
+    symlink, etc.) is converted into the deterministic exit-2 path.
+    """
+    cfg = tmp_path / "cfg.json"
+    cfg.write_text("{}")
+
+    real_open = Path.open
+
+    def _raise_oserror(self, *args, **kwargs):
+        if self == cfg:
+            raise PermissionError("simulated read failure")
+        return real_open(self, *args, **kwargs)
+
+    monkeypatch.setattr(Path, "open", _raise_oserror, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(cfg),
+        ]
+    )
+    assert rc == 2
+    assert "could not be read" in err.lower()
+
+
 # --------------------------------------------------------------- python -m
 
 

From 88e71ea02f0f29795c1a182495d68b9f5b8d4e89 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 19:27:51 +0800
Subject: [PATCH 07/30] fix(cli): address round-4 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses all seven new comments from copilot-pull-request-reviewer
on PR #5 (commits 0072e5b and 8240e26):

* Module docstring example uses CSV (not parquet).
  Mirrors the README fix from round-2 so the in-file example does not
  silently rely on an optional dependency. A short note about parquet
  availability via `info`'s `parquet_available` flag was added.

* All input/output OSError paths normalize to exit 2.
  `_read_table` and `_write_table` now reject directories, catch
  `OSError` (permission denied, broken symlink, parent-dir creation
  failure, …), and convert them into `ValueError` with a precise
  message — surfaced via the standard exit-2 user-error path. The
  generic exit-1 backstop is reserved for truly unexpected errors.
  New tests cover `--input` directory, `--output` directory,
  unwritable output, and unreadable input.

* Scalar `--config` fields are type-validated.
  New `_check_scalar_type` helper rejects malformed config values
  (e.g. `"max_features": "10"`, `"correlation_threshold": "0.9"`,
  `"gate_n_jobs": "2"`, `"leakage_guard": 42`) at config-load time
  with a precise `ValueError` -> exit 2. Without this, those values
  later raised `TypeError` deep inside the estimator and surfaced as
  exit 1 `unexpected error`. Parametrized test covers six type
  mismatches including the bool-as-int trap.

* `--verbose` is a true tri-state (BooleanOptionalAction).
  `store_true` made it impossible to override a config-supplied
  `"verbose": true` back to false from the command line. Switched to
  `argparse.BooleanOptionalAction` (Python 3.9+; we require 3.10+),
  giving both `--verbose` and `--no-verbose` so the documented
  precedence rule (CLI > config > default) is honorable. Tests assert
  `--no-verbose` overrides config `true` and `--verbose`
  overrides config `false`.

* `explain` no longer advertises selection-only flags.
  `_add_engineer_args` learned an `include_selection_args=False`
  mode used by the `explain` subparser. `--selection-methods`,
  `--max-features`, `--correlation-threshold` are no longer
  accepted on `explain` (which always disables selection), so a
  user / agent can no longer silently mis-configure the call.
  `_build_engineer` uses `getattr(..., None)` to fall through to
  config / defaults when those attributes aren't present.

* `explain --target` help text fixed.
  Now says the target is used by the leakage-guard and as task
  context, and explicitly notes that selection is disabled in
  `explain`. `--target` help on `transform` is also clarified.

Tests: 53 (+16 new) in tests/test_cli.py, 826 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 181 ++++++++++++++++++++++++++-------
 tests/test_cli.py  | 247 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 394 insertions(+), 34 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index be8180e..d6cf451 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -25,13 +25,17 @@
 
     featcopilot info --json
     featcopilot transform \\
-        --input data.csv --target label --output features.parquet \\
+        --input data.csv --target label --output features.csv \\
         --engines tabular --max-features 50 --json
     featcopilot explain --input data.csv --target label --json
 
 Equivalent module invocation::
 
     python -m featcopilot info --json
+
+Parquet I/O is supported only when ``pyarrow`` or ``fastparquet`` is
+installed (FeatCopilot's base distribution does not pin either); ``info``
+reports the runtime availability via ``parquet_available``.
 """
 
 from __future__ import annotations
@@ -103,15 +107,23 @@ def _detect_format(path: Path, override: str | None) -> str:
 def _read_table(path: Path, fmt: str):
     """Read a tabular file into a pandas DataFrame.
 
-    For optional ``parquet`` engines (``pyarrow``/``fastparquet``), a missing
-    dependency is converted into a :class:`ValueError` so the CLI's top-level
-    error handler can route it to the deterministic ``exit 2`` user-error path
-    rather than the generic ``exit 1`` backstop.
+    All user-facing failure modes (missing parquet engine, ``--input``
+    pointing at a directory, permission denied, malformed JSON/CSV,
+    decoding errors) are normalized into :class:`ValueError` so the CLI's
+    top-level handler routes them to the deterministic ``exit 2``
+    user-error path. The generic ``exit 1`` backstop is reserved for
+    truly unexpected (i.e. CLI-internal) errors.
     """
     import pandas as pd
 
+    if path.is_dir():
+        raise ValueError(f"--input expects a file, but {str(path)!r} is a directory.")
+
     if fmt == "csv":
-        return pd.read_csv(path)
+        try:
+            return pd.read_csv(path)
+        except (OSError, pd.errors.ParserError, UnicodeDecodeError) as exc:
+            raise ValueError(f"Failed to read CSV from {str(path)!r}: {exc}") from exc
     if fmt == "parquet":
         try:
             return pd.read_parquet(path)
@@ -120,26 +132,45 @@ def _read_table(path: Path, fmt: str):
                 f"Reading parquet requires a parquet engine (pyarrow or fastparquet); "
                 f"install one of them, or convert the input to CSV/JSON. Original error: {exc}"
             ) from exc
+        except OSError as exc:
+            raise ValueError(f"Failed to read parquet from {str(path)!r}: {exc}") from exc
     if fmt == "json":
         # ``orient='records'`` is the agent-friendly default; fall back to
         # pandas' auto-detection when the file isn't a records list.
         try:
             return pd.read_json(path, orient="records")
         except ValueError:
-            return pd.read_json(path)
+            try:
+                return pd.read_json(path)
+            except ValueError as exc:
+                raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc
+        except OSError as exc:
+            raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc
     raise ValueError(f"Unsupported input format: {fmt}")
 
 
 def _write_table(df, path: Path, fmt: str) -> None:
     """Write a pandas DataFrame to ``path`` in ``fmt``.
 
-    Parquet ``ImportError`` is normalized to :class:`ValueError` so the CLI
-    surfaces a clean dependency message via the standard ``exit 2`` path.
+    All user-facing failure modes (missing parquet engine, ``--output``
+    pointing at a directory, permission denied, parent-directory creation
+    failures) are normalized into :class:`ValueError` so the CLI surfaces a
+    clean stderr message via the standard ``exit 2`` path instead of the
+    generic ``exit 1`` "unexpected error" backstop.
     """
-    path.parent.mkdir(parents=True, exist_ok=True)
+    if path.exists() and path.is_dir():
+        raise ValueError(f"--output expects a file, but {str(path)!r} is an existing directory.")
+
+    try:
+        path.parent.mkdir(parents=True, exist_ok=True)
+    except OSError as exc:
+        raise ValueError(f"Cannot create parent directory for {str(path)!r}: {exc}") from exc
 
     if fmt == "csv":
-        df.to_csv(path, index=False)
+        try:
+            df.to_csv(path, index=False)
+        except OSError as exc:
+            raise ValueError(f"Failed to write CSV to {str(path)!r}: {exc}") from exc
     elif fmt == "parquet":
         try:
             df.to_parquet(path, index=False)
@@ -148,8 +179,13 @@ def _write_table(df, path: Path, fmt: str) -> None:
                 f"Writing parquet requires a parquet engine (pyarrow or fastparquet); "
                 f"install one of them, or pick CSV/JSON via --output-format. Original error: {exc}"
             ) from exc
+        except OSError as exc:
+            raise ValueError(f"Failed to write parquet to {str(path)!r}: {exc}") from exc
     elif fmt == "json":
-        df.to_json(path, orient="records", indent=2)
+        try:
+            df.to_json(path, orient="records", indent=2)
+        except OSError as exc:
+            raise ValueError(f"Failed to write JSON to {str(path)!r}: {exc}") from exc
     else:
         raise ValueError(f"Unsupported output format: {fmt}")
 
@@ -198,6 +234,36 @@ def _emit(payload: dict[str, Any], *, as_json: bool, stream=None) -> None:
     stream.flush()
 
 
+def _check_scalar_type(
+    name: str,
+    value: Any,
+    expected: tuple[type, ...],
+    *,
+    allow_none: bool = False,
+    allow_bool: bool = True,
+) -> None:
+    """Validate a scalar value's type for ``--config``-supplied keys.
+
+    Raises :class:`ValueError` (caught by ``main()`` -> exit 2) when the
+    value's type does not match. ``bool`` is a subclass of ``int`` in
+    Python; pass ``allow_bool=False`` to reject ``True``/``False`` for
+    numeric-only fields like ``max_features`` / ``correlation_threshold``.
+    """
+    if value is None:
+        if allow_none:
+            return
+        raise ValueError(f"`{name}` must not be null in --config")
+    if not allow_bool and isinstance(value, bool):
+        raise ValueError(
+            f"`{name}` in --config must be a {' or '.join(t.__name__ for t in expected)}; " f"got bool={value!r}."
+        )
+    if not isinstance(value, expected):
+        raise ValueError(
+            f"`{name}` in --config must be a {' or '.join(t.__name__ for t in expected)}; "
+            f"got {type(value).__name__}={value!r}."
+        )
+
+
 def _build_engineer(args: argparse.Namespace) -> AutoFeatureEngineer:
     """Construct an :class:`AutoFeatureEngineer` from parsed CLI args.
 
@@ -224,12 +290,29 @@ def pick(flag_value, config_key, default):
         return default
 
     engines = pick(args.engines, "engines", ["tabular"])
-    selection_methods = pick(args.selection_methods, "selection_methods", ["mutual_info", "importance"])
-    max_features = pick(args.max_features, "max_features", None)
-    correlation_threshold = pick(args.correlation_threshold, "correlation_threshold", 0.85)
+    # ``explain`` does not expose selection-only flags on argparse, so use
+    # ``getattr(..., None)`` to safely fall through to config / defaults
+    # without requiring the attribute to exist on the namespace.
+    selection_methods = pick(
+        getattr(args, "selection_methods", None),
+        "selection_methods",
+        ["mutual_info", "importance"],
+    )
+    max_features = pick(getattr(args, "max_features", None), "max_features", None)
+    correlation_threshold = pick(getattr(args, "correlation_threshold", None), "correlation_threshold", 0.85)
     leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn")
     gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1)
 
+    # Type-check scalar config fields here so the CLI surfaces a clean
+    # exit-2 error instead of a downstream ``TypeError`` (e.g. from
+    # ``self.max_features <= 0`` when the JSON config supplied a string).
+    # ``argparse`` already enforces types for the flag side; this only
+    # guards against malformed ``--config`` JSON.
+    _check_scalar_type("max_features", max_features, (int,), allow_none=True, allow_bool=False)
+    _check_scalar_type("correlation_threshold", correlation_threshold, (int, float), allow_bool=False)
+    _check_scalar_type("gate_n_jobs", gate_n_jobs, (int,), allow_bool=False)
+    _check_scalar_type("leakage_guard", leakage_guard, (str,))
+
     # Validate ``llm_config`` is a JSON object (i.e. a Python dict) before
     # forwarding it. Without this check, a misconfigured non-dict value
     # would only fail at engine-construction time inside
@@ -466,16 +549,23 @@ def _build_parser() -> argparse.ArgumentParser:
         "explain",
         help="Print JSON feature explanations and code for agent consumption.",
         description="Fit AutoFeatureEngineer on INPUT and emit a JSON document "
-        "describing each generated feature (name, explanation, code).",
+        "describing each generated feature (name, explanation, code). Selection is "
+        "intentionally disabled, so all candidate features are reported.",
     )
     p_explain.add_argument("--input", "-i", required=True, help="Path to input file (CSV / Parquet / JSON).")
     p_explain.add_argument("--input-format", choices=SUPPORTED_INPUT_FORMATS, help="Override input format detection.")
-    p_explain.add_argument("--target", "-t", help="Target column name (required for selection).")
+    p_explain.add_argument(
+        "--target",
+        "-t",
+        help="Target column name. Used by leakage-guard checks and as task context "
+        "for the LLM engine. (Selection is disabled in `explain`, so this flag "
+        "does not gate selector behavior.)",
+    )
     p_explain.add_argument(
         "--task-description",
         help="Natural-language ML task description (used by the LLM engine).",
     )
-    _add_engineer_args(p_explain)
+    _add_engineer_args(p_explain, include_selection_args=False)
     p_explain.add_argument("--json", action="store_true", help="(Always JSON — flag accepted for symmetry.)")
     p_explain.set_defaults(func=_cmd_explain)
 
@@ -487,33 +577,47 @@ def _add_io_args(p: argparse.ArgumentParser) -> None:
     p.add_argument("--output", "-o", required=True, help="Path to output file (CSV / Parquet / JSON).")
     p.add_argument("--input-format", choices=SUPPORTED_INPUT_FORMATS, help="Override input format detection.")
     p.add_argument("--output-format", choices=SUPPORTED_OUTPUT_FORMATS, help="Override output format detection.")
-    p.add_argument("--target", "-t", help="Target column name (required for selection).")
+    p.add_argument(
+        "--target",
+        "-t",
+        help="Target column name. Required when selection is applied (the default; "
+        "use --no-selection to skip selection entirely).",
+    )
     p.add_argument(
         "--task-description",
         help="Natural-language ML task description (used by the LLM engine).",
     )
 
 
-def _add_engineer_args(p: argparse.ArgumentParser) -> None:
-    """Add ``AutoFeatureEngineer``-related flags to a subparser."""
+def _add_engineer_args(p: argparse.ArgumentParser, *, include_selection_args: bool = True) -> None:
+    """Add ``AutoFeatureEngineer``-related flags to a subparser.
+
+    ``include_selection_args=False`` omits selection-only flags
+    (``--selection-methods``, ``--correlation-threshold``,
+    ``--max-features``) — these would be silently ignored by the
+    ``explain`` subcommand, which always runs with selection disabled,
+    and surfacing them in ``--help`` would be a confusing API for
+    automation.
+    """
     p.add_argument(
         "--engines",
         nargs="+",
         choices=sorted(AutoFeatureEngineer.SUPPORTED_ENGINES),
         help="Engines to use (default: tabular).",
     )
-    p.add_argument(
-        "--selection-methods",
-        nargs="+",
-        choices=sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS),
-        help="Selection methods (default: mutual_info importance).",
-    )
-    p.add_argument("--max-features", type=int, help="Maximum number of features to keep.")
-    p.add_argument(
-        "--correlation-threshold",
-        type=float,
-        help="Maximum pairwise correlation in redundancy elimination (default: 0.85).",
-    )
+    if include_selection_args:
+        p.add_argument(
+            "--selection-methods",
+            nargs="+",
+            choices=sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS),
+            help="Selection methods (default: mutual_info importance).",
+        )
+        p.add_argument("--max-features", type=int, help="Maximum number of features to keep.")
+        p.add_argument(
+            "--correlation-threshold",
+            type=float,
+            help="Maximum pairwise correlation in redundancy elimination (default: 0.85).",
+        )
     p.add_argument(
         "--leakage-guard",
         choices=sorted(AutoFeatureEngineer.SUPPORTED_LEAKAGE_GUARDS),
@@ -529,7 +633,16 @@ def _add_engineer_args(p: argparse.ArgumentParser) -> None:
         help="Path to a JSON config file. CLI flags take precedence over config keys. "
         "Use this to pass nested keys such as ``llm_config``.",
     )
-    p.add_argument("--verbose", action="store_true", default=None, help="Enable verbose logging.")
+    # ``BooleanOptionalAction`` (Python 3.9+) provides both ``--verbose``
+    # and ``--no-verbose`` so a config-supplied ``"verbose": true`` can be
+    # explicitly turned off from the command line. ``default=None`` so the
+    # absence of either flag means "fall through to config / default".
+    p.add_argument(
+        "--verbose",
+        action=argparse.BooleanOptionalAction,
+        default=None,
+        help="Enable verbose logging (or --no-verbose to override config).",
+    )
 
 
 def main(argv: list[str] | None = None) -> int:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 7e96b05..eb0ffd1 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -362,6 +362,253 @@ def test_empty_selection_methods_list_in_config_returns_clean_exit_2(tmp_path: P
     assert "at least one method" in err.lower() or "empty sequence" in err.lower()
 
 
+# ----------------------- scalar config-type validation
+
+
+@pytest.mark.parametrize(
+    "key,value,fragment",
+    [
+        ("max_features", "10", "max_features"),
+        ("max_features", True, "max_features"),  # bool rejected for numeric field
+        ("correlation_threshold", "0.9", "correlation_threshold"),
+        ("correlation_threshold", True, "correlation_threshold"),
+        ("gate_n_jobs", "2", "gate_n_jobs"),
+        ("leakage_guard", 42, "leakage_guard"),
+    ],
+)
+def test_scalar_type_mismatch_in_config_returns_exit_2(tmp_path: Path, tabular_csv: Path, key, value, fragment):
+    """A malformed JSON config (string in a numeric field, etc.) must hit the
+    deterministic exit-2 user-error path with a precise message — not bubble
+    up as a downstream ``TypeError`` (exit 1).
+    """
+    config_path = tmp_path / "cfg.json"
+    config_path.write_text(json.dumps({key: value}))
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(config_path),
+        ]
+    )
+    assert rc == 2
+    assert fragment in err
+
+
+# ----------------------- --verbose / --no-verbose
+
+
+def test_no_verbose_overrides_config_verbose_true(tmp_path: Path, tabular_csv: Path):
+    """``--no-verbose`` (BooleanOptionalAction) must override a config-level
+    ``"verbose": true`` to false — the documented precedence rule.
+    """
+    config_path = tmp_path / "cfg.json"
+    config_path.write_text(json.dumps({"verbose": True}))
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(config_path),
+            "--no-verbose",
+            "--max-features",
+            "5",
+            "--json",
+        ]
+    )
+    assert rc == 0, err
+
+
+def test_verbose_overrides_config_verbose_false(tmp_path: Path, tabular_csv: Path):
+    config_path = tmp_path / "cfg.json"
+    config_path.write_text(json.dumps({"verbose": False}))
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(config_path),
+            "--verbose",
+            "--max-features",
+            "5",
+        ]
+    )
+    assert rc == 0, err
+
+
+# ----------------------- explain subparser doesn't expose selection-only flags
+
+
+def test_explain_rejects_selection_methods_flag(tmp_path: Path, tabular_csv: Path):
+    """``explain`` always disables selection, so accepting ``--selection-methods``
+    on the CLI would silently mis-configure the user. The subparser must not
+    advertise it.
+    """
+    rc, _, err = _run(
+        [
+            "explain",
+            "--input",
+            str(tabular_csv),
+            "--target",
+            "y",
+            "--selection-methods",
+            "mutual_info",
+        ]
+    )
+    assert rc == 2
+    assert "unrecognized" in err.lower() or "--selection-methods" in err.lower()
+
+
+def test_explain_rejects_max_features_flag(tmp_path: Path, tabular_csv: Path):
+    rc, _, err = _run(
+        [
+            "explain",
+            "--input",
+            str(tabular_csv),
+            "--target",
+            "y",
+            "--max-features",
+            "10",
+        ]
+    )
+    assert rc == 2
+
+
+def test_explain_rejects_correlation_threshold_flag(tmp_path: Path, tabular_csv: Path):
+    rc, _, err = _run(
+        [
+            "explain",
+            "--input",
+            str(tabular_csv),
+            "--target",
+            "y",
+            "--correlation-threshold",
+            "0.9",
+        ]
+    )
+    assert rc == 2
+
+
+def test_explain_target_help_no_longer_says_required_for_selection():
+    """The ``--target`` help on ``explain`` must not claim it gates selection
+    (selection is intentionally disabled in ``explain``).
+    """
+    import argparse as _argparse
+
+    parser = fc_cli._build_parser()
+    # argparse stores subparsers under a special action attribute
+    explain_parser = next(
+        action.choices["explain"] for action in parser._actions if isinstance(action, _argparse._SubParsersAction)
+    )
+    target_help = next(a.help for a in explain_parser._actions if "--target" in a.option_strings)
+    assert "required for selection" not in target_help
+    assert "leakage" in target_help.lower() or "task context" in target_help.lower()
+
+
+# ----------------------- I/O OSError normalization
+
+
+def test_input_directory_returns_exit_2(tmp_path: Path):
+    """Pointing ``--input`` at a directory must surface as exit 2."""
+    in_dir = tmp_path / "i_am_a_dir.csv"
+    in_dir.mkdir()
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_dir),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "directory" in err.lower()
+
+
+def test_output_directory_returns_exit_2(tmp_path: Path, tabular_csv: Path):
+    """Pointing ``--output`` at an existing directory must surface as exit 2."""
+    out_dir = tmp_path / "i_am_a_dir.csv"
+    out_dir.mkdir()
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_dir),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "directory" in err.lower()
+
+
+def test_unwritable_output_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch):
+    """An ``OSError`` on write (e.g. permission denied) must surface as exit 2."""
+    import pandas as pd
+
+    def _raise_oserror(self, *args, **kwargs):
+        raise PermissionError("simulated write failure")
+
+    monkeypatch.setattr(pd.DataFrame, "to_csv", _raise_oserror, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "failed to write" in err.lower()
+
+
+def test_unreadable_input_csv_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch):
+    """An ``OSError`` while reading the input must surface as exit 2."""
+    import pandas as pd
+
+    def _raise_oserror(*args, **kwargs):
+        raise PermissionError("simulated read failure")
+
+    monkeypatch.setattr(pd, "read_csv", _raise_oserror, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "failed to read" in err.lower()
+
+
 # -------------------------------------------------------------- error paths
 
 

From 398c9327137ccb8c91f5222f1eb73c2a5df5a952 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 19:49:57 +0800
Subject: [PATCH 08/30] test(cli): cover new round-4 OSError + scalar-type
 branches

Adds 6 targeted tests so codecov/patch lifts back above the 88.90%
target after round-4''s expanded I/O error normalization (it dipped
to 87.85% on commit 88e71ea):

* JSON read OSError    -> exit 2 + "failed to read json"
* JSON write OSError   -> exit 2 + "failed to write json"
* Parquet read OSError -> exit 2 + "failed to read parquet"
* Parquet write OSError (vs ImportError) -> exit 2 + "failed to write parquet"
* Output parent-mkdir OSError -> exit 2 + "create parent directory"
* `_check_scalar_type` direct unit test for the `allow_none=False`
  + `value is None` branch (not naturally hit via integration since
  every scalar with `allow_none=False` has a non-None default).

Tests: 59 (+6 new) in tests/test_cli.py. Project coverage bumped from
88.83% to 89.22%.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/test_cli.py | 139 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 139 insertions(+)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index eb0ffd1..1be2702 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -609,6 +609,145 @@ def _raise_oserror(*args, **kwargs):
     assert "failed to read" in err.lower()
 
 
+def test_unreadable_input_json_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch):
+    """``OSError`` from ``pd.read_json`` is surfaced as exit 2 too."""
+    import pandas as pd
+
+    in_path = tmp_path / "in.json"
+    in_path.write_text("[]")  # contents irrelevant; we'll intercept
+
+    def _raise_oserror(*args, **kwargs):
+        raise PermissionError("simulated read failure")
+
+    monkeypatch.setattr(pd, "read_json", _raise_oserror, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "failed to read json" in err.lower()
+
+
+def test_unreadable_input_parquet_returns_exit_2(tmp_path: Path, monkeypatch):
+    """``OSError`` from ``pd.read_parquet`` (e.g. corrupt file) is exit 2."""
+    import pandas as pd
+
+    in_path = tmp_path / "in.parquet"
+    in_path.write_bytes(b"")
+
+    def _raise_oserror(*args, **kwargs):
+        raise OSError("simulated parquet read failure")
+
+    monkeypatch.setattr(pd, "read_parquet", _raise_oserror, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "failed to read parquet" in err.lower()
+
+
+def test_unwritable_output_json_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch):
+    import pandas as pd
+
+    def _raise_oserror(self, *args, **kwargs):
+        raise PermissionError("simulated json write failure")
+
+    monkeypatch.setattr(pd.DataFrame, "to_json", _raise_oserror, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.json"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "failed to write json" in err.lower()
+
+
+def test_unwritable_output_parquet_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch):
+    """``OSError`` (vs ``ImportError``) from ``DataFrame.to_parquet`` -> exit 2."""
+    import pandas as pd
+
+    def _raise_oserror(self, *args, **kwargs):
+        raise OSError("simulated parquet write failure")
+
+    monkeypatch.setattr(pd.DataFrame, "to_parquet", _raise_oserror, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.parquet"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "failed to write parquet" in err.lower()
+
+
+def test_uncreatable_parent_directory_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch):
+    """If creating the output's parent directory fails, exit 2 with a clean message."""
+    real_mkdir = Path.mkdir
+
+    def _raise_oserror(self, *args, **kwargs):
+        # Only fail for our test's would-be output parent so other calls (e.g.
+        # tmp_path operations under the hood) still work.
+        if "deep" in self.parts:
+            raise PermissionError("simulated mkdir failure")
+        return real_mkdir(self, *args, **kwargs)
+
+    monkeypatch.setattr(Path, "mkdir", _raise_oserror, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "deep" / "nested" / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "create parent directory" in err.lower()
+
+
+def test_check_scalar_type_rejects_none_when_required():
+    """Direct unit test for ``_check_scalar_type`` to exercise the
+    ``allow_none=False`` + ``value is None`` branch, which the integration
+    path doesn't naturally hit (every scalar with ``allow_none=False`` has
+    a non-None default).
+    """
+    with pytest.raises(ValueError, match="must not be null"):
+        fc_cli._check_scalar_type("foo", None, (int,), allow_none=False)
+
+
 # -------------------------------------------------------------- error paths
 
 

From bb9e77c91fcfc30e933ba893e78895e20f886c46 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 20:23:37 +0800
Subject: [PATCH 09/30] fix(cli): address round-5 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses all five new comments from copilot-pull-request-reviewer
and chatgpt-codex-connector on PR #5 (commit 88e71ea):

* Type-validate erbose from --config (Copilot LsiH, Codex LtF8 P2).
  `verbose = bool(pick(...))` silently coerced malformed config
  values like `{"verbose": "false"}` (truthy string) into `True`.
  `_build_engineer` now type-checks `verbose` via the existing
  `_check_scalar_type` helper, rejecting non-bool values with a
  clean exit-2 error consistent with the other scalar fields.
  Parametrized test covers `"true"` / `"false"` / `1` / `0`.

* Restore --max-features on xplain (Copilot Lsia + Lsim).
  `--max-features` is *not* a selection-only flag —
  `AutoFeatureEngineer` forwards it into engine construction (e.g.
  the tabular engine uses it to cap the number of generated features),
  so removing it from `explain` deprived callers of the only
  CLI-level handle on the explanation payload size. The
  `include_selection_args=False` mode in `_add_engineer_args` now
  only excludes `--selection-methods` and `--correlation-threshold`;
  `--max-features` is exposed on every engineer-using subcommand.
  Test `test_explain_accepts_max_features_flag` asserts `explain`
  succeeds with the flag (replacing the prior reject-all test).

* Reject missing --target when selection is enabled (Copilot Lsis).
  `transform` without `--target` previously called
  `fit_transform(apply_selection=True)`, which silently no-ops the
  selector (only built when `y is not None`). The CLI now raises a
  clean `ValueError` -> exit 2 with a precise message:
  "--target is required when feature selection is applied. Pass
  --target <column>, or pass --no-selection to skip selection."
  Two tests cover the new branch (selection -> exit 2; --no-selection
  with no target -> exit 0).

Tests: 65 (+6 net) in tests/test_cli.py, 838 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 52 ++++++++++++++++++++++------
 tests/test_cli.py  | 85 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 122 insertions(+), 15 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index d6cf451..c4f0baf 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -290,15 +290,17 @@ def pick(flag_value, config_key, default):
         return default
 
     engines = pick(args.engines, "engines", ["tabular"])
-    # ``explain`` does not expose selection-only flags on argparse, so use
-    # ``getattr(..., None)`` to safely fall through to config / defaults
-    # without requiring the attribute to exist on the namespace.
+    # ``explain`` exposes ``--engines`` and ``--max-features`` (engine-level
+    # caps) but not the selection-only flags ``--selection-methods`` and
+    # ``--correlation-threshold``. Use ``getattr(..., None)`` for the
+    # latter so we can fall through to config / defaults without requiring
+    # the attribute to exist on the namespace.
     selection_methods = pick(
         getattr(args, "selection_methods", None),
         "selection_methods",
         ["mutual_info", "importance"],
     )
-    max_features = pick(getattr(args, "max_features", None), "max_features", None)
+    max_features = pick(args.max_features, "max_features", None)
     correlation_threshold = pick(getattr(args, "correlation_threshold", None), "correlation_threshold", 0.85)
     leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn")
     gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1)
@@ -331,7 +333,15 @@ def pick(flag_value, config_key, default):
             f"got {type(llm_config_raw).__name__}={llm_config_raw!r}."
         )
 
-    verbose = bool(pick(args.verbose, "verbose", False))
+    # ``verbose`` is type-checked before being forwarded so a malformed
+    # config like ``{"verbose": "false"}`` (truthy string) does NOT silently
+    # turn verbose mode on — instead it raises a clean exit-2 error
+    # consistent with the other scalar fields. ``args.verbose`` is already
+    # a bool / None thanks to ``BooleanOptionalAction``; only the config
+    # path can introduce a non-bool.
+    verbose_raw = pick(args.verbose, "verbose", False)
+    _check_scalar_type("verbose", verbose_raw, (bool,))
+    verbose = bool(verbose_raw)
 
     # Pass ``engines`` / ``selection_methods`` through *unchanged* (no
     # ``list(...)`` wrapping). Coercion would convert a misconfigured
@@ -404,6 +414,17 @@ def _cmd_transform(args: argparse.Namespace) -> int:
     df = _read_table(input_path, in_fmt)
     X, y = _split_xy(df, args.target)
 
+    # Selection requires a target column to fit against. Without ``--target``,
+    # ``AutoFeatureEngineer.fit_transform(apply_selection=True)`` silently
+    # degrades to an unselected run because the selector is only built when
+    # ``y is not None``. Surface that as a clean exit-2 user error rather than
+    # silently producing the same output as ``--no-selection``.
+    if not args.no_selection and args.target is None:
+        raise ValueError(
+            "--target is required when feature selection is applied. "
+            "Pass --target <column>, or pass --no-selection to skip selection."
+        )
+
     engineer = _build_engineer(args)
     transformed = engineer.fit_transform(
         X,
@@ -593,11 +614,13 @@ def _add_engineer_args(p: argparse.ArgumentParser, *, include_selection_args: bo
     """Add ``AutoFeatureEngineer``-related flags to a subparser.
 
     ``include_selection_args=False`` omits selection-only flags
-    (``--selection-methods``, ``--correlation-threshold``,
-    ``--max-features``) — these would be silently ignored by the
-    ``explain`` subcommand, which always runs with selection disabled,
-    and surfacing them in ``--help`` would be a confusing API for
-    automation.
+    (``--selection-methods`` and ``--correlation-threshold``) — these are
+    silently ignored by the ``explain`` subcommand, which always runs with
+    selection disabled. ``--max-features`` is *not* selection-only:
+    ``AutoFeatureEngineer`` forwards it into engine construction (e.g. the
+    tabular engine uses it to cap the number of generated features), so it
+    is exposed even when ``include_selection_args=False`` to give callers
+    a CLI-level handle on the engine output size.
     """
     p.add_argument(
         "--engines",
@@ -605,6 +628,14 @@ def _add_engineer_args(p: argparse.ArgumentParser, *, include_selection_args: bo
         choices=sorted(AutoFeatureEngineer.SUPPORTED_ENGINES),
         help="Engines to use (default: tabular).",
     )
+    # ``--max-features`` is exposed on every engineer-using subcommand
+    # because it caps engine output, not just selection — see the
+    # ``AutoFeatureEngineer`` constructor and ``TabularEngine``.
+    p.add_argument(
+        "--max-features",
+        type=int,
+        help="Maximum number of features to generate / keep (forwarded to engines and selector).",
+    )
     if include_selection_args:
         p.add_argument(
             "--selection-methods",
@@ -612,7 +643,6 @@ def _add_engineer_args(p: argparse.ArgumentParser, *, include_selection_args: bo
             choices=sorted(AutoFeatureEngineer.SUPPORTED_SELECTION_METHODS),
             help="Selection methods (default: mutual_info importance).",
         )
-        p.add_argument("--max-features", type=int, help="Maximum number of features to keep.")
         p.add_argument(
             "--correlation-threshold",
             type=float,
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 1be2702..f32fdb3 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -451,6 +451,76 @@ def test_verbose_overrides_config_verbose_false(tmp_path: Path, tabular_csv: Pat
     assert rc == 0, err
 
 
+@pytest.mark.parametrize(
+    "value",
+    ["true", "false", 1, 0],
+)
+def test_non_bool_verbose_in_config_returns_exit_2(tmp_path: Path, tabular_csv: Path, value):
+    """A malformed ``"verbose": <non-bool>`` config must hit exit 2 with a
+    precise message, not silently turn verbose mode on/off via Python's
+    truthiness rules.
+    """
+    config_path = tmp_path / "cfg.json"
+    config_path.write_text(json.dumps({"verbose": value}))
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(config_path),
+            "--max-features",
+            "5",
+        ]
+    )
+    assert rc == 2
+    assert "verbose" in err
+
+
+def test_transform_missing_target_with_selection_returns_exit_2(tmp_path: Path, tabular_csv: Path):
+    """Without ``--target``, selection silently degrades to a no-op. The CLI
+    must surface that as a clean exit-2 user error so automation can react.
+    """
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--max-features",
+            "5",
+        ]
+    )
+    assert rc == 2
+    assert "--target" in err
+    assert "selection" in err.lower()
+
+
+def test_transform_missing_target_with_no_selection_succeeds(tmp_path: Path, tabular_csv: Path):
+    """Once selection is opted out, the missing target is no longer an error
+    (selection requires a target; raw transform doesn't).
+    """
+    # Drop the target column so we can run without --target.
+    in_path = tmp_path / "in_notarget.csv"
+    pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False)
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--no-selection",
+        ]
+    )
+    assert rc == 0, err
+
+
 # ----------------------- explain subparser doesn't expose selection-only flags
 
 
@@ -474,8 +544,13 @@ def test_explain_rejects_selection_methods_flag(tmp_path: Path, tabular_csv: Pat
     assert "unrecognized" in err.lower() or "--selection-methods" in err.lower()
 
 
-def test_explain_rejects_max_features_flag(tmp_path: Path, tabular_csv: Path):
-    rc, _, err = _run(
+def test_explain_accepts_max_features_flag(tmp_path: Path, tabular_csv: Path):
+    """``--max-features`` is *not* selection-only — ``AutoFeatureEngineer``
+    forwards it into engine construction (e.g. the tabular engine uses it
+    to cap how many features it generates). ``explain`` must therefore
+    expose it so callers can bound the size of the explanation payload.
+    """
+    rc, out, err = _run(
         [
             "explain",
             "--input",
@@ -483,10 +558,12 @@ def test_explain_rejects_max_features_flag(tmp_path: Path, tabular_csv: Path):
             "--target",
             "y",
             "--max-features",
-            "10",
+            "5",
         ]
     )
-    assert rc == 2
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["status"] == "ok"
 
 
 def test_explain_rejects_correlation_threshold_flag(tmp_path: Path, tabular_csv: Path):

From 586c51fa039e602b6a2cc032fe13e8c310348179 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 20:58:31 +0800
Subject: [PATCH 10/30] fix(cli): address round-6 review feedback

Addresses all three new comments from copilot-pull-request-reviewer
on PR #5 (commit bb9e77c):

* Loosen --target requirement (Copilot L5bO).
  Round-5 made --target unconditionally required when selection was
  enabled, but `AutoFeatureEngineer` only actually fits a selector
  when `y is not None` AND `max_features` is set. With the default
  `max_features=None` the call is a raw feature-generation run and
  needs no target. The CLI now mirrors that contract: `--target` is
  required only when selection is enabled AND `--max-features` is
  configured (CLI flag or config). Three new tests:
  - missing target + no max_features -> exit 0 (raw transform)
  - missing target + --max-features -> exit 2 (selection would run)
  - missing target + max_features in config -> exit 2

* explain ignores selection-only config keys (Copilot L5bU).
  `_build_engineer` learned a `include_selection_config` flag.
  `_cmd_explain` calls `_build_engineer(args, include_selection_config=False)`
  so a shared transform/explain config with `selection_methods` /
  `correlation_threshold` no longer trips `explain` over keys that
  are inert at runtime (selection is disabled in `explain`).
  `test_explain_ignores_selection_only_config_keys` covers the new
  behavior with a mixed config.

* PR description vs. shipped CLI surface (Copilot L5bc).
  The PR description on GitHub is updated separately to remove the
  stale claim that `--selection-methods` and
  `--correlation-threshold` are accepted on `explain`.

Tests: 68 (+3 new) in tests/test_cli.py, 841 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 62 ++++++++++++++++++++++++-----------
 tests/test_cli.py  | 80 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+), 19 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index c4f0baf..93ca7a1 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -264,7 +264,7 @@ def _check_scalar_type(
         )
 
 
-def _build_engineer(args: argparse.Namespace) -> AutoFeatureEngineer:
+def _build_engineer(args: argparse.Namespace, *, include_selection_config: bool = True) -> AutoFeatureEngineer:
     """Construct an :class:`AutoFeatureEngineer` from parsed CLI args.
 
     Precedence: explicit CLI flags override values from ``--config``;
@@ -274,6 +274,12 @@ def _build_engineer(args: argparse.Namespace) -> AutoFeatureEngineer:
     canonical (and deterministic) error path — the CLI's wrapper must not
     silently rewrite a misconfigured config into something that looks
     different from what the user wrote.
+
+    ``include_selection_config=False`` (used by the ``explain`` subcommand)
+    skips reading selection-only config keys (``selection_methods``,
+    ``correlation_threshold``) so a shared config file with selection
+    settings does not cause ``explain`` to fail config-validation for keys
+    that are inert at runtime (selection is disabled in ``explain``).
     """
     config = _load_config(args.config)
 
@@ -292,16 +298,22 @@ def pick(flag_value, config_key, default):
     engines = pick(args.engines, "engines", ["tabular"])
     # ``explain`` exposes ``--engines`` and ``--max-features`` (engine-level
     # caps) but not the selection-only flags ``--selection-methods`` and
-    # ``--correlation-threshold``. Use ``getattr(..., None)`` for the
-    # latter so we can fall through to config / defaults without requiring
-    # the attribute to exist on the namespace.
-    selection_methods = pick(
-        getattr(args, "selection_methods", None),
-        "selection_methods",
-        ["mutual_info", "importance"],
-    )
+    # ``--correlation-threshold``. When ``include_selection_config`` is
+    # False (i.e. we're called from ``explain``) we also skip reading the
+    # selection-only keys from the config file, so a shared transform/explain
+    # config with selection settings won't trip ``explain`` over keys that
+    # have no effect on its runtime behavior.
+    if include_selection_config:
+        selection_methods = pick(
+            getattr(args, "selection_methods", None),
+            "selection_methods",
+            ["mutual_info", "importance"],
+        )
+        correlation_threshold = pick(getattr(args, "correlation_threshold", None), "correlation_threshold", 0.85)
+    else:
+        selection_methods = ["mutual_info", "importance"]
+        correlation_threshold = 0.85
     max_features = pick(args.max_features, "max_features", None)
-    correlation_threshold = pick(getattr(args, "correlation_threshold", None), "correlation_threshold", 0.85)
     leakage_guard = pick(args.leakage_guard, "leakage_guard", "warn")
     gate_n_jobs = pick(args.gate_n_jobs, "gate_n_jobs", 1)
 
@@ -414,15 +426,27 @@ def _cmd_transform(args: argparse.Namespace) -> int:
     df = _read_table(input_path, in_fmt)
     X, y = _split_xy(df, args.target)
 
-    # Selection requires a target column to fit against. Without ``--target``,
-    # ``AutoFeatureEngineer.fit_transform(apply_selection=True)`` silently
-    # degrades to an unselected run because the selector is only built when
-    # ``y is not None``. Surface that as a clean exit-2 user error rather than
-    # silently producing the same output as ``--no-selection``.
-    if not args.no_selection and args.target is None:
+    # Selection requires a target column to fit against. ``AutoFeatureEngineer``
+    # only actually fits a selector when ``y is not None`` AND ``max_features``
+    # is set; without ``max_features`` the call is a raw feature-generation
+    # run and does not need a target. The CLI mirrors that contract: only
+    # require ``--target`` when both selection is enabled (the default) AND
+    # ``max_features`` is configured (CLI flag or config), so commands like
+    # ``featcopilot transform --input in.csv --output out.csv`` (no target,
+    # no cap) still work.
+    effective_max_features = args.max_features
+    if effective_max_features is None and args.config is not None:
+        try:
+            cfg_max = _load_config(args.config).get("max_features")
+        except (FileNotFoundError, ValueError):
+            cfg_max = None
+        if cfg_max is not None:
+            effective_max_features = cfg_max
+    if not args.no_selection and args.target is None and effective_max_features is not None:
         raise ValueError(
-            "--target is required when feature selection is applied. "
-            "Pass --target <column>, or pass --no-selection to skip selection."
+            "--target is required when feature selection is applied "
+            "(i.e. when --max-features / config max_features is set). "
+            "Pass --target <column>, or pass --no-selection / drop --max-features to skip selection."
         )
 
     engineer = _build_engineer(args)
@@ -483,7 +507,7 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     df = _read_table(input_path, in_fmt)
     X, y = _split_xy(df, args.target)
 
-    engineer = _build_engineer(args)
+    engineer = _build_engineer(args, include_selection_config=False)
     engineer.fit_transform(
         X,
         y,
diff --git a/tests/test_cli.py b/tests/test_cli.py
index f32fdb3..e5bbb81 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -521,6 +521,86 @@ def test_transform_missing_target_with_no_selection_succeeds(tmp_path: Path, tab
     assert rc == 0, err
 
 
+def test_transform_missing_target_no_max_features_succeeds(tmp_path: Path, tabular_csv: Path):
+    """Without ``--max-features`` (and the corresponding config key),
+    ``AutoFeatureEngineer`` doesn't actually fit a selector even with
+    ``apply_selection=True``, so requiring ``--target`` would be a false
+    positive. Raw feature generation without target / without cap must
+    therefore succeed.
+    """
+    in_path = tmp_path / "in_notarget.csv"
+    pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False)
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+        ]
+    )
+    assert rc == 0, err
+
+
+def test_transform_missing_target_max_features_in_config_returns_exit_2(tmp_path: Path, tabular_csv: Path):
+    """The ``--target`` requirement also fires when ``max_features`` comes
+    from ``--config`` (not just the CLI flag), since the selector will
+    actually run in that case.
+    """
+    config_path = tmp_path / "cfg.json"
+    config_path.write_text(json.dumps({"max_features": 5}))
+    in_path = tmp_path / "in_notarget.csv"
+    pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False)
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--config",
+            str(config_path),
+        ]
+    )
+    assert rc == 2
+    assert "--target" in err
+
+
+def test_explain_ignores_selection_only_config_keys(tmp_path: Path, tabular_csv: Path):
+    """A shared transform/explain config with selection-only keys
+    (``selection_methods`` / ``correlation_threshold``) must not break
+    ``explain``: those keys are inert at runtime (selection is disabled
+    in ``explain``) and ``_build_engineer(include_selection_config=False)``
+    skips reading them so config-validation does not fire.
+    """
+    config_path = tmp_path / "cfg.json"
+    # Use *valid* selection_methods values; the point is they''re ignored.
+    config_path.write_text(
+        json.dumps(
+            {
+                "engines": ["tabular"],
+                "selection_methods": ["mutual_info"],
+                "correlation_threshold": 0.5,
+                "max_features": 5,
+            }
+        )
+    )
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(tabular_csv),
+            "--target",
+            "y",
+            "--config",
+            str(config_path),
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["status"] == "ok"
+
+
 # ----------------------- explain subparser doesn't expose selection-only flags
 
 

From dc4e5b95aee8ad304d64ff8a77e15023a9c9b9e7 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 21:28:22 +0800
Subject: [PATCH 11/30] fix(cli): address round-7 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses both new comments from chatgpt-codex-connector on PR #5
(commit 586c51f):

* Normalize all parquet read backend errors to exit 2 (Codex P2).
  `_read_table` now catches `Exception` (not just `OSError`) for
  the parquet branch. This routes engine-level failures like
  `pyarrow.lib.ArrowInvalid` (corrupt parquet) and fastparquet's
  metadata `ValueError`s through the deterministic exit-2 path
  instead of the generic exit-1 backstop. `Exception` is the right
  scope because parquet I/O is fully delegated to a third-party
  backend; any error raised is by definition an I/O or data issue,
  not a CLI bug.

* Same broad catch for parquet write (Codex P2).
  `_write_table` parquet branch now also catches `Exception`, so
  pyarrow type / conversion errors for unsupported column values
  produce a clean exit 2 with a "Failed to write parquet to ..."
  message rather than exit 1 "unexpected error".

Two new tests use stand-in `Exception` subclasses (not `OSError`)
to verify both paths route to exit 2 — closing the previously
demonstrated gap.

Tests: 70 (+2 new) in tests/test_cli.py.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 17 ++++++++++--
 tests/test_cli.py  | 65 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 93ca7a1..b0ae14b 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -132,7 +132,14 @@ def _read_table(path: Path, fmt: str):
                 f"Reading parquet requires a parquet engine (pyarrow or fastparquet); "
                 f"install one of them, or convert the input to CSV/JSON. Original error: {exc}"
             ) from exc
-        except OSError as exc:
+        except Exception as exc:
+            # Catch *any* backend failure (``OSError`` for I/O,
+            # ``pyarrow.lib.ArrowInvalid`` for corrupt files,
+            # ``ValueError`` from ``fastparquet`` for malformed metadata,
+            # etc.) and surface it via the deterministic exit-2 path.
+            # Catching ``Exception`` is appropriate here because the entire
+            # operation is delegated to a third-party backend; any error
+            # raised is by definition an I/O or data issue, not a CLI bug.
             raise ValueError(f"Failed to read parquet from {str(path)!r}: {exc}") from exc
     if fmt == "json":
         # ``orient='records'`` is the agent-friendly default; fall back to
@@ -179,7 +186,13 @@ def _write_table(df, path: Path, fmt: str) -> None:
                 f"Writing parquet requires a parquet engine (pyarrow or fastparquet); "
                 f"install one of them, or pick CSV/JSON via --output-format. Original error: {exc}"
             ) from exc
-        except OSError as exc:
+        except Exception as exc:
+            # Same broad-catch rationale as ``_read_table``: parquet write
+            # is fully delegated to a backend (``pyarrow``/``fastparquet``)
+            # whose errors include ``OSError`` (I/O), engine-specific type
+            # / conversion exceptions for unsupported column values, etc.
+            # All of these are user-facing data issues, not CLI bugs, so
+            # they should produce a clean exit-2 failure.
             raise ValueError(f"Failed to write parquet to {str(path)!r}: {exc}") from exc
     elif fmt == "json":
         try:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index e5bbb81..a4f1715 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -867,6 +867,71 @@ def _raise_oserror(self, *args, **kwargs):
     assert "failed to write parquet" in err.lower()
 
 
+def test_parquet_read_engine_error_returns_exit_2(tmp_path: Path, monkeypatch):
+    """A non-OSError parquet *backend* error (e.g. ``pyarrow.lib.ArrowInvalid``
+    for a corrupt file) must surface as exit 2, not the generic exit 1
+    "unexpected error" backstop. The CLI catches ``Exception`` for parquet
+    operations because they are fully delegated to a third-party backend
+    whose failures are by definition user-facing data issues.
+    """
+    import pandas as pd
+
+    in_path = tmp_path / "fake.parquet"
+    in_path.write_bytes(b"\x00\x01\x02\x03")  # not a real parquet file
+
+    class _FakeArrowInvalid(Exception):
+        """Stand-in for ``pyarrow.lib.ArrowInvalid`` (also subclasses Exception)."""
+
+    def _raise_backend_error(*args, **kwargs):
+        raise _FakeArrowInvalid("simulated corrupt parquet")
+
+    monkeypatch.setattr(pd, "read_parquet", _raise_backend_error, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "failed to read parquet" in err.lower()
+
+
+def test_parquet_write_engine_error_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch):
+    """Same coverage on the write side: a backend-level pyarrow exception
+    that is *not* an ``OSError`` (e.g. an unsupported column-type
+    conversion error) must produce exit 2, not exit 1.
+    """
+    import pandas as pd
+
+    class _FakeArrowTypeError(Exception):
+        pass
+
+    def _raise_backend_error(self, *args, **kwargs):
+        raise _FakeArrowTypeError("simulated unsupported column dtype for parquet")
+
+    monkeypatch.setattr(pd.DataFrame, "to_parquet", _raise_backend_error, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.parquet"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "failed to write parquet" in err.lower()
+
+
 def test_uncreatable_parent_directory_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch):
     """If creating the output's parent directory fails, exit 2 with a clean message."""
     real_mkdir = Path.mkdir

From 459b1b93650e63646234bb4fd8f9ee220b3c6a63 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Sun, 3 May 2026 22:04:40 +0800
Subject: [PATCH 12/30] fix(cli): address round-8 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses all three new comments from copilot-pull-request-reviewer
on PR #5 (commit dc4e5b9):

* stderr is reserved for failures (Copilot MIZN + MIZS).
  `AutoFeatureEngineer.fit` calls `warnings.warn(...)` for
  leakage-prone column names under the default `leakage_guard='warn'`,
  bleeding non-empty stderr onto a zero-exit success path and breaking
  the CLI's agent-friendly contract.

  New `_fit_transform_capturing_warnings` and
  `_fit_capturing_warnings` helpers wrap the engineer call in
  `warnings.catch_warnings(record=True)` and surface every captured
  message as a JSON-serializable string list under a new `warnings`
  field in the success payload — keeping stderr deterministic for
  agent / tool-use parsing. Both `transform` and `explain` use
  the helpers.

  Tests assert `stderr == ""` on a successful run with a column
  name (`label_encoded`) that triggers the leakage heuristic.

* --target check runs after type validation (Copilot MIZY).
  Round-6's pre-check used the raw `args.max_features` (and tried
  to read it from config), so a malformed value like
  `{"max_features": "5"}` (string) or `"max_features": -1` was
  reported as `--target is required` instead of the real type
  validation error.

  `_cmd_transform` now builds the engineer FIRST (which runs all
  scalar / list / dict `_check_scalar_type` validation on the
  merged CLI + config view), then performs the `--target` check
  using the validated `engineer.max_features` attribute. Users
  with a malformed config now see the precise type error and can
  remediate; they no longer get sent down the wrong path.

  `test_invalid_max_features_in_config_takes_precedence_over_target_check`
  asserts the error mentions `max_features` and *not* `--target`.

Tests: 73 (+3 new) in tests/test_cli.py, 846 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py |  72 +++++++++++++++++++++++------
 tests/test_cli.py  | 112 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+), 13 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index b0ae14b..08504cc 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -426,6 +426,47 @@ def _cmd_info(args: argparse.Namespace) -> int:
     return 0
 
 
+def _fit_transform_capturing_warnings(engineer, X, y, **kwargs):
+    """Run ``engineer.fit_transform(X, y, **kwargs)`` while capturing any
+    Python ``UserWarning`` (or other warning) it emits.
+
+    The CLI contract is that stdout carries the JSON payload and stderr is
+    reserved for failures. ``AutoFeatureEngineer.fit`` calls
+    ``warnings.warn(...)`` for leakage-prone column names under the default
+    ``leakage_guard='warn'``, which would otherwise bleed onto stderr on a
+    successful run and break agent / tool-use error parsing. This helper
+    intercepts those warnings, collects them as JSON-serializable strings,
+    and lets the caller surface them inside the ``warnings`` field of the
+    success payload — keeping stderr deterministic.
+
+    Returns
+    -------
+    (warnings_list, result)
+        ``warnings_list`` is a list of ``str`` (one entry per warning, in
+        emission order). ``result`` is whatever ``fit_transform`` returned.
+    """
+    import warnings as _warnings
+
+    captured: list[str] = []
+    with _warnings.catch_warnings(record=True) as caught:
+        _warnings.simplefilter("always")
+        result = engineer.fit_transform(X, y, **kwargs)
+        captured.extend(str(w.message) for w in caught)
+    return captured, result
+
+
+def _fit_capturing_warnings(engineer, X, y, **kwargs):
+    """Sibling of :func:`_fit_transform_capturing_warnings` for explain."""
+    import warnings as _warnings
+
+    captured: list[str] = []
+    with _warnings.catch_warnings(record=True) as caught:
+        _warnings.simplefilter("always")
+        engineer.fit_transform(X, y, **kwargs)
+        captured.extend(str(w.message) for w in caught)
+    return captured
+
+
 def _cmd_transform(args: argparse.Namespace) -> int:
     """Read input, fit/transform, write output."""
     input_path = Path(args.input)
@@ -439,6 +480,13 @@ def _cmd_transform(args: argparse.Namespace) -> int:
     df = _read_table(input_path, in_fmt)
     X, y = _split_xy(df, args.target)
 
+    # Build the engineer first: ``_build_engineer`` runs all scalar / list /
+    # dict type validation on the merged CLI-flag + config view, so any
+    # malformed value (e.g. ``"max_features": "5"``, ``"verbose": "false"``)
+    # surfaces a precise exit-2 error here rather than down the wrong
+    # ``--target is required`` rabbit hole.
+    engineer = _build_engineer(args)
+
     # Selection requires a target column to fit against. ``AutoFeatureEngineer``
     # only actually fits a selector when ``y is not None`` AND ``max_features``
     # is set; without ``max_features`` the call is a raw feature-generation
@@ -446,24 +494,19 @@ def _cmd_transform(args: argparse.Namespace) -> int:
     # require ``--target`` when both selection is enabled (the default) AND
     # ``max_features`` is configured (CLI flag or config), so commands like
     # ``featcopilot transform --input in.csv --output out.csv`` (no target,
-    # no cap) still work.
-    effective_max_features = args.max_features
-    if effective_max_features is None and args.config is not None:
-        try:
-            cfg_max = _load_config(args.config).get("max_features")
-        except (FileNotFoundError, ValueError):
-            cfg_max = None
-        if cfg_max is not None:
-            effective_max_features = cfg_max
-    if not args.no_selection and args.target is None and effective_max_features is not None:
+    # no cap) still work. Using ``engineer.max_features`` here means the
+    # value has already been type-validated, so we never report
+    # ``--target is required`` when the real problem is a malformed
+    # ``max_features`` config value.
+    if not args.no_selection and args.target is None and engineer.max_features is not None:
         raise ValueError(
             "--target is required when feature selection is applied "
             "(i.e. when --max-features / config max_features is set). "
             "Pass --target <column>, or pass --no-selection / drop --max-features to skip selection."
         )
 
-    engineer = _build_engineer(args)
-    transformed = engineer.fit_transform(
+    captured_warnings, transformed = _fit_transform_capturing_warnings(
+        engineer,
         X,
         y,
         task_description=args.task_description or "prediction task",
@@ -495,6 +538,7 @@ def _cmd_transform(args: argparse.Namespace) -> int:
         "max_features": engineer.max_features,
         "target": args.target,
         "selection_applied": engineer._selector is not None,
+        "warnings": captured_warnings,
     }
     _emit(payload, as_json=args.json)
     return 0
@@ -521,7 +565,8 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     X, y = _split_xy(df, args.target)
 
     engineer = _build_engineer(args, include_selection_config=False)
-    engineer.fit_transform(
+    captured_warnings = _fit_capturing_warnings(
+        engineer,
         X,
         y,
         task_description=args.task_description or "prediction task",
@@ -546,6 +591,7 @@ def _cmd_explain(args: argparse.Namespace) -> int:
             }
             for name in feature_names
         ],
+        "warnings": captured_warnings,
     }
 
     # explain always emits JSON to stdout (it's the only sensible format),
diff --git a/tests/test_cli.py b/tests/test_cli.py
index a4f1715..0fed666 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -960,6 +960,118 @@ def _raise_oserror(self, *args, **kwargs):
     assert "create parent directory" in err.lower()
 
 
+# ----------------------- stderr is reserved for failures (warnings captured)
+
+
+def test_transform_leakage_warning_does_not_pollute_stderr(tmp_path: Path):
+    """``leakage_guard='warn'`` (the default) must not bleed
+    ``warnings.warn(...)`` onto stderr on a successful run; the warnings
+    are captured and surfaced inside the JSON payload's ``warnings`` field
+    instead, so agents can keep treating non-empty stderr as failure metadata.
+    """
+    rng = np.random.default_rng(0)
+    n = 200
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            # ``label_encoded`` is detected as leakage-prone ("label" + "encoded"
+            # both appear in the stoplist).
+            "label_encoded": rng.integers(0, 2, size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "in_with_leakage.csv"
+    df.to_csv(in_path, index=False)
+    out_path = tmp_path / "out.csv"
+
+    rc, out, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--max-features",
+            "5",
+            "--json",
+        ]
+    )
+    assert rc == 0, err
+    assert err == "", f"stderr should be empty on success but got: {err!r}"
+    payload = json.loads(out)
+    assert payload["status"] == "ok"
+    # ``warnings`` field is always present; it MAY contain the leakage
+    # warning depending on the heuristic. The contract being tested is
+    # that stderr stays clean — not that any specific warning was emitted
+    # (the leakage detector heuristics evolve).
+    assert "warnings" in payload
+    assert isinstance(payload["warnings"], list)
+
+
+def test_explain_leakage_warning_does_not_pollute_stderr(tmp_path: Path):
+    """``explain`` has the same stderr-cleanliness contract as ``transform``."""
+    rng = np.random.default_rng(0)
+    n = 200
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "label_encoded": rng.integers(0, 2, size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "in.csv"
+    df.to_csv(in_path, index=False)
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 0, err
+    assert err == "", f"stderr should be empty on success but got: {err!r}"
+    payload = json.loads(out)
+    assert "warnings" in payload
+    assert isinstance(payload["warnings"], list)
+
+
+# ----------------------- target check runs after type validation
+
+
+def test_invalid_max_features_in_config_takes_precedence_over_target_check(tmp_path: Path, tabular_csv: Path):
+    """A malformed ``max_features`` in ``--config`` (string, negative, etc.)
+    must surface its real validation error rather than ``--target is
+    required``. The CLI now builds the engineer first (which type-validates
+    every scalar config field) and only checks ``--target`` after.
+    """
+    in_path = tmp_path / "in_notarget.csv"
+    pd.read_csv(tabular_csv).drop(columns=["y"]).to_csv(in_path, index=False)
+
+    cfg = tmp_path / "cfg.json"
+    cfg.write_text(json.dumps({"max_features": "5"}))  # string, not int
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--config",
+            str(cfg),
+        ]
+    )
+    assert rc == 2
+    # The real error is the type mismatch, NOT --target missing.
+    assert "max_features" in err
+    assert "--target" not in err
+
+
 def test_check_scalar_type_rejects_none_when_required():
     """Direct unit test for ``_check_scalar_type`` to exercise the
     ``allow_none=False`` + ``value is None`` branch, which the integration

From c388d32e286518249e260933c4c3fda2fc8813c8 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Mon, 4 May 2026 05:52:55 +0800
Subject: [PATCH 13/30] fix(cli): address round-9 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses all three new comments from copilot-pull-request-reviewer
on PR #5 (commit 459b1b9):

* Capture logger output in addition to warnings.warn (Copilot MhpC).
  Round-8's helpers only intercepted `warnings.warn(...)`, but the
  do-no-harm gate's fallback path calls `logger.warning(...)` and
  successful `transform` runs would still bleed those messages onto
  stderr — breaking the documented stderr-reserved-for-failures
  contract.

  New `_capture_featcopilot_messages` contextmanager swaps the
  `featcopilot` root logger's handlers for a list-appending handler
  for the duration of the call. Every `featcopilot.*` child logger's
  records propagate up to the root by default, so this single hook
  captures all log output (debug, info, warning, error). The captured
  messages are merged with the `warnings.warn` strings and surfaced
  in the JSON payload's `warnings` field.

* Same fix applies to `explain` (Copilot MhpF).
  Both helper functions now go through the new contextmanager, so
  `explain --verbose` (which fires multiple `logger.info(...)`
  records) keeps stderr empty. Two new unit tests cover the
  contextmanager directly: it intercepts logger.warning + warnings.warn,
  and it restores the root logger state on exception.

* Fix --target help text on transform (Copilot MhpI).
  The help previously said `--target` is required when "selection is
  applied (the default)", but the round-6 fix made the requirement
  conditional on `--max-features` actually being set (because the
  selector only fits in that case). The help text now matches the
  shipped contract; `test_transform_target_help_reflects_actual_contract`
  asserts both the new wording and a regression guard against the old
  misleading phrasing.

Tests: 78 (+5 new) in tests/test_cli.py, 851 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py |  95 +++++++++++++++++++++++++----------
 tests/test_cli.py  | 121 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 187 insertions(+), 29 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 08504cc..2353ef7 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -41,8 +41,11 @@
 from __future__ import annotations
 
 import argparse
+import contextlib
 import json
+import logging
 import sys
+import warnings
 from pathlib import Path
 from typing import Any
 
@@ -427,46 +430,86 @@ def _cmd_info(args: argparse.Namespace) -> int:
 
 
 def _fit_transform_capturing_warnings(engineer, X, y, **kwargs):
-    """Run ``engineer.fit_transform(X, y, **kwargs)`` while capturing any
-    Python ``UserWarning`` (or other warning) it emits.
+    """Run ``engineer.fit_transform(X, y, **kwargs)`` while capturing both
+    Python ``warnings.warn(...)`` and FeatCopilot logger records.
 
     The CLI contract is that stdout carries the JSON payload and stderr is
-    reserved for failures. ``AutoFeatureEngineer.fit`` calls
-    ``warnings.warn(...)`` for leakage-prone column names under the default
-    ``leakage_guard='warn'``, which would otherwise bleed onto stderr on a
-    successful run and break agent / tool-use error parsing. This helper
-    intercepts those warnings, collects them as JSON-serializable strings,
-    and lets the caller surface them inside the ``warnings`` field of the
-    success payload — keeping stderr deterministic.
+    reserved for failures. Two sources can otherwise bleed onto stderr on
+    a successful run:
+
+    * ``warnings.warn(...)`` — emitted by ``AutoFeatureEngineer.fit`` for
+      leakage-prone column names under the default ``leakage_guard='warn'``.
+    * ``logger.warning(...)`` / ``logger.info(...)`` — emitted by e.g.
+      ``_do_no_harm_gate`` on validation-failure fallback, and by every
+      engine when ``--verbose`` is set.
+
+    The single ``featcopilot`` root logger (``propagate=False``) receives
+    every child logger's records by ordinary Python logging propagation;
+    we swap in a capture handler for the duration of the call so the JSON
+    payload can surface those messages instead of stderr.
 
     Returns
     -------
-    (warnings_list, result)
-        ``warnings_list`` is a list of ``str`` (one entry per warning, in
+    (messages, result)
+        ``messages`` is a list of ``str`` (warnings then logs, in
         emission order). ``result`` is whatever ``fit_transform`` returned.
     """
-    import warnings as _warnings
-
-    captured: list[str] = []
-    with _warnings.catch_warnings(record=True) as caught:
-        _warnings.simplefilter("always")
+    with _capture_featcopilot_messages() as captured:
         result = engineer.fit_transform(X, y, **kwargs)
-        captured.extend(str(w.message) for w in caught)
     return captured, result
 
 
 def _fit_capturing_warnings(engineer, X, y, **kwargs):
     """Sibling of :func:`_fit_transform_capturing_warnings` for explain."""
-    import warnings as _warnings
-
-    captured: list[str] = []
-    with _warnings.catch_warnings(record=True) as caught:
-        _warnings.simplefilter("always")
+    with _capture_featcopilot_messages() as captured:
         engineer.fit_transform(X, y, **kwargs)
-        captured.extend(str(w.message) for w in caught)
     return captured
 
 
+@contextlib.contextmanager
+def _capture_featcopilot_messages():
+    """Capture all FeatCopilot ``warnings.warn`` calls and logger records.
+
+    Yields a list that the caller can read after the with-block exits. The
+    list contains formatted log records (in emission order) followed by any
+    Python warning messages emitted during the with-block. The featcopilot
+    root logger's handlers are temporarily replaced with a list-appending
+    handler; child loggers propagate up to the root by default (the only
+    ``propagate=False`` in the project is on the root itself, which
+    prevents bleeding to Python's root logger).
+    """
+    captured: list[str] = []
+
+    class _ListHandler(logging.Handler):
+        def emit(self, record):
+            try:
+                captured.append(self.format(record))
+            except Exception:  # pragma: no cover - never let logging crash the CLI
+                captured.append(record.getMessage())
+
+    list_handler = _ListHandler()
+    list_handler.setLevel(logging.DEBUG)
+    list_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
+
+    fc_root = logging.getLogger("featcopilot")
+    saved_handlers = list(fc_root.handlers)
+    saved_level = fc_root.level
+    fc_root.handlers = [list_handler]
+    fc_root.setLevel(logging.DEBUG)
+
+    try:
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            yield captured
+            # Append warnings *after* the body returns so the order in the
+            # captured list mirrors emission order: log records first
+            # (appended live by the handler), warnings last.
+            captured.extend(str(w.message) for w in caught)
+    finally:
+        fc_root.handlers = saved_handlers
+        fc_root.setLevel(saved_level)
+
+
 def _cmd_transform(args: argparse.Namespace) -> int:
     """Read input, fit/transform, write output."""
     input_path = Path(args.input)
@@ -684,8 +727,10 @@ def _add_io_args(p: argparse.ArgumentParser) -> None:
     p.add_argument(
         "--target",
         "-t",
-        help="Target column name. Required when selection is applied (the default; "
-        "use --no-selection to skip selection entirely).",
+        help="Target column name. Required when feature selection is applied "
+        "(i.e. when --max-features / config max_features is set so the "
+        "selector actually fits). With no max_features, raw feature "
+        "generation runs without a target.",
     )
     p.add_argument(
         "--task-description",
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 0fed666..89ac103 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2,9 +2,12 @@
 
 from __future__ import annotations
 
+import argparse
 import io
 import json
+import logging
 import sys
+import warnings
 from contextlib import redirect_stderr, redirect_stdout
 from pathlib import Path
 
@@ -665,12 +668,10 @@ def test_explain_target_help_no_longer_says_required_for_selection():
     """The ``--target`` help on ``explain`` must not claim it gates selection
     (selection is intentionally disabled in ``explain``).
     """
-    import argparse as _argparse
-
     parser = fc_cli._build_parser()
     # argparse stores subparsers under a special action attribute
     explain_parser = next(
-        action.choices["explain"] for action in parser._actions if isinstance(action, _argparse._SubParsersAction)
+        action.choices["explain"] for action in parser._actions if isinstance(action, argparse._SubParsersAction)
     )
     target_help = next(a.help for a in explain_parser._actions if "--target" in a.option_strings)
     assert "required for selection" not in target_help
@@ -1037,8 +1038,120 @@ def test_explain_leakage_warning_does_not_pollute_stderr(tmp_path: Path):
     assert rc == 0, err
     assert err == "", f"stderr should be empty on success but got: {err!r}"
     payload = json.loads(out)
-    assert "warnings" in payload
+    assert payload["status"] == "ok"
+    # The ``warnings`` field is always present and is a list. Whether or
+    # not the leakage heuristic fires is not guaranteed (it evolves); the
+    # contract under test is that stderr stays clean.
+    assert isinstance(payload["warnings"], list)
+
+
+def test_transform_logger_warning_does_not_pollute_stderr(tmp_path: Path, tabular_csv: Path):
+    """The CLI captures ``logger.warning(...)`` records (in addition to
+    ``warnings.warn``), so any successful run that exercises a code path
+    emitting a logger message — for example the do-no-harm gate's
+    fallback — keeps stderr empty. The captured records appear in the
+    JSON payload's ``warnings`` field.
+    """
+    out_path = tmp_path / "out.csv"
+    rc, out, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--max-features",
+            "5",
+            "--verbose",  # exercises ``logger.info(...)`` paths in engines
+            "--json",
+        ]
+    )
+    assert rc == 0, err
+    assert err == "", f"stderr should be empty on success but got: {err!r}"
+    payload = json.loads(out)
+    assert payload["status"] == "ok"
+    assert isinstance(payload["warnings"], list)
+
+
+def test_transform_verbose_logger_info_captured_not_on_stderr(tmp_path: Path, tabular_csv: Path):
+    """``--verbose`` enables ``logger.info(...)`` calls in
+    ``AutoFeatureEngineer`` and the engines. Those records must end up
+    in the JSON payload's ``warnings`` field, not on stderr.
+    """
+    out_path = tmp_path / "out.csv"
+    rc, out, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--max-features",
+            "5",
+            "--verbose",
+            "--json",
+        ]
+    )
+    assert rc == 0, err
+    assert err == "", f"stderr should be empty on success but got: {err!r}"
+    payload = json.loads(out)
+    # ``--verbose`` reliably emits "Fitted tabular engine" via logger.info,
+    # and selection / engineer calls also log. We don't pin the exact
+    # messages (they evolve) — just check at least one log record is
+    # present in the captured payload.
     assert isinstance(payload["warnings"], list)
+    assert len(payload["warnings"]) >= 1
+
+
+def test_capture_featcopilot_messages_intercepts_logger_warning():
+    """Direct unit test for the contextmanager so the docstring contract is
+    not just covered transitively via the CLI subcommands.
+    """
+    fc_logger = logging.getLogger("featcopilot.test_cli")
+    with fc_cli._capture_featcopilot_messages() as captured:
+        fc_logger.warning("captured-warning-message")
+        warnings.warn("captured-runtime-warning", UserWarning, stacklevel=2)
+    assert any("captured-warning-message" in m for m in captured)
+    assert any("captured-runtime-warning" in m for m in captured)
+
+
+def test_capture_featcopilot_messages_restores_handlers():
+    """The contextmanager must restore the original featcopilot root logger
+    state after the with-block, even if an exception propagates.
+    """
+    fc_root = logging.getLogger("featcopilot")
+    saved_handlers = list(fc_root.handlers)
+    saved_level = fc_root.level
+
+    with pytest.raises(RuntimeError):
+        with fc_cli._capture_featcopilot_messages():
+            raise RuntimeError("boom")
+
+    assert fc_root.handlers == saved_handlers
+    assert fc_root.level == saved_level
+
+
+# ----------------------- --target help text accuracy
+
+
+def test_transform_target_help_reflects_actual_contract():
+    """The ``--target`` help on ``transform`` must say the flag is required
+    only when ``--max-features`` is set (which is when the selector
+    actually fits), not whenever selection is enabled by default.
+    """
+    parser = fc_cli._build_parser()
+    transform_parser = next(
+        action.choices["transform"] for action in parser._actions if isinstance(action, argparse._SubParsersAction)
+    )
+    target_help = next(a.help for a in transform_parser._actions if "--target" in a.option_strings)
+    assert "max_features" in target_help.lower() or "max-features" in target_help.lower()
+    # The old ("required when selection is applied (the default ...)")
+    # phrasing was misleading — guard against regressions.
+    assert "the default" not in target_help.lower()
 
 
 # ----------------------- target check runs after type validation

From 55a814820344202d716472c5218fbd9a80daa720 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Mon, 4 May 2026 05:57:07 +0800
Subject: [PATCH 14/30] fix(cli): address round-9 follow-up review feedback

Addresses both new comments from copilot-pull-request-reviewer that
arrived after the round-9 fix (still on commit 459b1b9):

* Logger output capture (Copilot ODei).
  Already addressed by c388d32 (round-9): the new
  `_capture_featcopilot_messages` contextmanager replaces the
  `featcopilot` root logger's handlers for the duration of the
  engineer call, capturing every `logger.warning(...)` /
  `logger.info(...)` from every featcopilot.* module (including the
  copilot-sdk / litellm / openai mock-mode warnings inside
  `__init__` methods, and `TextEngine`'s missing-NLP-dependency
  warnings inside `fit`). All such records appear in the JSON
  payload's `warnings` field; stderr stays empty on success.

* Parquet probe is now a real import, not find_spec (Copilot ODeo).
  `find_spec` only confirms a distribution is on `sys.path`; it
  doesn't prove the C extensions can load. `_parquet_engine_available`
  now uses `__import__` so a broken native install honestly reports
  `parquet_available=false`. New
  `test_parquet_engine_available_returns_false_for_broken_native_install`
  exercises the `OSError` (loader-level) branch via `builtins.__import__`
  monkey-patch; the existing engine-missing / fastparquet-only tests
  were rewritten to mock `__import__` instead of `find_spec`.

Tests: 79 (+1 net) in tests/test_cli.py.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 17 +++++++++----
 tests/test_cli.py  | 60 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 2353ef7..71644b4 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -67,12 +67,19 @@ def _parquet_engine_available() -> bool:
     machine-readable capability output reflects what will actually work in
     the current environment, rather than always advertising parquet.
 
-    Uses :func:`importlib.util.find_spec` so the probe is side-effect-free
-    (no actual module import) and easy to mock in tests.
+    Uses ``__import__`` (not ``importlib.util.find_spec``) so the probe is
+    *correct* even on environments with a broken native install:
+    ``find_spec`` only confirms a distribution is on ``sys.path``; it does
+    not prove the C extensions can actually load. A real import is the
+    only way to verify the engine is usable.
     """
-    import importlib.util
-
-    return importlib.util.find_spec("pyarrow") is not None or importlib.util.find_spec("fastparquet") is not None
+    for name in ("pyarrow", "fastparquet"):
+        try:
+            __import__(name)
+            return True
+        except Exception:  # noqa: BLE001  - any import-time failure means unusable
+            continue
+    return False
 
 
 def _detect_format(path: Path, override: str | None) -> str:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 89ac103..d0b92b4 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1470,40 +1470,64 @@ def _raise_import_error(*args, **kwargs):
 
 
 def test_parquet_engine_available_returns_false_when_neither_installed(monkeypatch):
-    """Both probes return ``None`` from ``find_spec`` -> function returns False."""
-    import importlib.util
+    """When ``__import__`` raises ``ImportError`` for both engines, the
+    function reports parquet as unavailable.
+    """
+    import builtins
 
-    real_find_spec = importlib.util.find_spec
+    real_import = builtins.__import__
 
-    def fake_find_spec(name, *args, **kwargs):
+    def fake_import(name, *args, **kwargs):
         if name in ("pyarrow", "fastparquet"):
-            return None
-        return real_find_spec(name, *args, **kwargs)
+            raise ImportError(f"No module named '{name}' (simulated)")
+        return real_import(name, *args, **kwargs)
 
-    monkeypatch.setattr(importlib.util, "find_spec", fake_find_spec)
+    monkeypatch.setattr(builtins, "__import__", fake_import)
     assert fc_cli._parquet_engine_available() is False
 
 
 def test_parquet_engine_available_returns_true_for_fastparquet_only(monkeypatch):
-    """Even without pyarrow, finding fastparquet must report parquet as available."""
-    import importlib.util
-
-    class _FakeSpec:
-        pass
+    """Even without pyarrow, importing fastparquet must report parquet as available."""
+    import builtins
 
-    real_find_spec = importlib.util.find_spec
+    real_import = builtins.__import__
 
-    def fake_find_spec(name, *args, **kwargs):
+    def fake_import(name, *args, **kwargs):
         if name == "pyarrow":
-            return None
+            raise ImportError("No module named 'pyarrow' (simulated)")
         if name == "fastparquet":
-            return _FakeSpec()
-        return real_find_spec(name, *args, **kwargs)
+            # Simulate a successful import by short-circuiting; we don't
+            # actually need a real module object, just a non-raising return.
+            class _FakeModule:
+                pass
 
-    monkeypatch.setattr(importlib.util, "find_spec", fake_find_spec)
+            return _FakeModule()
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", fake_import)
     assert fc_cli._parquet_engine_available() is True
 
 
+def test_parquet_engine_available_returns_false_for_broken_native_install(monkeypatch):
+    """A distribution that's on sys.path but raises a non-ImportError at
+    import time (e.g. broken native bindings) is reported as unavailable.
+    Using ``__import__`` (rather than ``importlib.util.find_spec``) is what
+    makes this honest: ``find_spec`` would have returned a spec and lied.
+    """
+    import builtins
+
+    real_import = builtins.__import__
+
+    def fake_import(name, *args, **kwargs):
+        if name in ("pyarrow", "fastparquet"):
+            # Simulate a broken native install (loader-level failure).
+            raise OSError("broken native install: undefined symbol (simulated)")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", fake_import)
+    assert fc_cli._parquet_engine_available() is False
+
+
 def test_unreadable_config_returns_exit_2(tmp_path, tabular_csv, monkeypatch):
     """An ``OSError`` while opening the config (permission denied, broken
     symlink, etc.) is converted into the deterministic exit-2 path.

From 0f1f0b1b92e2549b6b642fd8b4173beb7fd5bc94 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Mon, 4 May 2026 06:32:04 +0800
Subject: [PATCH 15/30] fix(cli): address round-10 review feedback

Addresses all three new comments from copilot-pull-request-reviewer
on PR #5 (commit 55a8148):

* Drop logger.exception in exit-1 path (Copilot OHcA).
  The generic exception handler used to write
  `featcopilot: unexpected error: ...` to stderr AND then call
  `logger.exception(...)`, which appended a second timestamped
  traceback (FeatCopilot loggers write to stderr). The CLI's contract
  is exactly one structured stderr line per failure; the
  `logger.exception` call is removed. Internal failure introspection
  is the caller''s job (e.g. `PYTHONFAULTHANDLER=1`).
  `test_unexpected_error_writes_single_stderr_line` asserts a single
  matching line and absence of any traceback signature.

* Make the capture contextmanager thread-safe (Copilot OHcD).
  `_capture_featcopilot_messages()` mutates the global
  `featcopilot` logger's handlers/level. Concurrent in-process CLI
  calls (e.g. two threads invoking `cli.main(...)`) could steal each
  other's handlers and restore stale state. A module-level
  `threading.Lock` (`_capture_lock`) now serializes captures so
  each context gets a clean save/restore cycle. New
  `test_capture_featcopilot_messages_thread_safety` runs two threads
  through a `Barrier` to force contention and asserts each capture
  contains exactly its own 20 records (no cross-talk, no losses) and
  that no `ListHandler` leaks onto the global logger.

* Reject unknown top-level config keys (Copilot OHcE).
  `_load_config` previously accepted typos like
  `{"max_feature": 5}` (missing 's') and silently ran with defaults,
  making the JSON config API hard to trust in automation. New
  `_KNOWN_CONFIG_KEYS` whitelist (`engines`, `selection_methods`,
  `max_features`, `correlation_threshold`, `leakage_guard`,
  `gate_n_jobs`, `llm_config`, `verbose`) is checked at config
  load; unknown keys produce a precise exit-2 error that lists the
  recognized keys so users can self-correct without reading source.
  Two new tests cover `max_feature` and `selection_method` typos.

Tests: 83 (+4 new) in tests/test_cli.py, 856 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py |  87 +++++++++++++++++++++-------
 tests/test_cli.py  | 138 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 204 insertions(+), 21 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 71644b4..129e6da 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -45,6 +45,7 @@
 import json
 import logging
 import sys
+import threading
 import warnings
 from pathlib import Path
 from typing import Any
@@ -213,15 +214,34 @@ def _write_table(df, path: Path, fmt: str) -> None:
         raise ValueError(f"Unsupported output format: {fmt}")
 
 
+# Top-level keys recognized in a ``--config`` JSON file. The CLI rejects
+# any other top-level key with a precise exit-2 error so typos like
+# ``max_feature`` (no s) fail fast in automation rather than silently
+# running with defaults.
+_KNOWN_CONFIG_KEYS = frozenset(
+    {
+        "engines",
+        "selection_methods",
+        "max_features",
+        "correlation_threshold",
+        "leakage_guard",
+        "gate_n_jobs",
+        "llm_config",
+        "verbose",
+    }
+)
+
+
 def _load_config(config_path: str | None) -> dict[str, Any]:
     """Load a JSON config file (or return an empty dict).
 
     Normalizes user-input mistakes (missing path, directory passed instead
-    of a file, invalid JSON, non-object root) into :class:`ValueError` /
-    :class:`FileNotFoundError` so the CLI's top-level error handler can
-    route them all to the deterministic ``exit 2`` user-error path
-    (rather than e.g. ``IsADirectoryError`` falling into the generic
-    ``exit 1`` "unexpected error" backstop).
+    of a file, invalid JSON, non-object root, unknown top-level keys) into
+    :class:`ValueError` / :class:`FileNotFoundError` so the CLI's top-level
+    error handler can route them all to the deterministic ``exit 2``
+    user-error path (rather than e.g. ``IsADirectoryError`` falling into
+    the generic ``exit 1`` "unexpected error" backstop, or a typo silently
+    being ignored).
     """
     if config_path is None:
         return {}
@@ -242,6 +262,12 @@ def _load_config(config_path: str | None) -> dict[str, Any]:
         raise ValueError(f"Config file {config_path!r} could not be read: {exc}") from exc
     if not isinstance(data, dict):
         raise ValueError(f"Config file {config_path!r} must contain a JSON object at the top level")
+    unknown = sorted(set(data.keys()) - _KNOWN_CONFIG_KEYS)
+    if unknown:
+        raise ValueError(
+            f"Config file {config_path!r} has unknown top-level key(s): {unknown}. "
+            f"Recognized keys: {sorted(_KNOWN_CONFIG_KEYS)}."
+        )
     return data
 
 
@@ -484,6 +510,12 @@ def _capture_featcopilot_messages():
     handler; child loggers propagate up to the root by default (the only
     ``propagate=False`` in the project is on the root itself, which
     prevents bleeding to Python's root logger).
+
+    Concurrency: serialized via ``_capture_lock``. Multiple in-process CLI
+    calls that overlap (e.g. two threads calling ``cli.main(...)``
+    simultaneously) take the lock in turn so neither steals the other's
+    handlers nor restores stale state. Single-process / single-CLI usage
+    is unaffected.
     """
     captured: list[str] = []
 
@@ -499,22 +531,29 @@ def emit(self, record):
     list_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
 
     fc_root = logging.getLogger("featcopilot")
-    saved_handlers = list(fc_root.handlers)
-    saved_level = fc_root.level
-    fc_root.handlers = [list_handler]
-    fc_root.setLevel(logging.DEBUG)
+    with _capture_lock:
+        saved_handlers = list(fc_root.handlers)
+        saved_level = fc_root.level
+        fc_root.handlers = [list_handler]
+        fc_root.setLevel(logging.DEBUG)
 
-    try:
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            yield captured
-            # Append warnings *after* the body returns so the order in the
-            # captured list mirrors emission order: log records first
-            # (appended live by the handler), warnings last.
-            captured.extend(str(w.message) for w in caught)
-    finally:
-        fc_root.handlers = saved_handlers
-        fc_root.setLevel(saved_level)
+        try:
+            with warnings.catch_warnings(record=True) as caught:
+                warnings.simplefilter("always")
+                yield captured
+                # Append warnings *after* the body returns so the order in
+                # the captured list mirrors emission order: log records
+                # first (appended live by the handler), warnings last.
+                captured.extend(str(w.message) for w in caught)
+        finally:
+            fc_root.handlers = saved_handlers
+            fc_root.setLevel(saved_level)
+
+
+# Serializes ``_capture_featcopilot_messages`` so concurrent CLI calls in
+# the same process can't steal each other's handlers / level on the
+# global ``featcopilot`` logger.
+_capture_lock = threading.Lock()
 
 
 def _cmd_transform(args: argparse.Namespace) -> int:
@@ -849,8 +888,14 @@ def main(argv: list[str] | None = None) -> int:
         sys.stderr.write("featcopilot: interrupted\n")
         return 130
     except Exception as exc:  # pragma: no cover - defensive backstop
+        # Single deterministic stderr line so agents can parse the failure.
+        # We deliberately do NOT call ``logger.exception(...)`` here:
+        # FeatCopilot loggers write to stderr, which would append a second
+        # timestamped traceback after our structured line and break the
+        # CLI's "stderr is exactly one error message" contract. Internal
+        # failure introspection is the caller's job (e.g. set
+        # ``PYTHONFAULTHANDLER=1`` or attach a debugger).
         sys.stderr.write(f"featcopilot: unexpected error: {type(exc).__name__}: {exc}\n")
-        logger.exception("Unhandled CLI exception")
         return 1
 
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index d0b92b4..c1e6729 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1135,6 +1135,93 @@ def test_capture_featcopilot_messages_restores_handlers():
     assert fc_root.level == saved_level
 
 
+def test_capture_featcopilot_messages_thread_safety():
+    """Concurrent ``_capture_featcopilot_messages`` invocations must not
+    steal each other's handlers / lose log records. Implementation uses
+    ``_capture_lock`` to serialize captures.
+    """
+    import threading
+
+    fc_logger = logging.getLogger("featcopilot.test_concurrent")
+
+    results: list[list[str]] = []
+    barrier = threading.Barrier(2)
+
+    def worker(tag: str):
+        # Force both threads to enter the with-block at roughly the same
+        # time so the lock is genuinely contended.
+        barrier.wait()
+        with fc_cli._capture_featcopilot_messages() as captured:
+            for i in range(20):
+                fc_logger.warning(f"{tag}-{i}")
+        results.append(captured)
+
+    t1 = threading.Thread(target=worker, args=("A",))
+    t2 = threading.Thread(target=worker, args=("B",))
+    t1.start()
+    t2.start()
+    t1.join()
+    t2.join()
+
+    assert len(results) == 2
+    # Each capture list must contain exactly its own thread's records and
+    # nothing from the other thread.
+    for res in results:
+        # Find which tag this list belongs to.
+        tag = "A" if any("A-" in m for m in res) else "B"
+        assert all(f"{tag}-" in m for m in res), f"Thread isolation violated in capture {tag!r}: got {res!r}"
+        assert len(res) == 20
+
+    # Final state on the global logger must be cleanly restored.
+    fc_root = logging.getLogger("featcopilot")
+    assert all(
+        not isinstance(h, logging.Handler) or "ListHandler" not in type(h).__name__ for h in fc_root.handlers
+    ), "ListHandler leaked onto the global featcopilot logger"
+
+
+def test_unexpected_error_writes_single_stderr_line(monkeypatch, tmp_path: Path, tabular_csv: Path):
+    """An unexpected (non-ValueError) exception must produce exactly one
+    structured stderr line — no second timestamped traceback from
+    ``logger.exception(...)`` — so agents can parse failures
+    deterministically.
+    """
+    import pandas as pd
+
+    class _UnexpectedError(Exception):
+        """A non-ValueError, non-OSError exception that escapes the helpers."""
+
+    def _raise_unexpected(*args, **kwargs):
+        raise _UnexpectedError("simulated internal failure")
+
+    # Monkey-patch ``pd.read_csv`` directly. Since ``_read_table``'s CSV
+    # branch normally catches ``OSError`` / ``ParserError`` / ``UnicodeDecodeError``,
+    # raising a different exception type forces us into the generic exit-1
+    # backstop in ``main()``.
+    monkeypatch.setattr(pd, "read_csv", _raise_unexpected, raising=True)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 1, err
+    # Exactly one non-empty line on stderr.
+    err_lines = [line for line in err.splitlines() if line.strip()]
+    assert len(err_lines) == 1, f"Expected single-line stderr, got: {err!r}"
+    assert err_lines[0].startswith("featcopilot: unexpected error:")
+    assert "_UnexpectedError" in err_lines[0]
+    assert "simulated internal failure" in err_lines[0]
+    # No traceback signature.
+    assert "Traceback" not in err
+    assert 'File "' not in err
+
+
 # ----------------------- --target help text accuracy
 
 
@@ -1286,6 +1373,57 @@ def test_invalid_config_file_returns_exit_2(tmp_path: Path, tabular_csv: Path):
     assert "JSON object" in err
 
 
+def test_unknown_config_top_level_key_returns_exit_2(tmp_path: Path, tabular_csv: Path):
+    """A typo in a top-level config key (``max_feature`` instead of
+    ``max_features``, etc.) must fail fast with a precise exit-2 message
+    listing the recognized keys — not silently run with defaults.
+    """
+    cfg = tmp_path / "cfg.json"
+    cfg.write_text(json.dumps({"max_feature": 5}))  # missing 's'
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "o.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(cfg),
+        ]
+    )
+    assert rc == 2
+    assert "max_feature" in err
+    assert "Recognized keys" in err or "recognized keys" in err.lower()
+
+
+def test_unknown_config_top_level_key_lists_known_keys(tmp_path: Path, tabular_csv: Path):
+    """The error message must enumerate the recognized keys so users can
+    self-correct without reading the source.
+    """
+    cfg = tmp_path / "cfg.json"
+    cfg.write_text(json.dumps({"selection_method": ["mutual_info"]}))  # missing 's'
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "o.csv"),
+            "--target",
+            "y",
+            "--config",
+            str(cfg),
+        ]
+    )
+    assert rc == 2
+    assert "selection_method" in err
+    # Recognized-keys list must include the canonical names.
+    assert "selection_methods" in err
+    assert "max_features" in err
+
+
 def test_directory_as_config_returns_exit_2(tmp_path: Path, tabular_csv: Path):
     """Pointing ``--config`` at a directory must surface as exit 2, not the
     generic ``exit 1`` backstop (``IsADirectoryError``).

From fd7c28a774a9cfdef4c841356296e9476bb458bf Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Mon, 4 May 2026 09:11:36 +0800
Subject: [PATCH 16/30] fix(cli): address round-11 review feedback

Addresses both new comments from copilot-pull-request-reviewer on
PR #5 (commit 0f1f0b1):

* Capture no longer serializes concurrent CLI calls (Copilot OueS).
  Round-10's `_capture_lock` was held for the entire `fit_transform`
  body, so a second `cli.main(...)` from another in-process caller
  blocked until the first feature-engineering job finished. The
  contextmanager is now lock-free for the body and uses *per-thread
  routing* via two singletons added once to the `featcopilot` root
  logger:
  - `_ThreadRoutingHandler` appends each record to the calling
    thread's capture list (or no-ops if the thread isn't capturing).
  - `_SuppressCapturingFilter` is added to the existing handlers so
    capturing threads' records DON'T also bleed onto stderr.
  Concurrent threads each see only their own records and run in
  parallel; `test_capture_does_not_block_concurrent_callers` verifies
  the no-serialization property by having two workers `time.sleep(0.2)`
  inside the block and asserting both are inside simultaneously.
  `test_capture_concurrent_cli_calls_isolate_logs` is the
  end-to-end version: two real `transform --verbose` runs in parallel
  threads with empty stderr and isolated `warnings` payloads.

* warnings.warn capture is now thread-local (Copilot OueZ).
  The previous `warnings.catch_warnings(record=True)` is process-
  global; warnings from a non-capturing thread (or another capturing
  thread) could be swallowed and mis-attributed. The contextmanager
  now overrides `warnings.showwarning` and routes by
  `threading.get_ident()`: only warnings from the registered thread
  go to that thread's list; warnings from other threads chain to the
  previous `showwarning` (preserving normal emission for
  non-capturing threads). `test_capture_warnings_warn_thread_isolated`
  asserts two threads capturing concurrently see only their own
  `warnings.warn` calls.

Tests: 85 (+2 net) in tests/test_cli.py, 858 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 206 ++++++++++++++++++++++++++++++++++-----------
 tests/test_cli.py  |  84 ++++++++++++++++--
 2 files changed, 233 insertions(+), 57 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 129e6da..708fe63 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -499,61 +499,169 @@ def _fit_capturing_warnings(engineer, X, y, **kwargs):
     return captured
 
 
-@contextlib.contextmanager
-def _capture_featcopilot_messages():
-    """Capture all FeatCopilot ``warnings.warn`` calls and logger records.
-
-    Yields a list that the caller can read after the with-block exits. The
-    list contains formatted log records (in emission order) followed by any
-    Python warning messages emitted during the with-block. The featcopilot
-    root logger's handlers are temporarily replaced with a list-appending
-    handler; child loggers propagate up to the root by default (the only
-    ``propagate=False`` in the project is on the root itself, which
-    prevents bleeding to Python's root logger).
-
-    Concurrency: serialized via ``_capture_lock``. Multiple in-process CLI
-    calls that overlap (e.g. two threads calling ``cli.main(...)``
-    simultaneously) take the lock in turn so neither steals the other's
-    handlers nor restores stale state. Single-process / single-CLI usage
-    is unaffected.
+class _ThreadCaptureState:
+    """Holds per-thread capture lists.
+
+    Shared by :class:`_ThreadRoutingHandler` (writes records) and
+    :class:`_SuppressCapturingFilter` (decides whether to drop a record
+    from the original handlers). Mutations are guarded by a small lock;
+    lookups use ``dict.get`` which is atomic under the GIL for hashable
+    keys.
     """
-    captured: list[str] = []
 
-    class _ListHandler(logging.Handler):
-        def emit(self, record):
-            try:
-                captured.append(self.format(record))
-            except Exception:  # pragma: no cover - never let logging crash the CLI
-                captured.append(record.getMessage())
+    def __init__(self):
+        self._per_thread: dict[int, list[str]] = {}
+        self._lock = threading.Lock()
+
+    def register(self, tid: int, target: list[str]) -> None:
+        with self._lock:
+            self._per_thread[tid] = target
 
-    list_handler = _ListHandler()
-    list_handler.setLevel(logging.DEBUG)
-    list_handler.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
+    def unregister(self, tid: int) -> None:
+        with self._lock:
+            self._per_thread.pop(tid, None)
 
-    fc_root = logging.getLogger("featcopilot")
-    with _capture_lock:
-        saved_handlers = list(fc_root.handlers)
-        saved_level = fc_root.level
-        fc_root.handlers = [list_handler]
-        fc_root.setLevel(logging.DEBUG)
+    def get(self, tid: int) -> list[str] | None:
+        # Lock-free read: ``dict.get`` is atomic for hashable keys under
+        # the CPython GIL, and we only ever read references to lists owned
+        # by individual threads — no shared mutation hazard.
+        return self._per_thread.get(tid)
+
+
+class _ThreadRoutingHandler(logging.Handler):
+    """Logging handler that routes records to the calling thread's capture list.
+
+    Attached once to the ``featcopilot`` root logger. Records propagated
+    from any ``featcopilot.*`` child logger reach this handler in the same
+    way they reach the existing stderr handler. If the calling thread has
+    a registered capture list, the record is appended to it; otherwise the
+    handler does nothing (the existing stderr handler is what produces the
+    user-facing output for non-capturing threads).
+    """
 
+    def __init__(self, state: _ThreadCaptureState):
+        super().__init__(logging.DEBUG)
+        self._state = state
+        self.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
+
+    def emit(self, record: logging.LogRecord) -> None:
+        target = self._state.get(threading.get_ident())
+        if target is None:
+            return
         try:
-            with warnings.catch_warnings(record=True) as caught:
-                warnings.simplefilter("always")
-                yield captured
-                # Append warnings *after* the body returns so the order in
-                # the captured list mirrors emission order: log records
-                # first (appended live by the handler), warnings last.
-                captured.extend(str(w.message) for w in caught)
-        finally:
-            fc_root.handlers = saved_handlers
-            fc_root.setLevel(saved_level)
-
-
-# Serializes ``_capture_featcopilot_messages`` so concurrent CLI calls in
-# the same process can't steal each other's handlers / level on the
-# global ``featcopilot`` logger.
-_capture_lock = threading.Lock()
+            target.append(self.format(record))
+        except Exception:  # pragma: no cover - never let logging crash the CLI
+            target.append(record.getMessage())
+
+
+class _SuppressCapturingFilter(logging.Filter):
+    """Filter for the *existing* handlers: drops records from capturing threads.
+
+    Without this filter, every record emitted by a capturing thread would
+    still hit the featcopilot root logger's stderr ``StreamHandler`` and
+    bleed onto stderr — breaking the CLI's "stderr reserved for failures"
+    contract. The filter checks ``threading.get_ident()`` against the
+    shared :class:`_ThreadCaptureState` so non-capturing threads continue
+    to see normal stderr output.
+    """
+
+    def __init__(self, state: _ThreadCaptureState):
+        super().__init__()
+        self._state = state
+
+    def filter(self, record: logging.LogRecord) -> bool:
+        return self._state.get(threading.get_ident()) is None
+
+
+# Module-level singletons. Installed exactly once on the featcopilot root
+# logger / its existing handlers; subsequent ``_capture_featcopilot_messages``
+# calls just register/unregister thread state. No global lock is held during
+# the slow ``fit_transform`` body — concurrent threads each capture their
+# own records independently.
+_capture_state = _ThreadCaptureState()
+_routing_handler = _ThreadRoutingHandler(_capture_state)
+_suppress_filter = _SuppressCapturingFilter(_capture_state)
+_install_lock = threading.Lock()
+_install_done = False
+
+
+def _install_capture_hooks_once() -> None:
+    """Install the routing handler + suppress filter on the featcopilot root logger.
+
+    Idempotent: subsequent calls are no-ops. Must be called before the
+    first capture; happens lazily on first use to avoid altering the
+    logging tree at module import time when the CLI is being introspected
+    rather than executed.
+    """
+    global _install_done
+    if _install_done:
+        return
+    with _install_lock:
+        if _install_done:
+            return
+        fc_root = logging.getLogger("featcopilot")
+        if _routing_handler not in fc_root.handlers:
+            fc_root.addHandler(_routing_handler)
+        for handler in list(fc_root.handlers):
+            if handler is _routing_handler:
+                continue
+            if _suppress_filter not in handler.filters:
+                handler.addFilter(_suppress_filter)
+        _install_done = True
+
+
+@contextlib.contextmanager
+def _capture_featcopilot_messages():
+    """Capture FeatCopilot log records and ``warnings.warn`` calls emitted
+    on the *current thread*.
+
+    Yields a list that the caller can read after the with-block exits.
+    The list contains formatted log records (in emission order) followed
+    by any Python warning messages emitted during the with-block on this
+    thread.
+
+    Concurrency model
+    -----------------
+    * **Logger records** are routed *per-thread* via
+      :class:`_ThreadRoutingHandler` (added once to the ``featcopilot``
+      root logger) and a :class:`_SuppressCapturingFilter` on the existing
+      handlers. Two threads can capture concurrently without blocking
+      each other; each sees only its own records, and other threads'
+      records still flow normally to stderr.
+    * **``warnings.warn`` records** are intercepted via a per-thread
+      override of :data:`warnings.showwarning`. The override appends to
+      the capturing thread's list and chains to the previous
+      ``showwarning`` for warnings emitted on non-capturing threads.
+
+    The contextmanager does NOT hold any lock for the duration of the
+    with-block — only briefly during install/register/unregister — so
+    long-running ``fit_transform`` calls in one thread do not block
+    other threads from running concurrently.
+    """
+    _install_capture_hooks_once()
+
+    captured: list[str] = []
+    tid = threading.get_ident()
+    _capture_state.register(tid, captured)
+
+    # Per-thread ``warnings.warn`` interception. We chain to whatever
+    # ``warnings.showwarning`` was in place before us so non-capturing
+    # threads (or nested captures) still receive their warnings via the
+    # existing path.
+    previous_showwarning = warnings.showwarning
+
+    def _routing_showwarning(message, category, filename, lineno, file=None, line=None):
+        if threading.get_ident() == tid:
+            captured.append(str(message))
+            return
+        previous_showwarning(message, category, filename, lineno, file, line)
+
+    warnings.showwarning = _routing_showwarning
+    try:
+        yield captured
+    finally:
+        warnings.showwarning = previous_showwarning
+        _capture_state.unregister(tid)
 
 
 def _cmd_transform(args: argparse.Namespace) -> int:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index c1e6729..ea5770e 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1137,8 +1137,8 @@ def test_capture_featcopilot_messages_restores_handlers():
 
 def test_capture_featcopilot_messages_thread_safety():
     """Concurrent ``_capture_featcopilot_messages`` invocations must not
-    steal each other's handlers / lose log records. Implementation uses
-    ``_capture_lock`` to serialize captures.
+    steal each other's records. Implementation uses per-thread routing
+    (no global lock held during the body), so threads execute concurrently.
     """
     import threading
 
@@ -1149,7 +1149,7 @@ def test_capture_featcopilot_messages_thread_safety():
 
     def worker(tag: str):
         # Force both threads to enter the with-block at roughly the same
-        # time so the lock is genuinely contended.
+        # time so the routing dispatch is genuinely contended.
         barrier.wait()
         with fc_cli._capture_featcopilot_messages() as captured:
             for i in range(20):
@@ -1172,11 +1172,79 @@ def worker(tag: str):
         assert all(f"{tag}-" in m for m in res), f"Thread isolation violated in capture {tag!r}: got {res!r}"
         assert len(res) == 20
 
-    # Final state on the global logger must be cleanly restored.
-    fc_root = logging.getLogger("featcopilot")
-    assert all(
-        not isinstance(h, logging.Handler) or "ListHandler" not in type(h).__name__ for h in fc_root.handlers
-    ), "ListHandler leaked onto the global featcopilot logger"
+
+def test_capture_does_not_block_concurrent_callers():
+    """Two concurrent ``_capture_featcopilot_messages`` blocks must run in
+    parallel — i.e. the design does NOT serialize the body via a global
+    lock. Verified by timing: a worker that sleeps inside the block must
+    not block another worker from also entering the block at the same
+    time.
+    """
+    import threading
+    import time
+
+    inside = []
+    inside_lock = threading.Lock()
+    seen_overlap = threading.Event()
+    barrier = threading.Barrier(2)
+
+    def worker():
+        barrier.wait()
+        with fc_cli._capture_featcopilot_messages():
+            with inside_lock:
+                inside.append(1)
+                if len(inside) >= 2:
+                    seen_overlap.set()
+            # Sleep long enough that, if the implementation serialized via
+            # a global lock, the second thread would never enter
+            # simultaneously.
+            time.sleep(0.2)
+            with inside_lock:
+                inside.pop()
+
+    t1 = threading.Thread(target=worker)
+    t2 = threading.Thread(target=worker)
+    t1.start()
+    t2.start()
+    t1.join(timeout=5)
+    t2.join(timeout=5)
+
+    assert seen_overlap.is_set(), (
+        "Both threads should have been inside _capture_featcopilot_messages "
+        "simultaneously; the implementation appears to serialize the body."
+    )
+
+
+def test_capture_warnings_warn_thread_isolated():
+    """``warnings.warn`` calls from one capturing thread must not leak into
+    another capturing thread's payload. The CLI overrides
+    ``warnings.showwarning`` per-thread (rather than using
+    ``warnings.catch_warnings(record=True)`` which is process-global).
+    """
+    import threading
+
+    barrier = threading.Barrier(2)
+    a_captured: list[str] = []
+    b_captured: list[str] = []
+
+    def worker(tag: str, target: list[str]):
+        barrier.wait()
+        with fc_cli._capture_featcopilot_messages() as captured:
+            for i in range(10):
+                warnings.warn(f"{tag}-warn-{i}", UserWarning, stacklevel=2)
+        target.extend(captured)
+
+    t1 = threading.Thread(target=worker, args=("A", a_captured))
+    t2 = threading.Thread(target=worker, args=("B", b_captured))
+    t1.start()
+    t2.start()
+    t1.join()
+    t2.join()
+
+    assert all("A-warn-" in m for m in a_captured)
+    assert all("B-warn-" in m for m in b_captured)
+    assert not any("B-warn-" in m for m in a_captured)
+    assert not any("A-warn-" in m for m in b_captured)
 
 
 def test_unexpected_error_writes_single_stderr_line(monkeypatch, tmp_path: Path, tabular_csv: Path):

From 0c69dd9d000cc08b3613b9459af56e24f3e696d4 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Mon, 4 May 2026 15:38:23 +0800
Subject: [PATCH 17/30] fix(cli): address round-12 review feedback

Addresses all three new comments from copilot-pull-request-reviewer
on PR #5 (commit fd7c28a):

* warnings.showwarning install is now overlap-safe (Copilot P4Ox).
  Round-11's per-call save-and-restore raced when two captures
  overlapped: A enters, B enters, A exits and restores the original
  `warnings.showwarning` while B is still active. The override is
  now installed by `_install_capture_hooks_once()` and survives the
  with-block; concurrent captures all dispatch through the same
  permanent override which routes by thread ID. The install is also
  rechecked on every capture entry, so a caller's
  `warnings.catch_warnings()` block (which restores
  `warnings.showwarning` on exit) can't silently undo the install
  for subsequent CLI runs.

* Per-thread state is now a stack (Copilot P4PL).
  Nested `_capture_featcopilot_messages()` calls on the same thread
  used to clobber the outer registration; the inner `unregister()`
  removed the thread entirely, so any later log records / warnings in
  the outer block leaked to stderr and were missing from the outer
  payload. `_ThreadCaptureState` now keeps a stack per thread:
  `push` on entry, `pop` on exit, `get` returns the innermost
  active capture. Logs and warnings always go to the innermost
  list while it's active; outer captures resume automatically.
  `test_nested_capture_on_same_thread_preserves_outer_list` covers
  the full scenario.

* Test asserts hook *stability*, not equality with pre-first-call
  state (Copilot P4PG). The previous restores-handlers test was
  order-dependent: on the very first capture in a process,
  `_install_capture_hooks_once()` permanently adds
  `_routing_handler`, so the post-block handler list differs from
  the pre-first-call list. The replacement test
  `test_capture_featcopilot_messages_does_not_mutate_logger_state_per_call`
  forces install via a no-op capture, then asserts the handler set,
  level, and `warnings.showwarning` are unchanged across an
  exception-propagating capture.

* New `test_overlapping_captures_with_out_of_order_exit` exercises
  the strict failure mode: two threads enter the block, then thread
  A exits before B emits its tail records. B's log records and
  `warnings.warn` calls are still captured under the new permanent-
  install design.

Tests: 87 (+2 net) in tests/test_cli.py, 860 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 172 ++++++++++++++++++++++++++++-----------------
 tests/test_cli.py  | 157 ++++++++++++++++++++++++++++++++++++-----
 2 files changed, 249 insertions(+), 80 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 708fe63..2c07916 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -500,32 +500,42 @@ def _fit_capturing_warnings(engineer, X, y, **kwargs):
 
 
 class _ThreadCaptureState:
-    """Holds per-thread capture lists.
+    """Holds per-thread capture *stacks*.
 
-    Shared by :class:`_ThreadRoutingHandler` (writes records) and
-    :class:`_SuppressCapturingFilter` (decides whether to drop a record
-    from the original handlers). Mutations are guarded by a small lock;
-    lookups use ``dict.get`` which is atomic under the GIL for hashable
-    keys.
+    Each thread maps to a stack of capture lists. Nested
+    :func:`_capture_featcopilot_messages` calls on the same thread push
+    onto the stack; the innermost active capture is always at the top
+    and receives records / warnings until its block exits, at which
+    point the outer capture (if any) becomes active again.
+
+    Shared by :class:`_ThreadRoutingHandler` (writes records),
+    :class:`_SuppressCapturingFilter` (suppresses stderr), and the
+    routing ``warnings.showwarning`` override.
     """
 
     def __init__(self):
-        self._per_thread: dict[int, list[str]] = {}
+        self._per_thread: dict[int, list[list[str]]] = {}
         self._lock = threading.Lock()
 
-    def register(self, tid: int, target: list[str]) -> None:
+    def push(self, tid: int, target: list[str]) -> None:
         with self._lock:
-            self._per_thread[tid] = target
+            self._per_thread.setdefault(tid, []).append(target)
 
-    def unregister(self, tid: int) -> None:
+    def pop(self, tid: int) -> None:
         with self._lock:
-            self._per_thread.pop(tid, None)
+            stack = self._per_thread.get(tid)
+            if stack:
+                stack.pop()
+                if not stack:
+                    del self._per_thread[tid]
 
     def get(self, tid: int) -> list[str] | None:
-        # Lock-free read: ``dict.get`` is atomic for hashable keys under
-        # the CPython GIL, and we only ever read references to lists owned
-        # by individual threads — no shared mutation hazard.
-        return self._per_thread.get(tid)
+        # Brief lock for thread-safe stack-top read.
+        with self._lock:
+            stack = self._per_thread.get(tid)
+            if stack:
+                return stack[-1]
+            return None
 
 
 class _ThreadRoutingHandler(logging.Handler):
@@ -575,39 +585,82 @@ def filter(self, record: logging.LogRecord) -> bool:
 
 # Module-level singletons. Installed exactly once on the featcopilot root
 # logger / its existing handlers; subsequent ``_capture_featcopilot_messages``
-# calls just register/unregister thread state. No global lock is held during
-# the slow ``fit_transform`` body — concurrent threads each capture their
-# own records independently.
+# calls just push/pop thread state. No global lock is held during the slow
+# ``fit_transform`` body — concurrent threads each capture their own records
+# independently.
 _capture_state = _ThreadCaptureState()
 _routing_handler = _ThreadRoutingHandler(_capture_state)
 _suppress_filter = _SuppressCapturingFilter(_capture_state)
 _install_lock = threading.Lock()
 _install_done = False
+# Captures the original ``warnings.showwarning`` at first install so the
+# routing override can chain to it for non-capturing threads (and so we
+# never mutate it again on subsequent capture calls — the previous
+# per-call save/restore raced under concurrent overlapping captures).
+_original_showwarning = None
 
 
-def _install_capture_hooks_once() -> None:
-    """Install the routing handler + suppress filter on the featcopilot root logger.
+def _routing_showwarning(message, category, filename, lineno, file=None, line=None):
+    """Permanent ``warnings.showwarning`` override (installed once).
+
+    Routes warnings to the *innermost* capturing list for the current
+    thread (via :class:`_ThreadCaptureState` stack lookup). If the
+    current thread is not capturing, chains to the original
+    ``warnings.showwarning`` so non-capturing threads keep their normal
+    behavior.
 
-    Idempotent: subsequent calls are no-ops. Must be called before the
-    first capture; happens lazily on first use to avoid altering the
-    logging tree at module import time when the CLI is being introspected
-    rather than executed.
+    Installed once globally — *not* swapped per-call — so concurrent
+    overlapping captures on different threads cannot race on the
+    process-global ``warnings.showwarning`` slot.
     """
-    global _install_done
-    if _install_done:
+    target = _capture_state.get(threading.get_ident())
+    if target is not None:
+        target.append(str(message))
         return
+    if _original_showwarning is not None:
+        _original_showwarning(message, category, filename, lineno, file, line)
+
+
+def _install_capture_hooks_once() -> None:
+    """Install the routing handler + suppress filter + showwarning override.
+
+    The logger handler and filter are installed exactly once (idempotent).
+    The ``warnings.showwarning`` override is re-installed every call if
+    something else has replaced it — this is necessary because external
+    code (most commonly ``warnings.catch_warnings()`` blocks) can reset
+    the global ``warnings.showwarning`` and undo a previous install. The
+    fresh re-install captures the current (caller's) ``showwarning`` as
+    the new "original" to chain to, so non-capturing threads still see
+    whatever warning behavior the caller had set up.
+
+    All hooks themselves dispatch on :class:`_ThreadCaptureState` which
+    uses a per-thread stack, so they are no-ops for threads that aren't
+    currently capturing.
+    """
+    global _install_done, _original_showwarning
     with _install_lock:
-        if _install_done:
-            return
-        fc_root = logging.getLogger("featcopilot")
-        if _routing_handler not in fc_root.handlers:
-            fc_root.addHandler(_routing_handler)
-        for handler in list(fc_root.handlers):
-            if handler is _routing_handler:
-                continue
-            if _suppress_filter not in handler.filters:
-                handler.addFilter(_suppress_filter)
-        _install_done = True
+        # Logger handler/filter install (truly once — these can't be
+        # silently undone by external code in the way ``warnings.showwarning``
+        # can).
+        if not _install_done:
+            fc_root = logging.getLogger("featcopilot")
+            if _routing_handler not in fc_root.handlers:
+                fc_root.addHandler(_routing_handler)
+            for handler in list(fc_root.handlers):
+                if handler is _routing_handler:
+                    continue
+                if _suppress_filter not in handler.filters:
+                    handler.addFilter(_suppress_filter)
+            _install_done = True
+
+        # ``warnings.showwarning`` install — re-check every entry. A
+        # caller's ``warnings.catch_warnings()`` block restores the
+        # previous ``showwarning`` on exit, undoing our install. Re-
+        # installing on next entry is what makes overlapping captures
+        # robust against caller-side warning context manipulation.
+        if warnings.showwarning is not _routing_showwarning:
+            _original_showwarning = warnings.showwarning
+            warnings.showwarning = _routing_showwarning
 
 
 @contextlib.contextmanager
@@ -616,9 +669,8 @@ def _capture_featcopilot_messages():
     on the *current thread*.
 
     Yields a list that the caller can read after the with-block exits.
-    The list contains formatted log records (in emission order) followed
-    by any Python warning messages emitted during the with-block on this
-    thread.
+    The list contains formatted log records (in emission order) and any
+    Python warning messages emitted during the with-block on this thread.
 
     Concurrency model
     -----------------
@@ -628,40 +680,32 @@ def _capture_featcopilot_messages():
       handlers. Two threads can capture concurrently without blocking
       each other; each sees only its own records, and other threads'
       records still flow normally to stderr.
-    * **``warnings.warn`` records** are intercepted via a per-thread
-      override of :data:`warnings.showwarning`. The override appends to
-      the capturing thread's list and chains to the previous
-      ``showwarning`` for warnings emitted on non-capturing threads.
+    * **``warnings.warn`` records** are intercepted via a permanent
+      :func:`_routing_showwarning` override installed once. The override
+      routes by ``threading.get_ident()`` and chains to the original
+      ``warnings.showwarning`` for non-capturing threads. The override is
+      *not* swapped per-call, so concurrent overlapping captures on
+      different threads cannot race on the process-global
+      ``warnings.showwarning`` slot.
+    * **Nested captures** on the same thread are supported via a
+      per-thread stack in :class:`_ThreadCaptureState`. Records and
+      warnings always go to the innermost active capture; when the inner
+      block exits, the outer capture is automatically reactivated.
 
     The contextmanager does NOT hold any lock for the duration of the
-    with-block — only briefly during install/register/unregister — so
-    long-running ``fit_transform`` calls in one thread do not block
-    other threads from running concurrently.
+    with-block — only briefly during install/push/pop — so long-running
+    ``fit_transform`` calls in one thread do not block other threads
+    from running concurrently.
     """
     _install_capture_hooks_once()
 
     captured: list[str] = []
     tid = threading.get_ident()
-    _capture_state.register(tid, captured)
-
-    # Per-thread ``warnings.warn`` interception. We chain to whatever
-    # ``warnings.showwarning`` was in place before us so non-capturing
-    # threads (or nested captures) still receive their warnings via the
-    # existing path.
-    previous_showwarning = warnings.showwarning
-
-    def _routing_showwarning(message, category, filename, lineno, file=None, line=None):
-        if threading.get_ident() == tid:
-            captured.append(str(message))
-            return
-        previous_showwarning(message, category, filename, lineno, file, line)
-
-    warnings.showwarning = _routing_showwarning
+    _capture_state.push(tid, captured)
     try:
         yield captured
     finally:
-        warnings.showwarning = previous_showwarning
-        _capture_state.unregister(tid)
+        _capture_state.pop(tid)
 
 
 def _cmd_transform(args: argparse.Namespace) -> int:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index ea5770e..a4ec0ff 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1112,27 +1112,47 @@ def test_capture_featcopilot_messages_intercepts_logger_warning():
     not just covered transitively via the CLI subcommands.
     """
     fc_logger = logging.getLogger("featcopilot.test_cli")
-    with fc_cli._capture_featcopilot_messages() as captured:
-        fc_logger.warning("captured-warning-message")
-        warnings.warn("captured-runtime-warning", UserWarning, stacklevel=2)
+    # Reset Python's warning-deduplication state for the duration of the
+    # test so a previous test that fired ``warnings.warn`` at the same
+    # source location does not suppress this one.
+    with warnings.catch_warnings():
+        warnings.simplefilter("always")
+        with fc_cli._capture_featcopilot_messages() as captured:
+            fc_logger.warning("captured-warning-message")
+            warnings.warn("captured-runtime-warning", UserWarning, stacklevel=2)
     assert any("captured-warning-message" in m for m in captured)
     assert any("captured-runtime-warning" in m for m in captured)
 
 
-def test_capture_featcopilot_messages_restores_handlers():
-    """The contextmanager must restore the original featcopilot root logger
-    state after the with-block, even if an exception propagates.
+def test_capture_featcopilot_messages_does_not_mutate_logger_state_per_call():
+    """The contextmanager installs hooks *once* (lazily) and then never
+    mutates the featcopilot logger again — so successive captures don't
+    add or remove handlers, regardless of test ordering. The earlier
+    "restores handlers" test (asserting equality with pre-first-call
+    state) was order-dependent: on the very first capture in a process,
+    ``_install_capture_hooks_once()`` permanently adds
+    ``_routing_handler`` and that's a one-way change. We instead assert
+    *stability* across an exception-propagating with-block, which is the
+    real behavioral contract.
     """
+    # First, force install via a no-op capture.
+    with fc_cli._capture_featcopilot_messages():
+        pass
+
     fc_root = logging.getLogger("featcopilot")
-    saved_handlers = list(fc_root.handlers)
-    saved_level = fc_root.level
+    handlers_before = list(fc_root.handlers)
+    level_before = fc_root.level
+    showwarning_before = warnings.showwarning
 
     with pytest.raises(RuntimeError):
         with fc_cli._capture_featcopilot_messages():
             raise RuntimeError("boom")
 
-    assert fc_root.handlers == saved_handlers
-    assert fc_root.level == saved_level
+    # Hooks remain installed (handler stays, level unchanged, showwarning
+    # override remains in place); per-call state has been popped.
+    assert fc_root.handlers == handlers_before
+    assert fc_root.level == level_before
+    assert warnings.showwarning is showwarning_before
 
 
 def test_capture_featcopilot_messages_thread_safety():
@@ -1231,15 +1251,21 @@ def worker(tag: str, target: list[str]):
         barrier.wait()
         with fc_cli._capture_featcopilot_messages() as captured:
             for i in range(10):
+                # ``stacklevel=2`` is forwarded; reset filter state so we
+                # don't lose the warning to Python's default dedup.
                 warnings.warn(f"{tag}-warn-{i}", UserWarning, stacklevel=2)
         target.extend(captured)
 
-    t1 = threading.Thread(target=worker, args=("A", a_captured))
-    t2 = threading.Thread(target=worker, args=("B", b_captured))
-    t1.start()
-    t2.start()
-    t1.join()
-    t2.join()
+    # Reset warning filters for this test so dedup doesn't suppress
+    # repeated emissions at the same source line.
+    with warnings.catch_warnings():
+        warnings.simplefilter("always")
+        t1 = threading.Thread(target=worker, args=("A", a_captured))
+        t2 = threading.Thread(target=worker, args=("B", b_captured))
+        t1.start()
+        t2.start()
+        t1.join()
+        t2.join()
 
     assert all("A-warn-" in m for m in a_captured)
     assert all("B-warn-" in m for m in b_captured)
@@ -1247,6 +1273,105 @@ def worker(tag: str, target: list[str]):
     assert not any("A-warn-" in m for m in b_captured)
 
 
+def test_nested_capture_on_same_thread_preserves_outer_list():
+    """A capture inside a capture on the same thread must:
+
+    1. Route records to the *innermost* list while the inner block is active.
+    2. Restore the outer list when the inner block exits, so subsequent
+       records flow into the outer payload.
+
+    The previous single-list-per-thread design clobbered the outer
+    registration; this test guards against that regression.
+    """
+    fc_logger = logging.getLogger("featcopilot.test_nested")
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("always")
+        with fc_cli._capture_featcopilot_messages() as outer:
+            fc_logger.warning("outer-before-nested")
+            with fc_cli._capture_featcopilot_messages() as inner:
+                fc_logger.warning("inner-only")
+                warnings.warn("inner-runtime", UserWarning, stacklevel=2)
+            fc_logger.warning("outer-after-nested")
+
+    # Inner contains only the records emitted while it was the active
+    # capture.
+    assert any("inner-only" in m for m in inner)
+    assert any("inner-runtime" in m for m in inner)
+    assert not any("outer-before-nested" in m for m in inner)
+    assert not any("outer-after-nested" in m for m in inner)
+
+    # Outer contains records emitted before AND after the inner block,
+    # but NOT records emitted while inner was active (those went to inner).
+    assert any("outer-before-nested" in m for m in outer)
+    assert any("outer-after-nested" in m for m in outer)
+    assert not any("inner-only" in m for m in outer)
+    assert not any("inner-runtime" in m for m in outer)
+
+
+def test_overlapping_captures_with_out_of_order_exit():
+    """Two threads enter the capture block, then thread A exits *before*
+    thread B. The CLI must continue to capture B's warnings even after
+    A has exited — i.e. A's exit must not restore a global state that
+    disables B's capture.
+
+    This is the strict version of the warnings.showwarning race that
+    existed when the override was saved/restored per-call: A's exit
+    used to restore the original ``warnings.showwarning``, leaking B's
+    subsequent ``warnings.warn`` calls onto stderr.
+    """
+    import threading
+    import time
+
+    barrier = threading.Barrier(2)
+    a_done = threading.Event()
+    a_captured: list[str] = []
+    b_captured: list[str] = []
+
+    fc_logger = logging.getLogger("featcopilot.test_overlap")
+
+    def worker_a():
+        barrier.wait()
+        with fc_cli._capture_featcopilot_messages() as captured:
+            fc_logger.warning("A-1")
+            warnings.warn("A-warn-1", UserWarning, stacklevel=2)
+        a_captured.extend(captured)
+        a_done.set()  # signal: A has exited the capture block
+
+    def worker_b():
+        barrier.wait()
+        with fc_cli._capture_featcopilot_messages() as captured:
+            fc_logger.warning("B-1")
+            # Wait for A to fully exit before emitting B's tail records.
+            assert a_done.wait(timeout=5)
+            time.sleep(0.05)  # small grace so any racy restoration would have happened
+            fc_logger.warning("B-2-after-A-exit")
+            warnings.warn("B-warn-after-A-exit", UserWarning, stacklevel=2)
+        b_captured.extend(captured)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("always")
+        t_a = threading.Thread(target=worker_a)
+        t_b = threading.Thread(target=worker_b)
+        t_b.start()  # start B first so it's already in the block
+        time.sleep(0.05)
+        t_a.start()
+        t_a.join(timeout=5)
+        t_b.join(timeout=5)
+
+    # B's records — including the ones emitted *after* A exited — must
+    # all be captured. None of A's records should have leaked into B.
+    assert any("B-1" in m for m in b_captured)
+    assert any("B-2-after-A-exit" in m for m in b_captured)
+    assert any("B-warn-after-A-exit" in m for m in b_captured)
+    assert not any("A-1" in m for m in b_captured)
+    assert not any("A-warn-1" in m for m in b_captured)
+    # A's payload likewise contains only A's records.
+    assert any("A-1" in m for m in a_captured)
+    assert any("A-warn-1" in m for m in a_captured)
+    assert not any("B-" in m for m in a_captured)
+
+
 def test_unexpected_error_writes_single_stderr_line(monkeypatch, tmp_path: Path, tabular_csv: Path):
     """An unexpected (non-ValueError) exception must produce exactly one
     structured stderr line — no second timestamped traceback from

From 8d3a9735499ef8501d834e2508af77c810c8f176 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Mon, 4 May 2026 20:36:17 +0800
Subject: [PATCH 18/30] fix(cli): address round-13 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses both new comments from copilot-pull-request-reviewer on
PR #5 (commit 0c69dd9):

* Range-validate correlation_threshold to [0.0, 1.0] (Copilot TrAR).
  Out-of-range values silently change selector behavior:
  `correlation_threshold > 1.0` disables redundancy elimination
  entirely (`FeatureSelector.fit` only runs it when threshold < 1.0),
  while a negative value treats every numeric pair as redundant.
  `_build_engineer` now rejects out-of-range values (from CLI flag
  or config) up front with a precise exit-2 error. Boundary values
  0.0 and 1.0 are accepted (inclusive). `max_features` likewise gets
  an explicit positive-int check at this layer so the message says
  `max_features` rather than the more cryptic transformer error.
  Six new tests parametrize negative / above-1 / boundary cases for
  both CLI flag and config sources.

* Bound explain's memory / compute via input sampling (Copilot TrAv).
  `explain` is metadata-only — the transformed frame is discarded
  immediately — but it used to materialize every engineered value on
  the full input, which on large datasets makes `featcopilot explain`
  slow or OOM-prone in agent / CI workflows. The CLI now caps the
  input at `_EXPLAIN_SAMPLE_SIZE = 1000` rows (deterministic
  `random_state=0` so re-runs produce the same metadata). The
  candidate feature set is independent of input length — every engine
  plans from column structure, not row values — so the payload is
  identical to a full-input run. New `n_rows_used` field reports
  the effective sample size; three tests cover (a) input >> cap,
  (b) input < cap (no-op), and (c) determinism across re-runs.

Tests: 96 (+9 new) in tests/test_cli.py, 869 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py |  44 +++++++++++
 tests/test_cli.py  | 181 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 225 insertions(+)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 2c07916..a30be02 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -376,6 +376,21 @@ def pick(flag_value, config_key, default):
     _check_scalar_type("gate_n_jobs", gate_n_jobs, (int,), allow_bool=False)
     _check_scalar_type("leakage_guard", leakage_guard, (str,))
 
+    # Range-check ``correlation_threshold``: it's only meaningful in
+    # ``[0.0, 1.0]``. Values above 1 silently disable redundancy
+    # elimination (``FeatureSelector.fit`` only runs it when threshold
+    # < 1.0); values below 0 effectively treat every numeric pair as
+    # redundant. Reject out-of-range up front so the CLI doesn't quietly
+    # change selector behavior.
+    if not (0.0 <= float(correlation_threshold) <= 1.0):
+        raise ValueError(f"`correlation_threshold` must be in the range [0.0, 1.0]; got {correlation_threshold!r}.")
+    # ``max_features`` must be positive when set (matches
+    # AutoFeatureEngineer's own validation). Surface that here too so
+    # the message says ``max_features`` rather than the more cryptic
+    # transformer error.
+    if max_features is not None and max_features <= 0:
+        raise ValueError(f"`max_features` must be a positive integer when set; got {max_features!r}.")
+
     # Validate ``llm_config`` is a JSON object (i.e. a Python dict) before
     # forwarding it. Without this check, a misconfigured non-dict value
     # would only fail at engine-construction time inside
@@ -785,6 +800,15 @@ def _cmd_transform(args: argparse.Namespace) -> int:
     return 0
 
 
+# ``explain`` only needs to fire each engine's planning + feature-naming
+# pass — the actual transformed values are discarded. Capping the input
+# at this many rows keeps the metadata-only command from paying the full
+# memory / compute cost of materializing every engineered value on large
+# datasets, while still giving every engine enough rows to plan its
+# features (the candidate set is independent of input length).
+_EXPLAIN_SAMPLE_SIZE = 1000
+
+
 def _cmd_explain(args: argparse.Namespace) -> int:
     """Fit + transform engines and print feature explanations + code as JSON.
 
@@ -796,6 +820,15 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     actual generated features. Selection is intentionally skipped here so the
     payload describes every candidate feature the engines produced, not just
     the post-selection survivors.
+
+    Performance: large inputs are sub-sampled to at most
+    :data:`_EXPLAIN_SAMPLE_SIZE` rows. The engineered-feature *metadata*
+    (names, explanations, code snippets) is independent of input length —
+    every engine plans its candidate feature set from column structure
+    rather than from individual row values — so the sampled run produces
+    the same payload at a fraction of the memory / compute cost. This
+    keeps ``featcopilot explain`` fast and bounded for agent / CI
+    workflows where the input might be GBs of data.
     """
     input_path = Path(args.input)
     if not input_path.exists():
@@ -805,6 +838,16 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     df = _read_table(input_path, in_fmt)
     X, y = _split_xy(df, args.target)
 
+    # Sample to bound memory / compute. Use a deterministic ``random_state``
+    # so re-running ``explain`` on the same input is reproducible.
+    n_sampled = len(X)
+    if n_sampled > _EXPLAIN_SAMPLE_SIZE:
+        sample_idx = X.sample(n=_EXPLAIN_SAMPLE_SIZE, random_state=0).index
+        X = X.loc[sample_idx]
+        if y is not None:
+            y = y.loc[sample_idx]
+        n_sampled = _EXPLAIN_SAMPLE_SIZE
+
     engineer = _build_engineer(args, include_selection_config=False)
     captured_warnings = _fit_capturing_warnings(
         engineer,
@@ -823,6 +866,7 @@ def _cmd_explain(args: argparse.Namespace) -> int:
         "status": "ok",
         "input": str(input_path),
         "n_features": len(feature_names),
+        "n_rows_used": n_sampled,
         "engines": list(engineer.engines),
         "features": [
             {
diff --git a/tests/test_cli.py b/tests/test_cli.py
index a4ec0ff..47a75c7 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -403,6 +403,98 @@ def test_scalar_type_mismatch_in_config_returns_exit_2(tmp_path: Path, tabular_c
     assert fragment in err
 
 
+@pytest.mark.parametrize("threshold", [-0.1, 1.1, 5.0, -1.0])
+def test_correlation_threshold_out_of_range_returns_exit_2(tmp_path: Path, tabular_csv: Path, threshold):
+    """``correlation_threshold`` is only meaningful in [0.0, 1.0]. Out-of-range
+    values silently change selector behavior (>1 disables redundancy elim,
+    <0 treats every numeric pair as redundant), so the CLI rejects them up
+    front with a precise exit-2 error.
+    """
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--correlation-threshold",
+            str(threshold),
+            "--max-features",
+            "5",
+        ]
+    )
+    assert rc == 2
+    assert "correlation_threshold" in err
+    assert "[0.0, 1.0]" in err or "0.0" in err
+
+
+def test_correlation_threshold_in_config_out_of_range_returns_exit_2(tmp_path: Path, tabular_csv: Path):
+    """The same range check applies when ``correlation_threshold`` arrives
+    from ``--config`` rather than the CLI flag.
+    """
+    cfg = tmp_path / "cfg.json"
+    cfg.write_text(json.dumps({"correlation_threshold": 2.5}))
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--max-features",
+            "5",
+            "--config",
+            str(cfg),
+        ]
+    )
+    assert rc == 2
+    assert "correlation_threshold" in err
+
+
+def test_correlation_threshold_boundary_values_accepted(tmp_path: Path, tabular_csv: Path):
+    """The boundaries (0.0 and 1.0) must be accepted — they're the inclusive
+    valid range. Default 0.85 is also exercised throughout the suite.
+    """
+    out_path = tmp_path / "out.csv"
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--correlation-threshold",
+            "0.0",
+            "--max-features",
+            "5",
+        ]
+    )
+    assert rc == 0, err
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--correlation-threshold",
+            "1.0",
+            "--max-features",
+            "5",
+        ]
+    )
+    assert rc == 0, err
+
+
 # ----------------------- --verbose / --no-verbose
 
 
@@ -1734,6 +1826,95 @@ def test_explain_emits_json_payload(tmp_path: Path, tabular_csv: Path):
     assert entry["name"]
 
 
+def test_explain_caps_input_size_for_large_inputs(tmp_path: Path):
+    """``explain`` is metadata-only. To bound memory / compute on large
+    inputs, the CLI sub-samples to at most ``_EXPLAIN_SAMPLE_SIZE`` rows
+    before running ``fit_transform``. The payload reports ``n_rows_used``
+    so callers can confirm the sampling.
+    """
+    rng = np.random.default_rng(0)
+    n = fc_cli._EXPLAIN_SAMPLE_SIZE * 5  # well above the cap
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "big.csv"
+    df.to_csv(in_path, index=False)
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["status"] == "ok"
+    # Sampling cap was enforced.
+    assert payload["n_rows_used"] == fc_cli._EXPLAIN_SAMPLE_SIZE
+    assert payload["n_features"] > 0
+
+
+def test_explain_uses_full_input_when_smaller_than_sample_cap(tmp_path: Path):
+    """When the input has fewer rows than ``_EXPLAIN_SAMPLE_SIZE``, the
+    sampler is a no-op: ``n_rows_used`` reflects the actual input size.
+    """
+    rng = np.random.default_rng(0)
+    n = 50  # well below the cap
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "small.csv"
+    df.to_csv(in_path, index=False)
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["n_rows_used"] == n
+
+
+def test_explain_sampling_is_deterministic(tmp_path: Path):
+    """Re-running ``explain`` on the same large input produces the same
+    set of feature names (sampling uses a fixed ``random_state``).
+    """
+    rng = np.random.default_rng(0)
+    n = fc_cli._EXPLAIN_SAMPLE_SIZE * 3
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "big.csv"
+    df.to_csv(in_path, index=False)
+
+    def _names():
+        rc, out, _ = _run(["explain", "--input", str(in_path), "--target", "y"])
+        assert rc == 0
+        return sorted(f["name"] for f in json.loads(out)["features"])
+
+    assert _names() == _names()
+
+
 # --------------------------------------------------------------- parquet path
 
 

From 59468052691e6cd9d6791113f1bdf13729ac7370 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Mon, 4 May 2026 21:13:48 +0800
Subject: [PATCH 19/30] fix(cli): address round-14 review feedback (Codex P1)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses the new P1 comment from chatgpt-codex-connector on PR #5
(commit 8d3a973):

* Default explain back to FULL input — sampling is opt-in (Codex P1).
  Round-13's automatic sampling-to-1000-rows changed which features
  some engines plan, breaking the metadata-vs-full-transform faithful-
  ness contract. `TabularEngine._fit_categorical_encoding` (and
  similar engines) decide which encodings to apply based on
  `n_rows`, unique-count ratios, and per-category counts, all of
  which are data-size dependent.

  `_cmd_explain` now uses the full input by default. Callers who
  knowingly accept the trade-off can opt in via:
  - `--explain-sample-size N` (CLI flag), or
  - `"explain_sample_size": N` in `--config`.

  When sampling is active, the CLI emits a `UserWarning` (captured
  into the JSON payload's `warnings` field, NOT to stderr) explaining
  that the metadata may differ from a full-input transform run. The
  warning + `fit_transform` are wrapped in a single
  `_capture_featcopilot_messages` block so the sampling notice ends
  up where downstream agents will see it.

  Validation: `explain_sample_size` must be a positive int. Strings,
  zero, and negatives are rejected with a precise exit-2 error.

Tests: 102 (+6 net) in tests/test_cli.py.
* test_explain_uses_full_input_by_default
* test_explain_caps_input_size_when_sample_size_set
* test_explain_sample_size_smaller_than_input_no_op
* test_explain_sample_size_via_config
* test_explain_sample_size_rejects_non_positive (parametrized)
* test_explain_sample_size_rejects_string_in_config
* test_explain_sample_size_rejects_zero_in_config
875 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 105 ++++++++++++++++++++++----------
 tests/test_cli.py  | 148 ++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 199 insertions(+), 54 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index a30be02..9c31eaa 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -228,6 +228,7 @@ def _write_table(df, path: Path, fmt: str) -> None:
         "gate_n_jobs",
         "llm_config",
         "verbose",
+        "explain_sample_size",
     }
 )
 
@@ -800,15 +801,14 @@ def _cmd_transform(args: argparse.Namespace) -> int:
     return 0
 
 
-# ``explain`` only needs to fire each engine's planning + feature-naming
-# pass — the actual transformed values are discarded. Capping the input
-# at this many rows keeps the metadata-only command from paying the full
-# memory / compute cost of materializing every engineered value on large
-# datasets, while still giving every engine enough rows to plan its
-# features (the candidate set is independent of input length).
-_EXPLAIN_SAMPLE_SIZE = 1000
-
-
+# Default ``explain`` behavior is to use the full input so the metadata
+# is a faithful description of what a corresponding ``transform`` run
+# would do — engines like ``TabularEngine._fit_categorical_encoding``
+# use ``n_rows`` and per-category counts to decide e.g. one-hot vs.
+# target-encoding, so subsampling can silently change which features
+# appear. Callers who knowingly accept that trade-off can opt in via
+# ``--explain-sample-size`` (set to ``None``/absent to disable, any
+# positive integer to cap).
 def _cmd_explain(args: argparse.Namespace) -> int:
     """Fit + transform engines and print feature explanations + code as JSON.
 
@@ -821,14 +821,22 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     payload describes every candidate feature the engines produced, not just
     the post-selection survivors.
 
-    Performance: large inputs are sub-sampled to at most
-    :data:`_EXPLAIN_SAMPLE_SIZE` rows. The engineered-feature *metadata*
-    (names, explanations, code snippets) is independent of input length —
-    every engine plans its candidate feature set from column structure
-    rather than from individual row values — so the sampled run produces
-    the same payload at a fraction of the memory / compute cost. This
-    keeps ``featcopilot explain`` fast and bounded for agent / CI
-    workflows where the input might be GBs of data.
+    Performance vs. faithfulness
+    ---------------------------
+    By default ``explain`` runs on the *full* input so the reported
+    metadata is a faithful description of what a corresponding
+    ``transform`` would generate. Some engines (notably
+    :class:`TabularEngine`) consult row counts and per-category
+    statistics when deciding which features to plan, so blind
+    subsampling can silently change the result.
+
+    For very large inputs where the metadata-only nature of ``explain``
+    really should not pay full memory / compute cost, callers can pass
+    ``--explain-sample-size N`` (or set ``"explain_sample_size": N`` in
+    ``--config``) to cap the rows fed to the engineer. The CLI emits a
+    ``UserWarning`` (captured into the JSON payload) noting that the
+    metadata may differ from a full-input ``transform`` run; the
+    ``n_rows_used`` field reports the effective sample size.
     """
     input_path = Path(args.input)
     if not input_path.exists():
@@ -838,25 +846,45 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     df = _read_table(input_path, in_fmt)
     X, y = _split_xy(df, args.target)
 
-    # Sample to bound memory / compute. Use a deterministic ``random_state``
-    # so re-running ``explain`` on the same input is reproducible.
+    # Apply opt-in sample cap from CLI flag or config (CLI flag wins).
+    sample_size = getattr(args, "explain_sample_size", None)
+    if sample_size is None and args.config is not None:
+        sample_size = _load_config(args.config).get("explain_sample_size")
+    if sample_size is not None:
+        _check_scalar_type("explain_sample_size", sample_size, (int,), allow_bool=False)
+        if sample_size <= 0:
+            raise ValueError(f"`explain_sample_size` must be a positive integer when set; got {sample_size!r}.")
+
     n_sampled = len(X)
-    if n_sampled > _EXPLAIN_SAMPLE_SIZE:
-        sample_idx = X.sample(n=_EXPLAIN_SAMPLE_SIZE, random_state=0).index
-        X = X.loc[sample_idx]
-        if y is not None:
-            y = y.loc[sample_idx]
-        n_sampled = _EXPLAIN_SAMPLE_SIZE
 
     engineer = _build_engineer(args, include_selection_config=False)
-    captured_warnings = _fit_capturing_warnings(
-        engineer,
-        X,
-        y,
-        task_description=args.task_description or "prediction task",
-        target_name=args.target,
-        apply_selection=False,
-    )
+
+    # Run the sample-warning AND ``fit_transform`` inside a single
+    # capture context so the sampling notice ends up in the JSON
+    # payload's ``warnings`` field instead of bleeding onto stderr.
+    with _capture_featcopilot_messages() as captured_warnings:
+        if sample_size is not None and n_sampled > sample_size:
+            warnings.warn(
+                f"explain: sampling input down to {sample_size} of {n_sampled} rows. "
+                "Some engines (e.g. TabularEngine categorical encoding) decide which "
+                "features to plan based on row counts and per-category statistics, "
+                "so the reported metadata may differ from a full-input transform run.",
+                UserWarning,
+                stacklevel=2,
+            )
+            sample_idx = X.sample(n=sample_size, random_state=0).index
+            X = X.loc[sample_idx]
+            if y is not None:
+                y = y.loc[sample_idx]
+            n_sampled = sample_size
+
+        engineer.fit_transform(
+            X,
+            y,
+            task_description=args.task_description or "prediction task",
+            target_name=args.target,
+            apply_selection=False,
+        )
 
     explanations = engineer.explain_features()
     code = engineer.get_feature_code()
@@ -954,6 +982,17 @@ def _build_parser() -> argparse.ArgumentParser:
         "--task-description",
         help="Natural-language ML task description (used by the LLM engine).",
     )
+    p_explain.add_argument(
+        "--explain-sample-size",
+        type=int,
+        default=None,
+        help="Cap the input fed to the engineer at this many rows (deterministic seed). "
+        "OFF by default: the full input is used so the metadata is a faithful description "
+        "of what a corresponding `transform` would generate. Pass a positive integer ONLY "
+        "when you knowingly accept that some engines (e.g. TabularEngine categorical "
+        "encoding) decide which features to plan based on row counts and per-category "
+        "statistics, so the reported metadata may differ from a full-input run.",
+    )
     _add_engineer_args(p_explain, include_selection_args=False)
     p_explain.add_argument("--json", action="store_true", help="(Always JSON — flag accepted for symmetry.)")
     p_explain.set_defaults(func=_cmd_explain)
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 47a75c7..e29e5e1 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1826,14 +1826,15 @@ def test_explain_emits_json_payload(tmp_path: Path, tabular_csv: Path):
     assert entry["name"]
 
 
-def test_explain_caps_input_size_for_large_inputs(tmp_path: Path):
-    """``explain`` is metadata-only. To bound memory / compute on large
-    inputs, the CLI sub-samples to at most ``_EXPLAIN_SAMPLE_SIZE`` rows
-    before running ``fit_transform``. The payload reports ``n_rows_used``
-    so callers can confirm the sampling.
+def test_explain_uses_full_input_by_default(tmp_path: Path):
+    """``explain`` defaults to using the FULL input — no implicit
+    sub-sampling. Some engines (e.g. ``TabularEngine`` categorical
+    encoding) decide which features to plan based on row counts and
+    per-category statistics, so silent sampling would change the
+    advertised metadata. Sampling is opt-in via ``--explain-sample-size``.
     """
     rng = np.random.default_rng(0)
-    n = fc_cli._EXPLAIN_SAMPLE_SIZE * 5  # well above the cap
+    n = 1500  # arbitrary
     df = pd.DataFrame(
         {
             "x1": rng.normal(size=n),
@@ -1856,17 +1857,54 @@ def test_explain_caps_input_size_for_large_inputs(tmp_path: Path):
     assert rc == 0, err
     payload = json.loads(out)
     assert payload["status"] == "ok"
+    # Default: no sampling — full input is used.
+    assert payload["n_rows_used"] == n
+
+
+def test_explain_caps_input_size_when_sample_size_set(tmp_path: Path):
+    """When ``--explain-sample-size N`` is passed, the input is capped at
+    ``N`` rows (with a captured warning) so callers can opt into bounded
+    cost on huge inputs. The default remains full-input.
+    """
+    rng = np.random.default_rng(0)
+    n = 5000
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "big.csv"
+    df.to_csv(in_path, index=False)
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+            "--explain-sample-size",
+            "1000",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
     # Sampling cap was enforced.
-    assert payload["n_rows_used"] == fc_cli._EXPLAIN_SAMPLE_SIZE
+    assert payload["n_rows_used"] == 1000
     assert payload["n_features"] > 0
+    # The CLI emits a warning when sampling so callers can detect that
+    # metadata may not match a full-input transform run.
+    assert any("sampling" in w.lower() for w in payload["warnings"])
 
 
-def test_explain_uses_full_input_when_smaller_than_sample_cap(tmp_path: Path):
-    """When the input has fewer rows than ``_EXPLAIN_SAMPLE_SIZE``, the
-    sampler is a no-op: ``n_rows_used`` reflects the actual input size.
+def test_explain_sample_size_smaller_than_input_no_op(tmp_path: Path):
+    """When ``--explain-sample-size`` exceeds the actual input, no sampling
+    happens (and no warning is emitted).
     """
     rng = np.random.default_rng(0)
-    n = 50  # well below the cap
+    n = 50
     df = pd.DataFrame(
         {
             "x1": rng.normal(size=n),
@@ -1884,19 +1922,20 @@ def test_explain_uses_full_input_when_smaller_than_sample_cap(tmp_path: Path):
             str(in_path),
             "--target",
             "y",
+            "--explain-sample-size",
+            "1000",
         ]
     )
     assert rc == 0, err
     payload = json.loads(out)
     assert payload["n_rows_used"] == n
+    assert not any("sampling" in w.lower() for w in payload["warnings"])
 
 
-def test_explain_sampling_is_deterministic(tmp_path: Path):
-    """Re-running ``explain`` on the same large input produces the same
-    set of feature names (sampling uses a fixed ``random_state``).
-    """
+def test_explain_sample_size_via_config(tmp_path: Path):
+    """``explain_sample_size`` is also recognized in ``--config`` JSON."""
     rng = np.random.default_rng(0)
-    n = fc_cli._EXPLAIN_SAMPLE_SIZE * 3
+    n = 5000
     df = pd.DataFrame(
         {
             "x1": rng.normal(size=n),
@@ -1907,12 +1946,79 @@ def test_explain_sampling_is_deterministic(tmp_path: Path):
     in_path = tmp_path / "big.csv"
     df.to_csv(in_path, index=False)
 
-    def _names():
-        rc, out, _ = _run(["explain", "--input", str(in_path), "--target", "y"])
-        assert rc == 0
-        return sorted(f["name"] for f in json.loads(out)["features"])
+    cfg = tmp_path / "cfg.json"
+    cfg.write_text(json.dumps({"explain_sample_size": 500}))
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+            "--config",
+            str(cfg),
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["n_rows_used"] == 500
+
 
-    assert _names() == _names()
+@pytest.mark.parametrize("bad_value", [0, -1, -100])
+def test_explain_sample_size_rejects_non_positive(tmp_path: Path, bad_value):
+    """``--explain-sample-size`` must be a positive integer."""
+    rc, _, err = _run(
+        [
+            "explain",
+            "--input",
+            str(tmp_path / "in.csv"),  # missing — but flag check happens first
+            "--target",
+            "y",
+            "--explain-sample-size",
+            str(bad_value),
+        ]
+    )
+    # We accept either argparse-level rejection or our own ValueError;
+    # both surface as exit 2.
+    assert rc == 2
+
+
+def test_explain_sample_size_rejects_string_in_config(tmp_path: Path, tabular_csv: Path):
+    """Type-validation: ``"explain_sample_size": "100"`` (string) is rejected."""
+    cfg = tmp_path / "cfg.json"
+    cfg.write_text(json.dumps({"explain_sample_size": "100"}))
+    rc, _, err = _run(
+        [
+            "explain",
+            "--input",
+            str(tabular_csv),
+            "--target",
+            "y",
+            "--config",
+            str(cfg),
+        ]
+    )
+    assert rc == 2
+    assert "explain_sample_size" in err
+
+
+def test_explain_sample_size_rejects_zero_in_config(tmp_path: Path, tabular_csv: Path):
+    cfg = tmp_path / "cfg.json"
+    cfg.write_text(json.dumps({"explain_sample_size": 0}))
+    rc, _, err = _run(
+        [
+            "explain",
+            "--input",
+            str(tabular_csv),
+            "--target",
+            "y",
+            "--config",
+            str(cfg),
+        ]
+    )
+    assert rc == 2
+    assert "explain_sample_size" in err
 
 
 # --------------------------------------------------------------- parquet path

From e85b79194a2947c8ea9a81878dace5ad43e187b2 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Mon, 4 May 2026 21:47:37 +0800
Subject: [PATCH 20/30] fix(cli): address round-15 review feedback

Addresses both new comments from copilot-pull-request-reviewer on
PR #5 (commit 5946805):

* Remove dead helper _fit_capturing_warnings (Copilot Wufb).
  After round-14 inlined the capture context into `_cmd_explain`
  (so the sampling `UserWarning` lands in the JSON payload alongside
  engine warnings), the `_fit_capturing_warnings` thin wrapper has
  no callers. Leaving it would create two competing capture paths
  that could drift apart over time. Removed; the actively used
  `_fit_transform_capturing_warnings` (called by `_cmd_transform`)
  remains.

* Console-script tests fail loudly when the package is installed but
  the script is missing (Copilot WueV). The previous
  `pytest.skip("not on PATH")` would have hidden a real
  `[project.scripts]` regression in CI: the tests workflow does
  `pip install -e .` before pytest, so the script MUST be on PATH.
  New `_featcopilot_package_is_installed` helper distinguishes
  the two scenarios:
  - Package installed, script missing -> `pytest.fail` with a
    message explaining the packaging regression.
  - Package not installed (rare: running tests against an un-installed
    source tree) -> `pytest.skip` with an install hint.
  Both `test_console_script_subprocess_invocation` and
  `test_console_script_version_flag` use the new policy.

Tests: 102 in tests/test_cli.py (unchanged count; helper + policy
update), 875 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py |  7 ------
 tests/test_cli.py  | 56 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 51 insertions(+), 12 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 9c31eaa..09deabe 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -508,13 +508,6 @@ def _fit_transform_capturing_warnings(engineer, X, y, **kwargs):
     return captured, result
 
 
-def _fit_capturing_warnings(engineer, X, y, **kwargs):
-    """Sibling of :func:`_fit_transform_capturing_warnings` for explain."""
-    with _capture_featcopilot_messages() as captured:
-        engineer.fit_transform(X, y, **kwargs)
-    return captured
-
-
 class _ThreadCaptureState:
     """Holds per-thread capture *stacks*.
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index e29e5e1..c297c96 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2229,22 +2229,59 @@ def test_dunder_main_subprocess_version_flag():
 # ------------------------------------------------------- console script
 
 
+def _featcopilot_package_is_installed() -> bool:
+    """Return True iff the ``featcopilot`` distribution is installed in the
+    current environment (i.e. the entry-point machinery should have placed
+    the console script on ``PATH``).
+
+    Used by the console-script tests to distinguish two cases:
+
+    * Running tests directly against the source tree (``python -m pytest``
+      from a clean checkout, no ``pip install -e .``): the package is
+      *not* installed; the script is legitimately missing and the test
+      should ``skip`` rather than report a packaging bug.
+    * Running tests after ``pip install`` (the CI flow): the package IS
+      installed, so the script MUST be on ``PATH``. If it isn't, that's a
+      real ``[project.scripts]`` regression and the test should ``fail``,
+      not silently pass via skip.
+    """
+    try:
+        from importlib.metadata import PackageNotFoundError, distribution
+    except ImportError:  # pragma: no cover - py3.10+ always has this
+        return False
+    try:
+        distribution("featcopilot")
+    except PackageNotFoundError:
+        return False
+    return True
+
+
 def test_console_script_subprocess_invocation():
     """The installed ``featcopilot`` console script must be on PATH and runnable.
 
     Exercises the ``[project.scripts] featcopilot = "featcopilot.cli:main"``
     entry point end-to-end so a typo or packaging regression in
-    ``pyproject.toml`` would actually break the suite. Skipped when the
-    script isn't on ``PATH`` (e.g. running tests without ``pip install``).
+    ``pyproject.toml`` would actually break the suite. When the
+    ``featcopilot`` distribution is installed, the script must be on
+    ``PATH``: a missing script in that case is a real packaging
+    regression, not a test environment quirk, so we ``fail`` (not
+    ``skip``). The skip is reserved for the rare case of running tests
+    against an un-installed source tree.
     """
     import shutil
     import subprocess
 
     script = shutil.which("featcopilot")
     if script is None:
+        if _featcopilot_package_is_installed():
+            pytest.fail(
+                "featcopilot package is installed but the `featcopilot` console "
+                "script is missing from PATH. This is a `[project.scripts]` "
+                "regression in pyproject.toml."
+            )
         pytest.skip(
-            "featcopilot console script not on PATH (install the package "
-            "with `pip install -e .` to exercise the entry point)"
+            "featcopilot package is not installed in this environment; install "
+            "it with `pip install -e .` to exercise the console-script entry point."
         )
 
     result = subprocess.run(
@@ -2261,12 +2298,21 @@ def test_console_script_subprocess_invocation():
 
 
 def test_console_script_version_flag():
+    """Same install-aware skip/fail policy as
+    :func:`test_console_script_subprocess_invocation`.
+    """
     import shutil
     import subprocess
 
     script = shutil.which("featcopilot")
     if script is None:
-        pytest.skip("featcopilot console script not on PATH")
+        if _featcopilot_package_is_installed():
+            pytest.fail(
+                "featcopilot package is installed but the `featcopilot` console "
+                "script is missing from PATH. This is a `[project.scripts]` "
+                "regression in pyproject.toml."
+            )
+        pytest.skip("featcopilot package is not installed in this environment.")
 
     result = subprocess.run(
         [script, "--version"],

From 15c15e9cf4aafda563b8bd32009186f30bbbb3ca Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Mon, 4 May 2026 22:52:57 +0800
Subject: [PATCH 21/30] fix(cli): address round-16 review feedback

Addresses both new comments from copilot-pull-request-reviewer on
PR #5 (commit e85b791):

* Catch pandas EmptyDataError in CSV read (Copilot YL1H).
  `_read_table` now catches `pandas.errors.EmptyDataError` in
  addition to `OSError` / `ParserError` / `UnicodeDecodeError`.
  Without it, a zero-byte / headerless `.csv` would have fallen
  through to the generic exit-1 "unexpected error" path instead of
  the documented exit-2 user-input error. New tests cover both
  zero-byte and "newlines-only" inputs.

* Detect target/feature name collision in --include-target
  (Copilot YL2C). `--include-target` used to blindly assign
  `transformed[target_name] = y.values`, which silently overwrites
  any engineered feature that happens to share the target's column
  name (e.g. a target named `x1_pow2` or `a_x_b` matching a
  tabular-engine derived feature). The CLI now detects the collision
  before assigning and raises a precise exit-2 error so the user can
  rename the target or drop `--include-target`. Two tests cover
  the contract: a real-data attempt that may or may not trip the
  collision (skipped when the engine doesn't materialize the
  colliding name), and a deterministic version that monkey-patches
  the engineer to inject a colliding column.

Tests: 106 (+4 new) in tests/test_cli.py, 879 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py |  26 ++++++++-
 tests/test_cli.py  | 138 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+), 2 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 09deabe..d25cc8e 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -133,7 +133,16 @@ def _read_table(path: Path, fmt: str):
     if fmt == "csv":
         try:
             return pd.read_csv(path)
-        except (OSError, pd.errors.ParserError, UnicodeDecodeError) as exc:
+        except (
+            OSError,
+            pd.errors.ParserError,
+            pd.errors.EmptyDataError,
+            UnicodeDecodeError,
+        ) as exc:
+            # ``EmptyDataError`` fires for headerless / zero-byte CSVs;
+            # without it, those inputs would fall into the generic exit-1
+            # "unexpected error" path instead of the documented exit-2
+            # user-input error.
             raise ValueError(f"Failed to read CSV from {str(path)!r}: {exc}") from exc
     if fmt == "parquet":
         try:
@@ -766,8 +775,21 @@ def _cmd_transform(args: argparse.Namespace) -> int:
 
     if args.include_target and y is not None:
         # Re-attach the target column so downstream training scripts can
-        # consume the engineered file as a single artifact.
+        # consume the engineered file as a single artifact. Detect column
+        # collisions: if an engineered feature happens to share the
+        # target's column name (e.g. a target named ``foo_pow2`` matching
+        # a tabular-engine derived feature), blindly assigning ``transformed[
+        # target_name] = y.values`` would silently overwrite the engineered
+        # column. Surface that as a clean exit-2 error instead. Callers
+        # who knowingly want to overwrite can rename their target before
+        # invoking ``transform`` (or skip ``--include-target``).
         target_name = args.target if args.target in df.columns else "target"
+        if target_name in transformed.columns:
+            raise ValueError(
+                f"--include-target would overwrite engineered feature {target_name!r} "
+                "with the target values. Rename the target column in the input file, "
+                "drop --include-target, or accept the rename and retry."
+            )
         transformed = transformed.copy()
         transformed[target_name] = y.values
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
index c297c96..e4bf86c 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -859,6 +859,144 @@ def _raise_oserror(*args, **kwargs):
     assert "failed to read" in err.lower()
 
 
+def test_empty_csv_input_returns_exit_2(tmp_path: Path):
+    """A zero-byte / headerless CSV triggers ``pandas.errors.EmptyDataError``,
+    which must be normalized to the documented exit-2 user-input error path
+    rather than falling through to the generic exit-1 backstop.
+    """
+    in_path = tmp_path / "empty.csv"
+    in_path.write_text("")  # zero bytes -> EmptyDataError on read
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "failed to read csv" in err.lower()
+
+
+def test_headerless_csv_input_returns_exit_2(tmp_path: Path):
+    """A CSV with no header and no rows is also empty-data territory and
+    must surface as exit 2.
+    """
+    in_path = tmp_path / "headerless.csv"
+    in_path.write_text("\n\n\n")  # only newlines, no header
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "failed to read csv" in err.lower()
+
+
+def test_transform_include_target_collision_returns_exit_2(tmp_path: Path):
+    """``--include-target`` would silently overwrite an engineered feature
+    if it happens to share the target column's name. The CLI must detect
+    that collision and fail with exit 2 instead of losing the engineered
+    feature.
+
+    A target named ``x1_pow2`` (which the tabular engine generates as a
+    derived feature from a numeric column ``x1``) provokes the collision.
+    """
+    rng = np.random.default_rng(0)
+    n = 200
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            # Target column has a name that the tabular engine would also
+            # generate (``x1_pow2`` etc. is in the tabular engine's
+            # derived feature catalog).
+            "x1_pow2": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "collision.csv"
+    df.to_csv(in_path, index=False)
+    out_path = tmp_path / "out.csv"
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(out_path),
+            "--target",
+            "x1_pow2",
+            "--include-target",
+            "--max-features",
+            "5",
+        ]
+    )
+    # Either the engineered set actually contains the colliding name (in
+    # which case we MUST exit 2), or selection happened to drop it. Skip
+    # if the engine didn't materialize the colliding feature this run —
+    # the test is about the contract, not whether ``x1_pow2`` is always
+    # generated.
+    if rc == 2:
+        assert "include-target would overwrite" in err.lower()
+        assert "x1_pow2" in err
+    else:
+        # No collision actually occurred; the test is a no-op for this
+        # input. Future engine changes that always emit ``x1_pow2`` will
+        # expose the collision branch.
+        assert rc == 0, err
+
+
+def test_transform_include_target_collision_deterministic(tmp_path: Path, tabular_csv: Path, monkeypatch):
+    """Deterministic version of the collision test: monkey-patch the
+    engineer so its transformed frame contains a column with the target's
+    name. This guarantees we exercise the exit-2 collision branch
+    regardless of which features the real engineer picks.
+    """
+    from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer
+
+    real_fit_transform = AutoFeatureEngineer.fit_transform
+
+    def _patched_fit_transform(self, X, y=None, **kwargs):
+        result = real_fit_transform(self, X, y, **kwargs)
+        # Inject a column named ``y`` into the result so it collides with
+        # the target column the test will pass.
+        result = result.copy()
+        result["y"] = result.iloc[:, 0]  # arbitrary engineered values
+        return result
+
+    monkeypatch.setattr(AutoFeatureEngineer, "fit_transform", _patched_fit_transform)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--include-target",
+            "--max-features",
+            "5",
+        ]
+    )
+    assert rc == 2
+    assert "include-target would overwrite" in err.lower()
+    assert "'y'" in err
+
+
 def test_unreadable_input_json_returns_exit_2(tmp_path: Path, tabular_csv: Path, monkeypatch):
     """``OSError`` from ``pd.read_json`` is surfaced as exit 2 too."""
     import pandas as pd

From 5cbb843b105d625fc746bb8ad7288a767115e628 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Mon, 4 May 2026 23:29:29 +0800
Subject: [PATCH 22/30] fix(cli): address round-17 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses both new comments from copilot-pull-request-reviewer on
PR #5 (commit 15c15e9):

* Sample by position, not label (Copilot Ymol).
  `_cmd_explain`'s sampling path used `.sample(...).index` plus
  `.loc[sample_idx]` to keep `X` and `y` aligned. `.loc`
  selects by label, so when the input has a non-unique index — common
  with parquet files that preserve a saved index — duplicate labels
  expand or reorder rows and `X` and `y` no longer line up. The
  CLI now samples by *position* with a seeded NumPy RNG and uses
  `.iloc[sample_positions]` for both, which is index-agnostic.
  `test_explain_sample_size_handles_non_unique_index` reads a
  parquet file with a deliberately duplicated index and asserts
  `n_rows_used` matches the requested cap exactly.

* --include-target collision message lists only actionable options
  (Copilot YmpI). The error text mentioned "accept the rename and
  retry", but the CLI does not offer any rename / auto-rename
  option. The misleading suffix is removed; the message now says
  "Rename the target column in the input file, or drop
  --include-target." (only options the caller can actually act on).
  `test_include_target_collision_error_text_lists_only_actionable_options`
  asserts the new wording and a regression guard against the old
  phantom-option phrasing.

Tests: 108 (+2 new) in tests/test_cli.py, 881 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 22 +++++++++---
 tests/test_cli.py  | 85 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 4 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index d25cc8e..8792d12 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -50,6 +50,8 @@
 from pathlib import Path
 from typing import Any
 
+import numpy as np
+
 from featcopilot import __version__
 from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer
 from featcopilot.utils.logger import get_logger
@@ -788,7 +790,7 @@ def _cmd_transform(args: argparse.Namespace) -> int:
             raise ValueError(
                 f"--include-target would overwrite engineered feature {target_name!r} "
                 "with the target values. Rename the target column in the input file, "
-                "drop --include-target, or accept the rename and retry."
+                "or drop --include-target."
             )
         transformed = transformed.copy()
         transformed[target_name] = y.values
@@ -887,10 +889,22 @@ def _cmd_explain(args: argparse.Namespace) -> int:
                 UserWarning,
                 stacklevel=2,
             )
-            sample_idx = X.sample(n=sample_size, random_state=0).index
-            X = X.loc[sample_idx]
+            # Sample by *position* (``.iloc[...]``), not label
+            # (``.sample(...).index`` + ``.loc[...]``). ``.loc`` selects
+            # by label, so a non-unique index — common when reading
+            # parquet files that preserve a saved index — would let
+            # duplicate labels expand or reorder rows so ``X`` and ``y``
+            # no longer line up. Positional sampling via a NumPy RNG +
+            # ``.iloc`` keeps them aligned regardless of input index.
+            rng_sampler = np.random.default_rng(0)
+            sample_positions = rng_sampler.choice(n_sampled, size=sample_size, replace=False)
+            # Sort the positions for determinism / readable output ordering
+            # (the random selection itself is already deterministic via
+            # the seeded RNG).
+            sample_positions.sort()
+            X = X.iloc[sample_positions]
             if y is not None:
-                y = y.loc[sample_idx]
+                y = y.iloc[sample_positions]
             n_sampled = sample_size
 
         engineer.fit_transform(
diff --git a/tests/test_cli.py b/tests/test_cli.py
index e4bf86c..b43a850 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2317,6 +2317,91 @@ def _raise_oserror(self, *args, **kwargs):
     assert "could not be read" in err.lower()
 
 
+def test_explain_sample_size_handles_non_unique_index(tmp_path: Path):
+    """Sampling must keep X and y aligned even when the input frame has a
+    non-unique index — e.g. a parquet read that preserves a saved index
+    where labels can repeat. Positional sampling (``.iloc``) avoids the
+    label-based ``.loc`` expansion / reordering bug.
+    """
+    pytest.importorskip("pyarrow")  # parquet write needs an engine
+
+    rng = np.random.default_rng(0)
+    n = 4000
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    # Force a non-unique index — labels repeat (each label appears twice).
+    df.index = pd.Index([i // 2 for i in range(n)], name="duplicated_index")
+    in_path = tmp_path / "non_unique.parquet"
+    df.to_parquet(in_path, index=True)
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+            "--explain-sample-size",
+            "100",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["status"] == "ok"
+    # Sample size must be honored exactly, not expanded by ``.loc``-with-
+    # duplicate-labels behavior.
+    assert payload["n_rows_used"] == 100
+
+
+def test_include_target_collision_error_text_lists_only_actionable_options(
+    tmp_path: Path, tabular_csv: Path, monkeypatch
+):
+    """The error text emitted when ``--include-target`` would overwrite an
+    engineered feature must only suggest actions that are actually
+    possible from this command. The CLI does not offer auto-rename, so
+    the message must NOT mention "rename and retry" or any other phantom
+    option.
+    """
+    from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer
+
+    real_fit_transform = AutoFeatureEngineer.fit_transform
+
+    def _patched_fit_transform(self, X, y=None, **kwargs):
+        result = real_fit_transform(self, X, y, **kwargs)
+        result = result.copy()
+        result["y"] = result.iloc[:, 0]
+        return result
+
+    monkeypatch.setattr(AutoFeatureEngineer, "fit_transform", _patched_fit_transform)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--include-target",
+            "--max-features",
+            "5",
+        ]
+    )
+    assert rc == 2
+    # Must mention the real options.
+    assert "rename the target column" in err.lower()
+    assert "drop --include-target" in err
+    # Must NOT mention non-existent CLI options.
+    assert "accept the rename" not in err.lower()
+    assert "retry" not in err.lower()
+
+
 # --------------------------------------------------------------- python -m
 
 

From f710fbe772fc1e340daea33f61cf133a6a06f463 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 5 May 2026 08:03:51 +0800
Subject: [PATCH 23/30] fix(cli): address round-18 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses both new comments from copilot-pull-request-reviewer on
PR #5 (commit 5cbb843):

* --explain-sample-size now actually bounds memory (Copilot gHFx).
  Round-13/14's sampling trimmed `X`/`y` AFTER the entire input
  had already been loaded into a pandas DataFrame, so on huge CSVs
  the subcommand could OOM before reaching the sampling branch —
  contradicting the "bounded cost on huge inputs" contract.

  `_read_table` now accepts an `nrows` parameter and propagates
  it to the underlying read:
  - **CSV**: `pd.read_csv(path, nrows=N)` — memory-bounded by
    pandas natively, never loads more than `N` rows.
  - **parquet / JSON**: pandas exposes no native row-limit, so the
    file is fully read and then truncated. A `UserWarning` is
    emitted (captured into the JSON payload's `warnings` field)
    explaining the limitation and recommending CSV for hard memory
    bounds on huge inputs.

  `_cmd_explain` now passes `nrows=sample_size` to `_read_table`
  and drops the post-read positional sampling step entirely.
  `test_explain_sample_size_bounds_csv_read_with_nrows` spies on
  `pd.read_csv` to assert the `nrows=200` kwarg is actually
  threaded through (not just truncated post-load); the existing
  parquet test now asserts the post-read warning fires.

* Worker-thread records routed via single-active-capture fallback
  (Copilot gHGE). `_ThreadCaptureState.get(tid)` previously
  returned `None` when the calling thread had no capture stack of
  its own. That meant log records emitted on worker threads spawned
  by the capturing thread (e.g. an LLM sync client wrapping
  `ThreadPoolExecutor` because it was called from a process with a
  running event loop) escaped capture and bled onto stderr.

  `get` now falls back: when exactly ONE capture is active in the
  process, cross-thread records are routed to that single capture.
  When two or more captures are concurrently active, the fallback
  stays disabled — each capture continues to see only its own
  thread's records, so concurrent CLI calls don't cross-contaminate.
  Both `_ThreadRoutingHandler` and `_routing_showwarning` go
  through the same `get`, so log records and `warnings.warn`
  calls share the policy.

  `test_capture_routes_worker_thread_records_to_single_active_capture`
  exercises a `ThreadPoolExecutor` worker plus a freshly spawned
  `threading.Thread` and asserts both are captured.
  `test_capture_keeps_thread_isolation_with_multiple_active_captures`
  guards the multi-capture isolation property.

Cleanup: dropped the now-unused `import numpy as np` from
`featcopilot/cli.py`.

Tests: 112 (+4 new) in tests/test_cli.py, 885 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 121 +++++++++++++++++++++++---------
 tests/test_cli.py  | 170 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 256 insertions(+), 35 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 8792d12..ba87ca9 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -50,8 +50,6 @@
 from pathlib import Path
 from typing import Any
 
-import numpy as np
-
 from featcopilot import __version__
 from featcopilot.transformers.sklearn_compat import AutoFeatureEngineer
 from featcopilot.utils.logger import get_logger
@@ -117,7 +115,7 @@ def _detect_format(path: Path, override: str | None) -> str:
     return fmt
 
 
-def _read_table(path: Path, fmt: str):
+def _read_table(path: Path, fmt: str, *, nrows: int | None = None):
     """Read a tabular file into a pandas DataFrame.
 
     All user-facing failure modes (missing parquet engine, ``--input``
@@ -126,6 +124,22 @@ def _read_table(path: Path, fmt: str):
     top-level handler routes them to the deterministic ``exit 2``
     user-error path. The generic ``exit 1`` backstop is reserved for
     truly unexpected (i.e. CLI-internal) errors.
+
+    Parameters
+    ----------
+    path : pathlib.Path
+        File to read.
+    fmt : str
+        One of ``csv`` / ``parquet`` / ``json``.
+    nrows : int or None, optional
+        Cap the number of rows returned. For ``csv``, this is propagated
+        directly to :func:`pandas.read_csv` so the underlying read is
+        memory-bounded. For ``parquet`` and ``json``, pandas does not
+        expose a native row limit, so the file is fully read and then
+        truncated; a :class:`UserWarning` is issued in that case so the
+        caller knows the bound is post-read (not memory-bounded). The
+        ``nrows`` cap is applied with a deterministic head slice so
+        re-runs on the same input produce the same metadata.
     """
     import pandas as pd
 
@@ -134,7 +148,11 @@ def _read_table(path: Path, fmt: str):
 
     if fmt == "csv":
         try:
-            return pd.read_csv(path)
+            # ``nrows`` is the only memory-bound knob native to read_csv;
+            # passing it here is what lets ``--explain-sample-size`` actually
+            # cap memory on huge CSV inputs (rather than loading the entire
+            # file and then trimming).
+            return pd.read_csv(path, nrows=nrows)
         except (
             OSError,
             pd.errors.ParserError,
@@ -148,7 +166,7 @@ def _read_table(path: Path, fmt: str):
             raise ValueError(f"Failed to read CSV from {str(path)!r}: {exc}") from exc
     if fmt == "parquet":
         try:
-            return pd.read_parquet(path)
+            df = pd.read_parquet(path)
         except ImportError as exc:
             raise ValueError(
                 f"Reading parquet requires a parquet engine (pyarrow or fastparquet); "
@@ -163,18 +181,42 @@ def _read_table(path: Path, fmt: str):
             # operation is delegated to a third-party backend; any error
             # raised is by definition an I/O or data issue, not a CLI bug.
             raise ValueError(f"Failed to read parquet from {str(path)!r}: {exc}") from exc
+        if nrows is not None and len(df) > nrows:
+            warnings.warn(
+                f"--explain-sample-size cap is applied post-read for parquet "
+                f"(loaded {len(df)} rows, truncating to {nrows}). pandas "
+                "does not expose a native parquet row-limit, so the full "
+                "file is materialized in memory before the cap. For hard "
+                "memory bounds on huge inputs, convert to CSV first.",
+                UserWarning,
+                stacklevel=2,
+            )
+            df = df.iloc[:nrows]
+        return df
     if fmt == "json":
         # ``orient='records'`` is the agent-friendly default; fall back to
         # pandas' auto-detection when the file isn't a records list.
         try:
-            return pd.read_json(path, orient="records")
+            df = pd.read_json(path, orient="records")
         except ValueError:
             try:
-                return pd.read_json(path)
+                df = pd.read_json(path)
             except ValueError as exc:
                 raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc
         except OSError as exc:
             raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc
+        if nrows is not None and len(df) > nrows:
+            warnings.warn(
+                f"--explain-sample-size cap is applied post-read for JSON "
+                f"(loaded {len(df)} rows, truncating to {nrows}). pandas "
+                "does not expose a native JSON row-limit, so the full "
+                "file is materialized in memory before the cap. For hard "
+                "memory bounds on huge inputs, convert to CSV first.",
+                UserWarning,
+                stacklevel=2,
+            )
+            df = df.iloc[:nrows]
+        return df
     raise ValueError(f"Unsupported input format: {fmt}")
 
 
@@ -520,7 +562,7 @@ def _fit_transform_capturing_warnings(engineer, X, y, **kwargs):
 
 
 class _ThreadCaptureState:
-    """Holds per-thread capture *stacks*.
+    """Holds per-thread capture *stacks* with a single-active-capture fallback.
 
     Each thread maps to a stack of capture lists. Nested
     :func:`_capture_featcopilot_messages` calls on the same thread push
@@ -528,6 +570,18 @@ class _ThreadCaptureState:
     and receives records / warnings until its block exits, at which
     point the outer capture (if any) becomes active again.
 
+    **Worker-thread fallback.** When the calling thread doesn't have a
+    capture but exactly one capture is active anywhere in the process,
+    :meth:`get` returns that single capture. This handles the common
+    case where the capturing thread spawns worker threads (e.g. an LLM
+    sync client wrapping ``ThreadPoolExecutor`` because it was called
+    from a process with a running event loop) — those workers' log
+    records logically belong to the single in-flight CLI run, and
+    routing them there keeps stderr clean. When more than one capture
+    is active concurrently, the fallback stays disabled (each captures
+    only its own thread's records) so concurrent CLI calls don't bleed
+    into each other.
+
     Shared by :class:`_ThreadRoutingHandler` (writes records),
     :class:`_SuppressCapturingFilter` (suppresses stderr), and the
     routing ``warnings.showwarning`` override.
@@ -550,11 +604,21 @@ def pop(self, tid: int) -> None:
                     del self._per_thread[tid]
 
     def get(self, tid: int) -> list[str] | None:
-        # Brief lock for thread-safe stack-top read.
+        # Brief lock for thread-safe stack-top read AND single-active-
+        # capture fallback (both walk ``self._per_thread``).
         with self._lock:
             stack = self._per_thread.get(tid)
             if stack:
                 return stack[-1]
+            # Worker-thread fallback. Cross-thread records (e.g. from a
+            # ThreadPoolExecutor worker spawned by the capturing thread)
+            # are routed to the single active capture when there is no
+            # ambiguity. Multiple concurrent captures keep their strict
+            # per-thread isolation.
+            if len(self._per_thread) == 1:
+                only_stack = next(iter(self._per_thread.values()))
+                if only_stack:
+                    return only_stack[-1]
             return None
 
 
@@ -860,10 +924,13 @@ def _cmd_explain(args: argparse.Namespace) -> int:
         raise FileNotFoundError(f"Input file not found: {args.input}")
 
     in_fmt = _detect_format(input_path, args.input_format)
-    df = _read_table(input_path, in_fmt)
-    X, y = _split_xy(df, args.target)
 
     # Apply opt-in sample cap from CLI flag or config (CLI flag wins).
+    # Resolve and validate it BEFORE reading the input so the cap can be
+    # threaded into ``_read_table(... nrows=sample_size)`` to bound memory
+    # on huge inputs (CSV uses ``pd.read_csv(nrows=...)`` natively;
+    # parquet/JSON fall back to post-read truncation with a UserWarning
+    # since pandas doesn't expose a native row-limit for those formats).
     sample_size = getattr(args, "explain_sample_size", None)
     if sample_size is None and args.config is not None:
         sample_size = _load_config(args.config).get("explain_sample_size")
@@ -872,40 +939,30 @@ def _cmd_explain(args: argparse.Namespace) -> int:
         if sample_size <= 0:
             raise ValueError(f"`explain_sample_size` must be a positive integer when set; got {sample_size!r}.")
 
-    n_sampled = len(X)
-
     engineer = _build_engineer(args, include_selection_config=False)
 
     # Run the sample-warning AND ``fit_transform`` inside a single
     # capture context so the sampling notice ends up in the JSON
     # payload's ``warnings`` field instead of bleeding onto stderr.
     with _capture_featcopilot_messages() as captured_warnings:
-        if sample_size is not None and n_sampled > sample_size:
+        # Read with ``nrows=sample_size`` so the underlying I/O is
+        # memory-bounded for CSV; for parquet/JSON the bound is
+        # post-read with an emitted UserWarning (captured into the
+        # payload below). Reading FIRST gives us ``len(df)`` so we
+        # only emit the "metadata may differ" notice when the cap
+        # actually shortened the input.
+        df = _read_table(input_path, in_fmt, nrows=sample_size)
+        X, y = _split_xy(df, args.target)
+        n_sampled = len(X)
+        if sample_size is not None and n_sampled >= sample_size:
             warnings.warn(
-                f"explain: sampling input down to {sample_size} of {n_sampled} rows. "
+                f"explain: capping input to {sample_size} rows (sampling). "
                 "Some engines (e.g. TabularEngine categorical encoding) decide which "
                 "features to plan based on row counts and per-category statistics, "
                 "so the reported metadata may differ from a full-input transform run.",
                 UserWarning,
                 stacklevel=2,
             )
-            # Sample by *position* (``.iloc[...]``), not label
-            # (``.sample(...).index`` + ``.loc[...]``). ``.loc`` selects
-            # by label, so a non-unique index — common when reading
-            # parquet files that preserve a saved index — would let
-            # duplicate labels expand or reorder rows so ``X`` and ``y``
-            # no longer line up. Positional sampling via a NumPy RNG +
-            # ``.iloc`` keeps them aligned regardless of input index.
-            rng_sampler = np.random.default_rng(0)
-            sample_positions = rng_sampler.choice(n_sampled, size=sample_size, replace=False)
-            # Sort the positions for determinism / readable output ordering
-            # (the random selection itself is already deterministic via
-            # the seeded RNG).
-            sample_positions.sort()
-            X = X.iloc[sample_positions]
-            if y is not None:
-                y = y.iloc[sample_positions]
-            n_sampled = sample_size
 
         engineer.fit_transform(
             X,
diff --git a/tests/test_cli.py b/tests/test_cli.py
index b43a850..6d37277 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2034,7 +2034,7 @@ def test_explain_caps_input_size_when_sample_size_set(tmp_path: Path):
     assert payload["n_features"] > 0
     # The CLI emits a warning when sampling so callers can detect that
     # metadata may not match a full-input transform run.
-    assert any("sampling" in w.lower() for w in payload["warnings"])
+    assert any("capping input" in w.lower() or "sampling" in w.lower() for w in payload["warnings"])
 
 
 def test_explain_sample_size_smaller_than_input_no_op(tmp_path: Path):
@@ -2067,7 +2067,7 @@ def test_explain_sample_size_smaller_than_input_no_op(tmp_path: Path):
     assert rc == 0, err
     payload = json.loads(out)
     assert payload["n_rows_used"] == n
-    assert not any("sampling" in w.lower() for w in payload["warnings"])
+    assert not any("capping input" in w.lower() or "sampling" in w.lower() for w in payload["warnings"])
 
 
 def test_explain_sample_size_via_config(tmp_path: Path):
@@ -2402,7 +2402,171 @@ def _patched_fit_transform(self, X, y=None, **kwargs):
     assert "retry" not in err.lower()
 
 
-# --------------------------------------------------------------- python -m
+# ----------------------- explain --explain-sample-size memory bound
+
+
+def test_explain_sample_size_bounds_csv_read_with_nrows(tmp_path: Path, monkeypatch):
+    """``--explain-sample-size N`` must propagate to ``pd.read_csv`` as
+    ``nrows=N`` so the underlying read is memory-bounded for huge CSV
+    inputs (rather than fully loading the file and then trimming).
+    """
+    import pandas as pd
+
+    rng = np.random.default_rng(0)
+    n = 5000
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "big.csv"
+    df.to_csv(in_path, index=False)
+
+    real_read_csv = pd.read_csv
+    captured_kwargs: list[dict] = []
+
+    def _spy_read_csv(*args, **kwargs):
+        captured_kwargs.append(kwargs.copy())
+        return real_read_csv(*args, **kwargs)
+
+    monkeypatch.setattr(pd, "read_csv", _spy_read_csv, raising=True)
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+            "--explain-sample-size",
+            "200",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["n_rows_used"] == 200
+    # Must have called pd.read_csv with nrows=200, not loaded the whole
+    # 5000-row file. Multiple calls are OK; at least one must be the
+    # explain read with nrows.
+    explain_reads = [k for k in captured_kwargs if k.get("nrows") == 200]
+    assert explain_reads, f"expected pd.read_csv to be called with nrows=200; got {captured_kwargs!r}"
+
+
+def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path):
+    """For parquet inputs, pandas has no native row-limit, so the bound
+    is applied post-read. The CLI must surface a warning describing the
+    limitation so callers know memory isn't strictly bounded.
+    """
+    pytest.importorskip("pyarrow")
+    rng = np.random.default_rng(0)
+    n = 4000
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "big.parquet"
+    df.to_parquet(in_path, index=False)
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+            "--explain-sample-size",
+            "100",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["n_rows_used"] == 100
+    # The post-read truncation notice must appear in the captured warnings.
+    assert any("post-read" in w.lower() for w in payload["warnings"])
+
+
+# ----------------------- worker-thread capture fallback
+
+
+def test_capture_routes_worker_thread_records_to_single_active_capture():
+    """When exactly one capture is active in the process, log records
+    emitted on a *different* thread (e.g. a ``ThreadPoolExecutor``
+    worker spawned by an LLM sync client) must still be routed to the
+    single active capture rather than escaping to stderr.
+
+    This is the documented "single-active-capture fallback" of
+    :class:`_ThreadCaptureState`.
+    """
+    import threading
+    from concurrent.futures import ThreadPoolExecutor
+
+    fc_logger = logging.getLogger("featcopilot.test_worker")
+
+    def _emit_in_worker():
+        fc_logger.warning("from-worker")
+        return "ok"
+
+    with fc_cli._capture_featcopilot_messages() as captured:
+        # Caller emits on its own thread (must be captured).
+        fc_logger.warning("from-caller")
+        # Spawn a worker thread (different ident) and emit there.
+        with ThreadPoolExecutor(max_workers=1) as pool:
+            assert pool.submit(_emit_in_worker).result(timeout=5) == "ok"
+        # Different non-worker thread also goes through the fallback.
+        t = threading.Thread(target=_emit_in_worker)
+        t.start()
+        t.join()
+
+    assert any("from-caller" in m for m in captured)
+    # Worker-thread records ARE captured under the single-active-capture
+    # fallback (the per-thread stack lookup misses, but exactly one
+    # capture is active, so :meth:`_ThreadCaptureState.get` returns it).
+    assert sum(1 for m in captured if "from-worker" in m) >= 2
+
+
+def test_capture_keeps_thread_isolation_with_multiple_active_captures():
+    """The single-active-capture fallback must NOT activate when two
+    threads are concurrently capturing — each must see only its own
+    thread's records, not records emitted on the other thread's
+    workers.
+    """
+    import threading
+
+    fc_logger = logging.getLogger("featcopilot.test_dual")
+    a_captured: list[str] = []
+    b_captured: list[str] = []
+    barrier = threading.Barrier(2)
+    inside = threading.Event()
+
+    def worker(tag: str, target: list[str]):
+        barrier.wait()
+        with fc_cli._capture_featcopilot_messages() as captured:
+            inside.set()
+            for i in range(10):
+                fc_logger.warning(f"{tag}-{i}")
+        target.extend(captured)
+
+    t1 = threading.Thread(target=worker, args=("A", a_captured))
+    t2 = threading.Thread(target=worker, args=("B", b_captured))
+    t1.start()
+    t2.start()
+    t1.join()
+    t2.join()
+
+    # Each capture must contain ONLY its own thread's records (no fallback
+    # cross-talk because two captures are active).
+    assert all("A-" in m for m in a_captured)
+    assert all("B-" in m for m in b_captured)
+    assert len(a_captured) == 10
+    assert len(b_captured) == 10
+
+
+# ----------------------- python -m
 
 
 def test_dunder_main_module_runs(monkeypatch, capsys):

From bfb5da8373221d3dbcc3d7e6af8ac63f17c1acf0 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 5 May 2026 08:39:37 +0800
Subject: [PATCH 24/30] fix(cli): address round-19 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses all three new comments from round-19 reviewers on PR #5
(commit f710fbe):

* No false sampling warning at exact-boundary (Codex gZs1 P3,
  Copilot gmOw — same root cause). The previous `n_sampled >=
  sample_size` check fired the "metadata may differ" warning even
  when the input had EXACTLY `sample_size` rows (no truncation
  actually happened), causing agents to misinterpret an unsampled
  run as degraded metadata.

  `_cmd_explain` now reads with `nrows = sample_size + 1` so the
  returned length is a strict proof of the file size relative to the
  cap: `len(df) > sample_size` means at least one row was dropped.
  The warning fires only in that case; the post-read truncation to
  exactly `sample_size` happens at the same time. CSV memory bound
  is preserved (`pd.read_csv(nrows=N+1)` still caps at N+1, no full
  load). Three new tests cover the three boundary cases:
  - input exactly = cap -> no warning, n_rows_used == cap
  - input < cap         -> no warning, n_rows_used == input size
  - input = cap + 1     -> warning fires, n_rows_used == cap

* --explain-sample-size help text describes actual semantics
  (Copilot gmPC). The help previously said "deterministic seed",
  implying a seeded random sample, but the implementation is now a
  deterministic head slice (`read_csv(nrows=N)` / `df.iloc[:N]`).
  The help and the docstring both updated to say "deterministic head
  slice (the first N rows of the input)" and to call out explicitly
  that this is NOT a random sample. New
  `test_explain_sample_size_help_text_describes_head_slice_not_random_seed`
  asserts the new wording and a regression guard against the old.

Tests: 116 (+4 new) in tests/test_cli.py, 889 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py |  55 ++++++++++++------
 tests/test_cli.py  | 137 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 170 insertions(+), 22 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index ba87ca9..e396bf5 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -914,10 +914,17 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     For very large inputs where the metadata-only nature of ``explain``
     really should not pay full memory / compute cost, callers can pass
     ``--explain-sample-size N`` (or set ``"explain_sample_size": N`` in
-    ``--config``) to cap the rows fed to the engineer. The CLI emits a
-    ``UserWarning`` (captured into the JSON payload) noting that the
-    metadata may differ from a full-input ``transform`` run; the
-    ``n_rows_used`` field reports the effective sample size.
+    ``--config``) to cap the rows fed to the engineer. The cap is a
+    deterministic *head slice* (the first N rows): for CSV the cap is
+    threaded through ``pd.read_csv(nrows=N)`` so memory is bounded
+    natively; for parquet/JSON pandas has no native row-limit so the
+    file is fully read and then truncated, with a UserWarning explaining
+    the limitation. The cap is NOT a random sample — callers who need
+    randomness should sample externally before invoking ``explain``.
+    A "metadata may differ" UserWarning is emitted (captured into the
+    JSON payload's ``warnings`` field) only when the cap actually
+    truncated the input. The ``n_rows_used`` field reports the effective
+    sample size.
     """
     input_path = Path(args.input)
     if not input_path.exists():
@@ -945,18 +952,28 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     # capture context so the sampling notice ends up in the JSON
     # payload's ``warnings`` field instead of bleeding onto stderr.
     with _capture_featcopilot_messages() as captured_warnings:
-        # Read with ``nrows=sample_size`` so the underlying I/O is
-        # memory-bounded for CSV; for parquet/JSON the bound is
-        # post-read with an emitted UserWarning (captured into the
-        # payload below). Reading FIRST gives us ``len(df)`` so we
-        # only emit the "metadata may differ" notice when the cap
-        # actually shortened the input.
-        df = _read_table(input_path, in_fmt, nrows=sample_size)
+        # Read with ``nrows=sample_size + 1`` so the underlying I/O is
+        # memory-bounded for CSV (``pd.read_csv(nrows=...)``) AND we can
+        # tell from the returned length whether the file actually had
+        # more rows than the cap. ``len(df) > sample_size`` is a strict
+        # proof the file was truncated; ``len(df) <= sample_size`` means
+        # the file fit naturally and no metadata-may-differ warning is
+        # warranted. For parquet/JSON the bound is post-read with its
+        # own UserWarning emitted by ``_read_table``.
+        read_nrows = (sample_size + 1) if sample_size is not None else None
+        df = _read_table(input_path, in_fmt, nrows=read_nrows)
         X, y = _split_xy(df, args.target)
         n_sampled = len(X)
-        if sample_size is not None and n_sampled >= sample_size:
+        if sample_size is not None and n_sampled > sample_size:
+            # Strict proof of truncation: file had at least one more row
+            # than the requested cap. Trim to the exact cap and emit the
+            # "metadata may differ" notice.
+            X = X.iloc[:sample_size]
+            if y is not None:
+                y = y.iloc[:sample_size]
+            n_sampled = sample_size
             warnings.warn(
-                f"explain: capping input to {sample_size} rows (sampling). "
+                f"explain: capping input to {sample_size} rows (head slice). "
                 "Some engines (e.g. TabularEngine categorical encoding) decide which "
                 "features to plan based on row counts and per-category statistics, "
                 "so the reported metadata may differ from a full-input transform run.",
@@ -1072,10 +1089,14 @@ def _build_parser() -> argparse.ArgumentParser:
         "--explain-sample-size",
         type=int,
         default=None,
-        help="Cap the input fed to the engineer at this many rows (deterministic seed). "
-        "OFF by default: the full input is used so the metadata is a faithful description "
-        "of what a corresponding `transform` would generate. Pass a positive integer ONLY "
-        "when you knowingly accept that some engines (e.g. TabularEngine categorical "
+        help="Cap the input fed to the engineer at this many rows. The cap is "
+        "applied as a deterministic head slice (the first N rows of the input — "
+        "for CSV via `pd.read_csv(nrows=N)` so memory is bounded; for parquet/JSON "
+        "the file is fully read and then truncated, with a warning). OFF by default: "
+        "the full input is used so the metadata is a faithful description of what a "
+        "corresponding `transform` would generate. Pass a positive integer ONLY when "
+        "you knowingly accept that (a) the analyzed rows are the first N (not a "
+        "random sample), and (b) some engines (e.g. TabularEngine categorical "
         "encoding) decide which features to plan based on row counts and per-category "
         "statistics, so the reported metadata may differ from a full-input run.",
     )
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 6d37277..3a2f224 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2447,11 +2447,13 @@ def _spy_read_csv(*args, **kwargs):
     assert rc == 0, err
     payload = json.loads(out)
     assert payload["n_rows_used"] == 200
-    # Must have called pd.read_csv with nrows=200, not loaded the whole
-    # 5000-row file. Multiple calls are OK; at least one must be the
-    # explain read with nrows.
-    explain_reads = [k for k in captured_kwargs if k.get("nrows") == 200]
-    assert explain_reads, f"expected pd.read_csv to be called with nrows=200; got {captured_kwargs!r}"
+    # Must have called pd.read_csv with nrows=201 (sample_size + 1, the
+    # CLI requests one extra row so it can detect whether the input was
+    # actually larger than the cap and only emit the metadata-may-differ
+    # warning when truncation really happened). The full 5000-row file
+    # is never loaded.
+    explain_reads = [k for k in captured_kwargs if k.get("nrows") == 201]
+    assert explain_reads, f"expected pd.read_csv to be called with nrows=201; got {captured_kwargs!r}"
 
 
 def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path):
@@ -2566,6 +2568,131 @@ def worker(tag: str, target: list[str]):
     assert len(b_captured) == 10
 
 
+# ----------------------- explain --explain-sample-size warning hygiene
+
+
+def test_explain_no_sampling_warning_when_input_fits_exactly(tmp_path: Path):
+    """When the input has exactly ``--explain-sample-size`` rows, no
+    truncation actually happens, so the "metadata may differ" warning
+    must NOT fire. The success payload was previously inaccurate when
+    the warning fired on the boundary case.
+    """
+    rng = np.random.default_rng(0)
+    n = 200  # exactly the sample-size we'll request
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "exact.csv"
+    df.to_csv(in_path, index=False)
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+            "--explain-sample-size",
+            "200",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["n_rows_used"] == 200
+    # No "metadata may differ" warning — input fit naturally.
+    assert not any("capping input" in w.lower() or "metadata may differ" in w.lower() for w in payload["warnings"])
+
+
+def test_explain_no_sampling_warning_when_input_smaller_than_sample(tmp_path: Path):
+    """When the input has fewer rows than ``--explain-sample-size``,
+    obviously no truncation happens. Belt-and-suspenders coverage of
+    the "<= cap, no warning" branch.
+    """
+    rng = np.random.default_rng(0)
+    n = 50
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "small.csv"
+    df.to_csv(in_path, index=False)
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+            "--explain-sample-size",
+            "200",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["n_rows_used"] == n
+    assert not any("capping input" in w.lower() or "metadata may differ" in w.lower() for w in payload["warnings"])
+
+
+def test_explain_sampling_warning_fires_when_input_strictly_larger(tmp_path: Path):
+    """Strict proof of truncation: input has at least one MORE row than
+    the cap. The warning must fire, and the payload must report
+    ``n_rows_used == sample_size``.
+    """
+    rng = np.random.default_rng(0)
+    n = 201  # exactly one more than the cap
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=n),
+            "x2": rng.normal(size=n),
+            "y": rng.integers(0, 2, size=n),
+        }
+    )
+    in_path = tmp_path / "barely_over.csv"
+    df.to_csv(in_path, index=False)
+
+    rc, out, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+            "--explain-sample-size",
+            "200",
+        ]
+    )
+    assert rc == 0, err
+    payload = json.loads(out)
+    assert payload["n_rows_used"] == 200
+    assert any("capping input" in w.lower() for w in payload["warnings"])
+
+
+def test_explain_sample_size_help_text_describes_head_slice_not_random_seed():
+    """The ``--explain-sample-size`` help text must accurately describe
+    the actual semantics (deterministic head slice, NOT a seeded random
+    sample). Guards against misleading users / agents who would expect
+    an unbiased sample.
+    """
+    parser = fc_cli._build_parser()
+    explain_parser = next(
+        action.choices["explain"] for action in parser._actions if isinstance(action, argparse._SubParsersAction)
+    )
+    sample_help = next(a.help for a in explain_parser._actions if "--explain-sample-size" in a.option_strings)
+    # Must accurately describe the implementation.
+    assert "head slice" in sample_help.lower() or "first n" in sample_help.lower()
+    # Must NOT use the misleading old phrasing.
+    assert "deterministic seed" not in sample_help.lower()
+    assert "random sample" not in sample_help.lower() or "not a random sample" in sample_help.lower()
+
+
 # ----------------------- python -m
 
 

From 167b49081062b4ff07246501ca964a0026c9948a Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 5 May 2026 09:14:11 +0800
Subject: [PATCH 25/30] fix(cli): address round-20 review feedback (Codex P2)

Addresses the Codex P2 comment on PR #5 (commit bfb5da8):

* Reject empty / header-only inputs with exit 2 (Codex gtYr).
  When a CSV has headers but zero data rows, `pd.read_csv` returns
  an empty DataFrame rather than raising `EmptyDataError`. Same for
  an empty JSON array (`[]`) or a parquet file with schema but zero
  rows. The CLI then passed that frame into `TabularEngine`, which
  divides by `len(X)` while fitting categorical encoding and the
  command exited via the generic exit-1 `unexpected error` path
  instead of a clean user-input error.

  `_read_table` now performs an explicit `df.empty` check after
  every supported format read and raises `ValueError("Input file ...
  is empty (zero data rows). Feature engineering requires at least
  one row of data.")` -> exit 2 with a precise stderr message.

  Four new tests cover the new branch:
  - header-only CSV (transform)
  - empty JSON array (transform)
  - parquet with schema but zero rows (transform)
  - header-only CSV (explain)

Tests: 120 (+4 new) in tests/test_cli.py, 893 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 26 ++++++++++----
 tests/test_cli.py  | 87 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+), 6 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index e396bf5..3425605 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -152,7 +152,7 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None):
             # passing it here is what lets ``--explain-sample-size`` actually
             # cap memory on huge CSV inputs (rather than loading the entire
             # file and then trimming).
-            return pd.read_csv(path, nrows=nrows)
+            df = pd.read_csv(path, nrows=nrows)
         except (
             OSError,
             pd.errors.ParserError,
@@ -164,7 +164,7 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None):
             # "unexpected error" path instead of the documented exit-2
             # user-input error.
             raise ValueError(f"Failed to read CSV from {str(path)!r}: {exc}") from exc
-    if fmt == "parquet":
+    elif fmt == "parquet":
         try:
             df = pd.read_parquet(path)
         except ImportError as exc:
@@ -192,8 +192,7 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None):
                 stacklevel=2,
             )
             df = df.iloc[:nrows]
-        return df
-    if fmt == "json":
+    elif fmt == "json":
         # ``orient='records'`` is the agent-friendly default; fall back to
         # pandas' auto-detection when the file isn't a records list.
         try:
@@ -216,8 +215,23 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None):
                 stacklevel=2,
             )
             df = df.iloc[:nrows]
-        return df
-    raise ValueError(f"Unsupported input format: {fmt}")
+    else:
+        raise ValueError(f"Unsupported input format: {fmt}")
+
+    # Reject "header-only" / empty inputs across every supported format.
+    # ``pd.read_csv`` returns an empty DataFrame (no exception) when the
+    # CSV has headers but zero data rows; the same goes for an empty
+    # parquet file or ``[]`` JSON body. Without this check, the CLI
+    # would pass an empty frame into ``TabularEngine``, which divides by
+    # ``len(X)`` while fitting categorical encoding and exits via the
+    # generic ``unexpected error`` path. Surface the issue as a clean
+    # exit-2 user-input error.
+    if df.empty:
+        raise ValueError(
+            f"Input file {str(path)!r} is empty (zero data rows). "
+            "Feature engineering requires at least one row of data."
+        )
+    return df
 
 
 def _write_table(df, path: Path, fmt: str) -> None:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 3a2f224..d1e044f 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -904,6 +904,93 @@ def test_headerless_csv_input_returns_exit_2(tmp_path: Path):
     assert "failed to read csv" in err.lower()
 
 
+def test_header_only_csv_input_returns_exit_2(tmp_path: Path):
+    """A CSV that has a header line but ZERO data rows is read by pandas
+    as an *empty* DataFrame (no exception). Without the explicit empty
+    check, the CLI would feed it into ``TabularEngine`` which divides by
+    ``len(X)`` and exits via the generic exit-1 backstop. The CLI must
+    surface this as a clean exit-2 user-input error.
+    """
+    in_path = tmp_path / "header_only.csv"
+    in_path.write_text("x1,x2,y\n")  # header but no data
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "empty" in err.lower()
+    assert "zero data rows" in err.lower()
+
+
+def test_empty_json_input_returns_exit_2(tmp_path: Path):
+    """An empty JSON array is parsed as an empty DataFrame and must be
+    rejected up front like header-only CSV.
+    """
+    in_path = tmp_path / "empty.json"
+    in_path.write_text("[]")
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "empty" in err.lower()
+
+
+def test_empty_parquet_input_returns_exit_2(tmp_path: Path):
+    """A parquet file with schema but zero rows is rejected up front."""
+    pytest.importorskip("pyarrow")
+    in_path = tmp_path / "empty.parquet"
+    pd.DataFrame({"x1": [], "x2": [], "y": []}).to_parquet(in_path, index=False)
+
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "empty" in err.lower()
+
+
+def test_explain_header_only_csv_returns_exit_2(tmp_path: Path):
+    """The empty-input check is applied to ``explain`` too."""
+    in_path = tmp_path / "header_only.csv"
+    in_path.write_text("x1,x2,y\n")
+
+    rc, _, err = _run(
+        [
+            "explain",
+            "--input",
+            str(in_path),
+            "--target",
+            "y",
+        ]
+    )
+    assert rc == 2
+    assert "empty" in err.lower()
+
+
 def test_transform_include_target_collision_returns_exit_2(tmp_path: Path):
     """``--include-target`` would silently overwrite an engineered feature
     if it happens to share the target column's name. The CLI must detect

From 016fe34d4f35c49f5a4f8bd3ac577834b19ae871 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 5 May 2026 12:21:54 +0800
Subject: [PATCH 26/30] fix(cli): address round-22 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses all three new comments from copilot-pull-request-reviewer
on PR #5 (commits 167b490 + 5cbb843):

* Drop the over-broad single-active-capture fallback (Copilot hFmA).
  Round-18's `_ThreadCaptureState.get` fallback routed records
  from ANY thread to the single in-flight CLI capture whenever
  exactly one capture was active. As Copilot pointed out, that was
  too broad: unrelated background work using `featcopilot` in the
  same process would have its log/warning output silently swallowed
  into the active CLI command's payload, with no ownership tie back
  to the command. `get` is now strictly per-thread; cross-thread
  records (e.g. ThreadPoolExecutor workers) bleed onto stderr like
  any other background log. The class docstring documents the
  trade-off and the existing options for callers who really need
  every worker record captured. The previous worker-routing test
  was inverted into `test_capture_does_not_route_unrelated_thread_records`,
  which guards against the regression.

* Structured single-line argparse error format (Copilot hFmP).
  `argparse`'s default `error()` writes the multi-line usage
  banner (`usage: featcopilot ...`) PLUS `prog: error: ...` to
  stderr before raising `SystemExit`. `main()` was only converting
  the exit code; the banner still appeared on stderr, breaking the
  documented "stderr carries one `featcopilot: error: ...` line per
  failure" contract. New `_StructuredArgumentParser` overrides
  `error()` to emit exactly one `featcopilot: error: <message>`
  line with no banner, and is wired into both the top-level parser
  and all subparsers via `parser_class=_StructuredArgumentParser`.
  Two new tests cover the contract: single-line stderr for an
  unknown flag and for a missing subcommand, plus a regression guard
  against the `usage:` substring appearing in stderr.

* Eliminate double truncation warning for parquet/JSON sample
  (Copilot iH8R). `_cmd_explain` was passing `nrows=sample_size+1`
  to `_read_table` so it could detect truncation. For parquet/JSON
  `_read_table` then emitted its own post-read warning saying
  "truncating to N+1" and trimmed to that, after which `_cmd_explain`
  trimmed again to N and emitted a second warning. Two near-duplicate
  warnings with off-by-one numbers — confusing to agents.

  The fix:
  - `_read_table` now accepts `suppress_truncation_warning=True`
    so callers that emit their own consolidated message can silence
    its post-read notice.
  - `_cmd_explain` only passes `nrows` to `_read_table` when
    pandas can natively bound the read (CSV with `nrows=N+1`).
    For parquet/JSON it reads with `nrows=None` (full file) and
    handles both detection and trimming itself.
  - The single `_cmd_explain` UserWarning now uses the user-facing
    `sample_size` value, AND for parquet/JSON includes the
    "memory wasn't bounded" caveat as a single sentence — same
    information, no duplication. Test
    `test_explain_sample_size_warns_post_read_for_parquet` asserts
    exactly one truncation notice and that it references `100 rows`
    (not the internal +1).

Tests: 122 (+2 net) in tests/test_cli.py, 895 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 189 ++++++++++++++++++++++++++++++---------------
 tests/test_cli.py  | 100 ++++++++++++++++++------
 2 files changed, 201 insertions(+), 88 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index 3425605..eea1b76 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -115,7 +115,7 @@ def _detect_format(path: Path, override: str | None) -> str:
     return fmt
 
 
-def _read_table(path: Path, fmt: str, *, nrows: int | None = None):
+def _read_table(path: Path, fmt: str, *, nrows: int | None = None, suppress_truncation_warning: bool = False):
     """Read a tabular file into a pandas DataFrame.
 
     All user-facing failure modes (missing parquet engine, ``--input``
@@ -136,10 +136,17 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None):
         directly to :func:`pandas.read_csv` so the underlying read is
         memory-bounded. For ``parquet`` and ``json``, pandas does not
         expose a native row limit, so the file is fully read and then
-        truncated; a :class:`UserWarning` is issued in that case so the
-        caller knows the bound is post-read (not memory-bounded). The
-        ``nrows`` cap is applied with a deterministic head slice so
-        re-runs on the same input produce the same metadata.
+        truncated; a :class:`UserWarning` is issued in that case (unless
+        ``suppress_truncation_warning`` is true) so the caller knows the
+        bound is post-read (not memory-bounded). The ``nrows`` cap is
+        applied with a deterministic head slice so re-runs on the same
+        input produce the same metadata.
+    suppress_truncation_warning : bool, optional
+        When True, the post-read truncation notice (parquet / JSON only)
+        is *not* emitted from this helper. Used by callers that emit
+        their own consolidated, user-facing warning so users don't see
+        a confusing pair of messages — see ``_cmd_explain``'s
+        ``--explain-sample-size`` handling.
     """
     import pandas as pd
 
@@ -182,15 +189,16 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None):
             # raised is by definition an I/O or data issue, not a CLI bug.
             raise ValueError(f"Failed to read parquet from {str(path)!r}: {exc}") from exc
         if nrows is not None and len(df) > nrows:
-            warnings.warn(
-                f"--explain-sample-size cap is applied post-read for parquet "
-                f"(loaded {len(df)} rows, truncating to {nrows}). pandas "
-                "does not expose a native parquet row-limit, so the full "
-                "file is materialized in memory before the cap. For hard "
-                "memory bounds on huge inputs, convert to CSV first.",
-                UserWarning,
-                stacklevel=2,
-            )
+            if not suppress_truncation_warning:
+                warnings.warn(
+                    f"--explain-sample-size cap is applied post-read for parquet "
+                    f"(loaded {len(df)} rows, truncating to {nrows}). pandas "
+                    "does not expose a native parquet row-limit, so the full "
+                    "file is materialized in memory before the cap. For hard "
+                    "memory bounds on huge inputs, convert to CSV first.",
+                    UserWarning,
+                    stacklevel=2,
+                )
             df = df.iloc[:nrows]
     elif fmt == "json":
         # ``orient='records'`` is the agent-friendly default; fall back to
@@ -205,15 +213,16 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None):
         except OSError as exc:
             raise ValueError(f"Failed to read JSON from {str(path)!r}: {exc}") from exc
         if nrows is not None and len(df) > nrows:
-            warnings.warn(
-                f"--explain-sample-size cap is applied post-read for JSON "
-                f"(loaded {len(df)} rows, truncating to {nrows}). pandas "
-                "does not expose a native JSON row-limit, so the full "
-                "file is materialized in memory before the cap. For hard "
-                "memory bounds on huge inputs, convert to CSV first.",
-                UserWarning,
-                stacklevel=2,
-            )
+            if not suppress_truncation_warning:
+                warnings.warn(
+                    f"--explain-sample-size cap is applied post-read for JSON "
+                    f"(loaded {len(df)} rows, truncating to {nrows}). pandas "
+                    "does not expose a native JSON row-limit, so the full "
+                    "file is materialized in memory before the cap. For hard "
+                    "memory bounds on huge inputs, convert to CSV first.",
+                    UserWarning,
+                    stacklevel=2,
+                )
             df = df.iloc[:nrows]
     else:
         raise ValueError(f"Unsupported input format: {fmt}")
@@ -576,7 +585,7 @@ def _fit_transform_capturing_warnings(engineer, X, y, **kwargs):
 
 
 class _ThreadCaptureState:
-    """Holds per-thread capture *stacks* with a single-active-capture fallback.
+    """Holds per-thread capture *stacks* with strict per-thread isolation.
 
     Each thread maps to a stack of capture lists. Nested
     :func:`_capture_featcopilot_messages` calls on the same thread push
@@ -584,17 +593,22 @@ class _ThreadCaptureState:
     and receives records / warnings until its block exits, at which
     point the outer capture (if any) becomes active again.
 
-    **Worker-thread fallback.** When the calling thread doesn't have a
-    capture but exactly one capture is active anywhere in the process,
-    :meth:`get` returns that single capture. This handles the common
-    case where the capturing thread spawns worker threads (e.g. an LLM
-    sync client wrapping ``ThreadPoolExecutor`` because it was called
-    from a process with a running event loop) — those workers' log
-    records logically belong to the single in-flight CLI run, and
-    routing them there keeps stderr clean. When more than one capture
-    is active concurrently, the fallback stays disabled (each captures
-    only its own thread's records) so concurrent CLI calls don't bleed
-    into each other.
+    **Strict thread isolation.** :meth:`get` returns a target list ONLY
+    for the calling thread itself. Records emitted on threads other
+    than the one that opened a capture (e.g. a worker spawned by an
+    LLM sync client wrapping ``ThreadPoolExecutor``) flow through the
+    normal handler chain and reach stderr — same as records emitted
+    by unrelated background work that happens to use ``featcopilot``
+    in the same process. This is intentional: a previous "single-
+    active-capture fallback" was too broad — when a single CLI run
+    was active, *any* featcopilot log on any thread would have been
+    silently swallowed into that command's payload, including
+    unrelated background work, causing misattribution. Strict per-
+    thread routing avoids that ambiguity at the cost of letting some
+    worker-thread records bleed onto stderr; callers who need every
+    last log captured should make sure their worker code explicitly
+    propagates the calling thread's identity (e.g. via
+    ``contextvars`` or a dedicated logging wrapper).
 
     Shared by :class:`_ThreadRoutingHandler` (writes records),
     :class:`_SuppressCapturingFilter` (suppresses stderr), and the
@@ -618,21 +632,14 @@ def pop(self, tid: int) -> None:
                     del self._per_thread[tid]
 
     def get(self, tid: int) -> list[str] | None:
-        # Brief lock for thread-safe stack-top read AND single-active-
-        # capture fallback (both walk ``self._per_thread``).
+        # Strict per-thread lookup. No cross-thread fallback (see class
+        # docstring): the previous "single-active-capture" fallback
+        # was too broad and could silently swallow unrelated
+        # background log output into a CLI run's payload.
         with self._lock:
             stack = self._per_thread.get(tid)
             if stack:
                 return stack[-1]
-            # Worker-thread fallback. Cross-thread records (e.g. from a
-            # ThreadPoolExecutor worker spawned by the capturing thread)
-            # are routed to the single active capture when there is no
-            # ambiguity. Multiple concurrent captures keep their strict
-            # per-thread isolation.
-            if len(self._per_thread) == 1:
-                only_stack = next(iter(self._per_thread.values()))
-                if only_stack:
-                    return only_stack[-1]
             return None
 
 
@@ -966,16 +973,30 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     # capture context so the sampling notice ends up in the JSON
     # payload's ``warnings`` field instead of bleeding onto stderr.
     with _capture_featcopilot_messages() as captured_warnings:
-        # Read with ``nrows=sample_size + 1`` so the underlying I/O is
-        # memory-bounded for CSV (``pd.read_csv(nrows=...)``) AND we can
-        # tell from the returned length whether the file actually had
-        # more rows than the cap. ``len(df) > sample_size`` is a strict
-        # proof the file was truncated; ``len(df) <= sample_size`` means
-        # the file fit naturally and no metadata-may-differ warning is
-        # warranted. For parquet/JSON the bound is post-read with its
-        # own UserWarning emitted by ``_read_table``.
-        read_nrows = (sample_size + 1) if sample_size is not None else None
-        df = _read_table(input_path, in_fmt, nrows=read_nrows)
+        # Choose a read strategy that gives us:
+        #   1. a memory bound where pandas allows it (CSV ``nrows``), and
+        #   2. enough information to detect truncation without ``_read_table``
+        #      emitting its own (slightly off-by-one) warning that would
+        #      then double up with ours.
+        #
+        # For CSV we ask for ``sample_size + 1`` rows: ``pd.read_csv``
+        # reads at most that many, AND ``len(df) > sample_size`` becomes
+        # a strict proof of truncation. We pass ``suppress_truncation_warning``
+        # so ``_read_table`` doesn't emit its own message — ``_cmd_explain``
+        # is the single source of truth for the sampling notice and uses
+        # the user-facing ``sample_size`` value.
+        #
+        # For parquet/JSON pandas exposes no native row-limit, so we
+        # always read fully (``nrows=None``) and let ``_cmd_explain``
+        # both detect truncation and emit a single notice that includes
+        # the "memory bound is post-read" caveat. Asking ``_read_table``
+        # for a limit there would only have caused it to truncate at
+        # ``sample_size + 1`` and emit a confusing duplicate warning.
+        if sample_size is not None and in_fmt == "csv":
+            read_nrows: int | None = sample_size + 1
+        else:
+            read_nrows = None
+        df = _read_table(input_path, in_fmt, nrows=read_nrows, suppress_truncation_warning=True)
         X, y = _split_xy(df, args.target)
         n_sampled = len(X)
         if sample_size is not None and n_sampled > sample_size:
@@ -985,15 +1006,25 @@ def _cmd_explain(args: argparse.Namespace) -> int:
             X = X.iloc[:sample_size]
             if y is not None:
                 y = y.iloc[:sample_size]
+            original_len = n_sampled
             n_sampled = sample_size
-            warnings.warn(
-                f"explain: capping input to {sample_size} rows (head slice). "
+            msg = f"explain: capping input to {sample_size} rows (head slice). "
+            if in_fmt != "csv":
+                # For parquet/JSON we read the whole file before truncation
+                # (no native row-limit). Surface that fact so callers know
+                # memory wasn't bounded.
+                msg += (
+                    f"For {in_fmt}, pandas does not expose a native row-limit, "
+                    f"so the full file ({original_len}+ rows) was loaded into "
+                    "memory before truncation. For hard memory bounds on huge "
+                    "inputs, convert to CSV first. "
+                )
+            msg += (
                 "Some engines (e.g. TabularEngine categorical encoding) decide which "
                 "features to plan based on row counts and per-category statistics, "
-                "so the reported metadata may differ from a full-input transform run.",
-                UserWarning,
-                stacklevel=2,
+                "so the reported metadata may differ from a full-input transform run."
             )
+            warnings.warn(msg, UserWarning, stacklevel=2)
 
         engineer.fit_transform(
             X,
@@ -1030,8 +1061,35 @@ def _cmd_explain(args: argparse.Namespace) -> int:
     return 0
 
 
+class _StructuredArgumentParser(argparse.ArgumentParser):
+    """``argparse.ArgumentParser`` that emits the CLI's structured single-line
+    error format on usage failures.
+
+    The default ``argparse`` ``error()`` method writes the multi-line
+    usage banner ("usage: featcopilot ...\\n featcopilot: error: ...") to
+    stderr before raising :class:`SystemExit`. That breaks the CLI
+    contract that stderr carries exactly one ``featcopilot: error: ...``
+    line per failure — agents parsing stderr deterministically would see
+    the banner and the actual error mixed together, with no easy way to
+    tell which is which.
+
+    This subclass overrides :meth:`error` to write a single line and
+    skip the banner, so usage failures (missing required argument,
+    unknown flag, missing subcommand, etc.) follow the same single-line
+    contract as the rest of the CLI's exit-2 paths.
+    """
+
+    def error(self, message: str) -> None:  # type: ignore[override]
+        sys.stderr.write(f"featcopilot: error: {message}\n")
+        # ``ArgumentParser.error`` is documented to terminate; ``SystemExit(2)``
+        # is what the parent class would do after writing the banner.
+        # ``main()``'s ``except SystemExit`` handler converts this to an int
+        # return value so callers still see the documented exit-2 contract.
+        raise SystemExit(2)
+
+
 def _build_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(
+    parser = _StructuredArgumentParser(
         prog="featcopilot",
         description=(
             "FeatCopilot CLI — automated feature engineering from the command line. "
@@ -1045,7 +1103,12 @@ def _build_parser() -> argparse.ArgumentParser:
         action="version",
         version=f"featcopilot {__version__}",
     )
-    subparsers = parser.add_subparsers(dest="command", required=True, metavar="COMMAND")
+    # Use the structured parser class for subparsers too so any
+    # subcommand-specific usage error (unknown flag, missing required
+    # arg) follows the same single-line stderr contract.
+    subparsers = parser.add_subparsers(
+        dest="command", required=True, metavar="COMMAND", parser_class=_StructuredArgumentParser
+    )
 
     # ----- info ---------------------------------------------------------
     p_info = subparsers.add_parser(
diff --git a/tests/test_cli.py b/tests/test_cli.py
index d1e044f..aea1b13 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2015,6 +2015,45 @@ def test_unknown_flag_returns_exit_2(capsys):
     assert rc == 2
 
 
+def test_argparse_usage_error_emits_single_structured_line(tmp_path: Path, tabular_csv: Path):
+    """``argparse`` defaults to writing a multi-line usage banner before its
+    error message, mixing two pieces of information on stderr that agents
+    must then parse apart. The CLI's ``_StructuredArgumentParser`` collapses
+    those into the single canonical ``featcopilot: error: <message>`` line
+    so usage failures match the rest of the exit-2 contract.
+    """
+    rc, _, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(tmp_path / "out.csv"),
+            "--target",
+            "y",
+            "--no-such-flag",  # genuine unknown flag (not a missing-required)
+        ]
+    )
+    assert rc == 2
+    err_lines = [line for line in err.splitlines() if line.strip()]
+    # Exactly one non-empty stderr line.
+    assert len(err_lines) == 1, f"Expected single-line stderr, got {err_lines!r}"
+    assert err_lines[0].startswith("featcopilot: error: ")
+    # No multi-line ``argparse`` usage banner.
+    assert "usage:" not in err.lower()
+    # Still mentions the offending flag.
+    assert "--no-such-flag" in err
+
+
+def test_argparse_missing_subcommand_emits_single_structured_line():
+    rc, _, err = _run([])
+    assert rc == 2
+    err_lines = [line for line in err.splitlines() if line.strip()]
+    assert len(err_lines) == 1, f"Expected single-line stderr, got {err_lines!r}"
+    assert err_lines[0].startswith("featcopilot: error: ")
+    assert "usage:" not in err.lower()
+
+
 def test_help_flag_returns_zero(capsys):
     rc = fc_cli.main(["--help"])
     assert rc == 0
@@ -2546,7 +2585,9 @@ def _spy_read_csv(*args, **kwargs):
 def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path):
     """For parquet inputs, pandas has no native row-limit, so the bound
     is applied post-read. The CLI must surface a warning describing the
-    limitation so callers know memory isn't strictly bounded.
+    limitation so callers know memory isn't strictly bounded. The
+    warning is emitted by ``_cmd_explain`` itself (not duplicated by
+    ``_read_table``) so the user sees one accurate message.
     """
     pytest.importorskip("pyarrow")
     rng = np.random.default_rng(0)
@@ -2576,46 +2617,55 @@ def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path):
     payload = json.loads(out)
     assert payload["n_rows_used"] == 100
     # The post-read truncation notice must appear in the captured warnings.
-    assert any("post-read" in w.lower() for w in payload["warnings"])
+    # The unified message says: "For parquet, pandas does not expose a
+    # native row-limit, so the full file ... was loaded into memory before
+    # truncation."
+    captured = " ".join(payload["warnings"]).lower()
+    assert "native row-limit" in captured or "post-read" in captured or "memory before truncation" in captured
+    # The user-facing message uses the actual sample_size (100), NOT the
+    # internal +1 read size, AND there is exactly one truncation notice.
+    assert "100 rows" in " ".join(payload["warnings"])
+    truncation_msgs = [w for w in payload["warnings"] if "truncat" in w.lower() or "capping" in w.lower()]
+    assert len(truncation_msgs) == 1, f"expected exactly one truncation notice, got {truncation_msgs!r}"
 
 
-# ----------------------- worker-thread capture fallback
+# ----------------------- strict per-thread capture isolation
 
 
-def test_capture_routes_worker_thread_records_to_single_active_capture():
-    """When exactly one capture is active in the process, log records
-    emitted on a *different* thread (e.g. a ``ThreadPoolExecutor``
-    worker spawned by an LLM sync client) must still be routed to the
-    single active capture rather than escaping to stderr.
+def test_capture_does_not_route_unrelated_thread_records():
+    """The capture layer must use STRICT per-thread routing: records
+    emitted on threads other than the one that opened a capture flow
+    through the normal handler chain (and reach stderr) — they are
+    NOT silently rolled into the single in-flight CLI run's payload.
 
-    This is the documented "single-active-capture fallback" of
-    :class:`_ThreadCaptureState`.
+    A previous "single-active-capture fallback" was too broad: when a
+    single CLI run was active, *any* featcopilot log on any thread
+    would have been swallowed into that command's payload, including
+    unrelated background work, causing misattribution. This test
+    guards against that regression.
     """
     import threading
-    from concurrent.futures import ThreadPoolExecutor
-
-    fc_logger = logging.getLogger("featcopilot.test_worker")
 
-    def _emit_in_worker():
-        fc_logger.warning("from-worker")
-        return "ok"
+    fc_logger = logging.getLogger("featcopilot.test_unrelated")
 
     with fc_cli._capture_featcopilot_messages() as captured:
         # Caller emits on its own thread (must be captured).
         fc_logger.warning("from-caller")
-        # Spawn a worker thread (different ident) and emit there.
-        with ThreadPoolExecutor(max_workers=1) as pool:
-            assert pool.submit(_emit_in_worker).result(timeout=5) == "ok"
-        # Different non-worker thread also goes through the fallback.
-        t = threading.Thread(target=_emit_in_worker)
+
+        # Spawn a separate, unrelated thread that ALSO emits via the
+        # featcopilot logger. With the over-broad fallback removed, that
+        # record must NOT appear in this capture's payload.
+        def _emit_elsewhere():
+            fc_logger.warning("from-other-thread")
+
+        t = threading.Thread(target=_emit_elsewhere)
         t.start()
         t.join()
 
     assert any("from-caller" in m for m in captured)
-    # Worker-thread records ARE captured under the single-active-capture
-    # fallback (the per-thread stack lookup misses, but exactly one
-    # capture is active, so :meth:`_ThreadCaptureState.get` returns it).
-    assert sum(1 for m in captured if "from-worker" in m) >= 2
+    # Strict per-thread isolation: unrelated thread's record is NOT in
+    # this capture's payload.
+    assert not any("from-other-thread" in m for m in captured)
 
 
 def test_capture_keeps_thread_isolation_with_multiple_active_captures():

From 298d4def60b26b8e79dd8fa68fac6b96cebd0d4c Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 5 May 2026 20:06:22 +0800
Subject: [PATCH 27/30] fix(cli): address round-23 review feedback
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses the new comment from copilot-pull-request-reviewer on
PR #5 (commit 016fe34):

* Restore narrow LLM-only cross-thread fallback (Copilot osJO).
  Round-22 removed the single-active-capture fallback in response to
  Copilot's complaint that it was too broad (would misattribute
  unrelated background `featcopilot` work). Round-23 Copilot now
  flags that the strict per-thread routing breaks the
  "stderr is reserved for failures" contract for LLM-backed runs in
  event-loop environments — the sync LLM clients in
  `featcopilot/llm/*_client.py` fall back to `ThreadPoolExecutor`
  there, and their mock-mode startup warnings emit from worker
  threads that `submit()` spawns.

  Resolution: a *narrow* fallback that satisfies both reviewers.
  `_ThreadCaptureState` now exposes `get_for_llm_record(tid,
  logger_name)`: it returns the calling thread's capture if any,
  otherwise — and only when the record originates from
  `featcopilot.llm.*` AND exactly one capture is active in the
  process — it routes to that single capture. Multiple concurrent
  captures keep strict isolation (no LLM cross-talk either). Records
  from non-LLM featcopilot loggers on cross-threads still flow to
  stderr as before, so unrelated background work is NOT
  misattributed.

  Both `_ThreadRoutingHandler.emit` and
  `_SuppressCapturingFilter.filter` go through
  `get_for_llm_record`, so the routing handler and the stderr-
  suppression filter stay in lockstep: anything captured is also
  suppressed from the original handlers.

  Three tests cover the policy:
  - `test_capture_does_not_route_unrelated_thread_records`: non-LLM
    record from a cross-thread is NOT captured (regression guard
    against round-22's "too broad" complaint).
  - `test_capture_routes_llm_client_worker_records_to_single_active_capture`:
    LLM record from ThreadPoolExecutor + raw threading.Thread workers
    IS captured.
  - `test_capture_does_not_apply_llm_fallback_with_multiple_captures`:
    even for LLM records, two concurrent captures stay strictly
    isolated.

Tests: 124 (+2 net) in tests/test_cli.py, 897 passed full suite.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 112 +++++++++++++++++++++++++++++++--------------
 tests/test_cli.py  | 103 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 171 insertions(+), 44 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index eea1b76..ea48ce3 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -585,7 +585,8 @@ def _fit_transform_capturing_warnings(engineer, X, y, **kwargs):
 
 
 class _ThreadCaptureState:
-    """Holds per-thread capture *stacks* with strict per-thread isolation.
+    """Holds per-thread capture *stacks* with strict per-thread isolation
+    and a *narrow* cross-thread fallback for LLM-client log records.
 
     Each thread maps to a stack of capture lists. Nested
     :func:`_capture_featcopilot_messages` calls on the same thread push
@@ -593,28 +594,35 @@ class _ThreadCaptureState:
     and receives records / warnings until its block exits, at which
     point the outer capture (if any) becomes active again.
 
-    **Strict thread isolation.** :meth:`get` returns a target list ONLY
-    for the calling thread itself. Records emitted on threads other
-    than the one that opened a capture (e.g. a worker spawned by an
-    LLM sync client wrapping ``ThreadPoolExecutor``) flow through the
-    normal handler chain and reach stderr — same as records emitted
-    by unrelated background work that happens to use ``featcopilot``
-    in the same process. This is intentional: a previous "single-
-    active-capture fallback" was too broad — when a single CLI run
-    was active, *any* featcopilot log on any thread would have been
-    silently swallowed into that command's payload, including
-    unrelated background work, causing misattribution. Strict per-
-    thread routing avoids that ambiguity at the cost of letting some
-    worker-thread records bleed onto stderr; callers who need every
-    last log captured should make sure their worker code explicitly
-    propagates the calling thread's identity (e.g. via
-    ``contextvars`` or a dedicated logging wrapper).
+    **Strict thread isolation by default.** :meth:`get` returns a target
+    list ONLY for the calling thread itself. This avoids the
+    misattribution that an unconditional single-active-capture fallback
+    would cause: any featcopilot log on any thread would otherwise be
+    silently swallowed into the active CLI run's payload, including
+    output from unrelated background work happening in the same process.
+
+    **Narrow LLM-client fallback.** :meth:`get_for_llm_record` is the
+    one exception: when a record originates from
+    ``featcopilot.llm.*_client`` (the sync LLM clients that fall back to
+    ``ThreadPoolExecutor`` in event-loop environments), and exactly one
+    capture is active in the process, the record is routed to that
+    capture even when emitted on a worker thread. This addresses the
+    common case where an LLM client's mock-mode startup warning fires
+    on a worker that ``submit()`` spawned and would otherwise bleed
+    onto stderr; the targeted whitelist keeps unrelated background
+    featcopilot work from being misattributed.
 
     Shared by :class:`_ThreadRoutingHandler` (writes records),
     :class:`_SuppressCapturingFilter` (suppresses stderr), and the
     routing ``warnings.showwarning`` override.
     """
 
+    # Logger-name prefixes whose records are eligible for the narrow
+    # cross-thread fallback. Only the LLM client modules whose sync
+    # entry points fall back to ``ThreadPoolExecutor`` in event-loop
+    # environments are listed.
+    _LLM_FALLBACK_LOGGER_PREFIXES = ("featcopilot.llm.",)
+
     def __init__(self):
         self._per_thread: dict[int, list[list[str]]] = {}
         self._lock = threading.Lock()
@@ -632,14 +640,42 @@ def pop(self, tid: int) -> None:
                     del self._per_thread[tid]
 
     def get(self, tid: int) -> list[str] | None:
-        # Strict per-thread lookup. No cross-thread fallback (see class
-        # docstring): the previous "single-active-capture" fallback
-        # was too broad and could silently swallow unrelated
-        # background log output into a CLI run's payload.
+        # Strict per-thread lookup. No cross-thread fallback: an
+        # unconditional fallback was too broad and could silently
+        # swallow unrelated background log output into a CLI run's
+        # payload. The narrow LLM-client fallback lives in
+        # :meth:`get_for_llm_record` instead, opted into by name.
+        with self._lock:
+            stack = self._per_thread.get(tid)
+            if stack:
+                return stack[-1]
+            return None
+
+    def get_for_llm_record(self, tid: int, logger_name: str) -> list[str] | None:
+        """Per-thread lookup with a narrow cross-thread fallback for LLM
+        client records.
+
+        When the calling thread has its own active capture, that's used.
+        Otherwise — and only when the record originates from a
+        whitelisted ``featcopilot.llm.*`` logger AND exactly one capture
+        is active in the process — the record is routed to that single
+        capture so it doesn't bleed onto stderr. The whitelist keeps
+        unrelated background featcopilot work strictly isolated.
+        """
         with self._lock:
             stack = self._per_thread.get(tid)
             if stack:
                 return stack[-1]
+            if not logger_name.startswith(self._LLM_FALLBACK_LOGGER_PREFIXES):
+                return None
+            if len(self._per_thread) != 1:
+                # Either no captures are active (nothing to route to) or
+                # multiple are active (ambiguous — keep strict isolation
+                # so concurrent CLI calls don't cross-contaminate).
+                return None
+            only_stack = next(iter(self._per_thread.values()))
+            if only_stack:
+                return only_stack[-1]
             return None
 
 
@@ -649,9 +685,13 @@ class _ThreadRoutingHandler(logging.Handler):
     Attached once to the ``featcopilot`` root logger. Records propagated
     from any ``featcopilot.*`` child logger reach this handler in the same
     way they reach the existing stderr handler. If the calling thread has
-    a registered capture list, the record is appended to it; otherwise the
-    handler does nothing (the existing stderr handler is what produces the
-    user-facing output for non-capturing threads).
+    a registered capture list, the record is appended to it.
+    Otherwise, for records originating from a ``featcopilot.llm.*``
+    logger AND when exactly one capture is active in the process, the
+    record is routed to that capture (the narrow LLM-client cross-thread
+    fallback — see :class:`_ThreadCaptureState`). Records from any
+    other thread / logger combination flow through to the existing
+    stderr handler.
     """
 
     def __init__(self, state: _ThreadCaptureState):
@@ -660,7 +700,7 @@ def __init__(self, state: _ThreadCaptureState):
         self.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
 
     def emit(self, record: logging.LogRecord) -> None:
-        target = self._state.get(threading.get_ident())
+        target = self._state.get_for_llm_record(threading.get_ident(), record.name)
         if target is None:
             return
         try:
@@ -670,14 +710,18 @@ def emit(self, record: logging.LogRecord) -> None:
 
 
 class _SuppressCapturingFilter(logging.Filter):
-    """Filter for the *existing* handlers: drops records from capturing threads.
-
-    Without this filter, every record emitted by a capturing thread would
-    still hit the featcopilot root logger's stderr ``StreamHandler`` and
-    bleed onto stderr — breaking the CLI's "stderr reserved for failures"
-    contract. The filter checks ``threading.get_ident()`` against the
-    shared :class:`_ThreadCaptureState` so non-capturing threads continue
-    to see normal stderr output.
+    """Filter for the *existing* handlers: drops records being captured.
+
+    Without this filter, every record routed by
+    :class:`_ThreadRoutingHandler` to a capture list would still hit the
+    featcopilot root logger's stderr ``StreamHandler`` and bleed onto
+    stderr — breaking the CLI's "stderr reserved for failures" contract.
+    The filter mirrors the routing handler's policy so the two stay in
+    lockstep: anything captured (current-thread record OR cross-thread
+    LLM-client record under the narrow fallback) is also suppressed
+    from the original handlers; anything else (records from
+    non-capturing threads / unrelated background work) flows through to
+    stderr unchanged.
     """
 
     def __init__(self, state: _ThreadCaptureState):
@@ -685,7 +729,7 @@ def __init__(self, state: _ThreadCaptureState):
         self._state = state
 
     def filter(self, record: logging.LogRecord) -> bool:
-        return self._state.get(threading.get_ident()) is None
+        return self._state.get_for_llm_record(threading.get_ident(), record.name) is None
 
 
 # Module-level singletons. Installed exactly once on the featcopilot root
diff --git a/tests/test_cli.py b/tests/test_cli.py
index aea1b13..80171d0 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2633,16 +2633,18 @@ def test_explain_sample_size_warns_post_read_for_parquet(tmp_path: Path):
 
 
 def test_capture_does_not_route_unrelated_thread_records():
-    """The capture layer must use STRICT per-thread routing: records
-    emitted on threads other than the one that opened a capture flow
-    through the normal handler chain (and reach stderr) — they are
-    NOT silently rolled into the single in-flight CLI run's payload.
+    """The capture layer must use STRICT per-thread routing for non-LLM
+    records: records emitted on threads other than the one that opened
+    a capture flow through the normal handler chain (and reach stderr)
+    — they are NOT silently rolled into the single in-flight CLI run's
+    payload.
 
     A previous "single-active-capture fallback" was too broad: when a
     single CLI run was active, *any* featcopilot log on any thread
     would have been swallowed into that command's payload, including
     unrelated background work, causing misattribution. This test
-    guards against that regression.
+    guards against that regression for the non-LLM case (the narrow
+    LLM-only fallback is covered separately).
     """
     import threading
 
@@ -2652,9 +2654,10 @@ def test_capture_does_not_route_unrelated_thread_records():
         # Caller emits on its own thread (must be captured).
         fc_logger.warning("from-caller")
 
-        # Spawn a separate, unrelated thread that ALSO emits via the
-        # featcopilot logger. With the over-broad fallback removed, that
-        # record must NOT appear in this capture's payload.
+        # Spawn a separate, unrelated thread that ALSO emits via a
+        # NON-LLM featcopilot logger. With strict per-thread isolation
+        # for non-LLM records, that record must NOT appear in this
+        # capture's payload.
         def _emit_elsewhere():
             fc_logger.warning("from-other-thread")
 
@@ -2663,11 +2666,91 @@ def _emit_elsewhere():
         t.join()
 
     assert any("from-caller" in m for m in captured)
-    # Strict per-thread isolation: unrelated thread's record is NOT in
-    # this capture's payload.
+    # Strict per-thread isolation for non-LLM records: unrelated thread's
+    # record is NOT in this capture's payload.
     assert not any("from-other-thread" in m for m in captured)
 
 
+def test_capture_routes_llm_client_worker_records_to_single_active_capture():
+    """The narrow LLM-client fallback: when a record originates from a
+    ``featcopilot.llm.*_client`` logger and exactly one capture is
+    active, the record is routed to that capture even when emitted
+    from a worker thread.
+
+    This addresses the common case where an LLM sync client wrapping
+    ``ThreadPoolExecutor`` (the fallback used in event-loop
+    environments) emits a mock-mode startup warning on a worker thread
+    that ``submit()`` spawned. Without the narrow fallback, that
+    warning would bleed onto stderr on a successful run.
+    """
+    import threading
+    from concurrent.futures import ThreadPoolExecutor
+
+    llm_logger = logging.getLogger("featcopilot.llm.test_client")
+
+    def _emit_llm_in_worker():
+        llm_logger.warning("llm-mock-mode-startup")
+        return "ok"
+
+    with fc_cli._capture_featcopilot_messages() as captured:
+        # Caller emits its own LLM record (current-thread path).
+        llm_logger.warning("llm-from-caller")
+        # ThreadPoolExecutor worker emits an LLM record (cross-thread,
+        # but the narrow LLM-only fallback should route it).
+        with ThreadPoolExecutor(max_workers=1) as pool:
+            assert pool.submit(_emit_llm_in_worker).result(timeout=5) == "ok"
+        # A raw threading.Thread emits an LLM record too.
+        t = threading.Thread(target=_emit_llm_in_worker)
+        t.start()
+        t.join()
+
+    # Caller's record + 2 worker records (one from pool, one from thread)
+    # are all in the capture.
+    assert any("llm-from-caller" in m for m in captured)
+    assert sum(1 for m in captured if "llm-mock-mode-startup" in m) >= 2
+
+
+def test_capture_does_not_apply_llm_fallback_with_multiple_captures():
+    """When two captures are concurrently active, the narrow LLM
+    fallback stays disabled — strict per-thread isolation is preserved
+    so concurrent CLI calls don't cross-contaminate, even for LLM
+    records.
+    """
+    import threading
+    from concurrent.futures import ThreadPoolExecutor
+
+    llm_logger = logging.getLogger("featcopilot.llm.test_dual")
+    a_captured: list[str] = []
+    b_captured: list[str] = []
+    barrier = threading.Barrier(2)
+
+    def worker(tag: str, target: list[str]):
+        barrier.wait()
+        with fc_cli._capture_featcopilot_messages() as captured:
+            llm_logger.warning(f"{tag}-direct")
+            with ThreadPoolExecutor(max_workers=1) as pool:
+                # Submit a worker that emits a record. With two
+                # captures active, the narrow fallback must NOT
+                # activate (it would be ambiguous which capture
+                # "owns" the worker's record).
+                pool.submit(lambda t=tag: llm_logger.warning(f"{t}-worker")).result(timeout=5)
+        target.extend(captured)
+
+    t1 = threading.Thread(target=worker, args=("A", a_captured))
+    t2 = threading.Thread(target=worker, args=("B", b_captured))
+    t1.start()
+    t2.start()
+    t1.join()
+    t2.join()
+
+    # Each capture sees its own direct record (current-thread path).
+    assert any("A-direct" in m for m in a_captured)
+    assert any("B-direct" in m for m in b_captured)
+    # The worker record is NOT in either capture (fallback disabled).
+    assert not any("worker" in m for m in a_captured)
+    assert not any("worker" in m for m in b_captured)
+
+
 def test_capture_keeps_thread_isolation_with_multiple_active_captures():
     """The single-active-capture fallback must NOT activate when two
     threads are concurrently capturing — each must see only its own

From ffa2b3a669b1181c367301be75e1e4625744f9b5 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 5 May 2026 20:20:31 +0800
Subject: [PATCH 28/30] fix(test): make multi-capture LLM-fallback test
 race-proof
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round-23's `test_capture_does_not_apply_llm_fallback_with_multiple_captures`
passed locally but failed in CI on Python 3.10/3.12/3.13. Root cause: a
race between the two threads. Without explicit barriers, one thread's
`ThreadPoolExecutor` worker could fire BEFORE the other thread had
pushed its capture onto `_state._per_thread`. At that moment
`len(_per_thread) == 1`, so the narrow LLM fallback (correctly)
routed the worker's record to the only active capture — which is what
the test was checking the multi-capture scenario does NOT do.

Replaced the single `Barrier(2)` with a three-phase protocol:
  - `enter_barrier`: both threads start at the same time
  - `inside_barrier`: BOTH have pushed their capture (so any
    subsequent emit sees `len == 2`)
  - `done_barrier`: BOTH workers have completed before EITHER
    exits its capture (pins `len == 2` for the entire worker
    window)

The contract under test — multi-capture isolation — is unchanged; the
test now reliably exercises the multi-capture branch of
`_ThreadCaptureState.get_for_llm_record`.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 tests/test_cli.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 80171d0..e9c9d55 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -2722,18 +2722,29 @@ def test_capture_does_not_apply_llm_fallback_with_multiple_captures():
     llm_logger = logging.getLogger("featcopilot.llm.test_dual")
     a_captured: list[str] = []
     b_captured: list[str] = []
-    barrier = threading.Barrier(2)
+    enter_barrier = threading.Barrier(2)
+    inside_barrier = threading.Barrier(2)
+    done_barrier = threading.Barrier(2)
 
     def worker(tag: str, target: list[str]):
-        barrier.wait()
+        # Phase 0: both threads start at roughly the same time.
+        enter_barrier.wait()
         with fc_cli._capture_featcopilot_messages() as captured:
             llm_logger.warning(f"{tag}-direct")
+            # Phase 1: BOTH threads have entered their captures, so
+            # ``_state._per_thread`` has TWO entries when either thread's
+            # worker fires below — that's the multi-capture scenario the
+            # narrow LLM fallback must skip. Without this barrier the
+            # threads race: one thread's worker can fire before the other
+            # has pushed its capture, making ``len == 1`` and (incorrectly
+            # for this test's intent) tripping the fallback.
+            inside_barrier.wait()
             with ThreadPoolExecutor(max_workers=1) as pool:
-                # Submit a worker that emits a record. With two
-                # captures active, the narrow fallback must NOT
-                # activate (it would be ambiguous which capture
-                # "owns" the worker's record).
                 pool.submit(lambda t=tag: llm_logger.warning(f"{t}-worker")).result(timeout=5)
+            # Phase 2: BOTH threads' workers have completed before either
+            # exits its capture. This pins ``len == 2`` for the entire
+            # worker-emit window.
+            done_barrier.wait()
         target.extend(captured)
 
     t1 = threading.Thread(target=worker, args=("A", a_captured))
@@ -2746,7 +2757,8 @@ def worker(tag: str, target: list[str]):
     # Each capture sees its own direct record (current-thread path).
     assert any("A-direct" in m for m in a_captured)
     assert any("B-direct" in m for m in b_captured)
-    # The worker record is NOT in either capture (fallback disabled).
+    # The worker record is NOT in either capture (fallback disabled
+    # because two captures were active during the worker emit).
     assert not any("worker" in m for m in a_captured)
     assert not any("worker" in m for m in b_captured)
 

From 100500989b54578d25b11e326b385ac8a03139ab Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 5 May 2026 20:43:53 +0800
Subject: [PATCH 29/30] fix(cli): atomicize capture decision and tighten
 LLM-fallback whitelist
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round-24 reviewer feedback addressed:

1. **Codex P1 — atomic routing/suppression decision.**
   `_SuppressCapturingFilter.filter()` and `_ThreadRoutingHandler.emit()`
   each independently called `state.get_for_llm_record(...)`. Under
   concurrent CLI calls, another thread could push or pop a capture
   between filter and emit so the two phases saw different state — the
   same record could end up both captured AND on stderr (or suppressed
   without being captured). Replaced both call sites with
   `state.resolve_for_record(record)`, which computes the decision
   once and caches it on the record itself; the two phases now always
   see the same answer.

2. **Copilot — narrow LLM whitelist was too broad.**
   `_LLM_FALLBACK_LOGGER_PREFIXES = ("featcopilot.llm.",)` matched
   every `featcopilot.llm.*` logger, including modules that never
   hop onto worker threads (`semantic_engine`, `code_generator`,
   `transform_rule_generator`, `explainer`). Replaced with an
   *exact* set `_LLM_FALLBACK_LOGGER_NAMES` containing only the
   three sync-client modules that actually fall back to
   `ThreadPoolExecutor` (`copilot_client`, `litellm_client`,
   `openai_client`).

3. **Copilot — `_run` did not actually capture handler output.**
   `featcopilot/utils/logger.py` installs `StreamHandler(sys.stderr)`
   at import time, so `redirect_stderr` only swapped the module
   attribute while the handler kept writing to the *original* stream.
   That made the file's stderr-cleanliness assertions vacuous. The
   `_run` helper now also temporarily re-points every
   `StreamHandler` on the `featcopilot` logger at the captured
   `err` buffer for the duration of the call, restoring the original
   `stream` attribute in `finally`.

New tests:
- `test_capture_does_not_apply_llm_fallback_for_non_whitelisted_llm_loggers`
  pins the exact-set whitelist (semantic_engine et al. stay strictly
  isolated).
- `test_capture_decision_is_cached_per_record_for_atomic_filter_emit`
  proves the resolver computes the decision exactly once per record.
- `test_capture_decision_stable_under_concurrent_pop_between_filter_and_emit`
  proves the cached decision survives a concurrent pop.
- `test_run_helper_redirects_featcopilot_stream_handlers` proves the
  test helper captures handler-stream writes (so leaks would be
  detectable, not silently passing).

Existing test `test_capture_routes_llm_client_worker_records_to_single_active_capture`
updated to use a whitelisted logger name (`openai_client`) since
arbitrary `featcopilot.llm.*` names no longer trigger the fallback.

901 tests pass locally (full suite); 128 in tests/test_cli.py.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 114 ++++++++++++++++++-----
 tests/test_cli.py  | 224 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 308 insertions(+), 30 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index ea48ce3..ceec11e 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -602,26 +602,54 @@ class _ThreadCaptureState:
     output from unrelated background work happening in the same process.
 
     **Narrow LLM-client fallback.** :meth:`get_for_llm_record` is the
-    one exception: when a record originates from
-    ``featcopilot.llm.*_client`` (the sync LLM clients that fall back to
-    ``ThreadPoolExecutor`` in event-loop environments), and exactly one
-    capture is active in the process, the record is routed to that
-    capture even when emitted on a worker thread. This addresses the
-    common case where an LLM client's mock-mode startup warning fires
-    on a worker that ``submit()`` spawned and would otherwise bleed
-    onto stderr; the targeted whitelist keeps unrelated background
-    featcopilot work from being misattributed.
+    one exception: when a record originates from one of the *exact*
+    sync LLM client modules whitelisted in
+    :attr:`_LLM_FALLBACK_LOGGER_NAMES` (the modules whose synchronous
+    entry points fall back to ``ThreadPoolExecutor`` in event-loop
+    environments), and exactly one capture is active in the process,
+    the record is routed to that capture even when emitted on a worker
+    thread. This addresses the common case where an LLM client's
+    mock-mode startup warning fires on a worker that ``submit()``
+    spawned and would otherwise bleed onto stderr. The whitelist is an
+    explicit set of module names — *not* a prefix — so unrelated
+    ``featcopilot.llm.*`` loggers (e.g. ``semantic_engine``,
+    ``code_generator``, ``transform_rule_generator``, ``explainer``)
+    that never run on worker threads do not trigger the fallback.
+
+    **Atomic per-record decision.** :meth:`resolve_for_record` caches
+    the lookup on the record itself the first time it is called, so
+    :class:`_ThreadRoutingHandler` (which routes records) and
+    :class:`_SuppressCapturingFilter` (which suppresses them from
+    stderr) cannot disagree under concurrent push/pop activity from
+    other threads. Without that caching, ``filter`` could see one
+    state and the later ``emit`` could see another, breaking the
+    "captured XOR on stderr" invariant.
 
     Shared by :class:`_ThreadRoutingHandler` (writes records),
     :class:`_SuppressCapturingFilter` (suppresses stderr), and the
     routing ``warnings.showwarning`` override.
     """
 
-    # Logger-name prefixes whose records are eligible for the narrow
-    # cross-thread fallback. Only the LLM client modules whose sync
-    # entry points fall back to ``ThreadPoolExecutor`` in event-loop
-    # environments are listed.
-    _LLM_FALLBACK_LOGGER_PREFIXES = ("featcopilot.llm.",)
+    # Logger names whose records are eligible for the narrow cross-thread
+    # fallback. Only the synchronous LLM client modules whose ``run`` /
+    # batch entry points fall back to ``ThreadPoolExecutor`` workers in
+    # event-loop environments are listed; their startup / mock-mode
+    # warnings legitimately fire from those worker threads. Other
+    # ``featcopilot.llm.*`` loggers are intentionally NOT included so
+    # cross-thread records from unrelated background work cannot be
+    # silently swallowed into an active CLI capture.
+    _LLM_FALLBACK_LOGGER_NAMES = frozenset(
+        {
+            "featcopilot.llm.copilot_client",
+            "featcopilot.llm.litellm_client",
+            "featcopilot.llm.openai_client",
+        }
+    )
+
+    # Sentinel for "no cached decision yet" on a log record. Distinct
+    # from ``None``, which is itself a valid resolved decision meaning
+    # "no capture target — let the record flow to stderr".
+    _UNCACHED = object()
 
     def __init__(self):
         self._per_thread: dict[int, list[list[str]]] = {}
@@ -657,16 +685,25 @@ def get_for_llm_record(self, tid: int, logger_name: str) -> list[str] | None:
 
         When the calling thread has its own active capture, that's used.
         Otherwise — and only when the record originates from a
-        whitelisted ``featcopilot.llm.*`` logger AND exactly one capture
-        is active in the process — the record is routed to that single
-        capture so it doesn't bleed onto stderr. The whitelist keeps
-        unrelated background featcopilot work strictly isolated.
+        whitelisted sync LLM client module
+        (:attr:`_LLM_FALLBACK_LOGGER_NAMES`) AND exactly one capture is
+        active in the process — the record is routed to that single
+        capture so it doesn't bleed onto stderr. The whitelist is an
+        explicit set of module names so unrelated ``featcopilot.llm.*``
+        loggers (e.g. ``semantic_engine``, ``code_generator``) that
+        never hop onto worker threads do not trigger the fallback.
+
+        This method takes the lock once and returns a snapshot
+        decision; callers should generally use :meth:`resolve_for_record`
+        to additionally cache the result on the record so that paired
+        filter / emit calls can never disagree under concurrent
+        push/pop on other threads.
         """
         with self._lock:
             stack = self._per_thread.get(tid)
             if stack:
                 return stack[-1]
-            if not logger_name.startswith(self._LLM_FALLBACK_LOGGER_PREFIXES):
+            if logger_name not in self._LLM_FALLBACK_LOGGER_NAMES:
                 return None
             if len(self._per_thread) != 1:
                 # Either no captures are active (nothing to route to) or
@@ -678,6 +715,29 @@ def get_for_llm_record(self, tid: int, logger_name: str) -> list[str] | None:
                 return only_stack[-1]
             return None
 
+    def resolve_for_record(self, record: logging.LogRecord) -> list[str] | None:
+        """Resolve the capture target for ``record`` exactly once.
+
+        The first call computes the decision via
+        :meth:`get_for_llm_record` and caches it on the record itself
+        as ``record._featcopilot_capture_target``; subsequent calls
+        return the cached value. This is what makes the routing
+        handler's ``emit`` and the suppression filter's ``filter``
+        atomic with respect to each other: they always see the same
+        decision for a given record even if another thread pops or
+        pushes a capture between the two calls.
+
+        Logging records are produced and dispatched to handlers in the
+        same thread, so caching directly on the record is safe — there
+        is no concurrent reader of the same record's attributes.
+        """
+        cached = getattr(record, "_featcopilot_capture_target", self._UNCACHED)
+        if cached is not self._UNCACHED:
+            return cached
+        target = self.get_for_llm_record(threading.get_ident(), record.name)
+        record._featcopilot_capture_target = target
+        return target
+
 
 class _ThreadRoutingHandler(logging.Handler):
     """Logging handler that routes records to the calling thread's capture list.
@@ -700,7 +760,11 @@ def __init__(self, state: _ThreadCaptureState):
         self.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
 
     def emit(self, record: logging.LogRecord) -> None:
-        target = self._state.get_for_llm_record(threading.get_ident(), record.name)
+        # Use the cached resolver so this handler and the paired
+        # ``_SuppressCapturingFilter`` always see the same decision
+        # for a given record, even if another thread pushes or pops
+        # a capture between the filter and emit phases.
+        target = self._state.resolve_for_record(record)
         if target is None:
             return
         try:
@@ -729,7 +793,15 @@ def __init__(self, state: _ThreadCaptureState):
         self._state = state
 
     def filter(self, record: logging.LogRecord) -> bool:
-        return self._state.get_for_llm_record(threading.get_ident(), record.name) is None
+        # Use the cached resolver so this filter and the paired
+        # ``_ThreadRoutingHandler`` always see the same decision for a
+        # given record, even if another thread pushes or pops a capture
+        # between this call and the routing handler's emit. Without the
+        # cache, the two could disagree and the same record could end
+        # up both captured and on stderr (or suppressed without being
+        # captured) — breaking the "stderr reserved for failures"
+        # contract under concurrent CLI calls.
+        return self._state.resolve_for_record(record) is None
 
 
 # Module-level singletons. Installed exactly once on the featcopilot root
diff --git a/tests/test_cli.py b/tests/test_cli.py
index e9c9d55..84928a1 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -7,6 +7,7 @@
 import json
 import logging
 import sys
+import threading
 import warnings
 from contextlib import redirect_stderr, redirect_stdout
 from pathlib import Path
@@ -20,10 +21,34 @@
 
 
 def _run(argv: list[str]) -> tuple[int, str, str]:
-    """Invoke ``cli.main(argv)`` and capture exit code, stdout, stderr."""
+    """Invoke ``cli.main(argv)`` and capture exit code, stdout, stderr.
+
+    The featcopilot logger installs a ``StreamHandler(sys.stderr)`` at
+    import time, which holds a reference to the *original* ``sys.stderr``
+    object. ``redirect_stderr`` only swaps the ``sys.stderr`` module
+    attribute, so without also redirecting the handler's ``stream`` any
+    log output the suppression filter doesn't catch would still go to
+    the real terminal — leaving every ``err == ""`` assertion in this
+    file vacuously satisfied even in the presence of a leak. This helper
+    therefore both redirects ``sys.stderr`` AND temporarily re-points
+    every ``StreamHandler`` on the ``featcopilot`` root logger at the
+    same ``err`` buffer for the duration of the call, so the captured
+    ``err`` value reflects what would actually have been written to the
+    user's terminal.
+    """
     out, err = io.StringIO(), io.StringIO()
-    with redirect_stdout(out), redirect_stderr(err):
-        rc = fc_cli.main(argv)
+    fc_logger = logging.getLogger("featcopilot")
+    saved_streams: list[tuple[logging.StreamHandler, object]] = []
+    for handler in list(fc_logger.handlers):
+        if isinstance(handler, logging.StreamHandler):
+            saved_streams.append((handler, handler.stream))
+            handler.stream = err
+    try:
+        with redirect_stdout(out), redirect_stderr(err):
+            rc = fc_cli.main(argv)
+    finally:
+        for handler, original_stream in saved_streams:
+            handler.stream = original_stream
     return rc, out.getvalue(), err.getvalue()
 
 
@@ -2672,10 +2697,11 @@ def _emit_elsewhere():
 
 
 def test_capture_routes_llm_client_worker_records_to_single_active_capture():
-    """The narrow LLM-client fallback: when a record originates from a
-    ``featcopilot.llm.*_client`` logger and exactly one capture is
-    active, the record is routed to that capture even when emitted
-    from a worker thread.
+    """The narrow LLM-client fallback: when a record originates from one
+    of the *whitelisted* sync LLM client modules
+    (``featcopilot.llm.copilot_client`` / ``litellm_client`` /
+    ``openai_client``) and exactly one capture is active, the record
+    is routed to that capture even when emitted from a worker thread.
 
     This addresses the common case where an LLM sync client wrapping
     ``ThreadPoolExecutor`` (the fallback used in event-loop
@@ -2686,7 +2712,10 @@ def test_capture_routes_llm_client_worker_records_to_single_active_capture():
     import threading
     from concurrent.futures import ThreadPoolExecutor
 
-    llm_logger = logging.getLogger("featcopilot.llm.test_client")
+    # Use an actual whitelisted sync-client module name; an arbitrary
+    # ``featcopilot.llm.*`` name (e.g. ``test_client``) is intentionally
+    # NOT eligible — see ``test_capture_does_not_apply_llm_fallback_for_non_whitelisted_llm_loggers``.
+    llm_logger = logging.getLogger("featcopilot.llm.openai_client")
 
     def _emit_llm_in_worker():
         llm_logger.warning("llm-mock-mode-startup")
@@ -2710,6 +2739,44 @@ def _emit_llm_in_worker():
     assert sum(1 for m in captured if "llm-mock-mode-startup" in m) >= 2
 
 
+def test_capture_does_not_apply_llm_fallback_for_non_whitelisted_llm_loggers():
+    """The narrow LLM-client fallback whitelist is an *exact* set of
+    sync-client module names — NOT a ``featcopilot.llm.*`` prefix.
+    Other ``featcopilot.llm.*`` loggers (e.g. ``semantic_engine``,
+    ``code_generator``, ``transform_rule_generator``, ``explainer``)
+    must keep strict per-thread isolation, so cross-thread records
+    from unrelated background work cannot be silently swallowed into
+    an active CLI capture.
+    """
+    import threading
+
+    non_whitelisted = [
+        "featcopilot.llm.semantic_engine",
+        "featcopilot.llm.code_generator",
+        "featcopilot.llm.transform_rule_generator",
+        "featcopilot.llm.explainer",
+        "featcopilot.llm.test_dummy",  # arbitrary subname — must NOT match
+    ]
+    captured_lists: list[list[str]] = []
+
+    for name in non_whitelisted:
+        other_logger = logging.getLogger(name)
+
+        def _emit_in_other_thread(logger=other_logger, tag=name):
+            logger.warning(f"{tag}-from-other-thread")
+
+        with fc_cli._capture_featcopilot_messages() as captured:
+            t = threading.Thread(target=_emit_in_other_thread)
+            t.start()
+            t.join()
+        captured_lists.append(list(captured))
+
+    for name, captured in zip(non_whitelisted, captured_lists, strict=True):
+        assert not any(
+            f"{name}-from-other-thread" in m for m in captured
+        ), f"Non-whitelisted LLM logger {name} unexpectedly tripped the cross-thread fallback"
+
+
 def test_capture_does_not_apply_llm_fallback_with_multiple_captures():
     """When two captures are concurrently active, the narrow LLM
     fallback stays disabled — strict per-thread isolation is preserved
@@ -2719,7 +2786,7 @@ def test_capture_does_not_apply_llm_fallback_with_multiple_captures():
     import threading
     from concurrent.futures import ThreadPoolExecutor
 
-    llm_logger = logging.getLogger("featcopilot.llm.test_dual")
+    llm_logger = logging.getLogger("featcopilot.llm.openai_client")
     a_captured: list[str] = []
     b_captured: list[str] = []
     enter_barrier = threading.Barrier(2)
@@ -2800,6 +2867,145 @@ def worker(tag: str, target: list[str]):
     assert len(b_captured) == 10
 
 
+def test_capture_decision_is_cached_per_record_for_atomic_filter_emit():
+    """The capture state must resolve each record's routing decision
+    *exactly once*, then cache the outcome on the record itself, so the
+    suppression filter and the routing handler always see the same
+    answer for that record. Otherwise a concurrent push/pop on another
+    thread could land between the filter (computed at handler-1 phase)
+    and the emit (computed at handler-2 phase), making the same record
+    both captured and emitted to stderr (or suppressed without being
+    captured) — breaking the CLI contract.
+    """
+    state = fc_cli._ThreadCaptureState()
+
+    class _CountingState:
+        """Wrap state so we can count ``get_for_llm_record`` calls."""
+
+        def __init__(self, inner):
+            self._inner = inner
+            self.calls: list[tuple[int, str]] = []
+
+        # Forward the attributes ``resolve_for_record`` reads.
+        @property
+        def _UNCACHED(self):
+            return self._inner._UNCACHED
+
+        def get_for_llm_record(self, tid, name):
+            self.calls.append((tid, name))
+            return self._inner.get_for_llm_record(tid, name)
+
+        # Re-bind ``resolve_for_record`` so calls are counted via the
+        # wrapped ``get_for_llm_record``.
+        def resolve_for_record(self, record):
+            return fc_cli._ThreadCaptureState.resolve_for_record(self, record)
+
+    counted = _CountingState(state)
+
+    record = logging.LogRecord(
+        name="featcopilot.llm.openai_client",
+        level=logging.WARNING,
+        pathname=__file__,
+        lineno=1,
+        msg="hello",
+        args=(),
+        exc_info=None,
+    )
+
+    # First call computes and caches; subsequent calls must not hit
+    # ``get_for_llm_record`` again.
+    first = counted.resolve_for_record(record)
+    second = counted.resolve_for_record(record)
+    third = counted.resolve_for_record(record)
+
+    assert first is second is third
+    assert len(counted.calls) == 1, (
+        "resolve_for_record must compute the decision exactly once per record; "
+        f"saw {len(counted.calls)} get_for_llm_record calls"
+    )
+
+    # The cached attribute is set on the record itself.
+    assert hasattr(record, "_featcopilot_capture_target")
+
+
+def test_capture_decision_stable_under_concurrent_pop_between_filter_and_emit():
+    """Regression test for the atomic filter/emit invariant: even if a
+    concurrent thread pops its capture between the moment a record is
+    filtered and the moment it is emitted, both phases see the SAME
+    decision because it was resolved and cached on the record once.
+    """
+    state = fc_cli._ThreadCaptureState()
+    cap_a: list[str] = []
+    state.push(threading.get_ident() ^ 1, cap_a)  # foreign-thread capture
+    try:
+        record = logging.LogRecord(
+            name="featcopilot.llm.copilot_client",
+            level=logging.WARNING,
+            pathname=__file__,
+            lineno=1,
+            msg="hi",
+            args=(),
+            exc_info=None,
+        )
+        # Phase 1: "filter" computes and caches ("len(_per_thread)==1"
+        # so the LLM fallback returns ``cap_a``).
+        first = state.resolve_for_record(record)
+        assert first is cap_a
+
+        # Concurrent pop: another thread tears its capture down. State
+        # would now produce a *different* answer for a fresh lookup.
+        state.pop(threading.get_ident() ^ 1)
+        fresh_lookup = state.get_for_llm_record(threading.get_ident(), record.name)
+        assert fresh_lookup is None  # state has indeed changed
+
+        # Phase 2: "emit" must still see the same decision via the cache.
+        second = state.resolve_for_record(record)
+        assert second is cap_a, (
+            "After a concurrent pop, resolve_for_record must still return the "
+            "originally cached decision so filter and emit cannot disagree"
+        )
+    finally:
+        # Clean up any stragglers.
+        state.pop(threading.get_ident() ^ 1)
+
+
+def test_run_helper_redirects_featcopilot_stream_handlers(monkeypatch):
+    """Regression test for the test helper itself: ``_run`` must
+    redirect every ``logging.StreamHandler`` on the ``featcopilot``
+    root logger so that any handler write that escapes the suppression
+    filter (the contract-violation scenario) lands in the captured
+    ``err`` buffer, NOT on the real terminal.
+
+    Without this redirect, every ``err == ""`` assertion in this file
+    would be vacuously satisfied because the ``StreamHandler`` installed
+    at import time holds a reference to the *original* ``sys.stderr``
+    object and ``redirect_stderr`` only swaps the module attribute.
+    """
+    fc_logger = logging.getLogger("featcopilot")
+    stream_handlers = [h for h in fc_logger.handlers if isinstance(h, logging.StreamHandler)]
+    assert stream_handlers, "featcopilot logger must have at least one StreamHandler"
+
+    # Stub ``cli.main`` to write directly through the StreamHandler's
+    # current ``stream`` attribute (which ``_run`` should have re-pointed
+    # at the captured ``err`` buffer).
+    def fake_main(argv):
+        for h in fc_logger.handlers:
+            if isinstance(h, logging.StreamHandler):
+                h.stream.write("HANDLER_LEAK_LINE\n")
+                h.stream.flush()
+        return 0
+
+    monkeypatch.setattr(fc_cli, "main", fake_main)
+    rc, _out, err = _run(["info"])
+    assert rc == 0
+    assert "HANDLER_LEAK_LINE" in err, (
+        "_run must redirect featcopilot StreamHandler streams; otherwise stderr-cleanliness " "assertions are vacuous"
+    )
+    # And the original stream is restored after the call.
+    for h in stream_handlers:
+        assert h.stream is sys.stderr or h.stream is sys.__stderr__
+
+
 # ----------------------- explain --explain-sample-size warning hygiene
 
 

From 99fe807624fc775d287fc79948cb101ba2cdaa50 Mon Sep 17 00:00:00 2001
From: Li Jiang <lijiang1@microsoft.com>
Date: Tue, 5 May 2026 23:14:38 +0800
Subject: [PATCH 30/30] fix(cli): widen warning capture to read+write phases;
 split empty-input cases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Round-25 reviewer feedback addressed:

1. **Copilot — empty-input branch conflated zero-row and zero-column.**
   `_read_table` raised the same "zero data rows" error for both
   `len(df) == 0` and `len(df.columns) == 0` (both make
   `DataFrame.empty` return `True`). A JSON `[{}, {}]` array
   produces a frame WITH rows but NO columns ù very different user
   error than a header-only CSV. Split into two distinct, accurately
   worded error messages: "no columns" vs "zero data rows".

2. **Copilot — read-time warnings leaked to stderr.**
   `_cmd_transform` only opened the capture context around
   `engineer.fit_transform`. `pd.read_csv` can legitimately emit
   `DtypeWarning` on mixed-type CSVs and parquet/JSON readers can
   emit pyarrow / pandas warnings on a successful read; those were
   bypassing the capture and bleeding to stderr, breaking the
   "stderr reserved for failures" contract.

3. **Copilot — write-time warnings also leaked to stderr.**
   `_write_table` was likewise outside the capture, so pandas /
   pyarrow `FutureWarning` / `UserWarning` from a successful
   write could leak. Same contract violation as #2.

The fix wraps the entire `_cmd_transform` pipeline (read +
build_engineer + fit_transform + write) in a single
`_capture_featcopilot_messages` block so warnings from ANY phase end
up in the JSON `warnings` field instead of stderr. `_cmd_explain`
also widened: `explain_features` / `get_feature_code` are now
inside the same capture as the read+fit_transform.

The dead helper `_fit_transform_capturing_warnings` is no longer used
internally; kept as a thin convenience wrapper for external test code
with an updated docstring noting that the CLI now wraps a wider
region.

New tests:
- `test_transform_zero_columns_input_distinguishes_from_zero_rows` ù
  pins the column-vs-row error-message distinction.
- `test_transform_zero_rows_input_still_uses_zero_rows_message` ù
  guards the existing zero-rows wording for the header-only case.
- `test_transform_read_warning_captured_not_on_stderr` ù patches
  `pd.read_csv` to emit a warning, asserts `err == ""` and the
  warning lands in JSON `warnings`.
- `test_transform_write_warning_captured_not_on_stderr` ù patches
  `DataFrame.to_csv` to emit a warning, same contract.
- `test_explain_features_warnings_captured_not_on_stderr` ù patches
  `AutoFeatureEngineer.explain_features` to emit a warning, same
  contract for `_cmd_explain`.

906 tests pass locally (full suite); 133 in tests/test_cli.py.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 featcopilot/cli.py | 199 ++++++++++++++++++++++++++-------------------
 tests/test_cli.py  | 150 ++++++++++++++++++++++++++++++++++
 2 files changed, 267 insertions(+), 82 deletions(-)

diff --git a/featcopilot/cli.py b/featcopilot/cli.py
index ceec11e..6f019f5 100644
--- a/featcopilot/cli.py
+++ b/featcopilot/cli.py
@@ -228,14 +228,24 @@ def _read_table(path: Path, fmt: str, *, nrows: int | None = None, suppress_trun
         raise ValueError(f"Unsupported input format: {fmt}")
 
     # Reject "header-only" / empty inputs across every supported format.
-    # ``pd.read_csv`` returns an empty DataFrame (no exception) when the
-    # CSV has headers but zero data rows; the same goes for an empty
-    # parquet file or ``[]`` JSON body. Without this check, the CLI
-    # would pass an empty frame into ``TabularEngine``, which divides by
-    # ``len(X)`` while fitting categorical encoding and exits via the
-    # generic ``unexpected error`` path. Surface the issue as a clean
-    # exit-2 user-input error.
-    if df.empty:
+    # ``DataFrame.empty`` returns ``True`` for both zero-row AND
+    # zero-column frames, but those are very different user errors that
+    # warrant different remediation paths, so check the two cases
+    # explicitly. ``pd.read_csv`` returns an empty DataFrame (no
+    # exception) when the CSV has headers but zero data rows; the same
+    # goes for an empty parquet file or ``[]`` JSON body. Without this
+    # check, the CLI would pass an empty frame into ``TabularEngine``,
+    # which divides by ``len(X)`` while fitting categorical encoding and
+    # exits via the generic ``unexpected error`` path. Surface both
+    # cases as clean exit-2 user-input errors.
+    if len(df.columns) == 0:
+        raise ValueError(
+            f"Input file {str(path)!r} has no columns. "
+            "Feature engineering requires at least one input feature column "
+            "(e.g. a JSON array of ``{}`` objects, or a table that only "
+            "preserved an index, would hit this error)."
+        )
+    if len(df) == 0:
         raise ValueError(
             f"Input file {str(path)!r} is empty (zero data rows). "
             "Feature engineering requires at least one row of data."
@@ -555,23 +565,16 @@ def _cmd_info(args: argparse.Namespace) -> int:
 
 
 def _fit_transform_capturing_warnings(engineer, X, y, **kwargs):
-    """Run ``engineer.fit_transform(X, y, **kwargs)`` while capturing both
-    Python ``warnings.warn(...)`` and FeatCopilot logger records.
-
-    The CLI contract is that stdout carries the JSON payload and stderr is
-    reserved for failures. Two sources can otherwise bleed onto stderr on
-    a successful run:
+    """Thin convenience wrapper: run ``engineer.fit_transform(X, y, **kwargs)``
+    inside a single ``_capture_featcopilot_messages()`` block.
 
-    * ``warnings.warn(...)`` — emitted by ``AutoFeatureEngineer.fit`` for
-      leakage-prone column names under the default ``leakage_guard='warn'``.
-    * ``logger.warning(...)`` / ``logger.info(...)`` — emitted by e.g.
-      ``_do_no_harm_gate`` on validation-failure fallback, and by every
-      engine when ``--verbose`` is set.
-
-    The single ``featcopilot`` root logger (``propagate=False``) receives
-    every child logger's records by ordinary Python logging propagation;
-    we swap in a capture handler for the duration of the call so the JSON
-    payload can surface those messages instead of stderr.
+    This helper is *not* used by the CLI subcommands themselves anymore
+    — they wrap a wider region (read + fit_transform + write) in one
+    capture so warnings emitted during pandas / pyarrow read or write
+    phases are also surfaced via the JSON ``warnings`` field instead of
+    leaking onto stderr. It is preserved as a public-ish helper for
+    external test code that just wants to capture messages around a
+    plain ``fit_transform`` call.
 
     Returns
     -------
@@ -930,7 +933,26 @@ def _capture_featcopilot_messages():
 
 
 def _cmd_transform(args: argparse.Namespace) -> int:
-    """Read input, fit/transform, write output."""
+    """Read input, fit/transform, write output.
+
+    The "successful runs keep stderr empty" CLI contract requires that
+    *every* phase that can legitimately emit a Python warning be wrapped
+    in the message capture, not just ``fit_transform``. Concretely:
+
+    * **Read** — ``pd.read_csv`` can emit ``DtypeWarning`` on mixed-type
+      columns and parquet/JSON readers can emit pyarrow / pandas
+      future-warnings on a successful read.
+    * **Fit / transform** — engineers themselves emit warnings (e.g.
+      ``AutoFeatureEngineer.fit`` for leakage-prone column names under
+      ``leakage_guard='warn'``).
+    * **Write** — ``DataFrame.to_csv`` / ``to_parquet`` / ``to_json``
+      can emit pandas / pyarrow ``FutureWarning`` / ``UserWarning``
+      during a successful write.
+
+    All three phases now live inside one capture block so any warnings
+    they emit are surfaced via the JSON ``warnings`` field rather than
+    leaking onto stderr.
+    """
     input_path = Path(args.input)
     if not input_path.exists():
         raise FileNotFoundError(f"Input file not found: {args.input}")
@@ -939,64 +961,70 @@ def _cmd_transform(args: argparse.Namespace) -> int:
     in_fmt = _detect_format(input_path, args.input_format)
     out_fmt = _detect_format(output_path, args.output_format)
 
-    df = _read_table(input_path, in_fmt)
-    X, y = _split_xy(df, args.target)
-
-    # Build the engineer first: ``_build_engineer`` runs all scalar / list /
-    # dict type validation on the merged CLI-flag + config view, so any
-    # malformed value (e.g. ``"max_features": "5"``, ``"verbose": "false"``)
-    # surfaces a precise exit-2 error here rather than down the wrong
-    # ``--target is required`` rabbit hole.
-    engineer = _build_engineer(args)
-
-    # Selection requires a target column to fit against. ``AutoFeatureEngineer``
-    # only actually fits a selector when ``y is not None`` AND ``max_features``
-    # is set; without ``max_features`` the call is a raw feature-generation
-    # run and does not need a target. The CLI mirrors that contract: only
-    # require ``--target`` when both selection is enabled (the default) AND
-    # ``max_features`` is configured (CLI flag or config), so commands like
-    # ``featcopilot transform --input in.csv --output out.csv`` (no target,
-    # no cap) still work. Using ``engineer.max_features`` here means the
-    # value has already been type-validated, so we never report
-    # ``--target is required`` when the real problem is a malformed
-    # ``max_features`` config value.
-    if not args.no_selection and args.target is None and engineer.max_features is not None:
-        raise ValueError(
-            "--target is required when feature selection is applied "
-            "(i.e. when --max-features / config max_features is set). "
-            "Pass --target <column>, or pass --no-selection / drop --max-features to skip selection."
-        )
-
-    captured_warnings, transformed = _fit_transform_capturing_warnings(
-        engineer,
-        X,
-        y,
-        task_description=args.task_description or "prediction task",
-        target_name=args.target,
-        apply_selection=not args.no_selection,
-    )
+    # Single capture context spans read + fit_transform + write so any
+    # legitimate-but-noisy warnings from pandas/pyarrow during read or
+    # write end up in the JSON payload's ``warnings`` field instead of
+    # bleeding to stderr. ``_capture_featcopilot_messages`` is a no-op
+    # for non-warning code paths so wrapping the broader region has no
+    # side effects on the happy path.
+    with _capture_featcopilot_messages() as captured_warnings:
+        df = _read_table(input_path, in_fmt)
+        X, y = _split_xy(df, args.target)
 
-    if args.include_target and y is not None:
-        # Re-attach the target column so downstream training scripts can
-        # consume the engineered file as a single artifact. Detect column
-        # collisions: if an engineered feature happens to share the
-        # target's column name (e.g. a target named ``foo_pow2`` matching
-        # a tabular-engine derived feature), blindly assigning ``transformed[
-        # target_name] = y.values`` would silently overwrite the engineered
-        # column. Surface that as a clean exit-2 error instead. Callers
-        # who knowingly want to overwrite can rename their target before
-        # invoking ``transform`` (or skip ``--include-target``).
-        target_name = args.target if args.target in df.columns else "target"
-        if target_name in transformed.columns:
+        # Build the engineer first: ``_build_engineer`` runs all scalar / list /
+        # dict type validation on the merged CLI-flag + config view, so any
+        # malformed value (e.g. ``"max_features": "5"``, ``"verbose": "false"``)
+        # surfaces a precise exit-2 error here rather than down the wrong
+        # ``--target is required`` rabbit hole.
+        engineer = _build_engineer(args)
+
+        # Selection requires a target column to fit against. ``AutoFeatureEngineer``
+        # only actually fits a selector when ``y is not None`` AND ``max_features``
+        # is set; without ``max_features`` the call is a raw feature-generation
+        # run and does not need a target. The CLI mirrors that contract: only
+        # require ``--target`` when both selection is enabled (the default) AND
+        # ``max_features`` is configured (CLI flag or config), so commands like
+        # ``featcopilot transform --input in.csv --output out.csv`` (no target,
+        # no cap) still work. Using ``engineer.max_features`` here means the
+        # value has already been type-validated, so we never report
+        # ``--target is required`` when the real problem is a malformed
+        # ``max_features`` config value.
+        if not args.no_selection and args.target is None and engineer.max_features is not None:
             raise ValueError(
-                f"--include-target would overwrite engineered feature {target_name!r} "
-                "with the target values. Rename the target column in the input file, "
-                "or drop --include-target."
+                "--target is required when feature selection is applied "
+                "(i.e. when --max-features / config max_features is set). "
+                "Pass --target <column>, or pass --no-selection / drop --max-features to skip selection."
             )
-        transformed = transformed.copy()
-        transformed[target_name] = y.values
 
-    _write_table(transformed, output_path, out_fmt)
+        transformed = engineer.fit_transform(
+            X,
+            y,
+            task_description=args.task_description or "prediction task",
+            target_name=args.target,
+            apply_selection=not args.no_selection,
+        )
+
+        if args.include_target and y is not None:
+            # Re-attach the target column so downstream training scripts can
+            # consume the engineered file as a single artifact. Detect column
+            # collisions: if an engineered feature happens to share the
+            # target's column name (e.g. a target named ``foo_pow2`` matching
+            # a tabular-engine derived feature), blindly assigning ``transformed[
+            # target_name] = y.values`` would silently overwrite the engineered
+            # column. Surface that as a clean exit-2 error instead. Callers
+            # who knowingly want to overwrite can rename their target before
+            # invoking ``transform`` (or skip ``--include-target``).
+            target_name = args.target if args.target in df.columns else "target"
+            if target_name in transformed.columns:
+                raise ValueError(
+                    f"--include-target would overwrite engineered feature {target_name!r} "
+                    "with the target values. Rename the target column in the input file, "
+                    "or drop --include-target."
+                )
+            transformed = transformed.copy()
+            transformed[target_name] = y.values
+
+        _write_table(transformed, output_path, out_fmt)
 
     payload = {
         "status": "ok",
@@ -1150,9 +1178,16 @@ def _cmd_explain(args: argparse.Namespace) -> int:
             apply_selection=False,
         )
 
-    explanations = engineer.explain_features()
-    code = engineer.get_feature_code()
-    feature_names = engineer.get_feature_names()
+        # ``explain_features`` / ``get_feature_code`` consult engine
+        # internals that may legitimately emit pandas / pyarrow warnings
+        # (e.g. when stringifying expression trees touching deprecated
+        # APIs). Keep them inside the capture so any such warning ends
+        # up in the JSON payload's ``warnings`` field instead of bleeding
+        # to stderr — same "stderr reserved for failures" contract as
+        # the read / fit_transform phases above.
+        explanations = engineer.explain_features()
+        code = engineer.get_feature_code()
+        feature_names = engineer.get_feature_names()
 
     payload = {
         "status": "ok",
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 84928a1..2f8c1f3 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -3006,6 +3006,156 @@ def fake_main(argv):
         assert h.stream is sys.stderr or h.stream is sys.__stderr__
 
 
+# ----------------------- empty-input column-vs-row distinction
+
+
+def test_transform_zero_columns_input_distinguishes_from_zero_rows(tmp_path: Path):
+    """``DataFrame.empty`` is ``True`` for both zero-row AND zero-column
+    frames. The CLI must distinguish: a JSON array of empty objects
+    ``[{}, {}, ...]`` is a zero-COLUMN input (the user has no feature
+    columns), not a zero-ROW input. The error message must point at
+    the actual problem so callers can take the right remediation.
+    """
+    # JSON array of empty objects: pandas reads this as a frame with
+    # rows but no columns.
+    p = tmp_path / "empty_columns.json"
+    p.write_text("[{}, {}, {}]")
+    rc, _out, err = _run(["transform", "--input", str(p), "--output", str(tmp_path / "out.csv"), "--target", "y"])
+    assert rc == 2
+    assert "no columns" in err.lower(), err
+    assert "feature column" in err.lower(), err
+    assert "zero data rows" not in err.lower(), err
+
+
+def test_transform_zero_rows_input_still_uses_zero_rows_message(tmp_path: Path):
+    """The zero-row case (header but no data) still surfaces the
+    distinct "zero data rows" wording so the two failure modes are
+    distinguishable in CLI output.
+    """
+    p = tmp_path / "header_only.csv"
+    p.write_text("x1,x2,y\n")
+    rc, _out, err = _run(["transform", "--input", str(p), "--output", str(tmp_path / "out.csv"), "--target", "y"])
+    assert rc == 2
+    assert "zero data rows" in err.lower(), err
+    assert "no columns" not in err.lower(), err
+
+
+# ----------------------- transform read/write warnings captured (not stderr)
+
+
+def test_transform_read_warning_captured_not_on_stderr(tmp_path: Path, monkeypatch):
+    """``pd.read_csv`` can legitimately emit ``DtypeWarning`` on a
+    successful read with mixed-type columns. That warning must end up
+    in the JSON ``warnings`` field, NOT on stderr — the contract is
+    that successful runs keep stderr empty for agent callers.
+    """
+    import pandas as _pd
+
+    # Build a valid CSV input and a real fit_transform-able payload.
+    rng = np.random.default_rng(0)
+    df = pd.DataFrame(
+        {
+            "x1": rng.normal(size=50),
+            "x2": rng.integers(0, 5, size=50),
+            "y": rng.integers(0, 2, size=50),
+        }
+    )
+    in_path = tmp_path / "in.csv"
+    df.to_csv(in_path, index=False)
+    out_path = tmp_path / "out.csv"
+
+    # Patch ``pd.read_csv`` so that calling it emits a real Python
+    # ``warnings.warn`` (mirroring DtypeWarning on a successful read)
+    # while still returning the same DataFrame.
+    real_read_csv = _pd.read_csv
+
+    def warning_emitting_read_csv(*a, **kw):
+        warnings.warn("pandas-mock-read-csv: DtypeWarning equivalent", UserWarning, stacklevel=2)
+        return real_read_csv(*a, **kw)
+
+    monkeypatch.setattr(_pd, "read_csv", warning_emitting_read_csv)
+
+    rc, out, err = _run(
+        [
+            "transform",
+            "--input",
+            str(in_path),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--no-selection",
+            "--json",
+        ]
+    )
+    assert rc == 0, err
+    assert err == "", f"read-time warning leaked to stderr: {err!r}"
+    payload = json.loads(out)
+    assert any("pandas-mock-read-csv" in w for w in payload["warnings"]), payload["warnings"]
+
+
+def test_transform_write_warning_captured_not_on_stderr(tmp_path: Path, monkeypatch, tabular_csv: Path):
+    """Pandas/pyarrow can legitimately emit ``FutureWarning`` /
+    ``UserWarning`` during ``DataFrame.to_csv`` / ``to_parquet`` /
+    ``to_json`` on a successful write. Those warnings must end up in
+    the JSON ``warnings`` field, NOT on stderr.
+    """
+    out_path = tmp_path / "out.csv"
+
+    # Patch ``DataFrame.to_csv`` so calling it emits a warning while
+    # still actually writing the file.
+    real_to_csv = pd.DataFrame.to_csv
+
+    def warning_emitting_to_csv(self, *a, **kw):
+        warnings.warn("pandas-mock-to-csv: FutureWarning equivalent", FutureWarning, stacklevel=2)
+        return real_to_csv(self, *a, **kw)
+
+    monkeypatch.setattr(pd.DataFrame, "to_csv", warning_emitting_to_csv)
+
+    rc, out, err = _run(
+        [
+            "transform",
+            "--input",
+            str(tabular_csv),
+            "--output",
+            str(out_path),
+            "--target",
+            "y",
+            "--no-selection",
+            "--json",
+        ]
+    )
+    assert rc == 0, err
+    assert err == "", f"write-time warning leaked to stderr: {err!r}"
+    payload = json.loads(out)
+    assert any("pandas-mock-to-csv" in w for w in payload["warnings"]), payload["warnings"]
+
+
+# ----------------------- explain captures explain_features warnings
+
+
+def test_explain_features_warnings_captured_not_on_stderr(tmp_path: Path, monkeypatch, tabular_csv: Path):
+    """``explain_features`` / ``get_feature_code`` are now inside the
+    same capture as the read + ``fit_transform``, so any warning they
+    emit goes to the JSON ``warnings`` field, not stderr.
+    """
+    from featcopilot.transformers import sklearn_compat as _sc
+
+    real_explain = _sc.AutoFeatureEngineer.explain_features
+
+    def warning_emitting_explain(self):
+        warnings.warn("explain-features-mock-warning", UserWarning, stacklevel=2)
+        return real_explain(self)
+
+    monkeypatch.setattr(_sc.AutoFeatureEngineer, "explain_features", warning_emitting_explain)
+
+    rc, out, err = _run(["explain", "--input", str(tabular_csv), "--target", "y"])
+    assert rc == 0, err
+    assert err == "", f"explain_features warning leaked to stderr: {err!r}"
+    payload = json.loads(out)
+    assert any("explain-features-mock-warning" in w for w in payload["warnings"]), payload["warnings"]
+
+
 # ----------------------- explain --explain-sample-size warning hygiene