plexe-ai
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 2 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 15 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 26 additions & 0 deletions b/‎Makefile‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎plexe/config.py‎
Lines changed: 3 additions & 0 deletions b/‎plexe/config.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎plexe/execution/dataproc/session.py‎
Lines changed: 2 additions & 2 deletions b/‎plexe/execution/dataproc/session.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎plexe/workflow.py‎
Lines changed: 29 additions & 0 deletions b/‎plexe/workflow.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎poetry.lock‎
Lines changed: 37 additions & 1 deletion b/‎poetry.lock‎
Lines changed: 37 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 9 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎scripts/tests/run_integration_staged.sh‎
Lines changed: 87 additions & 0 deletions b/‎scripts/tests/run_integration_staged.sh‎
Lines changed: 87 additions & 0 deletions
@@ -174,6 +174,7 @@ cython_debug/
 
 # Working directory for model generation
 workdir/
+catboost_info/
 
 # Files generated by running the MLE Bench script
 mle-bench-config.yaml
 
@@ -81,6 +81,8 @@ make build-databricks         # Databricks Connect
 
 # Run tests
 poetry run pytest tests/unit/
+make test-integration          # Staged pytest integration suite (seed -> search -> eval)
+make test-integration-verbose  # Same suite with live test logs in terminal
 
 # Format and lint
 poetry run black .
 
@@ -91,6 +91,21 @@ To set up the development environment:
    poetry run pytest
    ```
 
+4. **Run staged integration tests before opening a PR**:
+
+   ```bash
+   # Requires ANTHROPIC_API_KEY and local Spark/Java setup
+   bash scripts/tests/run_integration_staged.sh
+   ```
+
+   The staged suite runs three pytest phases with hard barriers:
+   - `integration_seed`: builds reusable checkpoints through phase 3
+   - `integration_search`: resumes from seeds and runs model search
+   - `integration_eval`: resumes from search checkpoints, runs evaluation, and validates predictor inference
+
+   This `tests/integration` suite is the primary pre-PR integration workflow.
+   Makefile Docker targets remain optional/manual end-to-end checks.
+
 Ensure all tests pass before making contributions.
 
 ## Style Guides
 
@@ -4,6 +4,8 @@
 # Quick reference for developers:
 #   make help              Show all available commands
 #   make test-quick        Fast test (~30s, 1 iteration)
+#   make test-integration  Staged pytest integration suite
+#   make test-integration-verbose  Staged suite with live logs
 #   make test-xgboost      Test XGBoost only
 #   make test-catboost     Test CatBoost only
 #   make test-all-models   Test all model types
@@ -36,6 +38,8 @@ help:
 	@echo "  make test-lightgbm      Test LightGBM model type"
 	@echo "  make test-pytorch       Test PyTorch model type"
 	@echo "  make test-keras         Test Keras model type"
+	@echo "  make test-integration   Run staged pytest integration suite"
+	@echo "  make test-integration-verbose  Run staged suite with live logs"
 	@echo "  make test-all-models    Test all model types (sequential)"
 	@echo "  make test-full          Full test run (3 iterations + evaluation)"
 	@echo ""
@@ -61,6 +65,28 @@ help:
 # Quick Development Tests
 # ============================================
 
+# Staged pytest-native integration suite (seed -> search -> eval).
+# Optional: make test-integration INTEGRATION_RUN_ID=my_run_id
+.PHONY: test-integration
+test-integration:
+	@echo "🧪 Running staged pytest integration suite..."
+	@if [ -n "$(INTEGRATION_RUN_ID)" ]; then \
+		echo "Using integration run id: $(INTEGRATION_RUN_ID)"; \
+		PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" bash scripts/tests/run_integration_staged.sh; \
+	else \
+		bash scripts/tests/run_integration_staged.sh; \
+	fi
+
+.PHONY: test-integration-verbose
+test-integration-verbose:
+	@echo "🧪 Running staged pytest integration suite (verbose)..."
+	@if [ -n "$(INTEGRATION_RUN_ID)" ]; then \
+		echo "Using integration run id: $(INTEGRATION_RUN_ID)"; \
+		PLEXE_IT_RUN_ID="$(INTEGRATION_RUN_ID)" PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
+	else \
+		PLEXE_IT_VERBOSE=1 bash scripts/tests/run_integration_staged.sh; \
+	fi
+
 # Fast sanity check - 1 iteration, minimal config
 .PHONY: test-quick
 test-quick: build
 
@@ -562,6 +562,9 @@ def setup_logging(config: Config) -> logging.Logger:
     # Get package root logger
     package_logger = logging.getLogger("plexe")
     package_logger.setLevel(getattr(logging, config.log_level.upper()))
+    # Avoid duplicate output when external handlers (e.g., pytest live logging)
+    # are attached to the root logger.
+    package_logger.propagate = False
 
     # Clear existing handlers to avoid duplicates
     package_logger.handlers = []
 
@@ -109,8 +109,8 @@ def _create_local_spark(config) -> SparkSession:
             logger.info("Using pre-bundled Spark JARs from /opt/spark-jars/")
             builder = builder.config("spark.jars", spark_jars_env)
         else:
-            # Fallback: Download JARs at runtime via Maven (local development)
-            logger.info("Downloading Spark JARs from Maven Central (first run may take ~40s)")
+            # Fallback: Resolve JARs via Maven (download occurs only on cache miss)
+            logger.info("Resolving Spark JARs via Maven Central (download only on first run/cache miss)")
             builder = builder.config(
                 "spark.jars.packages",
                 "org.apache.hadoop:hadoop-aws:3.3.6,com.amazonaws:aws-java-sdk-bundle:1.12.367",
 
@@ -76,6 +76,33 @@
 # ============================================
 
 
+def _apply_allowed_model_types_on_resume(context: BuildContext, config: Config, start_phase: int) -> None:
+    """Restrict checkpoint-resumed model types to config.allowed_model_types when provided."""
+    if start_phase <= 1 or not config.allowed_model_types:
+        return
+
+    allowed_types = list(dict.fromkeys(config.allowed_model_types))
+    if not context.viable_model_types:
+        context.viable_model_types = allowed_types
+        logger.info(f"Checkpoint missing viable model types; using allowed model types: {allowed_types}")
+        return
+
+    filtered_model_types = [m for m in context.viable_model_types if m in allowed_types]
+    if not filtered_model_types:
+        raise ValueError(
+            "No model types remain after applying allowed_model_types on resume: "
+            f"checkpoint={context.viable_model_types}, allowed={allowed_types}"
+        )
+
+    if filtered_model_types != context.viable_model_types:
+        logger.info(
+            "Restricting resumed model types from checkpoint %s to %s",
+            context.viable_model_types,
+            filtered_model_types,
+        )
+        context.viable_model_types = filtered_model_types
+
+
 def build_model(
     spark: SparkSession,
     train_dataset_uri: str,
@@ -182,6 +209,8 @@ def build_model(
         context.scratch["_user_feedback"] = user_feedback
         logger.info("📝 User feedback injected - agents will incorporate guidance into their work")
 
+    _apply_allowed_model_types_on_resume(context, config, start_phase)
+
     # Wrap entire workflow in top-level trace span
     with tracer.start_as_current_span("ModelBuilder") as root_span:
         root_span.set_attribute("experiment_id", experiment_id)
 
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "plexe"
-version = "1.3.4"
+version = "1.3.5"
 description = "An agentic framework for building ML models from natural language"
 authors = [
     "Marcello De Bernardi <mdebernardi@plexe.ai>",
@@ -84,13 +84,21 @@ vision = ["torch"]
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.3.4"
+pytest-xdist = "^3.8.0"
 pre-commit = "^4.0.1"
 ruff = "^0.14.9"
 black = ">=23.0.0"
 streamlit = ">=1.52.1,<2.0.0"
 plotly = ">=6.5.0,<7.0.0"
 boto3 = "^1.42.44"
 
+[tool.pytest.ini_options]
+markers = [
+    "integration_seed: stage 1 integration tests that build reusable checkpoints through phase 3",
+    "integration_search: stage 2 integration tests that resume from seeds and pause after phase 4",
+    "integration_eval: stage 3 integration tests that resume from search checkpoints and run evaluation + packaging",
+]
+
 [tool.semantic_release]
 version_variables = ["pyproject.toml:version"]
 commit_parser = "angular"
 
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+cd "$ROOT_DIR"
+
+CATBOOST_INFO_DIR="$ROOT_DIR/catboost_info"
+
+cleanup_catboost_info() {
+  if [[ "${PLEXE_IT_KEEP_CATBOOST_INFO:-0}" == "1" ]]; then
+    return
+  fi
+  rm -rf "$CATBOOST_INFO_DIR"
+}
+
+# Remove stale CatBoost local artifacts from previous runs.
+cleanup_catboost_info
+# Keep repo clean even if a stage fails midway.
+trap cleanup_catboost_info EXIT
+
+if [[ -z "${PLEXE_IT_RUN_ID:-}" ]]; then
+  PLEXE_IT_RUN_ID="$(date +%Y%m%d_%H%M%S)"
+fi
+export PLEXE_IT_RUN_ID
+
+ARTIFACT_ROOT="$ROOT_DIR/.pytest_cache/integration/$PLEXE_IT_RUN_ID"
+mkdir -p "$ARTIFACT_ROOT"
+
+if ! poetry run python -c "import importlib.util,sys; sys.exit(0 if importlib.util.find_spec('xdist') else 1)"; then
+  echo "ERROR: pytest-xdist is required for staged integration tests."
+  echo "Install dependencies with: poetry install"
+  echo "Then verify with: poetry run pytest --help | grep -E '(^| )-n( |$)'"
+  exit 2
+fi
+
+if [[ -n "${PLEXE_IT_WORKERS:-}" ]]; then
+  WORKERS="${PLEXE_IT_WORKERS}"
+elif [[ "${PLEXE_IT_VERBOSE:-0}" == "1" ]]; then
+  # In verbose mode, default to main-process execution for reliable live logs.
+  WORKERS="0"
+else
+  WORKERS="auto"
+fi
+PYTEST_PARALLEL_ARGS=(-n "$WORKERS")
+PYTEST_LOG_DISABLE_ARGS=(
+  --log-disable=LiteLLM
+  --log-disable=litellm
+  --log-disable=httpx
+  --log-disable=httpcore
+  --log-disable=urllib3
+  --log-disable=py4j
+  --log-disable=py4j.clientserver
+  --log-disable=py4j.java_gateway
+)
+
+run_stage() {
+  local marker="$1"
+  local cmd=(poetry run pytest tests/integration -m "$marker" "${PYTEST_PARALLEL_ARGS[@]}" --maxfail=1)
+
+  if [[ "${PLEXE_IT_VERBOSE:-0}" == "1" ]]; then
+    cmd+=(-s -vv -o log_cli=true -o log_cli_level=INFO --capture=tee-sys "${PYTEST_LOG_DISABLE_ARGS[@]}")
+  fi
+
+  "${cmd[@]}"
+}
+
+echo "Running staged integration tests with run id: $PLEXE_IT_RUN_ID"
+echo "Artifacts: $ARTIFACT_ROOT"
+echo "Workers: $WORKERS"
+if [[ "${PLEXE_IT_VERBOSE:-0}" == "1" ]]; then
+  echo "Verbose mode: enabled (live logs and test output)"
+fi
+
+echo ""
+echo "Stage 1/3: building reusable seeds through phase 3"
+run_stage "integration_seed"
+
+echo ""
+echo "Stage 2/3: resuming from seeds through phase 4"
+run_stage "integration_search"
+
+echo ""
+echo "Stage 3/3: final evaluation, packaging, and predictor checks"
+run_stage "integration_eval"
+
+echo ""
+echo "Staged integration suite completed successfully."