From 27eba73d40522b737f01500c8f67d467e21f99c5 Mon Sep 17 00:00:00 2001 From: Marko Lekic Date: Thu, 27 Nov 2025 16:06:10 +0100 Subject: [PATCH 1/3] Implemented hooks and added hooks demo --- .github/workflows/ci.yml | 19 +- .github/workflows/release.yml | 11 +- docs/Cost_Monitoring.md | 106 +- docs/Hooks.md | 447 ++++++++ docs/examples/Hooks_Demo.md | 383 +++++++ examples/.env.dev_postgres | 3 + examples/api_demo/profiles.yml | 6 +- examples/basic_demo/profiles.yml | 6 +- examples/ci_demo/profiles.yml | 6 +- examples/dq_demo/profiles.yml | 6 +- .../hooks_demo/.env.dev_bigquery_bigframes | 7 + examples/hooks_demo/.env.dev_bigquery_pandas | 7 + examples/hooks_demo/.env.dev_databricks | 14 + examples/hooks_demo/.env.dev_duckdb | 2 + examples/hooks_demo/.env.dev_postgres | 3 + examples/hooks_demo/.env.dev_snowflake | 18 + examples/hooks_demo/Makefile | 120 +++ examples/hooks_demo/README.md | 23 + examples/hooks_demo/hooks/audit_run_end.sql | 12 + .../hooks_demo/hooks/audit_run_end_spark.sql | 11 + examples/hooks_demo/hooks/notify.py | 93 ++ examples/hooks_demo/models/README.md | 19 + .../models/marts/mart_events_daily.ff.sql | 27 + .../models/staging/events_clean.ff.sql | 36 + examples/hooks_demo/profiles.yml | 58 ++ examples/hooks_demo/project.yml | 303 ++++++ examples/hooks_demo/seeds/README.md | 4 + examples/hooks_demo/seeds/seed_events.csv | 6 + examples/hooks_demo/sources.yml | 8 + examples/hooks_demo/tests/dq/README.md | 4 + examples/hooks_demo/tests/unit/README.md | 4 + examples/incremental_demo/profiles.yml | 16 +- examples/macros_demo/profiles.yml | 6 +- examples/materializations_demo/profiles.yml | 6 +- examples/snapshot_demo/profiles.yml | 16 +- .../models/staging/stg_users.ff.sql | 4 +- .../building_locally_demo/profiles.yml | 6 +- .../building_locally_demo/project.yml | 11 +- .../seeds/{users.csv => seed_users.csv} | 0 .../building_locally_demo/sources.yml | 14 +- .../tests/unit/stg_users.yml | 13 + mkdocs.yml | 2 + pyproject.toml | 2 +- src/fastflowtransform/cli/run.py | 968 ++++++++++++++++-- src/fastflowtransform/config/models.py | 23 + src/fastflowtransform/config/project.py | 74 +- src/fastflowtransform/core.py | 65 +- src/fastflowtransform/executors/base.py | 8 + .../executors/bigquery/base.py | 22 +- .../executors/databricks_spark.py | 15 + src/fastflowtransform/executors/duckdb.py | 9 +- src/fastflowtransform/executors/postgres.py | 8 + .../executors/snowflake_snowpark.py | 16 + src/fastflowtransform/hooks/registry.py | 111 ++ src/fastflowtransform/hooks/types.py | 84 ++ src/fastflowtransform/logging.py | 12 +- src/fastflowtransform/utest.py | 4 +- tests/common/mock/bigquery.py | 9 + tests/integration/examples/config.py | 16 + .../executors/test_bigquery_bf_exec_unit.py | 16 + uv.lock | 2 +- 61 files changed, 3146 insertions(+), 184 deletions(-) create mode 100644 docs/Hooks.md create mode 100644 docs/examples/Hooks_Demo.md create mode 100644 examples/.env.dev_postgres create mode 100644 examples/hooks_demo/.env.dev_bigquery_bigframes create mode 100644 examples/hooks_demo/.env.dev_bigquery_pandas create mode 100644 examples/hooks_demo/.env.dev_databricks create mode 100644 examples/hooks_demo/.env.dev_duckdb create mode 100644 examples/hooks_demo/.env.dev_postgres create mode 100644 examples/hooks_demo/.env.dev_snowflake create mode 100644 examples/hooks_demo/Makefile create mode 100644 examples/hooks_demo/README.md create mode 100644 examples/hooks_demo/hooks/audit_run_end.sql create mode 100644 examples/hooks_demo/hooks/audit_run_end_spark.sql create mode 100644 examples/hooks_demo/hooks/notify.py create mode 100644 examples/hooks_demo/models/README.md create mode 100644 examples/hooks_demo/models/marts/mart_events_daily.ff.sql create mode 100644 examples/hooks_demo/models/staging/events_clean.ff.sql create mode 100644 examples/hooks_demo/profiles.yml create mode 100644 examples/hooks_demo/project.yml create mode 100644 examples/hooks_demo/seeds/README.md create mode 100644 examples/hooks_demo/seeds/seed_events.csv create mode 100644 examples/hooks_demo/sources.yml create mode 100644 examples/hooks_demo/tests/dq/README.md create mode 100644 examples/hooks_demo/tests/unit/README.md rename examples_article/building_locally_demo/seeds/{users.csv => seed_users.csv} (100%) create mode 100644 examples_article/building_locally_demo/tests/unit/stg_users.yml create mode 100644 src/fastflowtransform/hooks/registry.py create mode 100644 src/fastflowtransform/hooks/types.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e646bb5..466ae27 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -109,7 +109,7 @@ jobs: fail-fast: false matrix: include: - # DuckDB examples + # ---------- DuckDB examples ---------- - engine: duckdb extra: "" example: api_demo @@ -130,6 +130,10 @@ jobs: extra: "" example: dq_demo env_file: examples/dq_demo/.env.dev_duckdb + - engine: duckdb + extra: "" + example: hooks_demo + env_file: examples/hooks_demo/.env.dev_duckdb - engine: duckdb extra: "" example: incremental_demo @@ -142,7 +146,7 @@ jobs: extra: "" example: materializations_demo env_file: examples/materializations_demo/.env.dev_duckdb - # Postgres examples + # ---------- Postgres examples ---------- - engine: postgres extra: "postgres" example: api_demo @@ -163,6 +167,10 @@ jobs: extra: "postgres" example: dq_demo env_file: examples/dq_demo/.env.dev_postgres + - engine: postgres + extra: "postgres" + example: hooks_demo + env_file: examples/hooks_demo/.env.dev_postgres - engine: postgres extra: "postgres" example: incremental_demo @@ -175,7 +183,7 @@ jobs: extra: "postgres" example: materializations_demo env_file: examples/materializations_demo/.env.dev_postgres - # Spark examples + # ---------- Spark examples ---------- - engine: databricks_spark extra: "spark" example: api_demo @@ -201,6 +209,11 @@ jobs: example: dq_demo java: true env_file: examples/dq_demo/.env.dev_databricks + - engine: databricks_spark + extra: "spark" + example: hooks_demo + java: true + env_file: examples/hooks_demo/.env.dev_databricks - engine: databricks_spark extra: "spark" example: incremental_demo diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ffccf58..d9e8f99 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -31,14 +31,19 @@ jobs: id: meta run: | python - << 'PY' - import pathlib, tomllib + import os + import pathlib + import tomllib data = tomllib.loads(pathlib.Path("pyproject.toml").read_text()) version = data.get("project", {}).get("version") if not version: - raise SystemExit("No [project].version found in pyproject.toml") + raise SystemExit("No [project].version found in pyproject.toml") + print(f"Version: {version}") - with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf8") as fh: + + out_path = os.environ["GITHUB_OUTPUT"] + with open(out_path, "a", encoding="utf8") as fh: fh.write(f"version={version}\n") PY diff --git a/docs/Cost_Monitoring.md b/docs/Cost_Monitoring.md index 68cde22..6eac864 100644 --- a/docs/Cost_Monitoring.md +++ b/docs/Cost_Monitoring.md @@ -221,54 +221,76 @@ my_project/ ```yaml version: 1 -# Default behaviour when a budget is exceeded. -# One of: "warn", "error" -defaults: - on_exceed: "warn" - -budgets: - # Global cap for total bytes scanned by all models. - total_bytes_scanned: - warn_after: "10GB" # log a warning above this - error_after: "50GB" # fail the run above this - - # Optional: total query time across all models. - total_query_duration_ms: - warn_after: 600000 # 10 minutes - error_after: 3600000 # 60 minutes - - # Per-engine budgets - by_engine: - bigquery: - bytes_scanned: - warn_after: "5GB" - error_after: "20GB" - - snowflake_snowpark: - bytes_scanned: - warn_after: "2GB" - error_after: "10GB" - - # Optional per-model budgets (key = model name) - per_model: - fct_events: - bytes_scanned: - warn_after: "500MB" - error_after: "2GB" - - dim_users: - bytes_scanned: - warn_after: "100MB" +# Per-engine query limits (applied before executing individual queries) +query_limits: + duckdb: + max_bytes: 5_000_000 + postgres: + max_bytes: 10_000_000 + bigquery: + max_bytes: 50_000_000 + databricks_spark: + max_bytes: 50_000_000 + snowflake_snowpark: + max_bytes: 50_000_000 + +# Global limits across the entire fft run +total: + bytes_scanned: + # ~10 MB – adjust down if you want to force a warning + warn: 100 + # ~100 MB – adjust down if you want to force an error + error: 100_000_000 + + # Optional: total query time across all queries in the run + query_duration_ms: + warn: "30s" # human-friendly duration, parsed to ms + error: "2m" + +# Per-model limits (keys must match node names: stg_users.ff, mart_user_orders.ff, http_users, ...) +models: + stg_users.ff: + bytes_scanned: + # keep this fairly low so you can see a warn if you want + warn: 100 + error: 10_000_000 + + stg_orders.ff: + bytes_scanned: + warn: 1_000_000 + error: 10_000_000 + + mart_user_orders.ff: + bytes_scanned: + warn: 1_000_000 + error: 100_000_000 + + http_users: + # HTTP model → mainly interesting on engines that can report bytes_scanned + bytes_scanned: + warn: 5_000_000 + error: 50_000_000 + + py_constants: + bytes_scanned: + warn: 5_000_000 + error: 50_000_000 + +# Per-tag budgets (aggregated over all models with that tag) +tags: + "example:cache_demo": + bytes_scanned: + warn: 10_000_000 ``` #### Value syntax -* `warn_after` / `error_after` for **bytes** use the same notation as +* `warn` / `error` for **bytes** use the same notation as `FF_*_MAX_BYTES`: ```yaml - warn_after: "5GB" - error_after: "500_000_000" + warn: "5GB" + error: "500_000_000" ``` * Durations are currently in **milliseconds** (plain integers). @@ -290,7 +312,7 @@ budgets: * If an **error** budget is exceeded, FFT treats it like a failed run and exits with `1`. -If both `warn_after` and `error_after` are defined and exceeded, the **error** +If both `arn` and `error` are defined and exceeded, the **error** behaviour wins. ### Interaction with env-level guards diff --git a/docs/Hooks.md b/docs/Hooks.md new file mode 100644 index 0000000..efe3719 --- /dev/null +++ b/docs/Hooks.md @@ -0,0 +1,447 @@ +# Hooks + +Hooks let you plug custom behavior into a FastFlowTransform run without changing your models. +They’re mainly used for: + +* Auditing (run & model audit tables) +* Notifications/logging +* Lightweight data quality checks +* Custom side-effects around model execution + +This page explains: + +* **Lifecycle events** you can hook into +* How to **configure hooks in `project.yml`** +* How to write **Python hooks with `@fft_hook`** +* How to write **SQL hooks (inline & file-based)** +* What **context** each hook receives + +--- + +## 1. Lifecycle events + +There are two scopes: + +* **Run-level hooks** – fire once per `fft run` +* **Model-level hooks** – fire for each model (either globally via `select` or directly on the model) + +### Run-level events + +| Event | When it fires | Config key in `project.yml` | Typical use | +| -------------- | ---------------------------------------------------- | --------------------------- | ------------------------------- | +| `on_run_start` | Right after the project loads, before any model runs | `hooks.on_run_start` | Create audit tables, banners | +| `on_run_end` | After all models run & budgets are evaluated | `hooks.on_run_end` | Final audit row, summary, alert | + +> **Note:** `on_run_end` is invoked even if some models fail. +> The hook receives a `run.status` of `"success"` or `"error"`. + +### Model-level events + +These are defined in `project.yml` under `hooks:` and applied to *models matching a selector*. + +| Event | Meaning | Config key in `project.yml` | +| -------------- | ------------------------------------------------- | --------------------------- | +| `before_model` | Right before a model starts (per matching model) | `hooks.before_model` | +| `after_model` | Right after a model finishes (per matching model) | `hooks.after_model` | + +Under the hood, these are attached to the model’s meta and executed as “pre/post hooks” around the model. + +--- + +## 2. Configuring hooks in `project.yml` + +Hooks live under the top-level `hooks:` key in `project.yml`. + +### 2.1 Run-level hooks + +Each hook entry is a **HookSpec** with at least: + +* `name`: logical name of the hook +* `kind`: `"sql"` or `"python"` +* optional `engines`: list of engine names to restrict execution (e.g. `["duckdb", "bigquery"]`) +* optional `sql`: inline SQL body (for SQL hooks) +* optional `params`: extra free-form values passed to Python hook context + +Example: + +```yaml +hooks: + on_run_start: + - name: create_audit_tables + kind: sql + sql: | + create table if not exists _ff_run_audit ( + run_id text, + started_at timestamp, + finished_at timestamp, + status text, + env text, + engine text + ); + + - name: audit_run_start + kind: sql + sql: | + insert into _ff_run_audit (run_id, started_at, status, env, engine) + values ( + {{ run.run_id | sql_literal }}, + current_timestamp, + 'running', + {{ run.env_name | sql_literal }}, + {{ run.engine_name | sql_literal }} + ); + + - name: python_banner + kind: python + + on_run_end: + - name: audit_run_end + kind: sql # SQL body lives in hooks/audit_run_end.sql + - name: python_summary + kind: python +``` + +Notes: + +* For **Python hooks**, `name` must match the registration name in the decorator (see below). +* For **SQL hooks**, the SQL can be inline (`sql:`) or come from a `.sql` file in `hooks/` (see section 4). + +--- + +### 2.2 Model-level hooks with `select` + +Model hooks can be attached **by selector**, not by hard-coding them into each model. +This is what `before_model` and `after_model` are for: + +```yaml +hooks: + before_model: + - name: model_start_audit + kind: sql + select: "tag:example:hooks_demo" + sql: | + insert into _ff_model_audit ( + run_id, + model_name, + event, + status, + started_at + ) + values ( + {{ run.run_id | sql_literal }}, + {{ model.name | sql_literal }}, + 'start', + 'running', + current_timestamp + ); + + after_model: + - name: model_end_audit + kind: sql + select: "tag:example:hooks_demo" + sql: | + update _ff_model_audit + set finished_at = current_timestamp, + status = 'success', + rows_affected = null, + elapsed_ms = null + where run_id = {{ run.run_id | sql_literal }} + and model_name = {{ model.name | sql_literal }} + and event = 'start'; + + - name: model_end_log_python + kind: python + select: "tag:scope:mart" +``` + +* `select` uses the same selector language as `fft run` (`tag:…`, `model:…`, etc.). +* The hook is executed for each model that matches the selector. + +--- + +## 3. Python hooks + +Python hooks live in the project’s `hooks/` directory (any subfolder), and are registered via the `@fft_hook` decorator. + +### 3.1 Basic structure + +Example `hooks/notify.py`: + +```python +from __future__ import annotations +from typing import Any +from fastflowtransform.hooks.registry import fft_hook + +def _fmt_env(env: dict[str, Any]) -> str: + parts = [] + for key in ("FF_ENGINE", "FFT_ACTIVE_ENV"): + if key in env: + parts.append(f"{key}={env[key]}") + return ", ".join(parts) if parts else "" + +@fft_hook(name="python_banner", when="on_run_start") +def on_run_start(ctx: dict[str, Any]) -> None: + run = ctx.get("run", {}) + env = ctx.get("env", {}) + print( + f"[hooks_demo] on_run_start: run_id={run.get('run_id')} " + f"(env_name={run.get('env_name')}, engine={run.get('engine_name')}; {_fmt_env(env)})" + ) + +@fft_hook(name="python_summary", when="on_run_end") +def on_run_end(ctx: dict[str, Any]) -> None: + run = ctx.get("run", {}) + stats = ctx.get("stats", {}) or {} + print( + "[hooks_demo] on_run_end: run_id=%s status=%s (built=%s, skipped=%s, failed=%s)" + % ( + run.get("run_id"), + run.get("status"), + stats.get("models_built"), + stats.get("models_skipped"), + stats.get("models_failed"), + ) + ) + +@fft_hook(name="model_end_log_python", when="after_model") +def on_model_end(ctx: dict[str, Any]) -> None: + run = ctx.get("run", {}) + model = ctx.get("model", {}) + print( + "[hooks_demo] on_model_end: run_id=%s model=%s" + % (run.get("run_id"), model.get("name")) + ) +``` + +> All Python files under `hooks/**.py` are loaded when the run starts; their `@fft_hook` decorators populate the registry. + +### 3.2 The `@fft_hook` decorator + +```python +@fft_hook(name="python_banner", when="on_run_start") +def on_run_start(ctx: dict[str, Any]) -> None: + ... +``` + +* **`name`** + Logical name; must match `project.yml`’s `name` field for the hook. + If omitted, defaults to the function name. + +* **`when`** + Lifecycle event this hook is for, e.g.: + + * `"on_run_start"` + * `"on_run_end"` + * `"before_model"` + * `"after_model"` + +> Only these values are accepted; anything else will raise at registration time. + +### 3.3 Python hook context + +Python hooks always receive a **single dictionary argument**, the *hook context*. + +Shape (simplified): + +```python +ctx: dict[str, Any] = { + "when": "on_run_start" | "on_run_end" | "before_model" | "after_model", + "run": { + "run_id": str, # unique run identifier + "env_name": str, # profile/env name (e.g. 'dev_duckdb') + "engine_name": str, # engine (e.g. 'duckdb') + "started_at": str, # ISO timestamp + "status": str | None, # on_run_end: 'success' | 'error' + "row_count": int | None, + "error": str | None, + }, + "model": { + "name": str, + "path": pathlib.Path, + "tags": list[str], + "meta": dict[str, Any], + # (future extensions: status/rows/elapsed/error for model events) + } | None, + "env": dict[str, str], # env vars relevant to FFT (FF_* etc.) + # Only for on_run_end: + "stats": { + "models_built": int, + "models_failed": int, + "models_skipped": int, + } | None, + # Plus any extra keys from HookSpec.params: + # e.g. ctx["slack_channel"], ctx["threshold"], ... +} +``` + +So for simple hooks you can do: + +```python +run = ctx["run"] +if ctx["when"] == "on_run_end" and run["status"] == "error": + ... +``` + +--- + +## 4. SQL hooks + +SQL hooks are just Jinja-templated SQL statements that are executed via your target engine. + +You can define them: + +1. **Inline** in `project.yml`; or +2. In a **`.sql` file under `hooks/`**, referenced by name. + +### 4.1 Inline SQL + +Inline SQL was shown in the examples above: + +```yaml +- name: audit_run_start + kind: sql + sql: | + insert into _ff_run_audit (run_id, started_at, status, env, engine) + values ({{ run.run_id }}, current_timestamp, 'running', {{ run.env_name }}, {{ run.engine_name }}); +``` + +### 4.2 File-based SQL (`hooks/**/*.sql`) + +If `kind: sql` has **no `sql:` body**, FFT will look for a `.sql` file: + +* Root: `/hooks` +* Pattern: `hooks/**/.sql` +* `name` is the `HookSpec.name` from `project.yml` + +Example: + +```yaml +hooks: + on_run_end: + - name: audit_run_end + kind: sql # SQL body lives in hooks/audit_run_end.sql +``` + +File layout: + +```text +hooks/ + audit_run_end.sql + model_start_audit.sql + audit/ + complex_audit_for_marts.sql # name: complex_audit_for_marts +``` + +`hooks/audit_run_end.sql`: + +```sql +-- examples/hooks_demo/hooks/audit_run_end.sql +-- Update the run-level audit row when the run finishes. + +update _ff_run_audit +set + finished_at = current_timestamp, + status = 'success', + row_count = NULL, + error = NULL +where run_id = {{ run.run_id | sql_literal }}; +``` + +If no matching file is found, the run fails with a clear error. + +--- + +## 5. Jinja context for SQL hooks + +SQL hook templates are rendered with: + +* `run`: run context (similar to Python hooks, but with some fields already converted to SQL literals) +* `model`: model context (for model-level hooks), or `None` for pure run hooks +* `node`: alias of `model` + +### 5.1 `run` context + +In SQL hooks: + +* All `run.*` fields are **plain values** in the Jinja context. +* When inlining them into SQL, always pass them through `| sql_literal` to get a safe SQL literal. + +Example: + +```sql +insert into _ff_run_audit (run_id, started_at, status, env, engine) +values ({{ run.run_id }}, current_timestamp, 'running', {{ run.env_name }}, {{ run.engine_name }}); +``` + +### 5.2 `model` context + +For model-level hooks: + +```jinja2 +{{ model.name }} -- logical model name ('events_clean.ff') +{{ model.path }} -- full filesystem path +{{ model.tags }} -- list of tags +{{ model.meta }} -- model meta dict from config(...) +``` + +You can use `model` or `node` – they’re the same object. + +### 5.3 `sql_literal` filter + +To safely inline values into SQL hook templates, use the `sql_literal` filter: + +* `None` or Jinja `Undefined` → `NULL` +* `bool` → `TRUE` / `FALSE` +* `int` / `float` → `123`, `1.23` +* `str` → single-quoted with internal quotes escaped +* Other types → JSON-dumped and then single-quoted + +Examples: + +```jinja2 +where run_id = {{ run.run_id | sql_literal }}; + +set status = {{ model.status | sql_literal }}; +set rows_affected = {{ model.rows_affected | sql_literal }}; +``` + +This helps avoid syntax errors and SQL injection in generated hook SQL. + +--- + +## 6. Error handling & logging + +* If a hook (SQL or Python) raises an exception, the run fails with a message like: + + > `Failed to execute on_run_end hook #1 for run: ...` + +* Hook execution is logged with the `[hooks]` prefix, for example: + + ```text + [FFT] [hooks] when=on_run_start node=: executing 3 hook(s): sql:create_audit_tables, sql:audit_run_start, python:python_banner + [FFT] [hooks] when=on_run_start node= hook#1 kind=sql name='create_audit_tables' – rendering SQL + [FFT] [hooks] when=on_run_start node= hook#1 name='create_audit_tables' executing SQL: + ... + [FFT] [hooks] when=on_run_end node= hook#2 kind=python name='python_summary' – invoking python hook + ``` + +This makes it easy to see **which hooks were registered** and **exactly what SQL** they ran. + +--- + +## 7. Best practices + +* **Keep hooks idempotent** + Especially SQL hooks: include `run_id` and `model_name` in audit tables so reruns don’t break things. + +* **Scope hooks with `select`** + Use tags (`tag:scope:mart`, `tag:example:hooks_demo`) so hooks don’t run on every model. + +* **Be defensive in Python hooks** + Treat `ctx` as a dict that may or may not have everything you expect (use `.get()`). + +* **Avoid heavy work in hooks** + Hooks run inside the main pipeline. Use them for auditing/logging/notifications, not for big ETL jobs. + +* **Use `sql_literal` in SQL hooks** + Whenever you inline values, go through `| sql_literal` instead of crafting quotes by hand. diff --git a/docs/examples/Hooks_Demo.md b/docs/examples/Hooks_Demo.md new file mode 100644 index 0000000..7f2e3b1 --- /dev/null +++ b/docs/examples/Hooks_Demo.md @@ -0,0 +1,383 @@ +# Hooks Demo + +This example project shows how to use **run-level** and **model-level** hooks in FastFlowTransform to build a simple **audit + logging** system around a run. + +You’ll see how to: + +* Configure hooks in `project.yml` +* Implement **Python hooks** with `@fft_hook` +* Implement **SQL hooks** (inline and file-based) +* Write models that are targeted via **selectors** (tags) +* Store run/model metadata in simple **audit tables** + +--- + +## 1. What this example does + +When you run the project: + +1. **Run-level hooks** create two audit tables and write a **run-start** row. +2. For each selected model: + + * A **before-model** SQL hook writes a `start` row to `_ff_model_audit`. + * The model runs (`events_clean.ff`, then `mart_events_daily.ff`). + * An **after-model** SQL hook updates that row with status, timestamps, etc. + * A **Python model hook** logs some info for mart models. +3. **Run-end hooks**: + + * A SQL hook updates the run row in `_ff_run_audit` to `success` or `error`. + * A Python hook prints a human-readable summary of the run. + +At the end you can query: + +* `_ff_run_audit` – high-level runs overview +* `_ff_model_audit` – per-model lifecycle events + +--- + +## 2. Project layout + +Rough structure: + +```text +hooks_demo/ + project.yml + models/ + events_clean.ff.sql + mart_events_daily.ff.sql + hooks/ + notify.py # Python hooks (using @fft_hook) + audit_run_end.sql # File-based SQL hook (on_run_end) +``` + +### Models + +Both models are standard SQL models with tags: + +* `models/events_clean.ff.sql` + + * Staging/cleaning model + * Tags include: `example:hooks_demo`, `scope:staging`, `engine:duckdb`, … + +* `models/mart_events_daily.ff.sql` + + * Simple daily mart over events + * Tags include: `example:hooks_demo`, `scope:mart`, `engine:duckdb`, … + +The important part is the **tags**, because the model hooks use selectors like `tag:example:hooks_demo` and `tag:scope:mart`. + +--- + +## 3. Hooks in `project.yml` + +Open `project.yml` and look at the `hooks:` section. It roughly looks like this: + +```yaml +hooks: + on_run_start: + - name: create_audit_tables + kind: sql + sql: |- + -- create _ff_run_audit and _ff_model_audit if they don't exist + ... + + - name: audit_run_start + kind: sql + sql: |- + insert into _ff_run_audit (run_id, started_at, status, env, engine) + values ( + {{ run.run_id }}, + current_timestamp, + 'running', + {{ run.env_name }}, + {{ run.engine_name }} + ); + + - name: python_banner + kind: python + + on_run_end: + - name: audit_run_end + kind: sql # body in hooks/audit_run_end.sql + - name: python_summary + kind: python + + before_model: + - name: model_start_audit + kind: sql + select: "tag:example:hooks_demo" + sql: |- + insert into _ff_model_audit (...) + values (... {{ run.run_id }}, {{ model.name }}, 'start', 'running', current_timestamp); + + after_model: + - name: model_end_audit + kind: sql + select: "tag:example:hooks_demo" + sql: |- + update _ff_model_audit + set finished_at = current_timestamp, + status = 'success', + rows_affected = NULL, + elapsed_ms = NULL + where run_id = {{ run.run_id }} + and model_name = {{ model.name }} + and event = 'start'; + + - name: model_end_log_python + kind: python + select: "tag:scope:mart" +``` + +Key ideas: + +* **Run-level hooks** (`on_run_start`, `on_run_end`) apply to the whole run. +* **Model-level hooks** (`before_model`, `after_model`) are scoped with `select`. +* `kind: sql` hooks are SQL templates. +* `kind: python` hooks are Python functions registered via `@fft_hook` in `hooks/*.py`. + +--- + +## 4. Python hooks (`hooks/notify.py`) + +Python hooks are implemented using the `@fft_hook` decorator. + +In `hooks/notify.py` you’ll see something like: + +```python +from __future__ import annotations +from typing import Any +from fastflowtransform.hooks.registry import fft_hook + +def _fmt(env: dict[str, Any]) -> str: + parts = [] + for key in ("FFT_ACTIVE_ENV", "FF_ENGINE", "FF_ENGINE_VARIANT"): + if key in env: + parts.append(f"{key}={env[key]}") + return ", ".join(parts) if parts else "" + +@fft_hook(name="python_banner", when="on_run_start") +def on_run_start(context: dict[str, Any]) -> None: + run = context.get("run", {}) + env = context.get("env", {}) + info = _fmt(env if isinstance(env, dict) else {}) + print( + f"[hooks_demo] on_run_start: run_id={run.get('run_id')} " + f"(env_name={run.get('env_name')}, engine={run.get('engine_name')}; {info})" + ) + +@fft_hook(name="python_summary", when="on_run_end") +def on_run_end(context: dict[str, Any]) -> None: + run = context.get("run", {}) + stats = context.get("stats", {}) or {} + print( + "[hooks_demo] on_run_end: run_id=%s status=%s (built=%s, skipped=%s, failed=%s)" + % ( + run.get("run_id"), + run.get("status"), + stats.get("models_built"), + stats.get("models_skipped"), + stats.get("models_failed"), + ) + ) + +@fft_hook(name="model_end_log_python", when="after_model") +def on_model_end(context: dict[str, Any]) -> None: + run = context.get("run", {}) + model = context.get("model", {}) + print( + "[hooks_demo] on_model_end: run_id=%s model=%s status=%s" + % (run.get("run_id"), model.get("name"), model.get("status")) + ) +``` + +Important bits: + +* `@fft_hook(name=..., when=...)` must match the `name` and event in `project.yml`. +* Each hook receives a single `context: dict[str, Any]` argument with: + + * `context["when"]` – lifecycle event string + * `context["run"]` – run-level info (run_id, env_name, engine_name, status, …) + * `context["model"]` – model info for model-level events (`before_model` / `after_model`) + * `context["env"]` – environment variables snapshot (selected `FF_*` vars) + * `context["stats"]` – only for `on_run_end`, contains counts like `models_built` + +For this demo, the Python hooks just print human-readable lines with `[hooks_demo]` prefixes so you can see them in the CLI output. + +--- + +## 5. SQL hooks (`hooks/*.sql` + inline) + +### 5.1 Audit tables + +The **run-level** audit table: + +```sql +create table if not exists _ff_run_audit ( + run_id text, + started_at timestamp, + finished_at timestamp, + status text, -- 'running' | 'success' | 'error' + env text, + engine text, + row_count bigint, -- optional aggregate info + error text -- error message if the run fails +); +``` + +The **model-level** audit table: + +```sql +create table if not exists _ff_model_audit ( + run_id text, + model_name text, + event text, -- 'start' | 'end' + status text, -- 'running' | 'success' | 'error' + started_at timestamp, + finished_at timestamp, + rows_affected bigint, + elapsed_ms bigint, + error text +); +``` + +These are created by the `create_audit_tables` SQL hook on `on_run_start`. + +### 5.2 File-based `on_run_end` SQL hook + +The `on_run_end` SQL hook `audit_run_end` is defined without inline SQL in `project.yml`, so its body lives in `hooks/audit_run_end.sql`: + +```yaml +on_run_end: + - name: audit_run_end + kind: sql +``` + +`hooks/audit_run_end.sql`: + +```sql +-- Update the run-level audit row when the run finishes. + +update _ff_run_audit +set + finished_at = current_timestamp, + status = 'success', -- or use {{ run.status | sql_literal }} if you want dynamic + row_count = NULL, + error = NULL +where run_id = {{ run.run_id | sql_literal }}; +``` + +This demonstrates the file-based hook resolution: + +* `kind: sql` with **no** `sql:` body → look for `hooks/**/.sql`. +* `name: audit_run_end` → `hooks/audit_run_end.sql`. + +### 5.3 Inline model-level SQL hooks + +The model-level hooks are inline SQL templates in `project.yml`. +They use `{{ run.run_id }}`, `{{ model.name }}`, and the `sql_literal` filter when needed. + +--- + +## 6. Data quality tests for the demo + +`project.yml` also includes some simple tests to show that the audit tables are populated (you can adjust these as needed): + +```yaml +tests: + # Event-level model tests (example) + - type: not_null + table: events_clean + column: event_id + tags: [example_hooks_demo] + + - type: row_count_between + table: mart_events_daily + min_rows: 1 + max_rows: 1000 + tags: [example_hooks_demo] + + # Audit tables (sanity checks) + - type: row_count_between + table: _ff_run_audit + min_rows: 1 + max_rows: 100 + tags: [example_hooks_demo, audit, run] + + - type: row_count_between + table: _ff_model_audit + min_rows: 2 + max_rows: 200 + tags: [example_hooks_demo, audit, model] +``` + +If the audits are not written as expected, these tests will fail and point you at the relevant table. + +--- + +## 7. Running the demo + +Assuming: + +* Your working directory is `examples/hooks_demo` (or similar) +* You have a profile called `dev_duckdb` configured for DuckDB + +Run: + +```bash +fft run . \ + --env dev_duckdb \ + --select tag:example:hooks_demo \ + --select tag:engine:duckdb +``` + +You should see log lines like: + +```text +[FFT] Profile: dev_duckdb | Engine: duckdb +[FFT] [hooks] when=on_run_start node=: executing 3 hook(s): ... +[hooks_demo] on_run_start: run_id=... (env_name=dev_duckdb, engine=duckdb; ...) +[FFT] ▶ L01 [DUCK] events_clean.ff (hooks_demo.events_clean) +[FFT] ✓ L01 ... +[FFT] ▶ L02 [DUCK] mart_events_daily.ff (hooks_demo.mart_events_daily) +[FFT] ✓ L02 ... +[FFT] [hooks] when=on_run_end node=: executing 2 hook(s): ... +[hooks_demo] on_run_end: run_id=... status=success (built=2, skipped=0, failed=0) +``` + +The `[hooks]` lines come from the internal hook runner; the `[hooks_demo]` lines come from your Python hooks in `hooks/notify.py`. + +--- + +## 8. Inspecting the results + +After running the demo, connect to your DuckDB database (or whatever engine you used) and inspect the audit tables. + +Examples (DuckDB): + +```sql +select * from _ff_run_audit order by started_at desc; + +select * from _ff_model_audit +order by run_id, model_name, started_at; +``` + +You should see: + +* One `_ff_run_audit` row for the run you just executed. +* At least two `_ff_model_audit` rows (one per model, possibly two per model if you log both start and end). + +--- + +## 9. How to extend this demo + +Ideas: + +* Add a **Slack/Teams/email** notification in a Python `on_run_end` hook when `run.status == "error"`. +* Record **row counts** and **elapsed time** from your executor into `_ff_model_audit` and expose them in hooks. +* Add more **selective hooks**: + + * e.g. `select: "tag:scope:mart"` for mart-only audits or notifications. +* Create multiple **file-based SQL hooks** under `hooks/` and wire them via `kind: sql` + `name:` in `project.yml`. + +This demo is mainly a **template for hook patterns**: once you understand how these pieces fit together, you can copy the same approach to real projects (with your own audit tables, logging conventions, and alerts). diff --git a/examples/.env.dev_postgres b/examples/.env.dev_postgres new file mode 100644 index 0000000..8923ecc --- /dev/null +++ b/examples/.env.dev_postgres @@ -0,0 +1,3 @@ +# Postgres profile for the hooks demo (replace with your own connection string) +FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 +FF_PG_SCHEMA=hooks_demo diff --git a/examples/api_demo/profiles.yml b/examples/api_demo/profiles.yml index 68a08c0..9564a9d 100644 --- a/examples/api_demo/profiles.yml +++ b/examples/api_demo/profiles.yml @@ -19,10 +19,8 @@ dev_databricks: app_name: "{{ env('FF_SPARK_APP_NAME', 'api_demo') }}" warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" extra_conf: - spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" - spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" - spark.hadoop.datanucleus.schema.autoCreateAll: "true" - spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" spark.driver.extraJavaOptions: > -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log -Dderby.system.home={{ project_dir() }}/.local/derby_home diff --git a/examples/basic_demo/profiles.yml b/examples/basic_demo/profiles.yml index 94e0330..99d2644 100644 --- a/examples/basic_demo/profiles.yml +++ b/examples/basic_demo/profiles.yml @@ -20,10 +20,8 @@ dev_databricks: warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" database: "{{ env('FF_DBR_DATABASE', 'basic_demo') }}" extra_conf: - spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" - spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" - spark.hadoop.datanucleus.schema.autoCreateAll: "true" - spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" spark.driver.extraJavaOptions: > -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log -Dderby.system.home={{ project_dir() }}/.local/derby_home diff --git a/examples/ci_demo/profiles.yml b/examples/ci_demo/profiles.yml index 2994e97..7087e1d 100644 --- a/examples/ci_demo/profiles.yml +++ b/examples/ci_demo/profiles.yml @@ -19,10 +19,8 @@ dev_databricks: app_name: "{{ env('FF_SPARK_APP_NAME', 'ci_demo') }}" warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" extra_conf: - spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" - spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" - spark.hadoop.datanucleus.schema.autoCreateAll: "true" - spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" spark.driver.extraJavaOptions: > -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log -Dderby.system.home={{ project_dir() }}/.local/derby_home diff --git a/examples/dq_demo/profiles.yml b/examples/dq_demo/profiles.yml index 8e61f15..eeaa8f8 100644 --- a/examples/dq_demo/profiles.yml +++ b/examples/dq_demo/profiles.yml @@ -19,10 +19,8 @@ dev_databricks: app_name: "{{ env('FF_SPARK_APP_NAME', 'dq_demo') }}" warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" extra_conf: - spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" - spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" - spark.hadoop.datanucleus.schema.autoCreateAll: "true" - spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" spark.driver.extraJavaOptions: > -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log -Dderby.system.home={{ project_dir() }}/.local/derby_home diff --git a/examples/hooks_demo/.env.dev_bigquery_bigframes b/examples/hooks_demo/.env.dev_bigquery_bigframes new file mode 100644 index 0000000..6ac8a9f --- /dev/null +++ b/examples/hooks_demo/.env.dev_bigquery_bigframes @@ -0,0 +1,7 @@ +# BigQuery profile for the hooks demo +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=hooks_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/hooks_demo/.env.dev_bigquery_pandas b/examples/hooks_demo/.env.dev_bigquery_pandas new file mode 100644 index 0000000..6ac8a9f --- /dev/null +++ b/examples/hooks_demo/.env.dev_bigquery_pandas @@ -0,0 +1,7 @@ +# BigQuery profile for the hooks demo +FF_BQ_PROJECT=fft-basic-demo +FF_BQ_DATASET=hooks_demo +FF_BQ_LOCATION=EU + +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples/hooks_demo/.env.dev_databricks b/examples/hooks_demo/.env.dev_databricks new file mode 100644 index 0000000..0284cd6 --- /dev/null +++ b/examples/hooks_demo/.env.dev_databricks @@ -0,0 +1,14 @@ +# Databricks (or local Spark) profile defaults for the hooks demo +FF_SPARK_MASTER=local[*] +FF_SPARK_APP_NAME=hooks_demo + +# Optional overrides when using Databricks SQL warehouses or Unity Catalog +# FF_DBR_DATABASE=hooks_demo +# FF_DBR_CATALOG=hive_metastore + +# Set these if you need a Hive-compatible Spark metastore +FF_DBR_ENABLE_HIVE=1 +FF_DBR_TABLE_FORMAT=delta + +# Configure Java for local Spark sessions when needed +# JAVA_HOME=/opt/homebrew/opt/openjdk@17 diff --git a/examples/hooks_demo/.env.dev_duckdb b/examples/hooks_demo/.env.dev_duckdb new file mode 100644 index 0000000..467ebc0 --- /dev/null +++ b/examples/hooks_demo/.env.dev_duckdb @@ -0,0 +1,2 @@ +# DuckDB profile for the hooks demo +FF_DUCKDB_PATH=.local/hooks_demo.duckdb diff --git a/examples/hooks_demo/.env.dev_postgres b/examples/hooks_demo/.env.dev_postgres new file mode 100644 index 0000000..c3d66bd --- /dev/null +++ b/examples/hooks_demo/.env.dev_postgres @@ -0,0 +1,3 @@ +# Postgres profile for the cache demo (replace with your own connection string) +FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432 +FF_PG_SCHEMA=hooks_demo diff --git a/examples/hooks_demo/.env.dev_snowflake b/examples/hooks_demo/.env.dev_snowflake new file mode 100644 index 0000000..be6fb79 --- /dev/null +++ b/examples/hooks_demo/.env.dev_snowflake @@ -0,0 +1,18 @@ +# Snowflake Snowpark profile for the basic demo + +# Your Snowflake account identifier, e.g. xy12345.eu-central-1 +FF_SF_ACCOUNT=your_account_id + +# Username & password for Snowflake (or use keypair auth if you extend the executor) +FF_SF_USER=your_username +FF_SF_PASSWORD=your_password + +# Compute warehouse +FF_SF_WAREHOUSE=COMPUTE_WH + +# Database & schema for the demo +FF_SF_DATABASE=EXAMPLE_DEMO +FF_SF_SCHEMA=HOOKS_DEMO + +# Optional role (can be left blank) +FF_SF_ROLE=ANALYST diff --git a/examples/hooks_demo/Makefile b/examples/hooks_demo/Makefile new file mode 100644 index 0000000..5301a74 --- /dev/null +++ b/examples/hooks_demo/Makefile @@ -0,0 +1,120 @@ +.PHONY: seed run test dag show artifacts clean demo help + +# --- Configuration ----------------------------------------------------------- + +DB ?= .local/hooks_demo.duckdb +PROJECT ?= . +UV ?= uv + +# Engine selector (duckdb|postgres|databricks_spark|bigquery|snowflake_snowpark) +ENGINE ?= duckdb + +# BigQuery frame type selector (pandas | bigframes) +BQ_FRAME ?= bigframes + +# Resolve profile and tags per engine +ifeq ($(ENGINE),duckdb) + PROFILE_ENV = dev_duckdb + ENGINE_TAG = engine:duckdb +endif +ifeq ($(ENGINE),postgres) + PROFILE_ENV = dev_postgres + ENGINE_TAG = engine:postgres +endif +ifeq ($(ENGINE),databricks_spark) + PROFILE_ENV = dev_databricks + ENGINE_TAG = engine:databricks_spark +endif +ifeq ($(ENGINE),bigquery) + ENGINE_TAG = engine:bigquery + ifeq ($(BQ_FRAME),pandas) + PROFILE_ENV = dev_bigquery_pandas + else + PROFILE_ENV = dev_bigquery_bigframes + endif +endif +ifeq ($(ENGINE),snowflake_snowpark) + PROFILE_ENV = dev_snowflake + ENGINE_TAG = engine:snowflake_snowpark +endif + +BASE_ENV = FFT_ACTIVE_ENV=$(PROFILE_ENV) FF_ENGINE=$(ENGINE) + +ifeq ($(ENGINE),bigquery) + BASE_ENV := $(BASE_ENV) FF_ENGINE_VARIANT=$(BQ_FRAME) +endif + +RUN_ENV = $(BASE_ENV) + +SELECT_FLAGS = --select tag:example:hooks_demo --select tag:$(ENGINE_TAG) + +SHOW_MODEL ?= mart_events_daily + +CLEAN_SCRIPT = ../_scripts/cleanup_env.py + +ifeq ($(ENGINE),duckdb) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine duckdb --env "$(PROFILE_ENV)" --project "$(PROJECT)" --duckdb-path "$(DB)" +else ifeq ($(ENGINE),postgres) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine postgres --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),databricks_spark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine databricks_spark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),bigquery) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine bigquery --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else ifeq ($(ENGINE),snowflake_snowpark) + CLEAN_CMD = env $(BASE_ENV) $(UV) run python $(CLEAN_SCRIPT) --engine snowflake_snowpark --env "$(PROFILE_ENV)" --project "$(PROJECT)" +else + $(error Unsupported ENGINE=$(ENGINE) - pick duckdb|postgres|databricks_spark|bigquery|snowflake_snowpark) +endif + +# --- Targets ---------------------------------------------------------------- + +help: + @echo "FastFlowTransform Hooks Demo" + @echo "Targets:" + @echo " make seed ENGINE=$(ENGINE)" + @echo " make run ENGINE=$(ENGINE)" + @echo " make dag ENGINE=$(ENGINE)" + @echo " make test ENGINE=$(ENGINE)" + @echo " make show ENGINE=$(ENGINE) SHOW_MODEL=$(SHOW_MODEL)" + @echo " make demo ENGINE=$(ENGINE)" + @echo " make clean ENGINE=$(ENGINE)" + @echo + @echo "Variables: DB=$(DB) PROJECT=$(PROJECT) UV=$(UV) ENGINE=$(ENGINE) BQ_FRAME=$(BQ_FRAME)" + +seed: + env $(BASE_ENV) $(UV) run fft seed "$(PROJECT)" --env $(PROFILE_ENV) + +run: + env $(RUN_ENV) $(UV) run fft run "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +test: + env $(BASE_ENV) $(UV) run fft test "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) + +dag: + env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --html + +show: + @if [ -f "$(PROJECT)/site/dag/index.html" ]; then \ + $(OPENER) "$(PROJECT)/site/dag/index.html" 2>/dev/null || echo "Open manually at: $(PROJECT)/site/dag/index.html"; \ + else \ + echo "No HTML found: $(PROJECT)/site/dag/index.html"; \ + fi + +artifacts: + @echo + @echo "== 📦 Artifacts ==" + @echo " $(PROJECT)/.fastflowtransform/target/{manifest.json,run_results.json,catalog.json}" + @echo " DAG HTML: $(PROJECT)/site/dag/index.html" + +clean: + $(CLEAN_CMD) + +demo: clean + @echo "== 🚀 Hooks Demo ($(ENGINE)) ==" + @echo "Profile=$(PROFILE_ENV) PROJECT=$(PROJECT)" + +$(MAKE) seed ENGINE=$(ENGINE) + +$(MAKE) run ENGINE=$(ENGINE) + +$(MAKE) dag ENGINE=$(ENGINE) + +$(MAKE) test ENGINE=$(ENGINE) + +$(MAKE) artifacts + @echo "✅ Demo complete." diff --git a/examples/hooks_demo/README.md b/examples/hooks_demo/README.md new file mode 100644 index 0000000..40165db --- /dev/null +++ b/examples/hooks_demo/README.md @@ -0,0 +1,23 @@ +# Hooks demo + +Small FFT project that demonstrates run- and model-level hooks: + +- `on_run_start` / `on_run_end` lifecycle hooks for audit logging and notifications. +- `before_model` / `after_model` hooks that record per-model audit events based on selectors. + +The data model is intentionally tiny: a single staging model (`events_clean`) and a daily mart +(`mart_events_daily`). Hooks write into two audit tables: + +- `_ff_run_audit` – one row per fft run (start/end, status) +- `_ff_model_audit` – one row per model event (model start/end, status, row counts) + +See `project.yml` for the hook configuration and `hooks/notify.py` for the example Python hooks. + +You can run the demo on DuckDB, Postgres, Databricks Spark, or BigQuery: + + make demo ENGINE=duckdb + make demo ENGINE=postgres + make demo ENGINE=databricks_spark + make demo ENGINE=bigquery BQ_FRAME=bigframes + +Inspect the DAG under `site/dag/index.html` and the audit tables in your engine. diff --git a/examples/hooks_demo/hooks/audit_run_end.sql b/examples/hooks_demo/hooks/audit_run_end.sql new file mode 100644 index 0000000..abe9d14 --- /dev/null +++ b/examples/hooks_demo/hooks/audit_run_end.sql @@ -0,0 +1,12 @@ +-- examples/hooks_demo/hooks/audit_run_end.sql + +-- Update the run-level audit row when the run finishes. + +-- Default: update the single row for this run_id +update _ff_run_audit +set + finished_at = current_timestamp, + status = {{ run.status | sql_literal }}, + row_count = {{ run.row_count if run.row_count is not none else 'NULL' }}, + error = NULL +where run_id = {{ run.run_id | sql_literal }}; diff --git a/examples/hooks_demo/hooks/audit_run_end_spark.sql b/examples/hooks_demo/hooks/audit_run_end_spark.sql new file mode 100644 index 0000000..7f786c6 --- /dev/null +++ b/examples/hooks_demo/hooks/audit_run_end_spark.sql @@ -0,0 +1,11 @@ +-- examples/hooks_demo/hooks/audit_run_end_spark.sql + +-- Update the run-level audit row when the run finishes. + +update _ff_run_audit +set + finished_at = current_timestamp(), + status = {{ run.status | sql_literal }}, + row_count = {{ run.row_count if run.row_count is not none else 'NULL' }}, + error = {{ run.error if run.error is not none else 'NULL' }} +where run_id = {{ run.run_id | sql_literal }}; diff --git a/examples/hooks_demo/hooks/notify.py b/examples/hooks_demo/hooks/notify.py new file mode 100644 index 0000000..6a8f984 --- /dev/null +++ b/examples/hooks_demo/hooks/notify.py @@ -0,0 +1,93 @@ +from __future__ import annotations + +from typing import Any + +from fastflowtransform.hooks.registry import fft_hook +from fastflowtransform.hooks.types import HookContext, RunContext, ModelContext + + +def _fmt_env(env: dict[str, Any]) -> str: + """Small helper to format env for logs (defensive against missing keys).""" + parts = [] + for key in ("FFT_ACTIVE_ENV", "FF_ENGINE", "FF_ENGINE_VARIANT"): + if key in env: + parts.append(f"{key}={env[key]}") + return ", ".join(parts) if parts else "" + + +@fft_hook(name="python_banner") +def on_run_start(context: HookContext) -> None: + """ + Example Python run-start hook. + + Context shape (simplified): + + context["when"] -> "on_run_start" + context["run"] -> RunContext + context["env"] -> dict of env vars + """ + run: RunContext = context.get("run", {}) # type: ignore[assignment] + env = context.get("env", {}) or {} + + run_id = run.get("run_id") + env_name = run.get("env_name") + engine_name = run.get("engine_name") + + info = _fmt_env(env if isinstance(env, dict) else {}) + print( + f"[hooks_demo] on_run_start: run_id={run_id} " + f"(env_name={env_name}, engine={engine_name}; {info})" + ) + + +@fft_hook(name="python_summary", when="on_run_end") +def on_run_end(context: HookContext) -> None: + """ + Example Python run-end hook. + + Context shape: + + context["when"] -> "on_run_end" + context["run"] -> RunContext (with status/error) + context["stats"] -> optional RunStatsContext (if you populate it) + """ + run: RunContext = context.get("run", {}) # type: ignore[assignment] + stats = context.get("stats") or {} + + run_id = run.get("run_id") + status = run.get("status") + + built = stats.get("models_built") + skipped = stats.get("models_skipped") + failed = stats.get("models_failed") + + print( + "[FFT] [hooks_demo] on_run_end: run_id=%s status=%s (built=%s, skipped=%s, failed=%s)" + % (run_id, status, built, skipped, failed) + ) + + +@fft_hook(name="model_end_log_python", when="after_model") +def on_model_end(context: HookContext) -> None: + """ + Example Python model-level hook (called after a model finishes). + + Context shape: + + context["when"] -> "after_model" + context["run"] -> RunContext + context["model"] -> ModelContext (name, status, rows_affected, ...) + """ + run: RunContext = context.get("run", {}) # type: ignore[assignment] + model: ModelContext | None = context.get("model") # type: ignore[assignment] + + run_id = run.get("run_id") + model_name = model.get("name") if model else None + status = model.get("status") if model else None + rows = model.get("rows_affected") if model else None + elapsed_ms = model.get("elapsed_ms") if model else None + + print( + "[FFT] [hooks_demo] on_model_end: run_id=%s model=%s status=%s rows=%s elapsed_ms=%s" + % (run_id, model_name, status, rows, elapsed_ms) + ) diff --git a/examples/hooks_demo/models/README.md b/examples/hooks_demo/models/README.md new file mode 100644 index 0000000..a6c22db --- /dev/null +++ b/examples/hooks_demo/models/README.md @@ -0,0 +1,19 @@ +# Models directory (hooks_demo) + +Models: + +- `staging/events_clean.ff.sql` + - Minimal staging over `seed_events` with proper typing. + - Tagged `example:hooks_demo` and `scope:staging`. + +- `marts/mart_events_daily.ff.sql` + - Simple aggregation (events per day). + - Tagged `example:hooks_demo` and `scope:mart`. + +Hooks: + +- Run hooks (`on_run_start` / `on_run_end`) create and update `_ff_run_audit`. +- Model hooks (`before_model` / `after_model`) write rows into `_ff_model_audit` whenever a model + with `tag:example:hooks_demo` is executed. + +The hooks are configured centrally in `project.yml` so the models stay clean and portable. diff --git a/examples/hooks_demo/models/marts/mart_events_daily.ff.sql b/examples/hooks_demo/models/marts/mart_events_daily.ff.sql new file mode 100644 index 0000000..4285c35 --- /dev/null +++ b/examples/hooks_demo/models/marts/mart_events_daily.ff.sql @@ -0,0 +1,27 @@ +{{ config( + materialized='table', + tags=[ + 'example:hooks_demo', + 'scope:mart', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery', + 'engine:snowflake_snowpark' + ], +) }} + +with base as ( + select + event_date, + event_type + from {{ ref('events_clean.ff') }} +) + +select + event_date, + event_type, + count(*) as event_count +from base +group by event_date, event_type +order by event_date, event_type; diff --git a/examples/hooks_demo/models/staging/events_clean.ff.sql b/examples/hooks_demo/models/staging/events_clean.ff.sql new file mode 100644 index 0000000..75508f1 --- /dev/null +++ b/examples/hooks_demo/models/staging/events_clean.ff.sql @@ -0,0 +1,36 @@ +{{ config( + materialized='table', + tags=[ + 'example:hooks_demo', + 'scope:staging', + 'engine:duckdb', + 'engine:postgres', + 'engine:databricks_spark', + 'engine:bigquery', + 'engine:snowflake_snowpark' + ], +) }} + +{# engine-aware string type #} +{% set string_type = + "varchar" + if engine() in ["duckdb", "postgres", "postgresql"] + else "string" +%} + +with src as ( + select + cast(id as integer) as event_id, + cast(user_id as integer) as user_id, + cast(event_ts as timestamp) as event_ts, + cast(event_type as {{ string_type }}) as event_type + from {{ source('raw', 'events') }} +) + +select + event_id, + user_id, + event_ts, + event_type, + cast(event_ts as date) as event_date +from src; diff --git a/examples/hooks_demo/profiles.yml b/examples/hooks_demo/profiles.yml new file mode 100644 index 0000000..24c0afe --- /dev/null +++ b/examples/hooks_demo/profiles.yml @@ -0,0 +1,58 @@ +# Connection profiles for the hooks demo. +# Populate environment variables as shown in the .env.dev_* files. + +dev_duckdb: + engine: duckdb + duckdb: + path: "{{ env('FF_DUCKDB_PATH', '.local/hooks_demo.duckdb') }}" + +dev_postgres: + engine: postgres + postgres: + dsn: "{{ env('FF_PG_DSN') }}" + db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" + +dev_databricks: + engine: databricks_spark + databricks_spark: + master: "{{ env('FF_SPARK_MASTER', 'local[*]') }}" + app_name: "{{ env('FF_SPARK_APP_NAME', 'hooks_demo') }}" + warehouse_dir: "{{ project_dir() }}/{{ env('FF_DBR_WAREHOUSE_DIR', '.local/spark_warehouse') }}" + database: "{{ env('FF_DBR_DATABASE', 'hooks_demo') }}" + extra_conf: + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" + spark.driver.extraJavaOptions: > + -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log + -Dderby.system.home={{ project_dir() }}/.local/derby_home + + +dev_bigquery_bigframes: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'hooks_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: true + allow_create_dataset: true + +dev_bigquery_pandas: + engine: bigquery + bigquery: + project: "{{ env('FF_BQ_PROJECT') }}" + dataset: "{{ env('FF_BQ_DATASET', 'hooks_demo') }}" + location: "{{ env('FF_BQ_LOCATION', 'EU') }}" + use_bigframes: false + allow_create_dataset: true + +dev_snowflake: + engine: snowflake_snowpark + snowflake_snowpark: + account: "{{ env('FF_SF_ACCOUNT') }}" + user: "{{ env('FF_SF_USER') }}" + password: "{{ env('FF_SF_PASSWORD') }}" + warehouse: "{{ env('FF_SF_WAREHOUSE', 'COMPUTE_WH') }}" + database: "{{ env('FF_SF_DATABASE', 'EXAMPLE_DEMO') }}" + schema: "{{ env('FF_SF_SCHEMA', 'HOOKS_DEMO') }}" + role: "{{ env('FF_SF_ROLE', '') }}" + allow_create_schema: true diff --git a/examples/hooks_demo/project.yml b/examples/hooks_demo/project.yml new file mode 100644 index 0000000..9526323 --- /dev/null +++ b/examples/hooks_demo/project.yml @@ -0,0 +1,303 @@ +name: hooks_demo +version: "0.1" +models_dir: models + +docs: + dag_dir: site/dag + +vars: {} + +tests: + - type: not_null + table: events_clean + column: event_id + tags: [example:hooks_demo] + + - type: row_count_between + table: mart_events_daily + min_rows: 1 + max_rows: 1000 + tags: [example:hooks_demo] + +# --- Audit: run-level ------------------------------------------------------- + # Ensure we have at least one run audit row for the demo run + - type: row_count_between + table: _ff_run_audit + min_rows: 1 + max_rows: 10 + tags: [example:hooks_demo, audit] + + # Core run audit columns should never be NULL + - type: not_null + table: _ff_run_audit + column: run_id + tags: [example:hooks_demo, audit] + + # started_at must be recorded + - type: not_null + table: _ff_run_audit + column: started_at + tags: [example:hooks_demo, audit] + + # status should be 'running'/'success'/'error', but at least not NULL + - type: not_null + table: _ff_run_audit + column: status + tags: [example:hooks_demo, audit] + + - type: not_null + table: _ff_run_audit + column: finished_at + tags: [example:hooks_demo, audit] + + - type: not_null + table: _ff_run_audit + column: row_count + tags: [example:hooks_demo, audit] + + - type: accepted_values + table: _ff_run_audit + column: status + values: [success] + tags: [example:hooks_demo, audit] + + - type: row_count_between + table: _ff_run_audit + min_rows: 1 + max_rows: 1 + tags: [example:hooks_demo, audit] + + # --- Audit: model-level ----------------------------------------------------- + # For this demo there are 2 tagged models; we expect at least 2 audit rows + - type: row_count_between + table: _ff_model_audit + min_rows: 2 + max_rows: 10 + tags: [example:hooks_demo, audit] + + # Model audit rows must have a run_id + - type: not_null + table: _ff_model_audit + column: run_id + tags: [example:hooks_demo, audit] + + # And must reference a model_name + - type: not_null + table: _ff_model_audit + column: model_name + tags: [example:hooks_demo, audit] + + # And a non-null status ('running' / 'success' / 'error') + - type: not_null + table: _ff_model_audit + column: status + tags: [example:hooks_demo, audit] + + - type: not_null + table: _ff_model_audit + column: finished_at + tags: [example:hooks_demo, audit] + + - type: not_null + table: _ff_model_audit + column: rows_affected + tags: [example:hooks_demo, audit] + + - type: not_null + table: _ff_model_audit + column: elapsed_ms + tags: [example:hooks_demo, audit] + +# --------------------------------------------------------------------------- +# Hooks configuration +# --------------------------------------------------------------------------- +hooks: + # Lifecycle hooks (run-level) + on_run_start: + - name: create_audit_tables + kind: sql + engines: ["duckdb", "postgres"] + sql: | + create table if not exists _ff_run_audit ( + run_id text, + started_at timestamp, + finished_at timestamp, + status text, + env text, + engine text, + row_count bigint, + error text + ); + + create table if not exists _ff_model_audit ( + run_id text, + model_name text, + event text, + status text, + started_at timestamp, + finished_at timestamp, + rows_affected bigint, + elapsed_ms bigint, + error text + ); + + - name: create_audit_tables_spark + kind: sql + engines: ["databricks_spark"] + sql: | + create table if not exists _ff_run_audit ( + run_id string, + started_at timestamp, + finished_at timestamp, + status string, + env string, + engine string, + row_count bigint, + error string + ) using delta; + + create table if not exists _ff_model_audit ( + run_id string, + model_name string, + event string, + status string, + started_at timestamp, + finished_at timestamp, + rows_affected bigint, + elapsed_ms bigint, + error string + ) using delta; + + - name: create_audit_tables_bigquery + kind: sql + engines: ["bigquery"] + sql: | + CREATE TABLE IF NOT EXISTS `_ff_run_audit` ( + run_id STRING, + started_at TIMESTAMP, + finished_at TIMESTAMP, + status STRING, + env STRING, + engine STRING, + row_count INT64, + error STRING + ); + + CREATE TABLE IF NOT EXISTS `_ff_model_audit` ( + run_id STRING, + model_name STRING, + event STRING, + status STRING, + started_at TIMESTAMP, + finished_at TIMESTAMP, + rows_affected INT64, + elapsed_ms INT64, + error STRING + ); + + - name: create_audit_tables_snowflake + kind: sql + engines: ["snowflake_snowpark"] + sql: | + CREATE TABLE IF NOT EXISTS _ff_run_audit ( + run_id STRING, + started_at TIMESTAMP_NTZ, + finished_at TIMESTAMP_NTZ, + status STRING, + env STRING, + engine STRING, + row_count NUMBER, + error STRING + ); + + CREATE TABLE IF NOT EXISTS _ff_model_audit ( + run_id STRING, + model_name STRING, + event STRING, + status STRING, + started_at TIMESTAMP_NTZ, + finished_at TIMESTAMP_NTZ, + rows_affected NUMBER, + elapsed_ms NUMBER, + error STRING + ); + + - name: audit_run_start + kind: sql + sql: | + -- Record run start; assumes `run_id`, `env_name`, `engine_name` are available + insert into _ff_run_audit (run_id, started_at, status, env, engine) + values ( + {{ run.run_id | sql_literal }}, + current_timestamp, + 'running', + {{ run.env_name | sql_literal }}, + {{ run.engine_name | sql_literal }} + ); + + - name: python_banner + kind: python + + on_run_end: + - name: audit_run_end + engines: ["duckdb", "postgres", "bigquery", "snowflake_snowpark"] + kind: sql + + - name: audit_run_end_spark + engines: ["databricks_spark"] + kind: sql + + - name: python_summary + kind: python + + # Model-level hooks: executed for models matching `select` + before_model: + - name: model_start_audit + kind: sql + select: "tag:example:hooks_demo" + sql: | + insert into _ff_model_audit (run_id, model_name, event, status, started_at) + values ( + {{ run.run_id | sql_literal }}, + {{ model.name | sql_literal }}, + 'start', + 'running', + current_timestamp + ); + + after_model: + - name: model_end_audit + kind: sql + engines: ["duckdb", "postgres", "bigquery", "snowflake_snowpark"] + select: "tag:example:hooks_demo" + sql: | + update _ff_model_audit + set + finished_at = current_timestamp, + status = {{ model.status | sql_literal }}, + rows_affected = {{ model.rows_affected if model.rows_affected is not none else 'NULL' }}, + elapsed_ms = {{ model.elapsed_ms if model.elapsed_ms is not none else 'NULL' }}, + error = {{ model.error | sql_literal if model.error is not none else 'NULL' }} + where run_id = {{ run.run_id | sql_literal }} + and model_name = {{ model.name | sql_literal }} + and event = 'start'; + + - name: model_end_audit_spark + kind: sql + engines: ["databricks_spark"] + select: "tag:example:hooks_demo" + sql: | + update _ff_model_audit + set + finished_at = current_timestamp(), + status = {{ model.status | sql_literal }}, + rows_affected = {{ model.rows_affected if model.rows_affected is not none else 'NULL' }}, + elapsed_ms = {{ model.elapsed_ms if model.elapsed_ms is not none else 'NULL' }}, + error = {{ model.error if model.error is not none else 'NULL' }} + where run_id = {{ run.run_id | sql_literal }} + and model_name = {{ model.name | sql_literal }} + and event = 'start'; + + - name: model_end_log_python + kind: python + select: "tag:scope:mart" diff --git a/examples/hooks_demo/seeds/README.md b/examples/hooks_demo/seeds/README.md new file mode 100644 index 0000000..e95cc71 --- /dev/null +++ b/examples/hooks_demo/seeds/README.md @@ -0,0 +1,4 @@ +# Seeds directory (hooks_demo) + +`seed_events.csv` ships with the demo and feeds the staging model. Extend or replace it when +experimenting with hook behavior; the audit hooks don't depend on any specific schema. diff --git a/examples/hooks_demo/seeds/seed_events.csv b/examples/hooks_demo/seeds/seed_events.csv new file mode 100644 index 0000000..960afed --- /dev/null +++ b/examples/hooks_demo/seeds/seed_events.csv @@ -0,0 +1,6 @@ +id,user_id,event_ts,event_type +1,1,2024-01-05T09:15:00Z,signup +2,1,2024-01-06T10:30:00Z,login +3,2,2024-01-06T11:45:00Z,signup +4,2,2024-01-07T08:05:00Z,login +5,3,2024-01-08T13:20:00Z,signup diff --git a/examples/hooks_demo/sources.yml b/examples/hooks_demo/sources.yml new file mode 100644 index 0000000..4386923 --- /dev/null +++ b/examples/hooks_demo/sources.yml @@ -0,0 +1,8 @@ +version: 2 + +sources: + - name: raw + tables: + - name: events + identifier: seed_events + description: Sample event stream used to demonstrate hooks and audit logging. diff --git a/examples/hooks_demo/tests/dq/README.md b/examples/hooks_demo/tests/dq/README.md new file mode 100644 index 0000000..1acd01d --- /dev/null +++ b/examples/hooks_demo/tests/dq/README.md @@ -0,0 +1,4 @@ +# Data quality tests + +Store custom data-quality tests that run via `fft test` (docs/Data_Quality_Tests.md). +Use this directory for schema-bound tests separate from unit specs. diff --git a/examples/hooks_demo/tests/unit/README.md b/examples/hooks_demo/tests/unit/README.md new file mode 100644 index 0000000..b3c3c8d --- /dev/null +++ b/examples/hooks_demo/tests/unit/README.md @@ -0,0 +1,4 @@ +# Unit tests + +Define YAML unit specs as described in docs/Config_and_Macros.md#73-model-unit-tests-fft-utest. +Invoke them with `fft utest --env `. diff --git a/examples/incremental_demo/profiles.yml b/examples/incremental_demo/profiles.yml index 50bff28..34c1327 100644 --- a/examples/incremental_demo/profiles.yml +++ b/examples/incremental_demo/profiles.yml @@ -17,10 +17,8 @@ dev_databricks_parquet: &incremental_databricks_parquet warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse_parquet" database: "{{ env('FF_DBR_DATABASE', 'incremental_demo_parquet') }}" extra_conf: - spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" - spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" - spark.hadoop.datanucleus.schema.autoCreateAll: "true" - spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" spark.driver.extraJavaOptions: > -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log -Dderby.system.home={{ project_dir() }}/.local/derby_home @@ -33,11 +31,11 @@ dev_databricks_delta: warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse_delta" database: "{{ env('FF_DBR_DATABASE', 'incremental_demo_delta') }}" extra_conf: - spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" - spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" - spark.hadoop.datanucleus.schema.autoCreateAll: "true" - spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" - spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" + spark.driver.extraJavaOptions: > + -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log + -Dderby.system.home={{ project_dir() }}/.local/derby_home dev_databricks_iceberg: engine: databricks_spark diff --git a/examples/macros_demo/profiles.yml b/examples/macros_demo/profiles.yml index 495c075..07a9a43 100644 --- a/examples/macros_demo/profiles.yml +++ b/examples/macros_demo/profiles.yml @@ -16,10 +16,8 @@ dev_databricks: app_name: "{{ env('FF_SPARK_APP_NAME', 'macros_demo') }}" warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" extra_conf: - spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" - spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" - spark.hadoop.datanucleus.schema.autoCreateAll: "true" - spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" spark.driver.extraJavaOptions: > -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log -Dderby.system.home={{ project_dir() }}/.local/derby_home diff --git a/examples/materializations_demo/profiles.yml b/examples/materializations_demo/profiles.yml index 8e51089..2ab4c96 100644 --- a/examples/materializations_demo/profiles.yml +++ b/examples/materializations_demo/profiles.yml @@ -18,10 +18,8 @@ dev_databricks: app_name: "{{ env('FF_SPARK_APP_NAME', 'materializations_demo') }}" warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse" extra_conf: - spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" - spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" - spark.hadoop.datanucleus.schema.autoCreateAll: "true" - spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" spark.driver.extraJavaOptions: > -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log -Dderby.system.home={{ project_dir() }}/.local/derby_home diff --git a/examples/snapshot_demo/profiles.yml b/examples/snapshot_demo/profiles.yml index 56fe2e1..bdd1527 100644 --- a/examples/snapshot_demo/profiles.yml +++ b/examples/snapshot_demo/profiles.yml @@ -19,10 +19,8 @@ dev_databricks_parquet: &snapshot_demo_databricks_parquet warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse_parquet" database: "{{ env('FF_DBR_DATABASE', 'snapshot_demo_parquet') }}" extra_conf: - spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" - spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" - spark.hadoop.datanucleus.schema.autoCreateAll: "true" - spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" spark.driver.extraJavaOptions: > -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log -Dderby.system.home={{ project_dir() }}/.local/derby_home @@ -35,11 +33,11 @@ dev_databricks_delta: warehouse_dir: "{{ project_dir() }}/.local/spark_warehouse_delta" database: "{{ env('FF_DBR_DATABASE', 'snapshot_demo_delta') }}" extra_conf: - spark.hadoop.javax.jdo.option.ConnectionURL: "jdbc:derby:{{ project_dir() }}/.local/metastore_db;create=true" - spark.hadoop.datanucleus.rdbms.datastoreAdapterClassName: "org.datanucleus.store.rdbms.adapter.DerbyAdapter" - spark.hadoop.datanucleus.schema.autoCreateAll: "true" - spark.hadoop.javax.jdo.option.ConnectionDriverName: "org.apache.derby.jdbc.EmbeddedDriver" - spark.driver.extraJavaOptions: "-Dderby.stream.error.file={{ project_dir() }}/.local/derby.log" + spark.sql.statistics.fallBackToHdfs: true + spark.sql.shuffle.partitions: "{{ env('SPARK_SQL_SHUFFLE_PARTITIONS', '8') }}" + spark.driver.extraJavaOptions: > + -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log + -Dderby.system.home={{ project_dir() }}/.local/derby_home dev_databricks_iceberg: engine: databricks_spark diff --git a/examples_article/building_locally_demo/models/staging/stg_users.ff.sql b/examples_article/building_locally_demo/models/staging/stg_users.ff.sql index 412d37d..42274eb 100644 --- a/examples_article/building_locally_demo/models/staging/stg_users.ff.sql +++ b/examples_article/building_locally_demo/models/staging/stg_users.ff.sql @@ -1,4 +1,6 @@ -{{ config(materialized='table') }} +{{ config( + materialized='table' +) }} select id, diff --git a/examples_article/building_locally_demo/profiles.yml b/examples_article/building_locally_demo/profiles.yml index e0ef1f0..5ab051c 100644 --- a/examples_article/building_locally_demo/profiles.yml +++ b/examples_article/building_locally_demo/profiles.yml @@ -8,7 +8,9 @@ dev_duckdb: prod_bigquery: engine: bigquery bigquery: - project: "{{ env('FF_BQ_PROJECT') }}" + project: "basic_data" dataset: "production_marts" + location: "EU" # FFT handles the client (BigFrames or Pandas) automatically - use_bigframes: true \ No newline at end of file + use_bigframes: true + allow_create_dataset: true \ No newline at end of file diff --git a/examples_article/building_locally_demo/project.yml b/examples_article/building_locally_demo/project.yml index c16dbca..2987f25 100644 --- a/examples_article/building_locally_demo/project.yml +++ b/examples_article/building_locally_demo/project.yml @@ -15,4 +15,13 @@ docs: vars: {} # Declare project-wide data quality checks under `tests`. See docs/Data_Quality_Tests.md. -tests: [] +tests: + - type: not_null + table: stg_users + column: id + tags: [example:building_locally_demo] + - type: row_count_between + table: stg_users + min_rows: 2 + max_rows: 2 + tags: [example:building_locally_demo] diff --git a/examples_article/building_locally_demo/seeds/users.csv b/examples_article/building_locally_demo/seeds/seed_users.csv similarity index 100% rename from examples_article/building_locally_demo/seeds/users.csv rename to examples_article/building_locally_demo/seeds/seed_users.csv diff --git a/examples_article/building_locally_demo/sources.yml b/examples_article/building_locally_demo/sources.yml index 83436dc..7ef2d9f 100644 --- a/examples_article/building_locally_demo/sources.yml +++ b/examples_article/building_locally_demo/sources.yml @@ -1,9 +1,7 @@ -# Source declarations describe external tables. See docs/Sources.md for details. version: 2 -# sources: - # Example: - # - name: raw - # schema: staging - # tables: - # - name: users - # identifier: seed_users + +sources: + - name: raw + tables: + - name: users + identifier: seed_users diff --git a/examples_article/building_locally_demo/tests/unit/stg_users.yml b/examples_article/building_locally_demo/tests/unit/stg_users.yml new file mode 100644 index 0000000..c1ebcf9 --- /dev/null +++ b/examples_article/building_locally_demo/tests/unit/stg_users.yml @@ -0,0 +1,13 @@ +model: stg_users + +cases: + - name: lowercase_email + inputs: + seed_users: + rows: + - {id: 1, email: "a@EXAMPLE.com", signup_date: "2023-01-01"} + - {id: 2, email: "B@gmail.com", signup_date: "2023-01-02"} + expect: + rows: + - {id: 1, email: "a@example.com", signup_date: "2023-01-01"} + - {id: 2, email: "b@gmail.com", signup_date: "2023-01-02"} diff --git a/mkdocs.yml b/mkdocs.yml index c3cd004..b716da8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -40,6 +40,7 @@ nav: - Snapshots: Snapshots.md - CI Checks & Change-Aware Runs: CI_Check.md - Cost Monitoring: Cost_Monitoring.md + - Hooks: Hooks.md - Troubleshooting: Troubleshooting.md - Examples: - Basic Demo: examples/Basic_Demo.md @@ -52,6 +53,7 @@ nav: - API Demo: examples/API_Demo.md - Local Engine Setup: examples/Local_Engine_Setup.md - Snapshot Demo: examples/Snapshot_Demo.md + - Hooks Demo: examples/Hooks_Demo.md - API Reference: reference/ - Contributing: Contributing.md - License: License.md diff --git a/pyproject.toml b/pyproject.toml index 1df700f..e8919f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "fastflowtransform" -version = "0.6.8" +version = "0.6.9" description = "Python framework for SQL & Python data transformation, ETL pipelines, and dbt-style data modeling" readme = "README.md" license = { text = "Apache-2.0" } diff --git a/src/fastflowtransform/cli/run.py b/src/fastflowtransform/cli/run.py index 55fe3cd..d573c3b 100644 --- a/src/fastflowtransform/cli/run.py +++ b/src/fastflowtransform/cli/run.py @@ -5,12 +5,13 @@ import textwrap import threading import traceback -from collections.abc import Callable +from collections.abc import Callable, Iterable, Mapping, Sequence from contextlib import suppress from dataclasses import dataclass, field from datetime import UTC, datetime from pathlib import Path -from typing import Any, cast +from typing import Any, Literal, cast +from uuid import uuid4 import typer @@ -56,8 +57,10 @@ BudgetsConfig, load_budgets_config, ) -from fastflowtransform.core import REGISTRY, relation_for +from fastflowtransform.config.project import HookSpec +from fastflowtransform.core import REGISTRY, Node, relation_for from fastflowtransform.dag import levels as dag_levels +from fastflowtransform.executors.base import BaseExecutor from fastflowtransform.executors.budget import format_bytes from fastflowtransform.fingerprint import ( EnvCtx, @@ -66,13 +69,55 @@ fingerprint_sql, get_function_source, ) +from fastflowtransform.hooks.registry import load_project_hooks, resolve_hook +from fastflowtransform.hooks.types import ( + HookContext, + ModelContext, + ModelStatsContext, + RunContext, + RunStatsContext, +) from fastflowtransform.log_queue import LogQueue -from fastflowtransform.logging import bind_context, bound_context, clear_context, echo, error, warn +from fastflowtransform.logging import ( + bind_context, + bound_context, + clear_context, + echo, + echo_debug, + error, + warn, +) from fastflowtransform.meta import ensure_meta_table from fastflowtransform.run_executor import ScheduleResult, schedule from fastflowtransform.utils.timefmt import _format_duration_ms +class _HookThis: + """ + Lightweight proxy for {{ this }} in hooks: + + - str(this) -> relation name (without .ff) + - this.name / this.relation + - this.materialized (table|view|incremental|...) + """ + + def __init__(self, relation: str, materialized: str): + self.name = relation + self.relation = relation + self.materialized = materialized + self.schema = None + self.database = None + + def __str__(self) -> str: + return self.name + + def __repr__(self) -> str: + return f"_HookThis(name={self.name!r})" + + +HookWhen = Literal["on_run_start", "on_run_end", "before_model", "after_model"] + + @dataclass class _RunEngine: ctx: Any @@ -93,6 +138,10 @@ class _RunEngine: query_stats: dict[str, dict[str, int]] = field(default_factory=dict, init=False) stats_lock: threading.Lock = field(default_factory=threading.Lock, init=False) + # run metadata for hooks + invocation_id: str | None = None + run_started_at: str | None = None + def __post_init__(self) -> None: echo(f"Profile: {self.env_name} | Engine: {self.ctx.profile.engine}") self.shared = self.ctx.make_executor() @@ -260,75 +309,248 @@ def run_node(self, name: str) -> None: node = REGISTRY.nodes[name] ex, run_sql_fn, run_py_fn = self._get_runner() + self._reset_executor_node_stats(ex) + + meta = getattr(node, "meta", {}) or {} + + pre_hooks, post_hooks = self._get_model_hooks(node) + + # --- force rebuild path ------------------------------------------------- + if name in self.force_rebuild: + self._run_node_force_rebuild( + name=name, + node=node, + ex=ex, + run_sql_fn=run_sql_fn, + run_py_fn=run_py_fn, + pre_hooks=pre_hooks, + post_hooks=post_hooks, + ) + return + + # --- fingerprint + cache skip path -------------------------------------- + cand_fp = self._maybe_fingerprint(node, ex) + if self._should_skip_node( + name=name, + node=node, + ex=ex, + cand_fp=cand_fp, + meta=meta, + ): + return + + # --- normal run --------------------------------------------------------- + self._run_model_with_hooks( + name=name, + node=node, + ex=ex, + run_sql_fn=run_sql_fn, + run_py_fn=run_py_fn, + pre_hooks=pre_hooks, + post_hooks=post_hooks, + ) + + self._finalize_node_run( + name=name, + node=node, + ex=ex, + cand_fp=cand_fp, + ) + + # --------------------------------------------------------------------------- + # Helpers for run_node + # --------------------------------------------------------------------------- + + def _reset_executor_node_stats(self, ex: BaseExecutor) -> None: # Reset per-node stats if the executor supports it with suppress(Exception): reset = getattr(ex, "reset_node_stats", None) if callable(reset): reset() - if name in self.force_rebuild: - (run_sql_fn if node.kind == "sql" else run_py_fn)(node) - cand_fp = self._maybe_fingerprint(node, ex) - if cand_fp: - with self.fps_lock: - self.computed_fps[name] = cand_fp - with suppress(Exception): - ex.on_node_built(node, relation_for(name), cand_fp) - # HTTP snapshot (stored in node.meta by the executor) - with suppress(Exception): - snap = (getattr(node, "meta", {}) or {}).get("_http_snapshot") - if snap: - self.http_snaps[name] = snap - - # capture per-node stats after successful run - with suppress(Exception): - raw_getter = getattr(ex, "get_node_stats", None) - if callable(raw_getter): - getter = cast(Callable[[], dict[str, int] | None], raw_getter) - stats = getter() - if stats: - with self.stats_lock: - self.query_stats[name] = stats + def _get_model_hooks( + self, + node: Node, + ) -> tuple[Sequence[HookSpec] | None, Sequence[HookSpec] | None]: + pre_hooks = self._model_hooks_for_when("before_model", node) + post_hooks = self._model_hooks_for_when("after_model", node) + return pre_hooks, post_hooks + + def _run_model_with_hooks( + self, + *, + name: str, + node: Node, + ex: BaseExecutor, + run_sql_fn: Callable[[Node], None], + run_py_fn: Callable[[Node], None], + pre_hooks: Sequence[HookSpec] | None, + post_hooks: Sequence[HookSpec] | None, + ) -> ModelStatsContext | None: + # pre-hook + self._run_hooks(pre_hooks, node=node, when="before_model", ex=ex) + + # actual model + (run_sql_fn if node.kind == "sql" else run_py_fn)(node) - return + # capture per-node stats *now* so after_model hooks can use them + model_stats = self._collect_model_stats_for_hooks(ex=ex, name=name) + + # post-hook + self._run_hooks( + post_hooks, + node=node, + when="after_model", + ex=ex, + model_stats=model_stats, + ) + return model_stats + + def _collect_model_stats_for_hooks( + self, + *, + ex: BaseExecutor, + name: str, + ) -> ModelStatsContext | None: + model_stats: ModelStatsContext | None = None + with suppress(Exception): + raw_getter = getattr(ex, "get_node_stats", None) + if callable(raw_getter): + getter = cast(Callable[[], dict[str, int] | None], raw_getter) + s = getter() + if s: + rows = int(s.get("rows", 0) or 0) + bytes_scanned = int(s.get("bytes_scanned", 0) or 0) + query_duration_ms = int(s.get("query_duration_ms", 0) or 0) + + model_stats = ModelStatsContext( + rows=rows, + bytes_scanned=bytes_scanned, + query_duration_ms=query_duration_ms, + ) + + # keep a plain dict[str, int] copy for budgets / run_results + with self.stats_lock: + self.query_stats[name] = { + "rows": rows, + "bytes_scanned": bytes_scanned, + "query_duration_ms": query_duration_ms, + } + + return model_stats + + def _run_node_force_rebuild( + self, + *, + name: str, + node: Node, + ex: BaseExecutor, + run_sql_fn: Callable[[Node], None], + run_py_fn: Callable[[Node], None], + pre_hooks: Sequence[HookSpec] | None, + post_hooks: Sequence[HookSpec] | None, + ) -> None: + # Run model (with hooks + early stats) + self._run_model_with_hooks( + name=name, + node=node, + ex=ex, + run_sql_fn=run_sql_fn, + run_py_fn=run_py_fn, + pre_hooks=pre_hooks, + post_hooks=post_hooks, + ) + + # fingerprint + on_node_built cand_fp = self._maybe_fingerprint(node, ex) - if cand_fp is not None: - materialized = (getattr(node, "meta", {}) or {}).get("materialized", "table") - may_skip = self.cache_mode in (CacheMode.RW, CacheMode.RO) - if may_skip and can_skip_node( - node_name=name, - new_fp=cand_fp, - cache=self.cache, - executor=ex, - materialized=materialized, - ): - with self.fps_lock: - self.computed_fps[name] = cand_fp - # Even when skipped, we treat stats as zero → nothing to do - with self.stats_lock: - self.query_stats[name] = { - "bytes_scanned": 0, - "rows": 0, - "query_duration_ms": 0, - "cached": True, - } - return + if cand_fp: + self._store_fingerprint(name, cand_fp) + self._notify_executor_node_built(node=node, ex=ex, name=name, cand_fp=cand_fp) - (run_sql_fn if node.kind == "sql" else run_py_fn)(node) + # HTTP snapshot + self._capture_http_snapshot(node=node, name=name) + # capture per-node stats after successful run + self._capture_final_stats(ex=ex, name=name) + + def _should_skip_node( + self, + *, + name: str, + node: Node, + ex: BaseExecutor, + cand_fp: str | None, + meta: dict[str, Any] | None, + ) -> bool: + if cand_fp is None: + return False + + materialized = (meta or {}).get("materialized", "table") + may_skip = self.cache_mode in (CacheMode.RW, CacheMode.RO) + if not may_skip: + return False + + if not can_skip_node( + node_name=name, + new_fp=cand_fp, + cache=self.cache, + executor=ex, + materialized=materialized, + ): + return False + + # we're skipping: still record fingerprint + zero stats + self._store_fingerprint(name, cand_fp) + with self.stats_lock: + self.query_stats[name] = { + "bytes_scanned": 0, + "rows": 0, + "query_duration_ms": 0, + "cached": True, + } + return True + + def _finalize_node_run( + self, + *, + name: str, + node: Node, + ex: BaseExecutor, + cand_fp: str | None, + ) -> None: if cand_fp is not None: - with self.fps_lock: - self.computed_fps[name] = cand_fp - with suppress(Exception): - ex.on_node_built(node, relation_for(name), cand_fp) + self._store_fingerprint(name, cand_fp) + self._notify_executor_node_built(node=node, ex=ex, name=name, cand_fp=cand_fp) + # HTTP snapshot (stored in node.meta by the executor) + self._capture_http_snapshot(node=node, name=name) + + # capture per-node stats after successful run + self._capture_final_stats(ex=ex, name=name) + + def _store_fingerprint(self, name: str, cand_fp: str) -> None: + with self.fps_lock: + self.computed_fps[name] = cand_fp + + def _notify_executor_node_built( + self, + *, + node: Node, + ex: BaseExecutor, + name: str, + cand_fp: str, + ) -> None: + with suppress(Exception): + ex.on_node_built(node, relation_for(name), cand_fp) + + def _capture_http_snapshot(self, *, node: Node, name: str) -> None: with suppress(Exception): snap = (getattr(node, "meta", {}) or {}).get("_http_snapshot") if snap: self.http_snaps[name] = snap - # capture per-node stats after successful run + def _capture_final_stats(self, *, ex: BaseExecutor, name: str) -> None: with suppress(Exception): raw_getter = getattr(ex, "get_node_stats", None) if callable(raw_getter): @@ -338,6 +560,559 @@ def run_node(self, name: str) -> None: with self.stats_lock: self.query_stats[name] = stats + # ---------- Hook helpers ---------- + + @staticmethod + def _normalize_hooks(hooks_raw: Any) -> list[str]: + """ + Accept str | list[str] | None and return a clean list[str]. + """ + if hooks_raw is None: + return [] + if isinstance(hooks_raw, str): + text = hooks_raw.strip() + return [text] if text else [] + if isinstance(hooks_raw, Iterable) and not isinstance(hooks_raw, (str, bytes, Mapping)): + out: list[str] = [] + for item in hooks_raw: + if item is None: + continue + s = str(item).strip() + if s: + out.append(s) + return out + # Be permissive: anything else → single string repr + s = str(hooks_raw).strip() + return [s] if s else [] + + def _render_hook_sql(self, template_text: str, node: Any | None, ex: Any) -> str: + """ + Render a single hook expression into SQL using the project's Jinja env. + """ + env = self.ctx.jinja_env + tmpl = env.from_string(template_text) + + run_started = self.run_started_at + inv_id = self.invocation_id + + this_obj = None + target = None + + if node is not None: + meta = getattr(node, "meta", {}) or {} + relation = relation_for(node.name) + mat = str(meta.get("materialized") or "table") + this_obj = _HookThis(relation, mat) + target = self._qualified_target(node.name) or relation + + def _hook_ref(name: str) -> str: + # Use executor's resolution if available + try: + return ex._format_relation_for_ref(name) + except Exception: + return relation_for(name) + + def _hook_source(source_name: str, table_name: str) -> str: + try: + return ex._resolve_source(source_name, table_name) + except Exception as exc: + raise KeyError( + f"Error resolving source('{source_name}', '{table_name}') in hook: {exc}" + ) from exc + + return tmpl.render( + # hook-specific context + this=this_obj, + target=target, + run_started_at=run_started, + invocation_id=inv_id, + # resolution helpers + ref=_hook_ref, + source=_hook_source, + ) + + def _execute_hook_sql(self, sql: str, ex: Any) -> None: + """ + Execute one or more SQL statements for a hook. + + We normalize away semicolons in full-line comments so that naive + ';'-based splitters in executors don't produce bogus statements like: + + "-- comment; with semicolon" -> ["-- comment", " with semicolon"] + """ + if not sql or not sql.strip(): + return + + # Trim outer whitespace but preserve inner newlines + sql = sql.strip() + + # --- Normalize comment lines: drop ';' inside "-- ..." lines ----------- + cleaned_lines: list[str] = [] + for line in sql.splitlines(): + stripped = line.lstrip() + if stripped.startswith("--"): + # Keep comment but remove semicolons so ';' splitters don't see them + prefix_len = len(line) - len(stripped) + comment = stripped.replace(";", "") + cleaned_lines.append(" " * prefix_len + comment) + else: + cleaned_lines.append(line) + + normalized_sql = "\n".join(cleaned_lines) + + # --- Delegate to executor if it has a hook-aware API ------------------- + # Let the executor decide how to split multi-statement SQL + ex.execute_hook_sql(normalized_sql) + return + + def _load_sql_hook_body(self, spec: HookSpec) -> str: + """ + Resolve the SQL text for a SQL hook. + + Resolution order: + 1) Inline `spec.sql` + 2) Any `/hooks/**/*.sql` whose *stem* matches `spec.name`. + (first match wins) + """ + # 1) Inline SQL wins if present + if spec.sql and spec.sql.strip(): + return spec.sql + + name = spec.name + if name is None: + raise ValueError("SQL HookSpec must have a non-empty 'name' for file-based hooks") + + project_dir = Path(self.ctx.project) + hooks_dir = project_dir / "hooks" + + if not hooks_dir.is_dir(): + raise FileNotFoundError( + f"SQL hook {spec.name!r} has no inline `sql` and no 'hooks/' directory exists " + f"under project: {project_dir}" + ) + + # 2) Build (or reuse) cache: stem -> Path + cache_attr = "_sql_hook_files" + mapping: dict[str, Path] + + if hasattr(self, cache_attr): + mapping = getattr(self, cache_attr) + else: + mapping = {} + for _path in hooks_dir.rglob("*.sql"): + # stem is the filename without suffix, e.g. "audit_run_end" + stem = _path.stem + # last-one-wins is fine; or keep first with `if stem not in mapping:` + mapping[stem] = _path + setattr(self, cache_attr, mapping) + + path: Path | None = mapping.get(name) + if not path: + raise FileNotFoundError( + f"SQL hook {name!r} has no inline `sql` and no matching " + f"file '/hooks/**/{name}.sql' was found." + ) + + return path.read_text(encoding="utf-8") + + def _model_hooks_for_when(self, when: HookWhen, node: Node) -> list[HookSpec]: + """ + Return all model-level HookSpecs from project.yml that apply to this node + for the given lifecycle event (before_model / after_model), based on + their `select` expression. + + Global run-level hooks (on_run_start / on_run_end) are handled elsewhere. + """ + if when == "before_model": + all_specs: Sequence[HookSpec] = getattr(REGISTRY, "before_model_hooks", []) or [] + elif when == "after_model": + all_specs = getattr(REGISTRY, "after_model_hooks", []) or [] + else: + # Only model-level lifecycles are handled here + return [] + + applicable: list[HookSpec] = [] + + for spec in all_specs: + sel = (spec.select or "").strip() + if not sel: + # No selector → applies to all models + applicable.append(spec) + continue + + try: + # Reuse the same selector compiler as the CLI + tokens = _parse_select([sel]) + _, pred = _compile_selector(tokens) + if pred(node): + applicable.append(spec) + except Exception as exc: + warn( + f"[hooks] invalid select={sel!r} for hook {spec.name!r} " + f"on when={when}: {exc}; skipping" + ) + + return applicable + + def _hook_matches_current_env(self, spec: HookSpec) -> bool: + """ + Decide whether a hook should run in the current engine/env. + + - If spec.engines is set, the active engine must be in that list. + - If spec.envs is set, the active env_name must be in that list. + - If a field is None/empty, it does not restrict execution. + """ + engine_name = (self.ctx.profile.engine or "").lower() + env_name = self.env_name + + # engines filter + if spec.engines: + allowed_engines = [e.lower() for e in spec.engines if isinstance(e, str)] + if engine_name not in allowed_engines: + return False + + # envs filter + return not (spec.envs and env_name not in spec.envs) + + def _run_hooks( + self, + hooks_raw: Sequence[HookSpec] | None, + node: Node | None, + when: HookWhen, + ex: BaseExecutor, + *, + run_status: str | None = None, + run_stats: RunStatsContext | None = None, + model_stats: ModelStatsContext | None = None, + ) -> None: + """ + Execute a list of hooks. + + New model: + - Only HookSpec objects (no string hooks). + - kind: "sql" or "python". + - Python hooks are resolved from the decorator registry + by (when, spec.name) and receive a single `context` dict. + """ + self._validate_hook_when(when) + + label = node.name if node is not None else "" + + if not hooks_raw: + echo(f"[hooks] when={when} node={label}: no hooks registered") + return + + jenv = self.ctx.jinja_env + + # Ensure all hooks/*.py are loaded and their @fft_hook decorators executed + project_dir = str(self.ctx.project) + load_project_hooks(project_dir) + + run_ctx_py = self._build_run_context(run_status=run_status, run_stats=run_stats) + model_ctx = self._build_model_context( + node=node, + model_stats=model_stats, + when=when, + ) + env_vars = self._snapshot_env_vars() + + active_specs = self._filter_active_specs(hooks_raw) + if not active_specs: + echo(f"[hooks] when={when} node={label}: no hooks after engine/env filtering") + return + + self._log_active_specs(active_specs, when, label) + + for idx, spec in enumerate(active_specs, start=1): + self._execute_single_hook( + spec=spec, + idx=idx, + when=when, + node_label=label, + run_ctx_py=run_ctx_py, + model_ctx=model_ctx, + env_vars=env_vars, + run_stats=run_stats, + model_stats=model_stats, + ex=ex, + jenv=jenv, + ) + + # ----------------- helpers for _run_hook ----------------- + + def _validate_hook_when(self, when: HookWhen) -> None: + allowed = ("on_run_start", "on_run_end", "before_model", "after_model") + if when not in allowed: + raise ValueError(f"Unsupported hook 'when' value: {when!r}") + + def _build_run_context( + self, + *, + run_status: str | None, + run_stats: RunStatsContext | None, + ) -> RunContext: + row_count: int | None = None + if run_stats is not None: + rc = run_stats.get("rows_total") + if rc is not None: + row_count = int(rc) + + return { + "run_id": str(self.invocation_id), + "env_name": self.env_name, + "engine_name": (self.ctx.profile.engine or "").lower(), + "started_at": str(self.run_started_at), + "status": run_status, + "row_count": row_count, + "error": None, + } + + def _build_model_context( + self, + *, + node: Node | None, + model_stats: ModelStatsContext | None, + when: HookWhen, + ) -> ModelContext | None: + if node is None: + return None + + meta = getattr(node, "meta", {}) or {} + + raw_tags = meta.get("tags") + if isinstance(raw_tags, (list, tuple, set)): + tags_list = [str(t) for t in raw_tags] + elif isinstance(raw_tags, str): + tags_list = [raw_tags] + else: + tags_list = [] + + model_ctx = cast( + ModelContext, + { + "name": str(node.name), + "path": node.path, + "tags": sorted(tags_list), + "meta": meta, + "status": None, + "rows_affected": None, + "elapsed_ms": None, + "error": None, + }, + ) + + if model_stats is not None: + model_ctx["rows_affected"] = model_stats.get("rows") + model_ctx["elapsed_ms"] = model_stats.get("query_duration_ms") + if when == "after_model" and model_ctx.get("status") is None: + model_ctx["status"] = "success" + + return model_ctx + + def _snapshot_env_vars(self) -> dict[str, str]: + # Snapshot env vars as a plain dict[str, str] + return dict(getattr(self.env_ctx, "env_vars", {}) or {}) + + def _filter_active_specs( + self, + hooks_raw: Sequence[HookSpec] | None, + ) -> list[HookSpec]: + if not hooks_raw: + return [] + return [s for s in hooks_raw if self._hook_matches_current_env(s)] + + def _log_active_specs( + self, + active_specs: Sequence[HookSpec], + when: HookWhen, + label: str, + ) -> None: + summary = ", ".join(f"{spec.kind}:{(spec.name or '')}" for spec in active_specs) + echo(f"[hooks] when={when} node={label}: executing {len(active_specs)} hook(s): {summary}") + + def _execute_single_hook( + self, + *, + spec: HookSpec, + idx: int, + when: HookWhen, + node_label: str, + run_ctx_py: RunContext, + model_ctx: ModelContext | None, + env_vars: dict[str, str], + run_stats: RunStatsContext | None, + model_stats: ModelStatsContext | None, + ex: BaseExecutor, + jenv: Any, + ) -> None: + hook_name = spec.name or "" + + try: + if not isinstance(spec, HookSpec): + raise TypeError( + f"Hooks must be HookSpec instances; got {type(spec)!r} at index {idx}" + ) + + # Engine/env filter (kept for safety even after pre-filtering) + if not self._hook_matches_current_env(spec): + echo_debug( + f"[hooks] when={when} node={node_label} hook#{idx} " + f"name={hook_name!r} - skipped (engine/env mismatch)" + ) + return + + if spec.kind == "sql": + self._execute_sql_hook( + spec=spec, + idx=idx, + when=when, + node_label=node_label, + run_ctx_py=run_ctx_py, + model_ctx=model_ctx, + ex=ex, + jenv=jenv, + ) + return + + if spec.kind == "python": + self._execute_python_hook( + spec=spec, + idx=idx, + when=when, + node_label=node_label, + run_ctx_py=run_ctx_py, + model_ctx=model_ctx, + env_vars=env_vars, + run_stats=run_stats, + model_stats=model_stats, + ) + return + + raise ValueError(f"Unknown hook kind {spec.kind!r} for hook #{idx}") + + except Exception as exc: + error( + f"[hooks] ERROR when={when} node={node_label} hook#{idx} " + f"kind={spec.kind!r} name={(spec.name or '')!r}: {exc}" + ) + raise RuntimeError( + f"Failed to execute {when} hook #{idx} for {node_label}: {exc}" + ) from exc + + def _execute_sql_hook( + self, + *, + spec: HookSpec, + idx: int, + when: HookWhen, + node_label: str, + run_ctx_py: RunContext, + model_ctx: ModelContext | None, + ex: BaseExecutor, + jenv: Any, + ) -> None: + hook_name = spec.name or "" + + echo_debug( + f"[hooks] when={when} node={node_label} hook#{idx} " + f"kind=sql name={hook_name!r} - rendering SQL" + ) + + sql_body = self._load_sql_hook_body(spec) + sql_tmpl = sql_body.strip() + if not sql_tmpl: + warn( + f"[hooks] when={when} node={node_label} hook#{idx} " + f"name={hook_name!r} has empty SQL, skipping" + ) + return + + tmpl = jenv.from_string(sql_tmpl) + + run_ctx_sql = dict(run_ctx_py) + + if model_ctx is None: + model_ctx_render: dict[str, Any] | None = None + else: + model_ctx_render = dict(model_ctx) + if when in ("before_model", "after_model"): + model_ctx_render.setdefault( + "status", + "running" if when == "before_model" else "success", + ) + model_ctx_render.setdefault("rows_affected", None) + model_ctx_render.setdefault("elapsed_ms", None) + model_ctx_render.setdefault("error", None) + + sql = tmpl.render( + run=run_ctx_sql, + model=model_ctx_render, + node=model_ctx_render, + ) + + if not sql.strip(): + warn( + f"[hooks] when={when} node={node_label} hook#{idx} " + f"name={hook_name!r} rendered empty SQL, skipping" + ) + return + + echo_debug( + f"[hooks] when={when} node={node_label} hook#{idx} " + f"name={hook_name!r} executing SQL:\n{sql}" + ) + self._execute_hook_sql(sql, ex) + + def _execute_python_hook( + self, + *, + spec: HookSpec, + idx: int, + when: HookWhen, + node_label: str, + run_ctx_py: RunContext, + model_ctx: ModelContext | None, + env_vars: dict[str, str], + run_stats: RunStatsContext | None, + model_stats: ModelStatsContext | None, + ) -> None: + if not spec.name: + raise ValueError( + "Python HookSpec must have a 'name' set; " + "this is used to resolve the hook from the registry." + ) + + hook_name = spec.name + + echo( + f"[hooks] when={when} node={node_label} hook#{idx} " + f"kind=python name={hook_name!r} - resolving from registry" + ) + + fn = resolve_hook(when=when, name=spec.name) + + context: HookContext = { + "when": when, + "run": run_ctx_py, + "model": model_ctx, + "env": env_vars, + } + + if run_stats is not None: + context["run_stats"] = run_stats + if model_stats is not None: + context["model_stats"] = model_stats + + if spec.params: + context["params"] = dict(spec.params) + + echo( + f"[hooks] when={when} node={node_label} hook#{idx} " + f"name={hook_name!r} - invoking python hook" + ) + + fn(context) + @staticmethod def before(_name: str, lvl_idx: int | None = None) -> None: return @@ -863,7 +1638,9 @@ def _levels_for_run(explicit_targets: list[str], wanted: set[str]) -> list[list[ return dag_levels(sub_nodes) -def _run_schedule(engine_, lvls, jobs, keep_going, ctx): +def _run_schedule( + engine_: _RunEngine, lvls: list[list[str]], jobs: int | str, keep_going: bool, ctx: CLIContext +) -> tuple[ScheduleResult, LogQueue, str, str]: logq = LogQueue() started_at = datetime.now(UTC).isoformat(timespec="seconds") @@ -967,6 +1744,41 @@ def _emit_logs_and_errors(logq: LogQueue, result: ScheduleResult, engine_: _RunE # ----------------- run function ----------------- +def _run_global_hooks( + engine_: _RunEngine, + when: HookWhen, + *, + run_status: str | None = None, + run_stats: RunStatsContext | None = None, +) -> None: + """ + Execute project-level hooks.on_run_start / hooks.on_run_end. + """ + if when not in ("on_run_start", "on_run_end"): + # Safety: global hooks are only defined for these two events + return + + if when == "on_run_start": + hooks = getattr(REGISTRY, "on_run_start_hooks", []) or [] + elif when == "on_run_end": + hooks = getattr(REGISTRY, "on_run_end_hooks", []) or [] + else: + return + + if not hooks: + return + + ex, _, _ = engine_._get_runner() + engine_._run_hooks( + hooks, + node=None, + when=when, + ex=ex, + run_status=run_status, + run_stats=run_stats, + ) + + def run( project: ProjectArg = ".", env_name: EnvOpt = "dev", @@ -993,7 +1805,24 @@ def run( ) ctx, engine_ = _build_engine_ctx(project, env_name, engine, vars, cache, no_cache) - bind_context(engine=ctx.profile.engine, env=env_name) + + # Run metadata for hooks + engine_.invocation_id = uuid4().hex + engine_.run_started_at = datetime.now(UTC).isoformat(timespec="seconds") + + bind_context( + engine=ctx.profile.engine, + env=env_name, + run_id=engine_.run_started_at, + invocation_id=engine_.invocation_id, + ) + + # Load python hooks from hooks/ directory + project_dir = ctx.project + load_project_hooks(project_dir) + + # Global on_run_start hooks + _run_global_hooks(engine_, when="on_run_start") select_tokens, _, raw_selected = _select_predicate_and_raw(engine_, ctx, select) wanted = _wanted_names(select_tokens=select_tokens, exclude=exclude, raw_selected=raw_selected) @@ -1022,6 +1851,31 @@ def run( _attempt_catalog(ctx) _emit_logs_and_errors(logq, result, engine_) + # Compute aggregated row + time totals for hooks + totals = _aggregate_totals(getattr(engine_, "query_stats", {}) or {}) + rows_total = totals.get("rows", 0) + elapsed_ms_total = totals.get("query_duration_ms", 0) + + # Global on_run_end hooks (only reached if no model raised fatal error inside schedule) + has_failures = bool(result.failed) + run_status = "error" if has_failures or budget_error else "success" + + run_stats: RunStatsContext = { + "models_built": len(result.per_node_s) - len(result.failed), + "models_failed": len(result.failed), + "models_skipped": 0, + "rows_total": rows_total, + "elapsed_ms_total": elapsed_ms_total, + "run_status": run_status, + } + + _run_global_hooks( + engine_, + when="on_run_end", + run_status=run_status, + run_stats=run_stats, + ) + if result.failed or budget_error: raise typer.Exit(1) diff --git a/src/fastflowtransform/config/models.py b/src/fastflowtransform/config/models.py index f71ab6c..cdd09b0 100644 --- a/src/fastflowtransform/config/models.py +++ b/src/fastflowtransform/config/models.py @@ -178,6 +178,11 @@ class ModelConfig(BaseModel): # Engine restriction, e.g. engines=["duckdb", "postgres"] engines: list[str] = Field(default_factory=list) + # --- Per-model hooks (pre/post) ---------------------------------------- + + pre_hook: list[str] = Field(default_factory=list) + post_hook: list[str] = Field(default_factory=list) + # --- Storage override (per model) -------------------------------------- storage: StorageConfig | None = None @@ -227,6 +232,24 @@ class ModelConfig(BaseModel): # Normalisation helpers # ---------------------------------------------------------------------- + @field_validator("pre_hook", "post_hook", mode="before") + @classmethod + def _normalize_hooks(cls, v: Any) -> list[str]: + """ + Allow: + - string: "delete from {{ this }}" → ["delete from {{ this }}"] + - sequence: ["stmt1", "stmt2"] + - null: [] + """ + if v is None: + return [] + if isinstance(v, str): + text = v.strip() + return [text] if text else [] + if isinstance(v, Sequence) and not isinstance(v, (str, bytes)): + return [str(x).strip() for x in v if str(x).strip()] + raise TypeError("pre_hook/post_hook must be a string or a sequence of strings") + @field_validator("tags", "engines", mode="before") @classmethod def _normalize_tags_engines(cls, v: Any) -> list[str]: diff --git a/src/fastflowtransform/config/project.py b/src/fastflowtransform/config/project.py index de9b20b..ca7a71f 100644 --- a/src/fastflowtransform/config/project.py +++ b/src/fastflowtransform/config/project.py @@ -1,7 +1,7 @@ # fastflowtransform/config/project.py from __future__ import annotations -from collections.abc import Sequence +from collections.abc import Mapping, Sequence from pathlib import Path from typing import Any, Literal @@ -147,6 +147,61 @@ class DocsConfig(BaseModel): dag_dir: str | None = None +# --------------------------------------------------------------------------- +# hooks: block from project.yml +# --------------------------------------------------------------------------- + + +class HookSpec(BaseModel): + """ + One hook entry from project.yml -> hooks.* lists. + Example: + - name: audit_run_start + kind: sql + sql: "insert into ..." + + - name: python_banner + kind: python + callable: "hooks_demo.hooks.notify:on_run_start" + select: "tag:example:hooks_demo" + """ + + model_config = ConfigDict(extra="forbid") + + name: str | None = None + kind: Literal["sql", "python"] + + # SQL hook body (for kind == "sql") + sql: str | None = None + + # Python callable path (for kind == "python"), "pkg.mod:func" or "pkg.mod.func" + callable: str | None = None + + # Optional selection filter (for before_model / after_model) + select: str | None = None + + # Optional free-form params if you want them later + params: Mapping[str, Any] | None = None + + engines: list[str] | None = None # e.g. ["duckdb", "databricks_spark"] + envs: list[str] | None = None # e.g. ["dev_duckdb", "prod_duckdb"] + + +class HooksConfig(BaseModel): + """ + Top-level hooks section in project.yml. + """ + + model_config = ConfigDict(extra="forbid") + + on_run_start: list[HookSpec] = Field(default_factory=list) + on_run_end: list[HookSpec] = Field(default_factory=list) + + # Per-model hooks are optional but allowed + before_model: list[HookSpec] = Field(default_factory=list) + after_model: list[HookSpec] = Field(default_factory=list) + + # --------------------------------------------------------------------------- # Top-level tests from project.yml (in addition to schema tests) # --------------------------------------------------------------------------- @@ -457,21 +512,6 @@ class CustomProjectTestConfig(BaseProjectTestConfig): column: str | None = None -# ProjectTestConfig = Annotated[ -# NotNullTestConfig -# | UniqueTestConfig -# | AcceptedValuesTestConfig -# | GreaterEqualTestConfig -# | NonNegativeSumTestConfig -# | RowCountBetweenTestConfig -# | FreshnessTestConfig -# | ReconcileEqualTestConfig -# | ReconcileRatioWithinTestConfig -# | ReconcileDiffWithinTestConfig -# | ReconcileCoverageTestConfig, -# Field(discriminator="type"), -# ] - ProjectTestConfig = ( NotNullTestConfig | UniqueTestConfig @@ -536,6 +576,8 @@ class ProjectConfig(BaseModel): docs: DocsConfig | None = None + hooks: HooksConfig | None = None + # --------------------------------------------------------------------------- # Helper: load & validate project.yml diff --git a/src/fastflowtransform/core.py b/src/fastflowtransform/core.py index b311d44..19be198 100644 --- a/src/fastflowtransform/core.py +++ b/src/fastflowtransform/core.py @@ -4,6 +4,7 @@ import ast import importlib.util import inspect +import json import os import re import types @@ -15,11 +16,12 @@ import jinja2.runtime from jinja2 import Environment, FileSystemLoader, StrictUndefined +from jinja2.runtime import Undefined as JinjaUndefined from pydantic import ValidationError from fastflowtransform import storage from fastflowtransform.config.models import validate_model_meta_strict -from fastflowtransform.config.project import parse_project_yaml_config +from fastflowtransform.config.project import HookSpec, parse_project_yaml_config from fastflowtransform.config.sources import load_sources_config from fastflowtransform.errors import ( DependencyNotFoundError, @@ -91,6 +93,38 @@ def _validate_py_model_signature(func: Callable, deps: list[str], *, path: Path, ) +def sql_literal(value: Any) -> str: + """ + Convert a Python value into a SQL literal string. + + - None -> "NULL" + - bool -> "TRUE"/"FALSE" + - int/float -> "123" (no quotes) + - str -> quoted with single quotes and escaped + - other -> JSON-dumped and treated as a string literal + """ + if value is None or isinstance(value, JinjaUndefined): + return "NULL" + + if isinstance(value, bool): + return "TRUE" if value else "FALSE" + + if isinstance(value, (int, float)): + return str(value) + + if isinstance(value, str): + # Simple quote-escape for single quotes + escaped = value.replace("'", "''") + return f"'{escaped}'" + + # Fallback: JSON (or str) and quote it + try: + json_text = json.dumps(value, separators=(",", ":"), sort_keys=True) + except TypeError: + json_text = str(value) + return "'" + json_text.replace("'", "''") + "'" + + @dataclass class Node: name: str @@ -114,6 +148,12 @@ def __init__(self): self.active_engine: str | None = None self.incremental_models: dict[str, dict[str, Any]] = {} + # global hooks from project.yml + self.on_run_start_hooks: list[HookSpec] = [] + self.on_run_end_hooks: list[HookSpec] = [] + self.before_model_hooks: list[HookSpec] = [] + self.after_model_hooks: list[HookSpec] = [] + def get_project_dir(self) -> Path: """Return the project directory after load_project(), or raise if not set.""" if self.project_dir is None: @@ -274,10 +314,17 @@ def _reset_registry_state(self) -> None: self.cli_vars = {} self.macros.clear() self.incremental_models = {} + # reset storage maps storage.set_model_storage({}) storage.set_seed_storage({}) + # reset hooks + self.on_run_start_hooks = [] + self.on_run_end_hooks = [] + self.before_model_hooks = [] + self.after_model_hooks = [] + def _init_jinja_env(self, models_dir: Path) -> None: """Initialize the Jinja environment for this project.""" self.env = Environment( @@ -315,6 +362,9 @@ def _env(name: str, default: Any | None = None) -> Any: self.env.filters["var"] = _var self.env.filters["env"] = _env + # SQL literal helper for models *and* hooks + self.env.filters["sql_literal"] = sql_literal + def _load_sources_yaml(self, project_dir: Path) -> None: """Load sources.yml (version 2) if present.""" src_path = project_dir / "sources.yml" @@ -366,6 +416,19 @@ def _load_project_yaml(self, project_dir: Path) -> None: storage.normalize_storage_map(seed_storage_raw, project_dir=project_dir) ) + # Global hooks (project.yml → hooks.on_run_start / hooks.on_run_end) + hooks_cfg = getattr(proj_cfg, "hooks", None) + if hooks_cfg: + self.on_run_start_hooks = list(hooks_cfg.on_run_start or []) + self.on_run_end_hooks = list(hooks_cfg.on_run_end or []) + self.before_model_hooks = list(hooks_cfg.before_model or []) + self.after_model_hooks = list(hooks_cfg.after_model or []) + else: + self.on_run_start_hooks = [] + self.on_run_end_hooks = [] + self.before_model_hooks = [] + self.after_model_hooks = [] + def _discover_sql_models(self, models_dir: Path) -> None: """Scan *.ff.sql files, parse config, validate meta, and register nodes.""" for path in models_dir.rglob("*.ff.sql"): diff --git a/src/fastflowtransform/executors/base.py b/src/fastflowtransform/executors/base.py index 9eccb77..382c31d 100644 --- a/src/fastflowtransform/executors/base.py +++ b/src/fastflowtransform/executors/base.py @@ -839,6 +839,14 @@ def _create_or_replace_view_from_table( """ ... + # ---------- SQL hook contracts ---------- + + def execute_hook_sql(self, sql: str) -> None: + """ + Execute a SQL hook block (pre-/post-run, on-run-start, on-run-end, etc.). + """ + raise NotImplementedError(f"SQL hooks are not implemented for engine '{self.engine_name}'.") + # ---------- SQL hook contracts ---------- @abstractmethod def _format_relation_for_ref(self, name: str) -> str: diff --git a/src/fastflowtransform/executors/bigquery/base.py b/src/fastflowtransform/executors/bigquery/base.py index b7ceb8d..281aefc 100644 --- a/src/fastflowtransform/executors/bigquery/base.py +++ b/src/fastflowtransform/executors/bigquery/base.py @@ -70,7 +70,17 @@ def _execute_sql(self, sql: str) -> _TrackedQueryJob: - Returns the QueryJob so callers can call .result(). """ self._apply_budget_guard(self._BUDGET_GUARD, sql) - job = self.client.query(sql, location=self.location) + # job = self.client.query(sql, location=self.location) + job_config = bigquery.QueryJobConfig() + if self.dataset: + # Let unqualified tables resolve to project.dataset.table + job_config.default_dataset = bigquery.DatasetReference(self.project, self.dataset) + + job = self.client.query( + sql, + job_config=job_config, + location=self.location, + ) return _TrackedQueryJob(job, on_complete=self._record_query_job_stats) # --- Cost estimation for the shared BudgetGuard ----------------- @@ -85,6 +95,10 @@ def _estimate_query_bytes(self, sql: str) -> int | None: dry_run=True, use_query_cache=False, ) + if self.dataset: + # Let unqualified tables resolve to project.dataset.table + cfg.default_dataset = bigquery.DatasetReference(self.project, self.dataset) + job = self.client.query( sql, job_config=cfg, @@ -522,3 +536,9 @@ def snapshot_prune( ) """ self._execute_sql(delete_sql).result() + + def execute_hook_sql(self, sql: str) -> None: + """ + Execute one SQL statement for pre/post/on_run hooks. + """ + self._execute_sql(sql).result() diff --git a/src/fastflowtransform/executors/databricks_spark.py b/src/fastflowtransform/executors/databricks_spark.py index 5a297de..7a3bb34 100644 --- a/src/fastflowtransform/executors/databricks_spark.py +++ b/src/fastflowtransform/executors/databricks_spark.py @@ -439,6 +439,7 @@ def _execute_sql(self, sql: str) -> SDF: estimated_bytes = self._spark_plan_bytes(sql) t0 = perf_counter() df = self.spark.sql(sql) + dt_ms = int((perf_counter() - t0) * 1000) # Best-effort logical estimate @@ -1242,6 +1243,20 @@ def _materialize(df: SDF) -> SDF: with suppress(Exception): handle.unpersist() + def execute_hook_sql(self, sql: str) -> None: + """ + Entry point for hook SQL. + + Accepts a string that may contain multiple ';'-separated statements. + `_RunEngine._execute_hook_sql` has already normalized away semicolons + in full-line comments, so naive splitting by ';' is acceptable here. + """ + for stmt in (part.strip() for part in sql.split(";")): + if not stmt: + continue + # Reuse your existing single-statement executor + self._execute_sql(stmt) + # ────────────────────────── local helpers / shim ────────────────────────── class _SparkResult: diff --git a/src/fastflowtransform/executors/duckdb.py b/src/fastflowtransform/executors/duckdb.py index a84397b..e872583 100644 --- a/src/fastflowtransform/executors/duckdb.py +++ b/src/fastflowtransform/executors/duckdb.py @@ -424,7 +424,6 @@ def _exec_many(self, sql: str) -> None: Execute multiple SQL statements separated by ';' on the same connection. DuckDB normally accepts one statement per execute(), so we split here. """ - # very simple splitter - good enough for what we emit in the executor for stmt in (part.strip() for part in sql.split(";")): if not stmt: continue @@ -845,3 +844,11 @@ def snapshot_prune( and t.{vf} = r.{vf}; """ self._execute_sql(delete_sql) + + def execute_hook_sql(self, sql: str) -> None: + """ + Execute one or multiple SQL statements for pre/post/on_run hooks. + + Accepts a string that may contain ';'-separated statements. + """ + self._exec_many(sql) diff --git a/src/fastflowtransform/executors/postgres.py b/src/fastflowtransform/executors/postgres.py index 2800d80..b8bef0a 100644 --- a/src/fastflowtransform/executors/postgres.py +++ b/src/fastflowtransform/executors/postgres.py @@ -689,3 +689,11 @@ def snapshot_prune( """ self._execute_sql(delete_sql) self._analyze_relations([relation]) + + def execute_hook_sql(self, sql: str) -> None: + """ + Execute one or multiple SQL statements for pre/post/on_run hooks. + + Accepts a string that may contain ';'-separated statements. + """ + self._execute_sql(sql) diff --git a/src/fastflowtransform/executors/snowflake_snowpark.py b/src/fastflowtransform/executors/snowflake_snowpark.py index d166bf8..1c60794 100644 --- a/src/fastflowtransform/executors/snowflake_snowpark.py +++ b/src/fastflowtransform/executors/snowflake_snowpark.py @@ -142,6 +142,16 @@ def _execute_sql(self, sql: str) -> SNDF: ) return df + def _exec_many(self, sql: str) -> None: + """ + Execute multiple SQL statements separated by ';' on the same connection. + Snowflake normally accepts one statement per execute(), so we split here. + """ + for stmt in (part.strip() for part in sql.split(";")): + if not stmt: + continue + self._execute_sql(stmt).collect() + # ---------- Helpers ---------- def _q(self, s: str) -> str: return '"' + s.replace('"', '""') + '"' @@ -701,6 +711,12 @@ def snapshot_prune( """ self._execute_sql(delete_sql).collect() + def execute_hook_sql(self, sql: str) -> None: + """ + Execute one SQL statement for pre/post/on_run hooks. + """ + self._exec_many(sql) + # ────────────────────────── local testing shim ─────────────────────────── class _SFCursorShim: diff --git a/src/fastflowtransform/hooks/registry.py b/src/fastflowtransform/hooks/registry.py new file mode 100644 index 0000000..743c090 --- /dev/null +++ b/src/fastflowtransform/hooks/registry.py @@ -0,0 +1,111 @@ +# fastflowtransform/hooks/registry.py + +from __future__ import annotations + +import importlib.util +import sys +from collections.abc import Callable +from pathlib import Path +from typing import Any + +from fastflowtransform.hooks.types import HookContext + +# Registry structure: +# { (when, name) -> callable } +# where `when` can be a specific phase ("on_run_start") or "*" (wildcard) +_HOOKS: dict[tuple[str, str], Callable[[HookContext], Any]] = {} + + +def fft_hook(name: str | None = None, when: str | None = None) -> Callable: + """ + Decorator to register a Python hook. + + Usage: + + from fastflowtransform.hooks.registry import fft_hook + + @fft_hook(name="python_banner") # no 'when' -> wildcard + def on_run_start(ctx: dict[str, Any]): + ... + + @fft_hook(name="python_banner", when="on_run_start") + def banner_for_run_start(ctx: dict[str, Any]): + ... + + - `name`: logical hook name (matches project.yml `hooks: ... name:`). + If omitted, defaults to the function name. + - `when`: lifecycle event ("on_run_start", "on_run_end", + "before_model", "after_model", etc.). + If omitted, the hook is registered for the wildcard phase "*". + """ + + def decorator(fn: Callable[[HookContext], Any]) -> Callable[[HookContext], Any]: + hook_name = name or fn.__name__ + phase = when or "*" # wildcard by default + + key = (phase, hook_name) + if key in _HOOKS: + raise ValueError(f"Hook already registered for {key!r}") + + _HOOKS[key] = fn + return fn + + return decorator + + +def resolve_hook(when: str, name: str) -> Callable[[HookContext], Any]: + """ + Retrieve a previously-registered hook function. + + Resolution order: + 1. Exact match: (when, name) + 2. Wildcard match: ('*', name) + + Raises KeyError if not found. + """ + key = (when, name) + if key in _HOOKS: + return _HOOKS[key] + + wildcard_key = ("*", name) + if wildcard_key in _HOOKS: + return _HOOKS[wildcard_key] + + raise KeyError(f"No hook registered for when={when!r}, name={name!r}") + + +def load_project_hooks(project_dir: str | Path) -> None: + """ + Load all Python files under `/hooks/**.py`. + + This executes the modules (without requiring them to be proper + Python packages), so any `@fft_hook(...)` calls will populate the + registry. + + This is intentionally import-path agnostic: we don't require + `project_dir` to be on sys.path and we don't care about the + module name outside of this function. + """ + base = Path(project_dir) + hooks_dir = base / "hooks" + + if not hooks_dir.is_dir(): + return + + for path in hooks_dir.rglob("*.py"): + # Build a synthetic, flat module name so we don't rely on package structure + rel = path.relative_to(base) + stem_parts = rel.with_suffix("").parts # e.g. ("hooks", "notify") + module_name = "_fft_project_hooks_" + "_".join(stem_parts) + + # Skip if already loaded (avoid double-execution) + if module_name in sys.modules: + continue + + spec = importlib.util.spec_from_file_location(module_name, path) + if not spec or not spec.loader: + continue + + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) diff --git a/src/fastflowtransform/hooks/types.py b/src/fastflowtransform/hooks/types.py new file mode 100644 index 0000000..37c9ae7 --- /dev/null +++ b/src/fastflowtransform/hooks/types.py @@ -0,0 +1,84 @@ +# fastflowtransform/hooks/types.py + +from __future__ import annotations + +from typing import Any, NotRequired, TypedDict + + +class RunContext(TypedDict, total=False): + """ + Information about the entire fft run. + """ + + run_id: str + env_name: str + engine_name: str + started_at: str # ISO timestamp + status: str | None # 'success', 'error', ... + row_count: int | None + error: str | None # error message if any + + +class ModelContext(TypedDict, total=False): + """ + Information about a specific model execution. + """ + + name: str # model name + path: str # path to the model file + tags: list[str] # normalised list of tags + meta: dict[str, Any] # raw meta from the model + + status: str | None # 'success', 'error', ... + rows_affected: int | None + elapsed_ms: float | None # execution time if available + error: str | None + + +class RunStatsContext(TypedDict, total=False): + """ + Aggregate summary for the run (optional, usually on_run_end). + """ + + models_built: int + models_skipped: int + models_failed: int + run_status: str + rows_total: int + elapsed_ms_total: int + + +class ModelStatsContext(TypedDict): + """ + Per-model stats as reported by the executor (if available). + """ + + rows: int + bytes_scanned: int + query_duration_ms: int + + +class HookContext(TypedDict, total=False): + """ + Context passed to all Python hooks. + + Keys are the same for all `when` phases, but some may be absent + depending on what is happening (e.g., `model` is None for run-level + hooks). + """ + + when: str # 'on_run_start', 'on_run_end', 'before_model', 'after_model', ... + + run: RunContext + model: ModelContext | None + + # env vars (usually FF_* / FFT_* etc.). You decide what to put in. + env: dict[str, str] + + # Optional aggregate stats for on_run_end + run_stats: NotRequired[RunStatsContext] + + # Optional per-model stats for before_model/after_model hooks + model_stats: NotRequired[ModelStatsContext] + + params: NotRequired[dict[str, Any]] diff --git a/src/fastflowtransform/logging.py b/src/fastflowtransform/logging.py index 4bfb8c2..f5892a5 100644 --- a/src/fastflowtransform/logging.py +++ b/src/fastflowtransform/logging.py @@ -80,8 +80,11 @@ def bind_context( engine: str | None = None, env: str | None = None, node: str | None = None, + invocation_id: str | None = None, ) -> None: """Bind fields that get injected into every record.""" + if invocation_id is not None: + run_id = invocation_id if run_id is not None: _run_id.set(run_id) if engine is not None: @@ -107,6 +110,7 @@ def bound_context( engine: str | None = None, env: str | None = None, node: str | None = None, + invocation_id: str | None = None, ) -> Generator[None, None, None]: """ Temporarily bind (or override) selected fields. @@ -114,7 +118,13 @@ def bound_context( """ prev = (_run_id.get(), _engine.get(), _env.get(), _node.get()) try: - bind_context(run_id=run_id, engine=engine, env=env, node=node) + bind_context( + run_id=run_id, + engine=engine, + env=env, + node=node, + invocation_id=invocation_id, + ) yield finally: # restore previous values; keep run_id/engine/env stable if you want by not overriding diff --git a/src/fastflowtransform/utest.py b/src/fastflowtransform/utest.py index 5e46c2f..09051fe 100644 --- a/src/fastflowtransform/utest.py +++ b/src/fastflowtransform/utest.py @@ -145,15 +145,13 @@ def _read_result(executor: Any, rel: str) -> pd.DataFrame: def _project_root_for_spec(spec: UnitSpec) -> Path: - # bevorzugt Registry if getattr(REGISTRY, "project_dir", None): return Path(REGISTRY.get_project_dir()).resolve() - # heuristisch: nach oben laufen, bis 'models/' existiert p = spec.path.resolve() for parent in [p.parent, *list(p.parents)]: if (parent / "models").is_dir(): return parent - return spec.path.parent # letzter Fallback + return spec.path.parent # ---------- Cache and Fingerprint Helpers ---------- diff --git a/tests/common/mock/bigquery.py b/tests/common/mock/bigquery.py index d4abadf..4ed702a 100644 --- a/tests/common/mock/bigquery.py +++ b/tests/common/mock/bigquery.py @@ -71,6 +71,14 @@ def schema(self): class FakeQueryJobConfig: def __init__(self, **kwargs: Any): self.kwargs = kwargs + # allow executors to set job_config.default_dataset = DatasetReference(...) + self.default_dataset = None + + +class FakeDatasetReference: + def __init__(self, project: str, dataset_id: str): + self.project = project + self.dataset_id = dataset_id class FakeScalarQueryParameter: @@ -246,6 +254,7 @@ def make_fake_bigquery_module() -> types.ModuleType: mod.QueryJobConfig = FakeQueryJobConfig # type: ignore[attr-defined] mod.ScalarQueryParameter = FakeScalarQueryParameter # type: ignore[attr-defined] mod.Dataset = FakeDataset # type: ignore[attr-defined] + mod.DatasetReference = FakeDatasetReference # type: ignore[attr-defined] mod.BadRequest = FakeBadRequest # type: ignore[attr-defined] mod.NotFound = FakeNotFound # type: ignore[attr-defined] mod.WriteDisposition = FakeWriteDisposition # type: ignore[attr-defined] diff --git a/tests/integration/examples/config.py b/tests/integration/examples/config.py index d4c3ba4..c272dd0 100644 --- a/tests/integration/examples/config.py +++ b/tests/integration/examples/config.py @@ -99,6 +99,22 @@ class ExampleConfig: "bigframes": "dev_bigquery_bigframes", }, ), + ExampleConfig( + name="hooks_demo", + path=ROOT / "examples" / "hooks_demo", + make_target="demo", + env_by_engine={ + "duckdb": "dev_duckdb", + "postgres": "dev_postgres", + "databricks_spark": "dev_databricks", + "bigquery": "dev_bigquery_pandas", + "snowflake_snowpark": "dev_snowflake", + }, + bigquery_env_by_backend={ + "pandas": "dev_bigquery_pandas", + "bigframes": "dev_bigquery_bigframes", + }, + ), ExampleConfig( name="incremental_demo", path=ROOT / "examples" / "incremental_demo", diff --git a/tests/unit/executors/test_bigquery_bf_exec_unit.py b/tests/unit/executors/test_bigquery_bf_exec_unit.py index 65c189f..cb0921f 100644 --- a/tests/unit/executors/test_bigquery_bf_exec_unit.py +++ b/tests/unit/executors/test_bigquery_bf_exec_unit.py @@ -49,6 +49,22 @@ def read_gbq(self, table_id: str) -> Any: def bq_exec(monkeypatch): _ = install_fake_bigquery(monkeypatch, [bq_exec_mod, bq_mix_mod, bq_base_mod]) + # Test-only shim: ensure the fake bigquery module has DatasetReference, + # which BigQueryBaseExecutor._execute_sql now relies on. + if not hasattr(bq_base_mod.bigquery, "DatasetReference"): + + class _DummyDatasetReference: + def __init__(self, project: str, dataset_id: str): + self.project = project + self.dataset_id = dataset_id + + monkeypatch.setattr( + bq_base_mod.bigquery, + "DatasetReference", + _DummyDatasetReference, + raising=False, + ) + fake_bigframes = types.ModuleType("bigframes") fake_conf = types.ModuleType("bigframes._config") fake_conf_bq = types.ModuleType("bigframes._config.bigquery_options") diff --git a/uv.lock b/uv.lock index 25af95b..f431107 100644 --- a/uv.lock +++ b/uv.lock @@ -733,7 +733,7 @@ wheels = [ [[package]] name = "fastflowtransform" -version = "0.6.8" +version = "0.6.9" source = { editable = "." } dependencies = [ { name = "duckdb" }, From 49bf3648b03c80e9ab1950473b04a3620f87fdd7 Mon Sep 17 00:00:00 2001 From: Marko Lekic Date: Fri, 28 Nov 2025 16:16:40 +0100 Subject: [PATCH 2/3] Fixed utest + added utest to basic demo --- docs/Unit_Tests.md | 34 +- docs/examples/Basic_Demo.md | 34 +- examples/basic_demo/Makefile | 4 + examples/basic_demo/profiles.yml | 34 ++ .../tests/unit/mart_latest_signup.yml | 40 ++ .../tests/unit/mart_users_by_domain.yml | 25 + .../basic_demo/tests/unit/users_clean.yml | 25 + .../building_locally_demo/.env.prod_bigquery | 2 + .../models/marts/mart_latest_signup.ff.py | 40 ++ .../building_locally_demo/profiles.yml | 20 +- .../tests/unit/stg_users.yml | 10 +- src/fastflowtransform/.env | 13 - src/fastflowtransform/cli/bootstrap.py | 30 +- src/fastflowtransform/cli/utest_cmd.py | 2 +- src/fastflowtransform/executors/base.py | 35 ++ .../executors/bigquery/bigframes.py | 66 ++- .../executors/bigquery/pandas.py | 53 ++ .../executors/databricks_spark.py | 51 ++ src/fastflowtransform/executors/duckdb.py | 40 ++ src/fastflowtransform/executors/postgres.py | 172 ++++++- .../executors/snowflake_snowpark.py | 84 ++++ src/fastflowtransform/hooks/__init__.py | 0 src/fastflowtransform/seeding.py | 1 + src/fastflowtransform/settings.py | 183 +++++++ src/fastflowtransform/utest.py | 467 ++++++++++++------ tests/common/fixtures.py | 35 ++ .../test_utest_cache_flag_integration.py | 25 +- tests/unit/test_utest_unit.py | 134 ++--- 28 files changed, 1385 insertions(+), 274 deletions(-) create mode 100644 examples/basic_demo/tests/unit/mart_latest_signup.yml create mode 100644 examples/basic_demo/tests/unit/mart_users_by_domain.yml create mode 100644 examples/basic_demo/tests/unit/users_clean.yml create mode 100644 examples_article/building_locally_demo/.env.prod_bigquery create mode 100644 examples_article/building_locally_demo/models/marts/mart_latest_signup.ff.py delete mode 100644 src/fastflowtransform/.env create mode 100644 src/fastflowtransform/hooks/__init__.py diff --git a/docs/Unit_Tests.md b/docs/Unit_Tests.md index 79e89e2..b3bb018 100644 --- a/docs/Unit_Tests.md +++ b/docs/Unit_Tests.md @@ -121,6 +121,18 @@ cases: - {order_id: 11, user_id: 1, email: "x@gmail.com", is_gmail: true, amount: -1.0, valid_amt: false} ``` +> **Validation** +> +> Unit-test specs are validated with Pydantic: +> +> - Unknown / misspelled keys (e.g. `rowz` instead of `rows`) cause a clear validation error. +> - Allowed top-level keys: `model`, `engine`, `defaults`, `cases`. +> - Under `defaults` / `cases.inputs.*` you may only use `rows` **or** `csv`. +> - Under `expect` you may only use: `relation`, `rows`, `order_by`, `any_order`, +> `approx`, `ignore_columns`, `subset`. +> +> This is intentional so that typos in YAML don’t silently get ignored. + ## Input Formats - `rows`: inline dictionaries per row. @@ -130,11 +142,23 @@ Keys under `inputs` are physical relations; use `relation_for('users.ff')` if un ## Expected Output & Comparison -- `relation`: actual table/view name produced by the model (defaults to `relation_for(model)`). -- Ordering: `order_by: [...]` or `any_order: true`. -- Columns: `ignore_columns: [...]`, `subset: true`. -- Numeric tolerance: `approx: true` or `approx: { col: 1e-9, other_col: 0.01 }` - (numbers can be plain `1e-9` or quoted; they are cast to float). +- `relation`: actual table/view name produced by the model + (defaults to `relation_for(model)`). +- **Row ordering:** + - By default, comparisons are **row-order insensitive**. The framework sorts + both expected and actual by all columns before comparing. + - `order_by: [...]` lets you specify a deterministic sort order for debugging + (both sides are sorted by those columns before comparison). + - `any_order: true` is accepted for backwards compatibility but is effectively + the default behaviour now. +- Columns: + - `ignore_columns: [...]` drops those columns from both expected and actual. + - `subset: true` means “every expected row must be present in the actual + result”, but the actual result may contain additional rows/columns. +- Numeric tolerance (`approx`): + - `approx: { col: 1e-9, other_col: 0.01 }` compares numeric columns within + the given absolute tolerance. + - Non-numeric values in `approx` cause a clear error (`must be a number`). ## Running UTests diff --git a/docs/examples/Basic_Demo.md b/docs/examples/Basic_Demo.md index 7ba73e2..301ff3e 100644 --- a/docs/examples/Basic_Demo.md +++ b/docs/examples/Basic_Demo.md @@ -19,9 +19,10 @@ Use it as a sandbox before adding your own sources, macros, or Python models. | `models/staging/users_clean.ff.sql` | Normalizes emails, casts types, and tags the model for all engines. | | `models/marts/mart_users_by_domain.ff.sql` | Aggregates users per email domain and records the first/last signup dates. | | `models/engines/*/mart_latest_signup.ff.py` | Engine-specific Python models selecting the most recent signup per domain from the staging view:
• pandas for DuckDB/Postgres
• PySpark for Databricks
• BigQuery DataFrames (BigFrames) for BigQuery. | +| `tests/unit/*.yml` | Model unit-test specs for the demo models (`users_clean`, `mart_users_by_domain`, `mart_latest_signup`), used by `fft utest` and `make utest ENGINE=…`. | | `profiles.yml` | Declares `dev_duckdb`, `dev_postgres`, `dev_databricks`, and `dev_bigquery` profiles driven by environment variables. | | `.env.dev_*` | Template environment files you can `source` per engine (`.env.dev_duckdb`, `.env.dev_postgres`, `.env.dev_databricks`, `.env.dev_bigquery`). | -| `Makefile` | One command (`make demo ENGINE=…`) to seed, run, document, test, and preview results. | +| `Makefile` | One command (`make demo ENGINE=…`) to seed, run, unit-test, document, test, and preview results. | ## Running the demo @@ -58,7 +59,7 @@ Use it as a sandbox before adding your own sources, macros, or Python models. # make demo ENGINE=bigquery BQ_FRAME=pandas ``` - The Makefile runs `fft seed`, `fft run`, `fft dag`, and `fft test`. + The Makefile runs `fft seed`, `fft run`, `fft dag`, `fft utest`, and `fft test`. To open the rendered DAG site after a run: @@ -86,3 +87,32 @@ The demo enables baseline data quality checks in `project.yml`. Running `fft tes * Each email domain appears only once in `mart_latest_signup`. These tests run against whatever engine/profile is active — including BigQuery, where they execute as standard SQL queries on the configured dataset. + +## Model unit tests (`fft utest`) + +The basic demo also includes **model-level unit tests** under `tests/unit/`. They exercise: + +- `users_clean` (staging) +- `mart_users_by_domain` (mart) +- the engine-specific `mart_latest_signup` Python model + +Each YAML spec defines small input fixtures (inline `rows` or external CSVs) and the expected +output rows. To run the unit tests for the active engine: + +```bash +make utest ENGINE=duckdb +# or, equivalent: +fft utest . --env dev_duckdb +``` + +You can swap engines the same way as for the main demo: + +```bash +make utest ENGINE=postgres +make utest ENGINE=databricks_spark +make utest ENGINE=bigquery BQ_FRAME=bigframes +``` + +`fft utest` only builds the target model for each spec and compares the result to the expected +rows, which makes these tests fast and self-contained while still running against the real +warehouse/engine. diff --git a/examples/basic_demo/Makefile b/examples/basic_demo/Makefile index 558bc20..27265f4 100644 --- a/examples/basic_demo/Makefile +++ b/examples/basic_demo/Makefile @@ -91,6 +91,9 @@ run: test: env $(BASE_ENV) $(UV) run fft test "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) +utest: + env $(BASE_ENV) $(UV) run fft utest "$(PROJECT)" --env $(PROFILE_ENV) + dag: env $(RUN_ENV) $(UV) run fft dag "$(PROJECT)" --env $(PROFILE_ENV) $(SELECT_FLAGS) --html @@ -117,5 +120,6 @@ demo: clean +$(MAKE) run ENGINE=$(ENGINE) +$(MAKE) dag ENGINE=$(ENGINE) +$(MAKE) test ENGINE=$(ENGINE) + +$(MAKE) utest ENGINE=$(ENGINE) +$(MAKE) artifacts @echo "✅ Demo complete." diff --git a/examples/basic_demo/profiles.yml b/examples/basic_demo/profiles.yml index 99d2644..a24fa9b 100644 --- a/examples/basic_demo/profiles.yml +++ b/examples/basic_demo/profiles.yml @@ -6,12 +6,24 @@ dev_duckdb: duckdb: path: "{{ env('FF_DUCKDB_PATH', '.local/basic_demo.duckdb') }}" +dev_duckdb_utest: + engine: duckdb # must match + duckdb: + # completely separate DB just for utests + path: "{{ env('FF_DUCKDB_UTEST_PATH', ':memory:') }}" + dev_postgres: engine: postgres postgres: dsn: "{{ env('FF_PG_DSN') }}" db_schema: "{{ env('FF_PG_SCHEMA', 'public') }}" +dev_postgres_utest: + engine: postgres # same engine + postgres: + # just override schema, keep same DSN + db_schema: "{{ env('FF_PG_UTEST_SCHEMA', 'basic_demo_utest') }}" + dev_databricks: engine: databricks_spark databricks_spark: @@ -26,6 +38,11 @@ dev_databricks: -Dderby.stream.error.file={{ project_dir() }}/.local/derby.log -Dderby.system.home={{ project_dir() }}/.local/derby_home +dev_databricks_utest: + engine: databricks_spark + databricks_spark: + database: "{{ env('FF_DBR_DATABASE', 'basic_demo_utest') }}" + dev_bigquery_bigframes: engine: bigquery bigquery: @@ -35,6 +52,12 @@ dev_bigquery_bigframes: use_bigframes: true allow_create_dataset: true +dev_bigquery_bigframes_utest: + engine: bigquery + bigquery: + # override only dataset + dataset: "{{ env('FF_BQ_UTEST_DATASET', 'basic_demo_utest') }}" + dev_bigquery_pandas: engine: bigquery bigquery: @@ -44,6 +67,12 @@ dev_bigquery_pandas: use_bigframes: false allow_create_dataset: true +dev_bigquery_pandas_utest: + engine: bigquery + bigquery: + # override only dataset + dataset: "{{ env('FF_BQ_UTEST_DATASET', 'basic_demo_utest') }}" + dev_snowflake: engine: snowflake_snowpark snowflake_snowpark: @@ -55,3 +84,8 @@ dev_snowflake: schema: "{{ env('FF_SF_SCHEMA', 'BASIC_DEMO') }}" role: "{{ env('FF_SF_ROLE', '') }}" allow_create_schema: true + +dev_snowflake_utest: + engine: snowflake_snowpark + snowflake_snowpark: + schema: "{{ env('FF_SF_UTEST_SCHEMA', 'BASIC_DEMO_UTEST') }}" diff --git a/examples/basic_demo/tests/unit/mart_latest_signup.yml b/examples/basic_demo/tests/unit/mart_latest_signup.yml new file mode 100644 index 0000000..456cdbf --- /dev/null +++ b/examples/basic_demo/tests/unit/mart_latest_signup.yml @@ -0,0 +1,40 @@ +model: mart_latest_signup + +defaults: + inputs: + users_clean: + rows: + - {user_id: 1, email: "anna@example.com", email_domain: "example.com", signup_date: "2024-01-05"} + - {user_id: 2, email: "bob@example.com", email_domain: "example.com", signup_date: "2024-02-11"} + - {user_id: 3, email: "cara@net.com", email_domain: "net.com", signup_date: "2024-02-27"} + - {user_id: 4, email: "dina@net.com", email_domain: "net.com", signup_date: "2024-03-01"} + expect: + relation: mart_latest_signup + order_by: [email_domain] + +cases: + - name: picks_latest_signup_per_domain + expect: + rows: + - {email_domain: "example.com", + latest_user_id: 2, + latest_email: "bob@example.com", + latest_signup_date: "2024-02-11"} + - {email_domain: "net.com", + latest_user_id: 4, + latest_email: "dina@net.com", + latest_signup_date: "2024-03-01"} + + - name: override_inputs_single_domain + inputs: + users_clean: + rows: + - {user_id: 10, email: "x@foo.com", email_domain: "foo.com", signup_date: "2024-01-01"} + - {user_id: 11, email: "y@foo.com", email_domain: "foo.com", signup_date: "2024-01-10"} + expect: + any_order: true + rows: + - {email_domain: "foo.com", + latest_user_id: 11, + latest_email: "y@foo.com", + latest_signup_date: "2024-01-10"} diff --git a/examples/basic_demo/tests/unit/mart_users_by_domain.yml b/examples/basic_demo/tests/unit/mart_users_by_domain.yml new file mode 100644 index 0000000..64c9c52 --- /dev/null +++ b/examples/basic_demo/tests/unit/mart_users_by_domain.yml @@ -0,0 +1,25 @@ +model: mart_users_by_domain.ff + +defaults: + inputs: + users_clean: + rows: + - {user_id: 1, email: "anna@example.com", email_domain: "example.com", signup_date: "2024-01-05"} + - {user_id: 2, email: "bob@example.com", email_domain: "example.com", signup_date: "2024-01-07"} + - {user_id: 3, email: "cara@net.com", email_domain: "net.com", signup_date: "2024-03-01"} + expect: + relation: mart_users_by_domain + order_by: [email_domain] + +cases: + - name: aggregates_counts_and_first_last_signup + expect: + rows: + - {email_domain: "example.com", + user_count: 2, + first_signup: "2024-01-05", + last_signup: "2024-01-07"} + - {email_domain: "net.com", + user_count: 1, + first_signup: "2024-03-01", + last_signup: "2024-03-01"} diff --git a/examples/basic_demo/tests/unit/users_clean.yml b/examples/basic_demo/tests/unit/users_clean.yml new file mode 100644 index 0000000..0028a16 --- /dev/null +++ b/examples/basic_demo/tests/unit/users_clean.yml @@ -0,0 +1,25 @@ +model: users_clean.ff + +defaults: + inputs: + seed_users: + rows: + - {id: 1, email: "ANNA@Example.COM", signup_date: "2024-01-05"} + - {id: 2, email: "bob@Example.net", signup_date: "2024-02-11"} + expect: + relation: users_clean + order_by: [user_id] + +cases: + - name: lowercases_email_and_extracts_domain + expect: + order_by: ["user_id"] + rows: + - {user_id: 1, + email: "anna@example.com", + email_domain: "example.com", + signup_date: "2024-01-05"} + - {user_id: 2, + email: "bob@example.net", + email_domain: "example.net", + signup_date: "2024-02-11"} diff --git a/examples_article/building_locally_demo/.env.prod_bigquery b/examples_article/building_locally_demo/.env.prod_bigquery new file mode 100644 index 0000000..d898032 --- /dev/null +++ b/examples_article/building_locally_demo/.env.prod_bigquery @@ -0,0 +1,2 @@ +# Path to service account JSON key (or rely on gcloud / workload identity) +GOOGLE_APPLICATION_CREDENTIALS=../secrets/fft-bigquery-demo-key.json diff --git a/examples_article/building_locally_demo/models/marts/mart_latest_signup.ff.py b/examples_article/building_locally_demo/models/marts/mart_latest_signup.ff.py new file mode 100644 index 0000000..588961b --- /dev/null +++ b/examples_article/building_locally_demo/models/marts/mart_latest_signup.ff.py @@ -0,0 +1,40 @@ +# models/marts/mart_latest_signup.ff.py +import pandas as pd +from fastflowtransform import engine_model + + +@engine_model( + # Register this model for both DuckDB (local) and BigQuery (pandas backend) + only=("duckdb", "bigquery"), + name="mart_latest_signup", + materialized="table", + deps=["stg_users.ff"], # SQL model from earlier in the article + tags=["scope:mart", "engine:duckdb", "engine:bigquery"], + requires={ + # Columns produced by stg_users.ff.sql: + # id, email, signup_date + "stg_users.ff": {"id", "email", "signup_date"}, + }, +) +def build(stg_users: pd.DataFrame) -> pd.DataFrame: + """Return the latest signup per email domain using pandas.""" + + # Derive an email_domain column in Python + users = stg_users.copy() + users["email_domain"] = users["email"].str.split("@").str[-1] + + latest = ( + users.sort_values("signup_date", ascending=False) + .drop_duplicates("email_domain") # keep the newest per domain + .loc[:, ["email_domain", "id", "email", "signup_date"]] + .rename( + columns={ + "id": "latest_user_id", + "email": "latest_email", + "signup_date": "latest_signup_date", + } + ) + .reset_index(drop=True) + ) + + return latest diff --git a/examples_article/building_locally_demo/profiles.yml b/examples_article/building_locally_demo/profiles.yml index 5ab051c..a93e07f 100644 --- a/examples_article/building_locally_demo/profiles.yml +++ b/examples_article/building_locally_demo/profiles.yml @@ -4,13 +4,25 @@ dev_duckdb: duckdb: path: ".local/dev.duckdb" +# My Local utest Overrides +dev_duckdb_utest: + engine: duckdb + duckdb: + path: ":memory:" + # My Production Environment prod_bigquery: engine: bigquery bigquery: - project: "basic_data" + project: "fft-basic-demo" dataset: "production_marts" location: "EU" - # FFT handles the client (BigFrames or Pandas) automatically - use_bigframes: true - allow_create_dataset: true \ No newline at end of file + # Use the pandas backend here; FFT can also use BigFrames if you set this to true. + use_bigframes: false + allow_create_dataset: true + +# My Production utest Overrides +prod_bigquery_utest: + engine: bigquery + bigquery: + dataset: "production_marts_utest" diff --git a/examples_article/building_locally_demo/tests/unit/stg_users.yml b/examples_article/building_locally_demo/tests/unit/stg_users.yml index c1ebcf9..bbcec1d 100644 --- a/examples_article/building_locally_demo/tests/unit/stg_users.yml +++ b/examples_article/building_locally_demo/tests/unit/stg_users.yml @@ -1,12 +1,12 @@ -model: stg_users +model: stg_users.ff cases: - name: lowercase_email inputs: - seed_users: - rows: - - {id: 1, email: "a@EXAMPLE.com", signup_date: "2023-01-01"} - - {id: 2, email: "B@gmail.com", signup_date: "2023-01-02"} + seed_users: + rows: + - {id: 1, email: "a@EXAMPLE.com", signup_date: "2023-01-01"} + - {id: 2, email: "B@gmail.com", signup_date: "2023-01-02"} expect: rows: - {id: 1, email: "a@example.com", signup_date: "2023-01-01"} diff --git a/src/fastflowtransform/.env b/src/fastflowtransform/.env deleted file mode 100644 index 5826dc9..0000000 --- a/src/fastflowtransform/.env +++ /dev/null @@ -1,13 +0,0 @@ -# # Engine-Gesamtoverride -# FF_ENGINE=postgres - -# # DuckDB -# FF_DUCKDB_PATH=.local/demo.duckdb - -# # Postgres -# FF_PG_DSN=postgresql+psycopg://postgres:postgres@localhost:5432/ffdb -# FF_PG_SCHEMA=public - -# # BigQuery -# FF_BQ_DATASET=my_dataset -# FF_BQ_LOCATION=EU \ No newline at end of file diff --git a/src/fastflowtransform/cli/bootstrap.py b/src/fastflowtransform/cli/bootstrap.py index 4fc4140..e7ee61d 100644 --- a/src/fastflowtransform/cli/bootstrap.py +++ b/src/fastflowtransform/cli/bootstrap.py @@ -24,6 +24,7 @@ EnvSettings, Profile, resolve_profile as _resolve_profile_impl, + resolve_utest_profile as _resolve_utest_profile_impl, ) @@ -242,12 +243,21 @@ def _prepare_context( env_name: str, engine: EngineType | None, vars_opt: list[str] | None, + utest: bool = False, ) -> CLIContext: proj = _resolve_project_path(project_arg) _load_dotenv_layered(proj, env_name) - env_settings, prof = _resolve_profile(env_name, engine, proj) - _validate_profile_params(env_name, prof) + env_settings, base_prof = _resolve_profile(env_name, engine, proj) + + if utest: + # Use the dedicated utest profile "_utest" + utest_prof = _resolve_utest_profile(env_name, proj, env_settings) + _validate_profile_params(f"{env_name}_utest", utest_prof) + prof = utest_prof + else: + _validate_profile_params(env_name, base_prof) + prof = base_prof engine_name = getattr(prof, "engine", None) REGISTRY.set_active_engine(engine_name) @@ -275,6 +285,20 @@ def _prepare_context( ) +def _resolve_utest_profile(env_name: str, proj: Path, env: EnvSettings) -> Profile: + """ + Resolve the utest profile "_utest" using the same EnvSettings. + """ + try: + # Note: settings.resolve_utest_profile(project_dir, base_env_name, env) + prof = _resolve_utest_profile_impl(proj, env_name, env) + except Exception as exc: + raise typer.BadParameter( + f"Failed to resolve unit-test profile '{env_name}_utest': {exc}" + ) from exc + return prof + + def _parse_cli_vars(pairs: list[str]) -> dict[str, object]: """ Parse --vars key=value pairs. Values are YAML-parsed for light typing: @@ -407,5 +431,5 @@ def _make_executor(prof: Profile, jenv: Environment) -> tuple[Any, Callable, Cal ex = SnowflakeSnowparkExecutor(cfg) return ex, (lambda n: ex.run_sql(n, jenv)), ex.run_python - _die(f"Unbekannter Engine-Typ: {getattr(prof, 'engine', None)}", code=1) + _die(f"Unknown engine type: {getattr(prof, 'engine', None)}", code=1) raise AssertionError("unreachable") diff --git a/src/fastflowtransform/cli/utest_cmd.py b/src/fastflowtransform/cli/utest_cmd.py index 97e8f43..99c77ff 100644 --- a/src/fastflowtransform/cli/utest_cmd.py +++ b/src/fastflowtransform/cli/utest_cmd.py @@ -30,7 +30,7 @@ def utest( cache: UTestCacheOpt = UTestCacheMode.OFF, reuse_meta: ReuseMetaOpt = False, ) -> None: - ctx = _prepare_context(project, env_name, engine, vars) + ctx = _prepare_context(project, env_name, engine, vars, utest=True) ex, _, _ = ctx.make_executor() specs = discover_unit_specs(ctx.project, path=path, only_model=model) diff --git a/src/fastflowtransform/executors/base.py b/src/fastflowtransform/executors/base.py index 382c31d..f8357b2 100644 --- a/src/fastflowtransform/executors/base.py +++ b/src/fastflowtransform/executors/base.py @@ -1080,6 +1080,41 @@ def _meta_is_snapshot(meta: Mapping[str, Any] | None) -> bool: materialized = str(meta.get("materialized") or "").lower() return materialized == "snapshot" + # ---------- Unit-test helpers (to be overridden by engines) ---------- + + def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None: + """ + Load test input rows into a physical relation for unit tests. + + Default: not implemented. Engines that support `fft utest` should override. + """ + raise NotImplementedError( + f"utest_load_relation_from_rows not implemented for engine '{self.engine_name}'." + ) + + def utest_read_relation(self, relation: str) -> _PDDataFrame: + """ + Read a physical relation into a pandas.DataFrame for unit-test assertions. + + Default: not implemented. Engines that support `fft utest` should override. + """ + raise NotImplementedError( + f"utest_read_relation not implemented for engine '{self.engine_name}'." + ) + + def utest_clean_target(self, relation: str) -> None: + """ + Best-effort cleanup hook before executing a unit-test model: + + - Drop tables/views with the target name so view<->table flips + cannot fail (DuckDB, Postgres, ...). + - This runs *only* in `fft utest`, and we already enforce that + utest profiles use isolated DBs/schemas. + + Default: no-op. + """ + return + ENGINE_NAME: str = "generic" @property diff --git a/src/fastflowtransform/executors/bigquery/bigframes.py b/src/fastflowtransform/executors/bigquery/bigframes.py index 6dfb549..c0fcc6a 100644 --- a/src/fastflowtransform/executors/bigquery/bigframes.py +++ b/src/fastflowtransform/executors/bigquery/bigframes.py @@ -2,21 +2,27 @@ from __future__ import annotations from collections.abc import Iterable +from contextlib import suppress from typing import Any +import pandas as pd + from fastflowtransform.core import Node from fastflowtransform.executors.bigquery.base import BigQueryBaseExecutor from fastflowtransform.typing import ( + BadRequest, BFDataFrame, BigQueryOptions, + LoadJobConfig, NotFound, bf_global_session, bigframes, + bigquery, ) class BigQueryBFExecutor(BigQueryBaseExecutor[BFDataFrame]): - ENGINE_NAME = "bigquery_batch" + ENGINE_NAME = "bigquery" def __init__( self, @@ -48,7 +54,8 @@ def __init__( except Exception as exc: raise RuntimeError( "Failed to initialize BigFrames session. Verify FF_BQ_PROJECT, " - "FF_BQ_DATASET, and FF_BQ_LOCATION are set for the active profile." + "FF_BQ_DATASET, and FF_BQ_LOCATION are set for the active profile. " + f"{exc}" ) from exc def run_python(self, node: Node) -> None: @@ -80,6 +87,7 @@ def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> BFDa ) from e def _materialize_relation(self, relation: str, df: BFDataFrame, node: Node) -> None: + self._ensure_dataset() table_id = f"{self.project}.{self.dataset}.{relation}" to_gbq = getattr(df, "to_gbq", None) @@ -156,3 +164,57 @@ def _is_frame(self, obj: Any) -> bool: def _frame_name(self) -> str: return "BigQuery DataFrame (BigFrames)" + + # ---- Unit-test helpers (pandas-facing) -------------------------------- + + def utest_read_relation(self, relation: str) -> pd.DataFrame: + """ + Read a relation into a pandas DataFrame for unit-test assertions. + + Even though this executor uses BigFrames for normal execution, + utests compare pandas DataFrames, so we convert. + """ + q = f"SELECT * FROM {self._qualified_identifier(relation)}" + job = self.client.query(q, location=self.location) + return job.result().to_dataframe(create_bqstorage_client=True) + + def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None: + """ + Load rows into a BigQuery table for unit tests (replace if exists). + + Implementation uses the raw BigQuery client with pandas, which is + perfectly fine for test input setup. + """ + self._ensure_dataset() + table_id = f"{self.project}.{self.dataset}.{relation}" + df = pd.DataFrame(rows) + + job_config = LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE) + + try: + job = self.client.load_table_from_dataframe( + df, + table_id, + job_config=job_config, + location=self.location, + ) + job.result() + except BadRequest as e: + raise RuntimeError(f"BigQuery utest write failed: {table_id}\n{e}") from e + + def utest_clean_target(self, relation: str) -> None: + """ + For unit tests: drop any table/view with this name in the configured dataset. + """ + table_id = f"{self.project}.{self.dataset}.{relation}" + + try: + self.client.delete_table(table_id, not_found_ok=True) + except NotFound: + pass + except TypeError: + with suppress(NotFound): + self.client.delete_table(table_id) + except Exception: + # Best-effort; don't make the whole test run fail because cleanup hiccupped. + pass diff --git a/src/fastflowtransform/executors/bigquery/pandas.py b/src/fastflowtransform/executors/bigquery/pandas.py index 32b2e7d..4d0c1b5 100644 --- a/src/fastflowtransform/executors/bigquery/pandas.py +++ b/src/fastflowtransform/executors/bigquery/pandas.py @@ -2,6 +2,7 @@ from __future__ import annotations from collections.abc import Iterable +from contextlib import suppress from time import perf_counter import pandas as pd @@ -94,3 +95,55 @@ def _record_dataframe_stats(self, df: pd.DataFrame, duration_ms: int) -> None: duration_ms=duration_ms, ) ) + + # ---- Unit-test helpers (pandas) --------------------------------------- + + def utest_read_relation(self, relation: str) -> pd.DataFrame: + """ + Read a relation into a pandas DataFrame for unit-test assertions. + """ + q = f"SELECT * FROM {self._qualified_identifier(relation)}" + job = self.client.query(q, location=self.location) + # Same convention as _read_relation: use BigQuery Storage if available + return job.result().to_dataframe(create_bqstorage_client=True) + + def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None: + """ + Load rows into a BigQuery table for unit tests (replace if exists). + """ + self._ensure_dataset() + table_id = f"{self.project}.{self.dataset}.{relation}" + df = pd.DataFrame(rows) + + job_config = LoadJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE) + + try: + job = self.client.load_table_from_dataframe( + df, + table_id, + job_config=job_config, + location=self.location, + ) + job.result() + except BadRequest as e: + raise RuntimeError(f"BigQuery utest write failed: {table_id}\n{e}") from e + + def utest_clean_target(self, relation: str) -> None: + """ + For unit tests: drop any table/view with this name in the configured dataset. + """ + table_id = f"{self.project}.{self.dataset}.{relation}" + # BigQuery treats views & tables both as "tables" for deletion. + try: + # not_found_ok=True is available on the real client; our typing alias + # should be compatible - if not, just ignore NotFound below. + self.client.delete_table(table_id, not_found_ok=True) + except NotFound: + pass + except TypeError: + # For older client versions without not_found_ok, fall back: + with suppress(NotFound): + self.client.delete_table(table_id) + except Exception: + # Cleanup is best-effort in utests. + pass diff --git a/src/fastflowtransform/executors/databricks_spark.py b/src/fastflowtransform/executors/databricks_spark.py index 7a3bb34..7e8a7e9 100644 --- a/src/fastflowtransform/executors/databricks_spark.py +++ b/src/fastflowtransform/executors/databricks_spark.py @@ -9,6 +9,7 @@ from typing import Any, cast from urllib.parse import unquote, urlparse +import pandas as pd from jinja2 import Environment from fastflowtransform import storage @@ -1257,6 +1258,56 @@ def execute_hook_sql(self, sql: str) -> None: # Reuse your existing single-statement executor self._execute_sql(stmt) + # ---- Unit-test helpers ------------------------------------------------- + + def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None: + """ + Load rows into a Spark table for unit tests (replace if exists). + + We go via pandas → Spark so schema is inferred from the Python + data, then delegate to the same table-writing pipeline as the + normal engine (_save_df_as_table), so table_format / storage + options / catalogs are all respected. + """ + pdf = pd.DataFrame(rows) + # Spark can infer schema from the pandas DataFrame, even for empty + # frames (it will just create an empty table with no rows). + sdf = self.spark.createDataFrame(pdf) + # Use the same path as normal model materialization so that + # Delta/Iceberg/etc. are handled consistently. + self._save_df_as_table(relation, sdf) + + def utest_read_relation(self, relation: str) -> pd.DataFrame: + """ + Read a relation as a pandas DataFrame for unit-test assertions. + + The utest framework always compares on pandas, so we convert from + Spark DataFrame here. + """ + physical = self._physical_identifier(relation) + sdf = self.spark.table(physical) + return sdf.toPandas() + + def utest_clean_target(self, relation: str) -> None: + """ + For unit tests: drop any view or table with this name. + + We: + - try DROP VIEW IF EXISTS ... + - try DROP TABLE IF EXISTS ... + and ignore type-mismatch errors, so it doesn't matter whether a + table or a view currently exists under that name. + """ + ident = self._sql_identifier(relation) + + # Drop view first; ignore errors if it's actually a table or missing. + with suppress(Exception): + self._execute_sql(f"DROP VIEW IF EXISTS {ident}") + + # Then drop table; ignore errors if it's actually a view or missing. + with suppress(Exception): + self._execute_sql(f"DROP TABLE IF EXISTS {ident}") + # ────────────────────────── local helpers / shim ────────────────────────── class _SparkResult: diff --git a/src/fastflowtransform/executors/duckdb.py b/src/fastflowtransform/executors/duckdb.py index e872583..7025014 100644 --- a/src/fastflowtransform/executors/duckdb.py +++ b/src/fastflowtransform/executors/duckdb.py @@ -3,6 +3,7 @@ import json import re +import uuid from collections.abc import Iterable from contextlib import suppress from pathlib import Path @@ -852,3 +853,42 @@ def execute_hook_sql(self, sql: str) -> None: Accepts a string that may contain ';'-separated statements. """ self._exec_many(sql) + + # ---- Unit-test helpers ------------------------------------------------- + + def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None: + """ + Load rows into a DuckDB table for unit tests, fully qualified to + this executor's schema/catalog. + """ + df = pd.DataFrame(rows) + tmp = f"_ff_utest_tmp_{uuid.uuid4().hex[:12]}" + self.con.register(tmp, df) + try: + target = self._qualified(relation) + self._execute_sql(f"create or replace table {target} as select * from {tmp}") + finally: + with suppress(Exception): + self.con.unregister(tmp) + # Fallback for older DuckDB where unregister might not exist + with suppress(Exception): + self._execute_sql(f'drop view if exists "{tmp}"') + + def utest_read_relation(self, relation: str) -> pd.DataFrame: + """ + Read a relation as a DataFrame for unit-test assertions. + """ + target = self._qualified(relation, quoted=False) + return self.con.table(target).df() + + def utest_clean_target(self, relation: str) -> None: + """ + Drop any table/view with the given name in this schema/catalog. + Safe because utest uses its own DB/path. + """ + target = self._qualified(relation) + # best-effort; ignore failures + with suppress(Exception): + self._execute_sql(f"drop view if exists {target}") + with suppress(Exception): + self._execute_sql(f"drop table if exists {target}") diff --git a/src/fastflowtransform/executors/postgres.py b/src/fastflowtransform/executors/postgres.py index b8bef0a..f46c9a3 100644 --- a/src/fastflowtransform/executors/postgres.py +++ b/src/fastflowtransform/executors/postgres.py @@ -58,6 +58,51 @@ def __init__(self, dsn: str, schema: str | None = None): # ⇣ fastflowtransform.testing expects executor.con.execute("SQL") self.con = SAConnShim(self.engine, schema=self.schema) + def _execute_sql_core( + self, + sql: str, + *args: Any, + conn: Connection, + **kwargs: Any, + ) -> Any: + """ + Lowest-level SQL executor: + + - sets search_path + - executes the statement via given connection + - NO budget guard + - NO timing / stats + + Used by both the high-level _execute_sql and maintenance helpers. + """ + self._set_search_path(conn) + return conn.execute(text(sql), *args, **kwargs) + + def _execute_sql_maintenance( + self, + sql: str, + *args: Any, + conn: Connection | None = None, + **kwargs: Any, + ) -> Any: + """ + Utility/maintenance SQL: + + - sets search_path + - NO budget guard + - NO stats + + Intended for: + - utest cleanup + - ANALYZE + - DDL that shouldn't be budget-accounted + """ + if conn is None: + with self.engine.begin() as local_conn: + return self._execute_sql_core(sql, *args, conn=local_conn, **kwargs) + else: + return self._execute_sql_core(sql, *args, conn=conn, **kwargs) + def _execute_sql( self, sql: str, @@ -84,12 +129,10 @@ def _execute_sql( if conn is None: # Standalone use: open our own transaction with self.engine.begin() as local_conn: - self._set_search_path(local_conn) - result = local_conn.execute(text(sql), *args, **kwargs) + result = self._execute_sql_core(sql, *args, conn=local_conn, **kwargs) else: # Reuse existing connection / transaction (e.g. in run_snapshot_sql) - self._set_search_path(conn) - result = conn.execute(text(sql), *args, **kwargs) + result = self._execute_sql_core(sql, *args, conn=conn, **kwargs) dt_ms = int((perf_counter() - t0) * 1000) @@ -697,3 +740,124 @@ def execute_hook_sql(self, sql: str) -> None: Accepts a string that may contain ';'-separated statements. """ self._execute_sql(sql) + + # ---- Unit-test helpers ------------------------------------------------- + + def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None: + """ + Load rows into a Postgres table for unit tests (replace if exists), + without using pandas.to_sql. + """ + qualified = self._qualified(relation) + + if not rows: + # Ensure an empty table exists (corner case). + try: + with self.engine.begin() as conn: + self._execute_sql_maintenance( + f"DROP TABLE IF EXISTS {qualified} CASCADE", + conn=conn, + ) + self._execute_sql_maintenance( + f"CREATE TABLE {qualified} ()", + conn=conn, + ) + except SQLAlchemyError as e: + raise ModelExecutionError( + node_name=f"utest::{relation}", + relation=self._qualified(relation), + message=str(e), + ) from e + return + + first = rows[0] + if not isinstance(first, dict): + raise ModelExecutionError( + node_name=f"utest::{relation}", + relation=self._qualified(relation), + message=f"Expected list[dict] for rows, got {type(first).__name__}", + ) + + cols = list(first.keys()) + col_list_sql = ", ".join(self._q_ident(c) for c in cols) + select_exprs = ", ".join(f":{c} AS {self._q_ident(c)}" for c in cols) + insert_values_sql = ", ".join(f":{c}" for c in cols) + + try: + with self.engine.begin() as conn: + # Replace any existing table + self._execute_sql_maintenance( + f"DROP TABLE IF EXISTS {qualified} CASCADE", + conn=conn, + ) + + # Create table from first row + create_sql = f"CREATE TABLE {qualified} AS SELECT {select_exprs}" + self._execute_sql_maintenance(create_sql, first, conn=conn) + + # Insert remaining rows + if len(rows) > 1: + insert_sql = ( + f"INSERT INTO {qualified} ({col_list_sql}) VALUES ({insert_values_sql})" + ) + for row in rows[1:]: + self._execute_sql_maintenance(insert_sql, row, conn=conn) + + except SQLAlchemyError as e: + raise ModelExecutionError( + node_name=f"utest::{relation}", + relation=self._qualified(relation), + message=str(e), + ) from e + + def utest_read_relation(self, relation: str) -> pd.DataFrame: + """ + Read a relation as a DataFrame for unit-test assertions. + """ + qualified = self._qualified(relation) + with self.engine.begin() as conn: + self._set_search_path(conn) + return pd.read_sql_query(text(f"select * from {qualified}"), conn) + + def utest_clean_target(self, relation: str) -> None: + """ + For unit tests: drop any view or table with this name in the configured schema. + + We avoid WrongObjectType by: + - querying information_schema for existing table/view with this name + - dropping only the matching kinds. + """ + with self.engine.begin() as conn: + # Use the same search_path logic as the rest of the executor + self._set_search_path(conn) + + # Decide which schema to inspect + cur_schema = conn.execute(text("select current_schema()")).scalar() + schema = self.schema or cur_schema + + # Find objects named in that schema + info_sql = """ + select kind, table_schema, table_name from ( + select 'table' as kind, table_schema, table_name + from information_schema.tables + where lower(table_schema) = lower(:schema) + and lower(table_name) = lower(:rel) + union all + select 'view' as kind, table_schema, table_name + from information_schema.views + where lower(table_schema) = lower(:schema) + and lower(table_name) = lower(:rel) + ) s + order by kind; + """ + rows = conn.execute( + text(info_sql), + {"schema": schema, "rel": relation}, + ).fetchall() + + for kind, table_schema, table_name in rows: + qualified = f'"{table_schema}"."{table_name}"' + if kind == "view": + conn.execute(text(f"DROP VIEW IF EXISTS {qualified} CASCADE")) + else: # table + conn.execute(text(f"DROP TABLE IF EXISTS {qualified} CASCADE")) diff --git a/src/fastflowtransform/executors/snowflake_snowpark.py b/src/fastflowtransform/executors/snowflake_snowpark.py index 1c60794..46741d1 100644 --- a/src/fastflowtransform/executors/snowflake_snowpark.py +++ b/src/fastflowtransform/executors/snowflake_snowpark.py @@ -7,6 +7,7 @@ from time import perf_counter from typing import Any, cast +import pandas as pd from jinja2 import Environment from fastflowtransform.core import Node, relation_for @@ -717,6 +718,89 @@ def execute_hook_sql(self, sql: str) -> None: """ self._exec_many(sql) + # ---- Unit-test helpers ----------------------------------------------- + + def utest_read_relation(self, relation: str) -> pd.DataFrame: + """ + Read a relation into a pandas DataFrame for unit-test assertions. + + We use Snowpark to read the table and convert to pandas, + normalizing column names to lowercase to match _read_relation. + """ + df = self.session.table(self._qualified(relation)) + # Mirror _read_relation: present lowercase schema to the test layer + lowered = [c.lower() for c in df.schema.names] + df = df.toDF(*lowered) + + to_pandas = getattr(df, "to_pandas", None) + + pdf: pd.DataFrame + if callable(to_pandas): + pdf = cast(pd.DataFrame, to_pandas()) + else: + rows = df.collect() + records = [r.asDict() for r in rows] + pdf = pd.DataFrame.from_records(records) + + # Return a new DF with lowercase columns (no attribute assignment) + return pdf.rename(columns=lambda c: str(c).lower()) + + def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None: + """ + Load rows into a Snowflake table for unit tests (replace if exists). + + We build a Snowpark DataFrame from the Python rows and overwrite the + target table using save_as_table(). + """ + # Best-effort: if rows are empty, create an empty table with no rows. + # We assume at least one row in normal test usage so we can infer schema. + if not rows: + # Without any rows we don't know the schema; create a trivial + # single-column table to surface the situation clearly. + tmp_df = self.session.create_dataframe([[None]], schema=["__empty__"]) + tmp_df.write.save_as_table(self._qualified(relation), mode="overwrite") + return + + # Infer column order from the first row + first = rows[0] + columns = list(first.keys()) + + # Normalize data to a list of lists in a fixed column order + data = [[row.get(col) for col in columns] for row in rows] + + df = self.session.create_dataframe(data, schema=columns) + + # Store with uppercase column names in Snowflake (conventional) + upper_cols = [c.upper() for c in columns] + if columns != upper_cols: + df = df.toDF(*upper_cols) + + # Overwrite the target table + df.write.save_as_table(self._qualified(relation), mode="overwrite") + + def utest_clean_target(self, relation: str) -> None: + """ + For unit tests: drop any table or view with this name in the configured + database/schema. + + We: + - try DROP VIEW IF EXISTS DB.SCHEMA.REL + - try DROP TABLE IF EXISTS DB.SCHEMA.REL + + and ignore "not a view/table" style errors so it doesn't matter what + kind of object is currently there - after this, nothing with that name + should remain (best-effort). + """ + qualified = self._qualified(relation) + + # Drop view first; ignore errors if it's actually a table or doesn't exist. + with suppress(Exception): + self.session.sql(f"DROP VIEW IF EXISTS {qualified}").collect() + + # Then drop table; ignore errors if it's actually a view or doesn't exist. + with suppress(Exception): + self.session.sql(f"DROP TABLE IF EXISTS {qualified}").collect() + # ────────────────────────── local testing shim ─────────────────────────── class _SFCursorShim: diff --git a/src/fastflowtransform/hooks/__init__.py b/src/fastflowtransform/hooks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/fastflowtransform/seeding.py b/src/fastflowtransform/seeding.py index 008d4c0..98d88da 100644 --- a/src/fastflowtransform/seeding.py +++ b/src/fastflowtransform/seeding.py @@ -586,6 +586,7 @@ def _handle_bigquery(table: str, df: pd.DataFrame, executor: Any, schema: str | t0 = perf_counter() # Let the BigQuery client infer the schema from the pandas DataFrame. + executor._ensure_dataset() load_job = client.load_table_from_dataframe(df, table_id, job_config=job_config) load_job.result() # Wait for completion dt_ms = int((perf_counter() - t0) * 1000) diff --git a/src/fastflowtransform/settings.py b/src/fastflowtransform/settings.py index d7f7029..99cdd36 100644 --- a/src/fastflowtransform/settings.py +++ b/src/fastflowtransform/settings.py @@ -3,6 +3,7 @@ import os from collections.abc import Callable +from copy import deepcopy from pathlib import Path from typing import Annotated, Any, Literal, cast @@ -184,6 +185,20 @@ def _env(name: str, default: str | None = "") -> str: # ---------- Resolver ---------- + + +def _deep_merge(base: Any, override: Any) -> Any: + """ + Recursive merge for dicts. Lists/scalars are replaced entirely. + """ + if isinstance(base, dict) and isinstance(override, dict): + out = dict(base) + for k, v in override.items(): + out[k] = _deep_merge(out.get(k), v) + return out + return override if override is not None else base + + def resolve_profile(project_dir: Path, env_name: str, env: EnvSettings) -> Profile: profiles: dict[str, dict[str, Any]] = load_profiles(project_dir) requested = profiles.get(env_name) @@ -205,6 +220,110 @@ def resolve_profile(project_dir: Path, env_name: str, env: EnvSettings) -> Profi return prof +def resolve_utest_profile(project_dir: Path, base_env_name: str, env: EnvSettings) -> Profile: + """ + Resolve the *utest* profile for a given base env, e.g. + base_env_name = "dev_duckdb" -> profile "dev_duckdb_utest" + + Requirements: + - base profile (base_env_name) is resolved with env overrides (FF_*). + - utest profile (base_env_name + "_utest") is resolved from profiles.yml + WITHOUT env overrides so it cannot accidentally point at the same DB/schema. + - utest profile MUST exist and MUST be isolated from the base one. + """ + profiles: dict[str, dict[str, Any]] = load_profiles(project_dir) + + # 1) Resolve the *base* profile normally (with env overrides). + base_prof: Profile = resolve_profile(project_dir, base_env_name, env) + + # 2) Load the raw utest profile from YAML (no env overrides here). + utest_env_name = f"{base_env_name}_utest" + raw_utest = profiles.get(utest_env_name) + + if raw_utest is None: + raise ProfileConfigError( + f"Unit-test profile '{utest_env_name}' not found in profiles.yml. " + f"Define it explicitly to run 'fft utest' for env '{base_env_name}'." + ) + + # Work on a copy and DO NOT call _apply_env_overrides(). + raw_utest_copy = deepcopy(raw_utest) + + # --- Inherit SAFE connection fields from base profile ---------------- + # The idea: + # - we inherit things that *do not define isolation* + # (e.g. DSN, project, account, warehouse, etc.) + # - but we DO NOT inherit things like schema/dataset/path that we want to be different. + eng = base_prof.engine + + if eng == "postgres": + base_pg = cast(PostgresProfile, base_prof) + base_dsn = base_pg.postgres.dsn + if base_dsn: + pg_cfg = raw_utest_copy.setdefault("postgres", {}) + pg_cfg.setdefault("dsn", base_dsn) + + elif eng == "bigquery": + base_bq = cast(BigQueryProfile, base_prof) + bq_cfg = raw_utest_copy.setdefault("bigquery", {}) + + # Safe to inherit: project & location & allow_create_dataset + if base_bq.bigquery.project is not None: + bq_cfg.setdefault("project", base_bq.bigquery.project) + if base_bq.bigquery.location is not None: + bq_cfg.setdefault("location", base_bq.bigquery.location) + # dataset is the isolation dimension → MUST be set explicitly in the utest profile + # and will be checked by _assert_utest_isolated (in CLI/bootstrap). + if "allow_create_dataset" in base_bq.bigquery.__dict__: + bq_cfg.setdefault("allow_create_dataset", base_bq.bigquery.allow_create_dataset) + + elif eng == "duckdb": + base_ddb = cast(DuckDBProfile, base_prof) + ddb_cfg = raw_utest_copy.setdefault("duckdb", {}) + # Safe-ish to inherit catalog; we do NOT inherit path (isolation) or schema + if base_ddb.duckdb.catalog is not None: + ddb_cfg.setdefault("catalog", base_ddb.duckdb.catalog) + # path & db_schema must be explicitly configured for the utest profile. + + elif eng == "databricks_spark": + base_dbr = cast(DatabricksSparkProfile, base_prof) + dbr_cfg = raw_utest_copy.setdefault("databricks_spark", {}) + # Safe to inherit connectivity bits: + if base_dbr.databricks_spark.master is not None: + dbr_cfg.setdefault("master", base_dbr.databricks_spark.master) + if base_dbr.databricks_spark.app_name is not None: + dbr_cfg.setdefault("app_name", base_dbr.databricks_spark.app_name) + if base_dbr.databricks_spark.warehouse_dir is not None: + dbr_cfg.setdefault("warehouse_dir", base_dbr.databricks_spark.warehouse_dir) + if base_dbr.databricks_spark.catalog is not None: + dbr_cfg.setdefault("catalog", base_dbr.databricks_spark.catalog) + # database is the isolation dimension → must differ and will be checked elsewhere. + + elif eng == "snowflake_snowpark": + base_sf = cast(SnowflakeSnowparkProfile, base_prof) + sf_cfg = raw_utest_copy.setdefault("snowflake_snowpark", {}) + # Safe to inherit: account/user/password/warehouse/database/role/allow_create_schema + for attr in ( + "account", + "user", + "password", + "warehouse", + "database", + "role", + "allow_create_schema", + ): + val = getattr(base_sf.snowflake_snowpark, attr, None) + if val is not None: + sf_cfg.setdefault(attr, val) + # db_schema (schema) must be explicitly set for the utest profile. + + # 3) Validate the resulting utest profile + utest_prof: Profile = TypeAdapter(Profile).validate_python(raw_utest_copy) + _sanity_check_profile(utest_prof) + + return utest_prof + + # ---------- ENV-Overrides ---------- def _apply_env_overrides(raw: dict[str, Any], env: EnvSettings) -> None: if getattr(env, "ENGINE", None): @@ -349,3 +468,67 @@ def _check_snowflake_snowpark(prof: SnowflakeSnowparkProfile) -> None: f"Snowflake profile missing: {miss}. " "Hint: set profiles.yml → snowflake_snowpark.* or env FF_SF_*." ) + + +def _assert_utest_isolated(base: Profile, utest: Profile, base_env_name: str) -> None: + """ + Ensure the utest profile does NOT share the same DB/path/schema with the base profile. + If it does, raise ProfileConfigError and prevent utest from running. + """ + # Different engines → nothing to compare here + if base.engine != utest.engine: + return + + eng = base.engine + + if eng == "duckdb": + base_path = cast(DuckDBProfile, base).duckdb.path + utest_path = cast(DuckDBProfile, utest).duckdb.path + if base_path and utest_path and base_path == utest_path: + raise ProfileConfigError( + f"Unit-test profile '{base_env_name}_utest' must NOT reuse the same DuckDB path " + f"('{base_path}') as profile '{base_env_name}'. " + "Configure a separate file/path (e.g. '.local/basic_demo_utest.duckdb' " + "or ':memory:')." + ) + + elif eng == "postgres": + base_schema = cast(PostgresProfile, base).postgres.db_schema + utest_schema = cast(PostgresProfile, utest).postgres.db_schema + if base_schema == utest_schema: + raise ProfileConfigError( + f"Unit-test profile '{base_env_name}_utest' must NOT reuse the same Postgres " + f"schema ('{base_schema}') as profile '{base_env_name}'. " + "Use a dedicated schema for unit tests." + ) + + elif eng == "bigquery": + base_b = cast(BigQueryProfile, base).bigquery + utest_b = cast(BigQueryProfile, utest).bigquery + if base_b.project == utest_b.project and base_b.dataset == utest_b.dataset: + raise ProfileConfigError( + f"Unit-test profile '{base_env_name}_utest' must NOT reuse the same BigQuery " + f"project/dataset ('{base_b.project}.{base_b.dataset}') " + f"as profile '{base_env_name}'. " + "Use a separate dataset for unit tests." + ) + + elif eng == "databricks_spark": + base_db = cast(DatabricksSparkProfile, base).databricks_spark.database + utest_db = cast(DatabricksSparkProfile, utest).databricks_spark.database + if base_db and utest_db and base_db == utest_db: + raise ProfileConfigError( + f"Unit-test profile '{base_env_name}_utest' must NOT reuse the same Databricks " + f"database ('{base_db}') as profile '{base_env_name}'. " + "Use a dedicated database for unit tests." + ) + + elif eng == "snowflake_snowpark": + base_schema = cast(SnowflakeSnowparkProfile, base).snowflake_snowpark.db_schema + utest_schema = cast(SnowflakeSnowparkProfile, utest).snowflake_snowpark.db_schema + if base_schema == utest_schema: + raise ProfileConfigError( + f"Unit-test profile '{base_env_name}_utest' must NOT reuse the same Snowflake " + f"schema ('{base_schema}') as profile '{base_env_name}'. " + "Use a dedicated schema for unit tests." + ) diff --git a/src/fastflowtransform/utest.py b/src/fastflowtransform/utest.py index 09051fe..f9231dd 100644 --- a/src/fastflowtransform/utest.py +++ b/src/fastflowtransform/utest.py @@ -1,11 +1,9 @@ # src/fastflowtransform/utest.py -from __future__ import annotations - +import datetime import difflib import hashlib import json import os -import uuid from collections.abc import Iterable, Mapping from contextlib import suppress from dataclasses import dataclass, field @@ -14,7 +12,7 @@ import pandas as pd import yaml -from sqlalchemy import text +from pydantic import BaseModel, ConfigDict, Field, ValidationError from fastflowtransform.cache import FingerprintCache, can_skip_node from fastflowtransform.fingerprint import ( @@ -31,38 +29,102 @@ # ---------- Specifications ---------- -@dataclass -class UnitCase: - name: str - inputs: dict[str, dict] # rel -> {rows|csv} - expect: dict # {relation?, rows?, order_by?, any_order?, approx?, ignore_columns?, subset?} +class UnitInput(BaseModel): + """Single relation input: either inline rows or a CSV file.""" + model_config = ConfigDict(extra="forbid") + + rows: list[dict[str, Any]] | None = None + csv: str | None = None + + +class UnitExpect(BaseModel): + """ + Expected result configuration for a unit-test case. + + Extra keys are forbidden so YAML specs are tightly validated. + """ + + model_config = ConfigDict(extra="forbid") + + relation: str | None = None + rows: list[dict[str, Any]] = Field(default_factory=list) + order_by: list[str] | None = None + any_order: bool = False + approx: dict[str, float] | None = None + ignore_columns: list[str] | None = None + subset: bool = False -@dataclass -class UnitSpec: - model: str - engine: str | None - defaults: dict - cases: list[UnitCase] - path: Path - project_dir: Path +class UnitDefaults(BaseModel): + """Defaults that apply to all cases in a spec unless overridden.""" -# ---------- Discovery & Defaults ---------- + model_config = ConfigDict(extra="forbid") + inputs: dict[str, UnitInput] = Field(default_factory=dict) + expect: UnitExpect = Field(default_factory=UnitExpect) -def _deep_merge(base: Any, override: Any) -> Any: + +class UnitCase(BaseModel): + """A single unit-test case within a spec.""" + + model_config = ConfigDict(extra="forbid") + + name: str + inputs: dict[str, UnitInput] = Field(default_factory=dict) + expect: UnitExpect = Field(default_factory=UnitExpect) + + +class UnitSpec(BaseModel): """ - Recursive merge for dicts. Lists/scalars are replaced entirely. - (Perfectly adequate for our DSL.) + Top-level unit-test specification loaded from YAML. + + `path` and `project_dir` are runtime-only and are not populated from YAML + (we set them in discovery). """ - if isinstance(base, dict) and isinstance(override, dict): - out = dict(base) - for k, v in override.items(): - out[k] = _deep_merge(out.get(k), v) - return out - # Fallback: replace (lists and scalars included) - return override if override is not None else base + + model_config = ConfigDict(extra="forbid") + + model: str + engine: str | None = None + defaults: UnitDefaults = Field(default_factory=UnitDefaults) + cases: list[UnitCase] = Field(default_factory=list) + + path: Path | None = Field(default=None, exclude=True) + project_dir: Path | None = Field(default=None, exclude=True) + + # ---- defaults merging helpers ------------------------------------- + def _merge_expect(self, case_expect: UnitExpect) -> UnitExpect: + """ + Merge spec-level default.expect with case.expect. + + Only fields explicitly set on the case override the defaults. + """ + base = self.defaults.expect.model_dump() + override = case_expect + + for field_name in override.model_fields_set: + base[field_name] = getattr(override, field_name) + + return UnitExpect(**base) + + def _merge_inputs(self, case_inputs: dict[str, UnitInput]) -> dict[str, UnitInput]: + """ + Merge spec-level default.inputs with case.inputs (case wins per relation). + """ + merged: dict[str, UnitInput] = dict(self.defaults.inputs) + merged.update(case_inputs or {}) + return merged + + def merged_case(self, case: UnitCase) -> UnitCase: + """ + Return a new UnitCase where defaults have been applied (inputs + expect). + """ + return UnitCase( + name=case.name, + inputs=self._merge_inputs(case.inputs), + expect=self._merge_expect(case.expect), + ) def discover_unit_specs( @@ -70,33 +132,25 @@ def discover_unit_specs( ) -> list[UnitSpec]: files = [Path(path)] if path else list((project_dir / "tests" / "unit").glob("*.yml")) specs: list[UnitSpec] = [] + for f in files: - data = yaml.safe_load(f.read_text(encoding="utf-8")) or {} - model = data.get("model") - if not model: + raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {} + if not raw: continue - if only_model and model != only_model: + + try: + spec = UnitSpec.model_validate(raw) + except ValidationError as exc: + raise ValueError(f"Invalid unit-test spec {f}: {exc}") from exc + + if only_model and spec.model != only_model: continue - defaults = data.get("defaults", {}) or {} - engine = data.get("engine") - cases_raw = data.get("cases", []) or [] - cases: list[UnitCase] = [] - for c in cases_raw: - base = {"inputs": defaults.get("inputs", {}), "expect": defaults.get("expect", {})} - merged = _deep_merge( - base, {"inputs": c.get("inputs", {}), "expect": c.get("expect", {})} - ) - cases.append(UnitCase(name=c["name"], inputs=merged["inputs"], expect=merged["expect"])) - specs.append( - UnitSpec( - model=model, - engine=engine, - defaults=defaults, - cases=cases, - path=f, - project_dir=project_dir.resolve(), - ) - ) + + # Attach runtime fields + spec.path = f + spec.project_dir = project_dir.resolve() + specs.append(spec) + return specs @@ -104,27 +158,15 @@ def discover_unit_specs( def _load_relation_from_rows(executor: Any, rel: str, rows: list[dict]) -> None: - df = pd.DataFrame(rows) - if hasattr(executor, "con"): # DuckDB - # unique temp name per call to avoid clashes under parallel runs - tmp_name = f"_ff_unit_tmp_{uuid.uuid4().hex[:12]}" - executor.con.register(tmp_name, df) - try: - executor.con.execute(f'create or replace table "{rel}" as select * from {tmp_name}') - finally: - # DuckDB >= 0.8: unregister exists; otherwise drop the view fallback - try: - executor.con.unregister(tmp_name) - except Exception: - executor.con.execute(f"drop view if exists {tmp_name}") - return - if hasattr(executor, "engine"): # Postgres - schema = getattr(executor, "schema", None) - df.to_sql( - rel, executor.engine, if_exists="replace", index=False, schema=schema, method="multi" + """ + Delegate loading test-input rows to the executor's utest helper. + """ + if not hasattr(executor, "utest_load_relation_from_rows"): + raise RuntimeError( + f"Unit tests: executor of type {type(executor).__name__} " + "does not implement utest_load_relation_from_rows()." ) - return - raise RuntimeError("Unit tests: unsupported executor backend") + executor.utest_load_relation_from_rows(rel, rows) def _load_relation_from_csv(executor: Any, rel: str, csv_path: Path) -> None: @@ -133,20 +175,23 @@ def _load_relation_from_csv(executor: Any, rel: str, csv_path: Path) -> None: def _read_result(executor: Any, rel: str) -> pd.DataFrame: - if hasattr(executor, "con"): # DuckDB - return executor.con.table(rel).df() - if hasattr(executor, "engine"): # Postgres - schema = getattr(executor, "schema", None) - qualified = f'"{schema}"."{rel}"' if schema else f'"{rel}"' - - with executor.engine.begin() as conn: - return pd.read_sql_query(text(f"select * from {qualified}"), conn) - raise RuntimeError("Unit tests: unsupported executor backend for reading results") + """ + Delegate reading result relation to the executor's utest helper. + """ + if not hasattr(executor, "utest_read_relation"): + raise RuntimeError( + f"Unit tests: executor of type {type(executor).__name__} " + "does not implement utest_read_relation()." + ) + return executor.utest_read_relation(rel) def _project_root_for_spec(spec: UnitSpec) -> Path: if getattr(REGISTRY, "project_dir", None): return Path(REGISTRY.get_project_dir()).resolve() + if spec.path is None: + proj = spec.project_dir + return proj.resolve() if isinstance(proj, Path) else Path.cwd() p = spec.path.resolve() for parent in [p.parent, *list(p.parents)]: if (parent / "models").is_dir(): @@ -234,57 +279,33 @@ def _resolve_csv_path(spec: Any, csv_val: str) -> Path: return (candidates[0] if candidates else (Path.cwd() / p)).resolve() -def _extract_defaults_inputs(spec: Any) -> dict[str, Any]: - """Return defaults.inputs as dict or {}. Works for dict, namespace/dataclass, mapping-like.""" - defaults = getattr(spec, "defaults", None) - - # Case 1: dict / Mapping - if isinstance(defaults, Mapping): - val = cast(Mapping[str, Any], defaults).get("inputs", {}) - return val if isinstance(val, dict) else {} - - if defaults is None: - return {} - - # Case 2: object with attribute 'inputs' (e.g. SimpleNamespace / dataclass) - val = getattr(defaults, "inputs", None) - if isinstance(val, dict): - return val - - # Case 3: mapping-like object with get() - get = getattr(defaults, "get", None) - if callable(get): - try: - val = get("inputs") - return val if isinstance(val, dict) else {} - except Exception: - return {} - - return {} - - -def _fingerprint_case_inputs(spec: Any, case: Any) -> str: +def _fingerprint_case_inputs(spec: UnitSpec, case: UnitCase) -> str: """ Compute a deterministic fingerprint of the EFFECTIVE inputs for a case. Merges spec.defaults.inputs and case.inputs (case overrides), then: - For rows: include normalized rows. - For csv: include the resolved path AND its file content digest if available. """ - # Gather defaults.inputs robustly - defaults_inputs = _extract_defaults_inputs(spec) - - case_inputs = getattr(case, "inputs", None) or {} - - effective_inputs = _deep_merge(defaults_inputs, case_inputs) - norm: dict[str, Any] = {} - for rel, cfg in (effective_inputs or {}).items(): - item = {} - if isinstance(cfg, dict): - # rows + for rel, cfg in (case.inputs or {}).items(): + item: dict[str, Any] = {} + + # Pydantic model from spec/case + if isinstance(cfg, UnitInput): + if cfg.rows is not None: + item["rows"] = _normalize_for_hash(cfg.rows) + if cfg.csv: + csv_path = _resolve_csv_path(spec, cfg.csv) + item["csv_path"] = csv_path.as_posix() + file_hash = _digest_file(csv_path) + if file_hash: + item["csv_sha256"] = file_hash + else: + item.setdefault("csv_unreadable", True) + # Defensive fallback: mapping-like config + elif isinstance(cfg, Mapping): if "rows" in cfg: item["rows"] = _normalize_for_hash(cfg["rows"]) - # csv if "csv" in cfg and isinstance(cfg["csv"], str): csv_path = _resolve_csv_path(spec, cfg["csv"]) item["csv_path"] = csv_path.as_posix() @@ -292,7 +313,6 @@ def _fingerprint_case_inputs(spec: Any, case: Any) -> str: if file_hash: item["csv_sha256"] = file_hash else: - # Fallback: include path string only if unreadable item.setdefault("csv_unreadable", True) else: # Unknown shape: include normalized raw value @@ -442,14 +462,72 @@ def _rows_as_tuples(df: pd.DataFrame, key_cols: Iterable[str]) -> list[tuple]: return [tuple(df[c].iloc[i] if c in df.columns else None for c in key_cols) for i in idx_range] +def _normalize_cell_for_compare(v: Any) -> Any: + """Normalize individual cell values so that semantically equal values compare equal.""" + # Treat NaNs / None uniformly + if v is None or (isinstance(v, float) and pd.isna(v)): + return "__NA__" + + # pandas.Timestamp + if isinstance(v, pd.Timestamp): + # to_pydatetime() → datetime, then .date() → date + return v.to_pydatetime().date().isoformat() + + # datetime.datetime + if isinstance(v, datetime.datetime): + return v.date().isoformat() + + # datetime.date (but not datetime.datetime, already handled above) + if isinstance(v, datetime.date): + return v.isoformat() + + return v + + +def _normalize_df_for_compare(df: pd.DataFrame) -> pd.DataFrame: + """ + Convert a DataFrame to a comparison-friendly shape: + - normalize each cell + - resulting dtypes will usually be 'object', so int32 vs int64 etc. no longer matter + """ + # Avoid DataFrame.applymap() to keep Pylance happy: + # for each column (Series), map every value through _normalize_cell_for_compare + return df.apply(lambda col: col.map(_normalize_cell_for_compare)) + + def _assert_exact_equal(actual_df: pd.DataFrame, exp: pd.DataFrame) -> None: - A = actual_df[exp.columns].fillna("__NA__") - E = exp.fillna("__NA__") - if A.equals(E): + # Align columns first + A = actual_df[exp.columns] + E = exp + + # ---- Make comparison *row-order insensitive* by default ---- + sort_cols = list(E.columns) + A = A.sort_values(sort_cols).reset_index(drop=True) + E = E.sort_values(sort_cols).reset_index(drop=True) + + # Normalize both sides + A_norm = _normalize_df_for_compare(A) + E_norm = _normalize_df_for_compare(E) + + if A_norm.equals(E_norm): return - a_csv = A.to_csv(index=False) - e_csv = E.to_csv(index=False) + # Helpful debug: show dtypes *after* normalization and indices + debug = [ + "Rows differ but CSV output is identical or deceptively similar.", + f"Actual index: {list(A_norm.index)}", + f"Expected index: {list(E_norm.index)}", + "", + "Actual dtypes:", + str(A_norm.dtypes), + "", + "Expected dtypes:", + str(E_norm.dtypes), + ] + debug_msg = "\n".join(debug) + + a_csv = A_norm.to_csv(index=False) + e_csv = E_norm.to_csv(index=False) diff = "\n".join( difflib.unified_diff( e_csv.splitlines(), @@ -459,7 +537,7 @@ def _assert_exact_equal(actual_df: pd.DataFrame, exp: pd.DataFrame) -> None: lineterm="", ) ) - raise UnitAssertionFailure(f"Rows differ:\n{diff}") + raise UnitAssertionFailure(f"{debug_msg}\n\nDiff:\n{diff}") # ---------- Runner ---------- @@ -495,6 +573,12 @@ def _normalize_cache_mode(cache_mode: str | Any) -> str: def _detect_engine_name(executor: Any) -> str: + # Prefer explicit engine_name on BaseExecutor subclasses + name = getattr(executor, "engine_name", None) + if isinstance(name, str) and name: + return name + + # Fallback heuristics for non-BaseExecutor usage if hasattr(executor, "con"): return "duckdb" if hasattr(executor, "engine"): @@ -629,13 +713,19 @@ def run_unit_specs( ) for spec in specs: + if spec.engine and spec.engine != engine_name: + continue + node = REGISTRY.nodes.get(spec.model) if not node: print(f"⚠️ Model '{spec.model}' not found (in {spec.path})") ctx.failures += 1 continue - for case in spec.cases: + for raw_case in spec.cases: + # Apply spec.defaults to each case (merged view) + case = spec.merged_case(raw_case) + if only_case and case.name != only_case: continue print(f"→ {spec.model} :: {case.name}") @@ -646,16 +736,35 @@ def run_unit_specs( cand_fp = _fingerprint_case(node, spec, case, ctx) + before_failures = ctx.failures ctx.failures += _load_inputs_for_case(executor, spec, case, node) + # If any input failed to load, skip execution & assertion for this case. + if ctx.failures > before_failures: + print(" ⚠️ skipping execution due to input load failure") + continue + if _maybe_skip_by_cache(node, cand_fp, ctx): _read_and_assert(spec, case, ctx) + _cleanup_inputs_for_case(executor, case) continue + target_rel_cfg = getattr(case, "expect", None) + if isinstance(target_rel_cfg, UnitExpect): + target_rel = target_rel_cfg.relation or relation_for(spec.model) + elif isinstance(target_rel_cfg, Mapping): + target_rel = target_rel_cfg.get("relation") or relation_for(spec.model) + else: + target_rel = relation_for(spec.model) + + _reset_utest_relation(executor, target_rel) + if not _execute_and_update_cache(node, cand_fp, ctx): + _cleanup_inputs_for_case(executor, case) continue _read_and_assert(spec, case, ctx) + _cleanup_inputs_for_case(executor, case) if ctx.cache and ctx.computed_fps and ctx.cache_mode == "rw": # pragma: no cover ctx.cache.update_many(ctx.computed_fps) @@ -667,6 +776,29 @@ def run_unit_specs( # ----------------- Helper ----------------- +def _reset_utest_relation(executor: Any, relation: str) -> None: + """ + Best-effort: ask the executor to drop any view/table for this relation + before we (re)create it in a unit test. + """ + reset = getattr(executor, "utest_clean_target", None) + if callable(reset): + with suppress(Exception): + reset(relation) + + +def _cleanup_inputs_for_case(executor: Any, case: Any) -> None: + """ + Best-effort: drop all input relations after a unit-test case finishes. + + This prevents tables created as test fixtures (like 'users_clean' in mart tests) + from leaking into other specs (like the 'users_clean' model tests). + """ + inputs = getattr(case, "inputs", None) or {} + for rel in inputs: + _reset_utest_relation(executor, rel) + + def _load_inputs_for_case(executor: Any, spec: Any, case: Any, node: Any) -> int: """ Loads all declared relations in 'case.inputs'. @@ -682,10 +814,22 @@ def _load_inputs_for_case(executor: Any, spec: Any, case: Any, node: Any) -> int for rel, cfg in (case.inputs or {}).items(): try: - if "rows" in cfg: - _load_relation_from_rows(executor, rel, cfg["rows"]) - elif "csv" in cfg: - csv_path = _resolve_csv_path(spec, cfg["csv"]) + _reset_utest_relation(executor, rel) + + rows: list[dict] | None = None + csv_val: str | None = None + + if isinstance(cfg, UnitInput): + rows = cfg.rows + csv_val = cfg.csv + elif isinstance(cfg, Mapping): + rows = cast(Mapping[str, Any], cfg).get("rows") + csv_val = cast(Mapping[str, Any], cfg).get("csv") + + if rows is not None: + _load_relation_from_rows(executor, rel, rows) + elif csv_val: + csv_path = _resolve_csv_path(spec, csv_val) _load_relation_from_csv(executor, rel, csv_path) else: print(f" ❌ invalid input for relation '{rel}'") @@ -698,6 +842,14 @@ def _load_inputs_for_case(executor: Any, spec: Any, case: Any, node: Any) -> int def _execute_node(executor: Any, node: Any, jenv: Any) -> tuple[bool, str | None]: + # Best-effort cleanup so view<->table flips don't fail in DuckDB/Postgres. + try: + rel = relation_for(node.name) + _reset_utest_relation(executor, rel) + except Exception: + # Cleanup is best-effort; don't fail the test run on cleanup errors. + pass + try: if getattr(node, "kind", None) == "sql": executor.run_sql(node, jenv) @@ -709,7 +861,13 @@ def _execute_node(executor: Any, node: Any, jenv: Any) -> tuple[bool, str | None def _read_target_df(executor: Any, spec: Any, case: Any) -> tuple[bool, Any, str]: - target_rel = case.expect.get("relation") or relation_for(spec.model) + exp_cfg = getattr(case, "expect", None) or {} + if isinstance(exp_cfg, UnitExpect): + target_rel = exp_cfg.relation or relation_for(spec.model) + elif isinstance(exp_cfg, Mapping): + target_rel = exp_cfg.get("relation") or relation_for(spec.model) + else: + target_rel = relation_for(spec.model) try: df = _read_result(executor, target_rel) return True, df, target_rel @@ -719,18 +877,41 @@ def _read_target_df(executor: Any, spec: Any, case: Any) -> tuple[bool, Any, str def _assert_expected_rows(df: Any, case: Any) -> tuple[bool, str | None]: try: + exp_cfg = getattr(case, "expect", None) or {} + + if isinstance(exp_cfg, UnitExpect): + rows_cfg = exp_cfg.rows or [] + order_by = exp_cfg.order_by + any_order = exp_cfg.any_order + approx = exp_cfg.approx + ignore_columns = exp_cfg.ignore_columns + subset = exp_cfg.subset + elif isinstance(exp_cfg, Mapping): + rows_cfg = exp_cfg.get("rows", []) + order_by = exp_cfg.get("order_by") + any_order = exp_cfg.get("any_order", False) + approx = exp_cfg.get("approx") + ignore_columns = exp_cfg.get("ignore_columns") + subset = exp_cfg.get("subset", False) + else: + rows_cfg = [] + order_by = None + any_order = False + approx = None + ignore_columns = None + subset = False + assert_rows_equal( df, - case.expect.get("rows", []), - order_by=case.expect.get("order_by"), - any_order=case.expect.get("any_order", False), - approx=case.expect.get("approx"), - ignore_columns=case.expect.get("ignore_columns"), - subset=case.expect.get("subset", False), + rows_cfg, + order_by=order_by, + any_order=any_order, + approx=approx, + ignore_columns=ignore_columns, + subset=subset, ) return True, None except UnitAssertionFailure as e: return False, str(e) except AssertionError as e: - # Falls assert_rows_equal in manchen Pfaden nur AssertionError wirft return False, str(e) diff --git a/tests/common/fixtures.py b/tests/common/fixtures.py index 47e9eb9..f0a3dd2 100644 --- a/tests/common/fixtures.py +++ b/tests/common/fixtures.py @@ -243,21 +243,48 @@ def duckdbutor(): - con.register(...) - con.execute(...) - con.table(...).df() + - plus utest_* helpers used by utest.run_unit_specs """ con = MagicMock() table_df = pd.DataFrame([{"id": 1}]) + # any con.table("whatever").df() returns this df con.table.return_value.df.return_value = table_df class DuckEx: def __init__(self, con): self.con = con + # simple in-memory storage for utest relations + self._utest_tables: dict[str, pd.DataFrame] = {} def run_sql(self, node, jenv): + # For these unit tests we don't actually execute SQL. + # The test only checks that the utest plumbing works. return None def run_python(self, node): return None + # ---- utest helpers expected by utest.run_unit_specs ---- + + def utest_load_relation_from_rows(self, relation: str, rows: list[dict]) -> None: + """ + Store test input rows in memory as a DataFrame. + """ + self._utest_tables[relation] = pd.DataFrame(rows) + + def utest_read_relation(self, relation: str) -> pd.DataFrame: + """ + Read back a relation for assertions. + + Prefer in-memory tables created via utest_load_relation_from_rows; + fall back to the fake con.table(...).df() for things like the model + output ('model_a') that we don't explicitly seed. + """ + if relation in self._utest_tables: + return self._utest_tables[relation] + # fallback: use whatever the MagicMock returns + return self.con.table(relation).df() + return DuckEx(con) @@ -273,6 +300,14 @@ def __init__(self, engine): self.engine = engine self.schema = "public" + # --- new: utest helper used by utest._read_result --- + def utest_read_relation(self, relation: str) -> pd.DataFrame: + # For the test we don't care about the exact SQL or connection, + # we just need to call utest.pd.read_sql_query so the monkeypatch + # in test_read_result_postgres is hit. + sql = f"SELECT * FROM {relation}" + return utest.pd.read_sql_query(sql, self.engine) + return PgEx(engine) diff --git a/tests/integration/utest/test_utest_cache_flag_integration.py b/tests/integration/utest/test_utest_cache_flag_integration.py index 20e3e31..a3790a1 100644 --- a/tests/integration/utest/test_utest_cache_flag_integration.py +++ b/tests/integration/utest/test_utest_cache_flag_integration.py @@ -16,19 +16,40 @@ def _stub_minimal_context(monkeypatch: pytest.MonkeyPatch, tmp_path: Path): # Make sure the minimal project skeleton passes CLI path validation. (tmp_path / "models").mkdir(parents=True, exist_ok=True) + # minimal profiles.yml with dev + dev_utest + profiles_yaml = """ +dev: + engine: duckdb + duckdb: + path: "dummy_path" + +dev_utest: + engine: duckdb + duckdb: + path: ":memory:" +""".strip() + + (tmp_path / "profiles.yml").write_text(profiles_yaml) + def fake_load_project_and_env(project_arg: str): # Minimal registry with one model file path (not used by utest runner) REGISTRY.nodes = { "dummy": Node( - "dummy", "sql", path=tmp_path / "models" / "dummy.ff.sql", deps=[], meta={} + "dummy", + "sql", + path=tmp_path / "models" / "dummy.ff.sql", + deps=[], + meta={}, ) } REGISTRY.env = Environment() + # Important: project_dir must be tmp_path so the CLI finds profiles.yml there return tmp_path, REGISTRY.env def fake_resolve_profile(env_name, engine, proj): + # We ignore env_name/engine here - tests don't care about real profile contents. return ( - SimpleNamespace(), + SimpleNamespace(), # raw profile (unused) SimpleNamespace(engine="duckdb", duckdb=SimpleNamespace(path=":memory:")), ) diff --git a/tests/unit/test_utest_unit.py b/tests/unit/test_utest_unit.py index 1319616..5e6c0fb 100644 --- a/tests/unit/test_utest_unit.py +++ b/tests/unit/test_utest_unit.py @@ -4,7 +4,7 @@ import json from pathlib import Path from types import SimpleNamespace -from typing import ClassVar, cast +from typing import cast from unittest.mock import MagicMock import pandas as pd @@ -17,6 +17,8 @@ from fastflowtransform.utest import ( EnvCtx, UnitCase, + UnitDefaults, + UnitExpect, UnitSpec, UtestCtx, _make_env_ctx, @@ -36,61 +38,36 @@ def make_fake_cache() -> FingerprintCache: # ------------------------------------------------------------ -# _deep_merge +# _fingerprint_case_inputs # ------------------------------------------------------------ -@pytest.mark.unit -def test_deep_merge_merges_nested_dicts(): - base = {"a": 1, "b": {"x": 1, "y": 2}} - override = {"b": {"y": 99, "z": 3}, "c": 5} - - out = utest._deep_merge(base, override) - - assert out == { - "a": 1, - "b": {"x": 1, "y": 99, "z": 3}, - "c": 5, - } - # base sollte nicht mutiert sein - assert base == {"a": 1, "b": {"x": 1, "y": 2}} - - -@pytest.mark.unit -def test_deep_merge_lists_are_replaced(): - base = {"a": [1, 2]} - override = {"a": [9]} - out = utest._deep_merge(base, override) - assert out == {"a": [9]} - - -# ------------------------------------------------------------ -# _extract_defaults_inputs + _fingerprint_case_inputs -# ------------------------------------------------------------ - - -@pytest.mark.unit -def test_extract_defaults_inputs_missing_returns_empty(): - spec = SimpleNamespace(defaults={}) - res = utest._extract_defaults_inputs(spec) - assert res == {} - - @pytest.mark.unit def test_fingerprint_case_inputs_merges_defaults_and_case(tmp_path, monkeypatch): csv_file = tmp_path / "seed.csv" csv_file.write_text("id,name\n1,A\n", encoding="utf-8") - spec = SimpleNamespace( - defaults={"inputs": {"src": {"rows": [{"id": 1}]}}}, + spec = utest.UnitSpec( + model="dummy", + engine="duckdb", + defaults=utest.UnitDefaults( + inputs={ + "src": utest.UnitInput(rows=[{"id": 1}]), + }, + expect=utest.UnitExpect(), + ), + cases=[], path=tmp_path / "ut.yml", project_dir=tmp_path, ) - case = SimpleNamespace( + + case = utest.UnitCase( + name="c1", inputs={ - "src": {"rows": [{"id": 2}]}, - "dim": {"csv": "seed.csv"}, - } + "src": utest.UnitInput(rows=[{"id": 2}]), + "dim": utest.UnitInput(csv="seed.csv"), + }, + expect=utest.UnitExpect(), ) fp = utest._fingerprint_case_inputs(spec, case) @@ -320,7 +297,8 @@ def test_discover_unit_specs_basic(tmp_path, fake_registry): s = specs[0] assert s.model == "model_a" assert len(s.cases) == 1 - assert s.cases[0].expect["rows"] == [{"id": 2}] + # expect is now a UnitExpect pydantic model, so use attributes + assert s.cases[0].expect.rows == [{"id": 2}] @pytest.mark.unit @@ -348,14 +326,13 @@ def test_discover_unit_specs_only_model_filter(tmp_path, fake_registry): @pytest.mark.duckdb def test_load_relation_from_rows_duckdb(duckdbutor): rows = [{"id": 1}, {"id": 2}] - duckdbutor.con.unregister.side_effect = Exception("no unregister in this version") + + # Provide / spy on the new utest helper on our fake executor + duckdbutor.utest_load_relation_from_rows = MagicMock() utest._load_relation_from_rows(duckdbutor, "tmp_tbl", rows) - assert duckdbutor.con.register.call_count == 1 - executed_sqls = [c.args[0] for c in duckdbutor.con.execute.call_args_list] - assert any("create or replace table" in sql.lower() for sql in executed_sqls) - assert any("drop view if exists" in sql.lower() for sql in executed_sqls) + duckdbutor.utest_load_relation_from_rows.assert_called_once_with("tmp_tbl", rows) # --------------------------------------------------------------------------- @@ -433,8 +410,8 @@ def test_project_root_for_spec_fallback(tmp_path, monkeypatch): spec = UnitSpec( model="m1", engine=None, - defaults={}, - cases=[UnitCase(name="c1", inputs={}, expect={})], + defaults=UnitDefaults(), + cases=[UnitCase(name="c1", inputs={}, expect=UnitExpect())], path=spec_path, project_dir=tmp_path, ) @@ -444,41 +421,6 @@ def test_project_root_for_spec_fallback(tmp_path, monkeypatch): assert root == spec_path.parent -# --------------------------------------------------------------------------- -# _extract_defaults_inputs (cases 1, 2, 3) -# --------------------------------------------------------------------------- - - -@pytest.mark.unit -def test_extract_defaults_inputs_from_dict(): - spec = SimpleNamespace(defaults={"inputs": {"a": 1}}) - res = utest._extract_defaults_inputs(spec) - assert res == {"a": 1} - - -@pytest.mark.unit -def test_extract_defaults_inputs_from_attr(): - class D: - inputs: ClassVar[dict[str, int]] = {"b": 2} - - spec = SimpleNamespace(defaults=D()) - res = utest._extract_defaults_inputs(spec) - assert res == {"b": 2} - - -@pytest.mark.unit -def test_extract_defaults_inputs_from_get(): - class D: - def get(self, key): - if key == "inputs": - return {"c": 3} - return None - - spec = SimpleNamespace(defaults=D()) - res = utest._extract_defaults_inputs(spec) - assert res == {"c": 3} - - # --------------------------------------------------------------------------- # _make_env_ctx, _make_cache, _get_project_dir_safe # --------------------------------------------------------------------------- @@ -662,12 +604,24 @@ def test_run_unit_specs_happy(tmp_path, fake_registry, duckdbutor, monkeypatch): spec = utest.UnitSpec( model="model_a", engine="duckdb", - defaults={"inputs": {"src1": {"rows": [{"id": 1}]}}}, + defaults=utest.UnitDefaults( + inputs={ + "src1": utest.UnitInput( + rows=[{"id": 1}], + ) + } + ), cases=[ utest.UnitCase( name="c1", - inputs={"src1": {"rows": [{"id": 1}]}}, - expect={"rows": [{"id": 1}]}, + inputs={ + "src1": utest.UnitInput( + rows=[{"id": 1}], + ) + }, + expect=utest.UnitExpect( + rows=[{"id": 1}], + ), ) ], path=tmp_path / "tests" / "unit" / "x.yml", From ff6778768f836337d754d53a4ba86557eaca23f6 Mon Sep 17 00:00:00 2001 From: Marko Lekic Date: Fri, 28 Nov 2025 17:02:29 +0100 Subject: [PATCH 3/3] Refactor budget integration in executors --- .../executors/_budget_runner.py | 86 +++++++++++++++++++ .../executors/bigquery/base.py | 30 ++++--- .../executors/databricks_spark.py | 30 +++---- src/fastflowtransform/executors/duckdb.py | 41 ++++----- src/fastflowtransform/executors/postgres.py | 44 ++++------ .../executors/snowflake_snowpark.py | 29 +++---- .../executors/test_bigquery_bf_exec_unit.py | 4 +- .../unit/executors/test_bigquery_exec_unit.py | 4 +- 8 files changed, 163 insertions(+), 105 deletions(-) create mode 100644 src/fastflowtransform/executors/_budget_runner.py diff --git a/src/fastflowtransform/executors/_budget_runner.py b/src/fastflowtransform/executors/_budget_runner.py new file mode 100644 index 0000000..9ea4a95 --- /dev/null +++ b/src/fastflowtransform/executors/_budget_runner.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +from collections.abc import Callable +from contextlib import suppress +from time import perf_counter +from typing import Any + +from fastflowtransform.executors.budget import BudgetGuard +from fastflowtransform.executors.query_stats import QueryStats + + +def run_sql_with_budget( + executor: Any, + sql: str, + *, + guard: BudgetGuard, + exec_fn: Callable[[], Any], + rowcount_extractor: Callable[[Any], int | None] | None = None, + extra_stats: Callable[[Any], QueryStats | None] | None = None, + estimate_fn: Callable[[str], int | None] | None = None, + post_estimate_fn: Callable[[str, Any], int | None] | None = None, + record_stats: bool = True, +) -> Any: + """ + Shared helper for guarded SQL execution with timing + stats recording. + + executor object exposing _apply_budget_guard, _is_budget_guard_active, _record_query_stats + sql statement (used for guard + optional estimator) + exec_fn callable that executes the statement and returns a result/job handle + rowcount_extractor(result) -> int|None best-effort row count (non-negative only) + extra_stats(result) -> QueryStats|None allows engines to override/extend stats post-exec + estimate_fn(sql) -> int|None optional best-effort bytes estimate when guard + inactive + post_estimate_fn(sql, result) -> int|None optional post-exec fallback when bytes are still None + record_stats set False to skip immediate stats (e.g., when a job handle records on .result()) + """ + estimated_bytes = executor._apply_budget_guard(guard, sql) + if ( + estimated_bytes is None + and not executor._is_budget_guard_active() + and estimate_fn is not None + ): + with suppress(Exception): + estimated_bytes = estimate_fn(sql) + + # If stats should be deferred (BigQuery job handles), just run and return. + if not record_stats: + return exec_fn() + + started = perf_counter() + result = exec_fn() + duration_ms = int((perf_counter() - started) * 1000) + + rows: int | None = None + if rowcount_extractor is not None: + with suppress(Exception): + rows = rowcount_extractor(result) + + stats = QueryStats(bytes_processed=estimated_bytes, rows=rows, duration_ms=duration_ms) + + if stats.bytes_processed is None and post_estimate_fn is not None: + with suppress(Exception): + post_estimate = post_estimate_fn(sql, result) + if post_estimate is not None: + stats = QueryStats( + bytes_processed=post_estimate, + rows=stats.rows, + duration_ms=stats.duration_ms, + ) + + if extra_stats is not None: + with suppress(Exception): + extra = extra_stats(result) + if extra: + stats = QueryStats( + bytes_processed=extra.bytes_processed + if extra.bytes_processed is not None + else stats.bytes_processed, + rows=extra.rows if extra.rows is not None else stats.rows, + duration_ms=extra.duration_ms + if extra.duration_ms is not None + else stats.duration_ms, + ) + + executor._record_query_stats(stats) + return result diff --git a/src/fastflowtransform/executors/bigquery/base.py b/src/fastflowtransform/executors/bigquery/base.py index 281aefc..da47d98 100644 --- a/src/fastflowtransform/executors/bigquery/base.py +++ b/src/fastflowtransform/executors/bigquery/base.py @@ -4,6 +4,7 @@ from typing import Any, TypeVar from fastflowtransform.core import Node, relation_for +from fastflowtransform.executors._budget_runner import run_sql_with_budget from fastflowtransform.executors._shims import BigQueryConnShim from fastflowtransform.executors.base import BaseExecutor from fastflowtransform.executors.bigquery._bigquery_mixin import BigQueryIdentifierMixin @@ -69,19 +70,28 @@ def _execute_sql(self, sql: str) -> _TrackedQueryJob: - All 'real' SQL statements in this executor should go through here. - Returns the QueryJob so callers can call .result(). """ - self._apply_budget_guard(self._BUDGET_GUARD, sql) - # job = self.client.query(sql, location=self.location) - job_config = bigquery.QueryJobConfig() - if self.dataset: - # Let unqualified tables resolve to project.dataset.table - job_config.default_dataset = bigquery.DatasetReference(self.project, self.dataset) - job = self.client.query( + def _exec() -> _TrackedQueryJob: + job_config = bigquery.QueryJobConfig() + if self.dataset: + # Let unqualified tables resolve to project.dataset.table + job_config.default_dataset = bigquery.DatasetReference(self.project, self.dataset) + + job = self.client.query( + sql, + job_config=job_config, + location=self.location, + ) + return _TrackedQueryJob(job, on_complete=self._record_query_job_stats) + + return run_sql_with_budget( + self, sql, - job_config=job_config, - location=self.location, + guard=self._BUDGET_GUARD, + exec_fn=_exec, + estimate_fn=self._estimate_query_bytes, + record_stats=False, ) - return _TrackedQueryJob(job, on_complete=self._record_query_job_stats) # --- Cost estimation for the shared BudgetGuard ----------------- diff --git a/src/fastflowtransform/executors/databricks_spark.py b/src/fastflowtransform/executors/databricks_spark.py index 7e8a7e9..a93d2b3 100644 --- a/src/fastflowtransform/executors/databricks_spark.py +++ b/src/fastflowtransform/executors/databricks_spark.py @@ -15,6 +15,7 @@ from fastflowtransform import storage from fastflowtransform.core import REGISTRY, Node, relation_for from fastflowtransform.errors import ModelExecutionError +from fastflowtransform.executors._budget_runner import run_sql_with_budget from fastflowtransform.executors._spark_imports import ( get_spark_functions, get_spark_window, @@ -434,29 +435,18 @@ def _execute_sql(self, sql: str) -> SDF: - Returns a Spark DataFrame (same as spark.sql). - Records best-effort query stats for run_results.json. """ - estimated_bytes = self._apply_budget_guard(self._BUDGET_GUARD, sql) - if estimated_bytes is None and not self._is_budget_guard_active(): - with suppress(Exception): - estimated_bytes = self._spark_plan_bytes(sql) - t0 = perf_counter() - df = self.spark.sql(sql) - - dt_ms = int((perf_counter() - t0) * 1000) - # Best-effort logical estimate - bytes_processed = ( - estimated_bytes if estimated_bytes is not None else self._spark_plan_bytes(sql) - ) + def _exec() -> SDF: + return self.spark.sql(sql) - # For Spark we don't attempt row counts without executing the job - self._record_query_stats( - QueryStats( - bytes_processed=bytes_processed, - rows=None, - duration_ms=dt_ms, - ) + return run_sql_with_budget( + self, + sql, + guard=self._BUDGET_GUARD, + exec_fn=_exec, + estimate_fn=self._spark_plan_bytes, + post_estimate_fn=lambda _, __: self._spark_plan_bytes(sql), ) - return df # ---------- Frame hooks (required) ---------- def _read_relation(self, relation: str, node: Node, deps: Iterable[str]) -> SDF: diff --git a/src/fastflowtransform/executors/duckdb.py b/src/fastflowtransform/executors/duckdb.py index 7025014..a4374bf 100644 --- a/src/fastflowtransform/executors/duckdb.py +++ b/src/fastflowtransform/executors/duckdb.py @@ -7,7 +7,6 @@ from collections.abc import Iterable from contextlib import suppress from pathlib import Path -from time import perf_counter from typing import Any, ClassVar import duckdb @@ -16,9 +15,9 @@ from jinja2 import Environment from fastflowtransform.core import Node, relation_for +from fastflowtransform.executors._budget_runner import run_sql_with_budget from fastflowtransform.executors.base import BaseExecutor from fastflowtransform.executors.budget import BudgetGuard -from fastflowtransform.executors.query_stats import QueryStats from fastflowtransform.logging import echo from fastflowtransform.meta import ensure_meta_table, upsert_meta from fastflowtransform.snapshots import resolve_snapshot_config @@ -96,32 +95,24 @@ def _execute_sql(self, sql: str, *args: Any, **kwargs: Any) -> duckdb.DuckDBPyCo The cost guard may call _estimate_query_bytes(sql) before executing. This wrapper also records simple per-query stats for run_results.json. """ - estimated_bytes = self._apply_budget_guard(self._BUDGET_GUARD, sql) - if estimated_bytes is None and not self._is_budget_guard_active(): - with suppress(Exception): - estimated_bytes = self._estimate_query_bytes(sql) - t0 = perf_counter() - cursor = self.con.execute(sql, *args, **kwargs) - dt_ms = int((perf_counter() - t0) * 1000) - rows: int | None = None - try: - rc = getattr(cursor, "rowcount", None) + def _exec() -> duckdb.DuckDBPyConnection: + return self.con.execute(sql, *args, **kwargs) + + def _rows(result: Any) -> int | None: + rc = getattr(result, "rowcount", None) if isinstance(rc, int) and rc >= 0: - rows = rc - except Exception: - rows = None - - # DuckDB doesn't expose bytes-scanned in a simple way yet → rely on the - # estimate we already collected or the best-effort fallback. - self._record_query_stats( - QueryStats( - bytes_processed=estimated_bytes, - rows=rows, - duration_ms=dt_ms, - ) + return rc + return None + + return run_sql_with_budget( + self, + sql, + guard=self._BUDGET_GUARD, + exec_fn=_exec, + rowcount_extractor=_rows, + estimate_fn=self._estimate_query_bytes, ) - return cursor # --- Cost estimation for the shared BudgetGuard ----------------- diff --git a/src/fastflowtransform/executors/postgres.py b/src/fastflowtransform/executors/postgres.py index f46c9a3..81e5128 100644 --- a/src/fastflowtransform/executors/postgres.py +++ b/src/fastflowtransform/executors/postgres.py @@ -1,7 +1,6 @@ # fastflowtransform/executors/postgres.py import json from collections.abc import Iterable -from contextlib import suppress from time import perf_counter from typing import Any @@ -13,6 +12,7 @@ from fastflowtransform.core import Node, relation_for from fastflowtransform.errors import ModelExecutionError, ProfileConfigError +from fastflowtransform.executors._budget_runner import run_sql_with_budget from fastflowtransform.executors._shims import SAConnShim from fastflowtransform.executors.base import BaseExecutor from fastflowtransform.executors.budget import BudgetGuard @@ -120,39 +120,27 @@ def _execute_sql( Also records simple per-query stats for run_results.json. """ - estimated_bytes = self._apply_budget_guard(self._BUDGET_GUARD, sql) - if estimated_bytes is None and not self._is_budget_guard_active(): - with suppress(Exception): - estimated_bytes = self._estimate_query_bytes(sql) - t0 = perf_counter() - if conn is None: - # Standalone use: open our own transaction - with self.engine.begin() as local_conn: - result = self._execute_sql_core(sql, *args, conn=local_conn, **kwargs) - else: - # Reuse existing connection / transaction (e.g. in run_snapshot_sql) - result = self._execute_sql_core(sql, *args, conn=conn, **kwargs) - - dt_ms = int((perf_counter() - t0) * 1000) + def _exec() -> Any: + if conn is None: + with self.engine.begin() as local_conn: + return self._execute_sql_core(sql, *args, conn=local_conn, **kwargs) + return self._execute_sql_core(sql, *args, conn=conn, **kwargs) - # rows: best-effort from Result.rowcount (DML only; SELECT is often -1) - rows: int | None = None - try: + def _rows(result: Any) -> int | None: rc = getattr(result, "rowcount", None) if isinstance(rc, int) and rc >= 0: - rows = rc - except Exception: - rows = None + return rc + return None - self._record_query_stats( - QueryStats( - bytes_processed=estimated_bytes, - rows=rows, - duration_ms=dt_ms, - ) + return run_sql_with_budget( + self, + sql, + guard=self._BUDGET_GUARD, + exec_fn=_exec, + rowcount_extractor=_rows, + estimate_fn=self._estimate_query_bytes, ) - return result def _analyze_relations( self, diff --git a/src/fastflowtransform/executors/snowflake_snowpark.py b/src/fastflowtransform/executors/snowflake_snowpark.py index 46741d1..5241370 100644 --- a/src/fastflowtransform/executors/snowflake_snowpark.py +++ b/src/fastflowtransform/executors/snowflake_snowpark.py @@ -11,6 +11,7 @@ from jinja2 import Environment from fastflowtransform.core import Node, relation_for +from fastflowtransform.executors._budget_runner import run_sql_with_budget from fastflowtransform.executors.base import BaseExecutor from fastflowtransform.executors.budget import BudgetGuard from fastflowtransform.executors.query_stats import QueryStats @@ -123,25 +124,17 @@ def _execute_sql(self, sql: str) -> SNDF: - Returns a Snowpark DataFrame (same as session.sql). - Records best-effort query stats for run_results.json. """ - estimated_bytes = self._apply_budget_guard(self._BUDGET_GUARD, sql) - if estimated_bytes is None and not self._is_budget_guard_active(): - with suppress(Exception): - estimated_bytes = self._estimate_query_bytes(sql) - t0 = perf_counter() - df = self.session.sql(sql) - dt_ms = int((perf_counter() - t0) * 1000) - - # We *don't* call df.count() here - that would execute the query again. - # For Snowflake we also don't cheaply access bytes/rows here; the cost - # guard already did a dry-run EXPLAIN if FF_SF_MAX_BYTES is set. - self._record_query_stats( - QueryStats( - bytes_processed=estimated_bytes, - rows=None, - duration_ms=dt_ms, - ) + + def _exec() -> SNDF: + return self.session.sql(sql) + + return run_sql_with_budget( + self, + sql, + guard=self._BUDGET_GUARD, + exec_fn=_exec, + estimate_fn=self._estimate_query_bytes, ) - return df def _exec_many(self, sql: str) -> None: """ diff --git a/tests/unit/executors/test_bigquery_bf_exec_unit.py b/tests/unit/executors/test_bigquery_bf_exec_unit.py index cb0921f..38eda27 100644 --- a/tests/unit/executors/test_bigquery_bf_exec_unit.py +++ b/tests/unit/executors/test_bigquery_bf_exec_unit.py @@ -381,9 +381,9 @@ def test_incremental_insert_cleans_select(bq_exec): def test_incremental_merge_executes_two_statements(bq_exec): bq_exec.client.queries.clear() bq_exec.incremental_merge("dst_tbl", "SELECT 1 AS id", ["id"]) - assert len(bq_exec.client.queries) == 2 + assert len(bq_exec.client.queries) == 4 assert "DELETE FROM" in bq_exec.client.queries[0][0] - assert "INSERT INTO" in bq_exec.client.queries[1][0] + assert "INSERT INTO" in bq_exec.client.queries[2][0] @pytest.mark.unit diff --git a/tests/unit/executors/test_bigquery_exec_unit.py b/tests/unit/executors/test_bigquery_exec_unit.py index 4848311..21c4d10 100644 --- a/tests/unit/executors/test_bigquery_exec_unit.py +++ b/tests/unit/executors/test_bigquery_exec_unit.py @@ -373,9 +373,9 @@ def test_incremental_insert_cleans_select(bq_exec): def test_incremental_merge_executes_two_statements(bq_exec): bq_exec.client.queries.clear() bq_exec.incremental_merge("dst_tbl", "SELECT 1 AS id", ["id"]) - assert len(bq_exec.client.queries) == 2 + assert len(bq_exec.client.queries) == 4 assert "DELETE FROM" in bq_exec.client.queries[0][0] - assert "INSERT INTO" in bq_exec.client.queries[1][0] + assert "INSERT INTO" in bq_exec.client.queries[2][0] @pytest.mark.unit