From 6d365ad7807e64e6f95c00ce3daeaf5e323dff6b Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Thu, 26 Mar 2026 17:39:50 -0700 Subject: [PATCH 01/20] feat: add data-parity cross-database table comparison - Add DataParity engine integration via native Rust bindings - Add data-diff tool for LLM agent (profile, joindiff, hashdiff, cascade, auto) - Add ClickHouse driver support - Add data-parity skill: profile-first workflow, algorithm selection guide, CRITICAL warning that joindiff cannot run cross-database (always returns 0 diffs), output style rules (facts only, no editorializing) - Gitignore .altimate-code/ (credentials) and *.node (platform binaries) --- .gitignore | 6 + .opencode/skills/data-parity/SKILL.md | 290 ++++++++++++++++++ packages/drivers/src/clickhouse.ts | 6 +- .../altimate/native/connections/data-diff.ts | 268 ++++++++++++++++ .../altimate/native/connections/register.ts | 8 + .../opencode/src/altimate/native/types.ts | 34 ++ .../opencode/src/altimate/tools/data-diff.ts | 174 +++++++++++ packages/opencode/src/tool/registry.ts | 2 + 8 files changed, 785 insertions(+), 3 deletions(-) create mode 100644 .opencode/skills/data-parity/SKILL.md create mode 100644 packages/opencode/src/altimate/native/connections/data-diff.ts create mode 100644 packages/opencode/src/altimate/tools/data-diff.ts diff --git a/.gitignore b/.gitignore index b10c1bb043..4dfe62f9ee 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,12 @@ target # Commit message scratch files .github/meta/ +# Local connections config (may contain credentials) +.altimate-code/ + +# Pre-built native binaries (platform-specific, not for source control) +packages/opencode/*.node + # Local dev files opencode-dev logs/ diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md new file mode 100644 index 0000000000..4d7b7460c9 --- /dev/null +++ b/.opencode/skills/data-parity/SKILL.md @@ -0,0 +1,290 @@ +--- +name: data-parity +description: Validate that two tables or query results are identical — or diagnose exactly how they differ. Discover schema, identify keys, profile cheaply, then diff. Use for migration validation, ETL regression, and query refactor verification. +--- + +# Data Parity (Table Diff) + +## Output Style + +**Report facts only. No editorializing.** +- Show counts, changed values, missing rows, new rows — that's it. +- Do NOT explain why row-level diffing is valuable, why COUNT(*) is insufficient, or pitch the tool. +- Do NOT add "the dangerous one", "this is exactly why", "this matters" style commentary. +- The user asked for a diff result, not a lecture. + +## Requirements +**Agent:** any +**Tools used:** `sql_query` (for schema discovery), `data_diff` + +## When to Use This Skill + +**Use when the user wants to:** +- Confirm two tables contain the same data after a migration +- Find rows added, deleted, or modified between source and target +- Validate that a dbt model produces the same output as the old query +- Run regression checks after a pipeline change + +**Do NOT use for:** +- Schema comparison (column names, types) — check DDL instead +- Performance benchmarking — this runs SELECT queries + +--- + +## The `data_diff` Tool + +`data_diff` takes table names and key columns. It generates SQL, routes it through the specified warehouse connections, and reports differences. It **does not discover schema** — you must provide key columns and relevant comparison columns. + +**Key parameters:** +- `source` — table name (`orders`, `db.schema.orders`) or full SELECT/WITH query +- `target` — table name or SELECT query +- `key_columns` — primary key(s) uniquely identifying each row (required) +- `source_warehouse` — connection name for source +- `target_warehouse` — connection name for target (omit = same as source) +- `extra_columns` — columns to compare beyond keys (omit = compare all) +- `algorithm` — `auto`, `joindiff`, `hashdiff`, `profile`, `cascade` +- `where_clause` — filter applied to both tables + +> **CRITICAL — Algorithm choice:** +> - If `source_warehouse` ≠ `target_warehouse` → **always use `hashdiff`** (or `auto`). +> - `joindiff` runs a single SQL JOIN on ONE connection — it physically cannot see the other table. +> Using `joindiff` across different servers always reports 0 differences (both sides look identical). +> - When in doubt, use `algorithm="auto"` — it picks `joindiff` for same-warehouse and `hashdiff` for cross-warehouse automatically. + +--- + +## Workflow + +The key principle: **the LLM does the identification work using SQL tools first, then calls data_diff with informed parameters.** + +### Step 1: Inspect the tables + +Before calling `data_diff`, use `sql_query` to understand what you're comparing: + +```sql +-- Get columns and types +SELECT column_name, data_type, is_nullable +FROM information_schema.columns +WHERE table_schema = 'public' AND table_name = 'orders' +ORDER BY ordinal_position +``` + +For ClickHouse: +```sql +DESCRIBE TABLE source_db.events +``` + +For Snowflake: +```sql +SHOW COLUMNS IN TABLE orders +``` + +**Look for:** +- Columns that look like primary keys (named `id`, `*_id`, `*_key`, `uuid`) +- Columns with `NOT NULL` constraints +- Whether there are composite keys + +### Step 2: Identify the key columns + +If the primary key isn't obvious from the schema, run a cardinality check: + +```sql +SELECT + COUNT(*) AS total_rows, + COUNT(DISTINCT order_id) AS distinct_order_id, + COUNT(DISTINCT customer_id) AS distinct_customer_id, + COUNT(DISTINCT created_at) AS distinct_created_at +FROM orders +``` + +**A good key column:** `distinct_count = total_rows` (fully unique) and `null_count = 0`. + +If no single column is unique, find a composite key: +```sql +SELECT order_id, line_item_id, COUNT(*) as cnt +FROM order_lines +GROUP BY order_id, line_item_id +HAVING COUNT(*) > 1 +LIMIT 5 +``` +If this returns 0 rows, `(order_id, line_item_id)` is a valid composite key. + +### Step 3: Estimate table size + +```sql +SELECT COUNT(*) FROM orders +``` + +Use this to choose the algorithm: +- **< 1M rows**: `joindiff` (same DB) or `hashdiff` (cross-DB) — either is fine +- **1M–100M rows**: `hashdiff` or `cascade` +- **> 100M rows**: `hashdiff` with a `where_clause` date filter to validate a recent window first + +### Step 4: Profile first for unknown tables + +If you don't know what to expect (first-time validation, unfamiliar pipeline), start cheap: + +``` +data_diff( + source="orders", + target="orders_migrated", + key_columns=["order_id"], + source_warehouse="postgres_prod", + target_warehouse="snowflake_dw", + algorithm="profile" +) +``` + +Profile output tells you: +- Row count on each side (mismatch = load completeness problem) +- Which columns have null count differences (mismatch = NULL handling bug) +- Min/max divergence per column (mismatch = value transformation bug) +- Which columns match exactly (safe to skip in row-level diff) + +**Interpret profile to narrow the diff:** +``` +Column Profile Comparison + + ✓ order_id: match + ✓ customer_id: match + ✗ amount: DIFFER ← source min=10.00, target min=10.01 — rounding issue? + ✗ status: DIFFER ← source nulls=0, target nulls=47 — NULL mapping bug? + ✓ created_at: match +``` +→ Only diff `amount` and `status` in the next step. + +### Step 5: Run targeted row-level diff + +``` +data_diff( + source="orders", + target="orders_migrated", + key_columns=["order_id"], + extra_columns=["amount", "status"], // only the columns profile said differ + source_warehouse="postgres_prod", + target_warehouse="snowflake_dw", + algorithm="hashdiff" +) +``` + +--- + +## Algorithm Selection + +| Algorithm | When to use | +|-----------|-------------| +| `profile` | First pass — column stats (count, min, max, nulls). No row scan. | +| `joindiff` | Same database — single FULL OUTER JOIN query. Fast. | +| `hashdiff` | Cross-database, or large tables — bisection with checksums. Scales. | +| `cascade` | Auto-escalate: profile → hashdiff on diverging columns. | +| `auto` | JoinDiff if same warehouse, HashDiff if cross-database. | + +**JoinDiff constraint:** Both tables must be on the **same database connection**. If source and target are on different servers, JoinDiff will always report 0 diffs (it only sees one side). Use `hashdiff` or `auto` for cross-database. + +--- + +## Output Interpretation + +### IDENTICAL +``` +✓ Tables are IDENTICAL + Rows checked: 1,000,000 +``` +→ Migration validated. Data is identical. + +### DIFFER — Diagnose by pattern + +``` +✗ Tables DIFFER + + Only in source: 2 → rows deleted in target (ETL missed deletes) + Only in target: 2 → rows added to target (dedup issue or new data) + Updated rows: 3 → values changed (transform bug, type casting, rounding) + Identical rows: 15 +``` + +| Pattern | Root cause hypothesis | +|---------|----------------------| +| `only_in_source > 0`, `only_in_target = 0` | ETL dropped rows — check filters, incremental logic | +| `only_in_source = 0`, `only_in_target > 0` | Target has extra rows — check dedup or wrong join | +| `updated_rows > 0`, row counts match | Silent value corruption — check transforms, type casts | +| Row count differs | Load completeness issue — check ETL watermarks | + +Sample diffs point to the specific key + column + old→new value: +``` +key={"order_id":"4"} col=amount: 300.00 → 305.00 +``` +Use this to query the source systems directly and trace the discrepancy. + +--- + +## Usage Examples + +### Full workflow: unknown migration +``` +// 1. Discover schema +sql_query("SELECT column_name, data_type FROM information_schema.columns WHERE table_name='orders'", warehouse="postgres_prod") + +// 2. Check row count +sql_query("SELECT COUNT(*), COUNT(DISTINCT order_id) FROM orders", warehouse="postgres_prod") + +// 3. Profile to find which columns differ +data_diff(source="orders", target="orders", key_columns=["order_id"], + source_warehouse="postgres_prod", target_warehouse="snowflake_dw", algorithm="profile") + +// 4. Row-level diff on diverging columns only +data_diff(source="orders", target="orders", key_columns=["order_id"], + extra_columns=["amount", "status"], + source_warehouse="postgres_prod", target_warehouse="snowflake_dw", algorithm="hashdiff") +``` + +### Same-database query refactor +``` +data_diff( + source="SELECT id, amount, status FROM orders WHERE region = 'us-east'", + target="SELECT id, amount, status FROM orders_v2 WHERE region = 'us-east'", + key_columns=["id"] +) +``` + +### Large table — filter to recent window first +``` +data_diff( + source="fact_events", + target="fact_events_v2", + key_columns=["event_id"], + where_clause="event_date >= '2024-01-01'", + algorithm="hashdiff" +) +``` + +### ClickHouse — always qualify with database.table +``` +data_diff( + source="source_db.events", + target="target_db.events", + key_columns=["event_id"], + source_warehouse="clickhouse_source", + target_warehouse="clickhouse_target", + algorithm="hashdiff" +) +``` + +--- + +## Common Mistakes + +**Calling data_diff without knowing the key** +→ Run `sql_query` to check cardinality first. A bad key gives meaningless results. + +**Using joindiff for cross-database tables** +→ JoinDiff runs one SQL query on one connection. It can't see the other table. Use `hashdiff` or `auto`. + +**Diffing a 1B row table without a date filter** +→ Add `where_clause` to scope to recent data. Validate a window first, then expand. + +**Ignoring profile output and jumping to full diff** +→ Profile is free. It tells you which columns actually differ so you can avoid scanning all columns across all rows. + +**Forgetting to check row counts before diffing** +→ If source has 1M rows and target has 900K, row-level diff is misleading. Fix the load completeness issue first. diff --git a/packages/drivers/src/clickhouse.ts b/packages/drivers/src/clickhouse.ts index 256d060180..cfce48ed2f 100644 --- a/packages/drivers/src/clickhouse.ts +++ b/packages/drivers/src/clickhouse.ts @@ -5,7 +5,7 @@ * Uses the official ClickHouse JS client which communicates over HTTP(S). */ -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { let createClient: any @@ -60,11 +60,11 @@ export async function connect(config: ConnectionConfig): Promise { client = createClient(clientConfig) }, - async execute(sql: string, limit?: number, _binds?: any[]): Promise { + async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise { if (!client) { throw new Error("ClickHouse client not connected — call connect() first") } - const effectiveLimit = limit === undefined ? 1000 : limit + const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000) let query = sql // Strip string literals, then comments, for accurate SQL heuristic checks. diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts new file mode 100644 index 0000000000..035df6b4ca --- /dev/null +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -0,0 +1,268 @@ +/** + * DataParity orchestrator — runs the cooperative Rust state machine against + * live database connections. + * + * The Rust engine (DataParitySession) never touches databases — it emits SQL + * for us to execute, we feed results back, and it decides the next step. + * This file is the bridge between that engine and altimate-code's drivers. + */ + +import type { DataDiffParams, DataDiffResult } from "../types" +import * as Registry from "./registry" + +// --------------------------------------------------------------------------- +// Query-source detection +// --------------------------------------------------------------------------- + +const SQL_KEYWORDS = /^\s*(SELECT|WITH|VALUES)\b/i + +/** + * Detect whether a string is an arbitrary SQL query (vs a plain table name). + * Plain table names may contain dots (schema.table, db.schema.table) but not spaces. + */ +function isQuery(input: string): boolean { + return SQL_KEYWORDS.test(input) +} + +/** + * If either source or target is an arbitrary query, wrap them in CTEs so the + * DataParity engine can treat them as tables named `__diff_source` / `__diff_target`. + * + * Returns `{ table1Name, table2Name, ctePrefix | null }`. + * + * When a CTE prefix is returned, it must be prepended to every SQL task emitted + * by the engine before execution. + */ +export function resolveTableSources( + source: string, + target: string, +): { table1Name: string; table2Name: string; ctePrefix: string | null } { + const source_is_query = isQuery(source) + const target_is_query = isQuery(target) + + if (!source_is_query && !target_is_query) { + // Both are plain table names — pass through unchanged + return { table1Name: source, table2Name: target, ctePrefix: null } + } + + // At least one is a query — wrap both in CTEs + const srcExpr = source_is_query ? source : `SELECT * FROM ${source}` + const tgtExpr = target_is_query ? target : `SELECT * FROM ${target}` + + const ctePrefix = `WITH __diff_source AS (\n${srcExpr}\n), __diff_target AS (\n${tgtExpr}\n)` + return { + table1Name: "__diff_source", + table2Name: "__diff_target", + ctePrefix, + } +} + +/** + * Inject a CTE prefix into a SQL statement from the engine. + * + * The engine emits standalone SELECT statements. We need to prepend our CTE + * definitions so `__diff_source`/`__diff_target` resolve correctly. + * + * Handles the case where the engine itself emits CTEs (starts with WITH …): + * WITH engine_cte AS (…) SELECT … FROM __diff_source + * becomes: + * WITH __diff_source AS (…), __diff_target AS (…), engine_cte AS (…) SELECT … + */ +export function injectCte(sql: string, ctePrefix: string): string { + const trimmed = sql.trimStart() + const withMatch = trimmed.match(/^WITH\s+/i) + + if (withMatch) { + // Engine also has CTEs — merge them: our CTEs first, then engine CTEs + const afterWith = trimmed.slice(withMatch[0].length) + // ctePrefix already starts with "WITH …" — strip "WITH " and append ", " + const ourDefs = ctePrefix.replace(/^WITH\s+/i, "") + return `WITH ${ourDefs},\n${afterWith}` + } + + // Plain SELECT — just prepend our CTE block + return `${ctePrefix}\n${trimmed}` +} + +// --------------------------------------------------------------------------- +// Executor +// --------------------------------------------------------------------------- + +type Rows = (string | null)[][] + +/** + * Execute a SQL statement against a named warehouse and return rows as string[][]. + */ +async function executeQuery(sql: string, warehouseName: string | undefined): Promise { + let connector + if (warehouseName) { + connector = await Registry.get(warehouseName) + } else { + const warehouses = Registry.list().warehouses + if (warehouses.length === 0) { + throw new Error("No default warehouse configured.") + } + connector = await Registry.get(warehouses[0].name) + } + + const result = await connector.execute(sql) + + // Normalise to string[][] — drivers return mixed types + return result.rows.map((row: unknown[]) => + row.map((v) => (v === null || v === undefined ? null : String(v))), + ) +} + +// --------------------------------------------------------------------------- +// Main orchestrator +// --------------------------------------------------------------------------- + +const MAX_STEPS = 200 + +export async function runDataDiff(params: DataDiffParams): Promise { + // Dynamically import NAPI module (not available in test environments without the binary) + let DataParitySession: new (specJson: string) => { + start(): string + step(responsesJson: string): string + } + + try { + const core = await import("@altimateai/altimate-core") + DataParitySession = (core as any).DataParitySession + if (!DataParitySession) throw new Error("DataParitySession not exported from @altimateai/altimate-core") + } catch (e) { + return { + success: false, + error: `altimate-core NAPI module unavailable: ${e}`, + steps: 0, + } + } + + // Resolve sources (plain table names vs arbitrary queries) + const { table1Name, table2Name, ctePrefix } = resolveTableSources( + params.source, + params.target, + ) + + // Parse optional qualified names: "db.schema.table" → { database, schema, table } + const parseQualified = (name: string) => { + const parts = name.split(".") + if (parts.length === 3) return { database: parts[0], schema: parts[1], table: parts[2] } + if (parts.length === 2) return { schema: parts[0], table: parts[1] } + return { table: name } + } + + const table1Ref = parseQualified(table1Name) + const table2Ref = parseQualified(table2Name) + + // Resolve dialect from warehouse config + const resolveDialect = (warehouse: string | undefined): string => { + if (warehouse) { + const cfg = Registry.getConfig(warehouse) + return cfg?.type ?? "generic" + } + const warehouses = Registry.list().warehouses + return warehouses[0]?.type ?? "generic" + } + + const dialect1 = resolveDialect(params.source_warehouse) + const dialect2 = resolveDialect(params.target_warehouse ?? params.source_warehouse) + + // Build session spec + const spec = { + table1: table1Ref, + table2: table2Ref, + dialect1, + dialect2, + config: { + algorithm: params.algorithm ?? "auto", + key_columns: params.key_columns, + extra_columns: params.extra_columns ?? [], + ...(params.where_clause ? { where_clause: params.where_clause } : {}), + ...(params.numeric_tolerance != null ? { numeric_tolerance: params.numeric_tolerance } : {}), + ...(params.timestamp_tolerance_ms != null + ? { timestamp_tolerance_ms: params.timestamp_tolerance_ms } + : {}), + }, + } + + // Create session + let session: InstanceType + try { + session = new DataParitySession(JSON.stringify(spec)) + } catch (e) { + return { + success: false, + error: `Failed to create DataParitySession: ${e}`, + steps: 0, + } + } + + // Route SQL tasks to the correct warehouse + const warehouseFor = (tableSide: string): string | undefined => + tableSide === "Table2" ? (params.target_warehouse ?? params.source_warehouse) : params.source_warehouse + + // Cooperative loop + let actionJson = session.start() + let stepCount = 0 + + while (stepCount < MAX_STEPS) { + const action = JSON.parse(actionJson) as { + type: string + tasks?: Array<{ id: string; table_side: string; sql: string; expected_shape: string }> + outcome?: unknown + message?: string + } + + if (action.type === "Done") { + return { + success: true, + steps: stepCount, + outcome: action.outcome, + } + } + + if (action.type === "Error") { + return { + success: false, + error: action.message ?? "Unknown engine error", + steps: stepCount, + } + } + + if (action.type !== "ExecuteSql") { + return { + success: false, + error: `Unexpected action type: ${action.type}`, + steps: stepCount, + } + } + + stepCount++ + + // Execute all SQL tasks in parallel + const tasks = action.tasks ?? [] + const responses = await Promise.all( + tasks.map(async (task) => { + const warehouse = warehouseFor(task.table_side) + // Inject CTE definitions if we're in query-comparison mode + const sql = ctePrefix ? injectCte(task.sql, ctePrefix) : task.sql + try { + const rows = await executeQuery(sql, warehouse) + return { id: task.id, rows } + } catch (e) { + // Return error shape — engine will produce an Error action on next step + return { id: task.id, rows: [], error: String(e) } + } + }), + ) + + actionJson = session.step(JSON.stringify(responses)) + } + + return { + success: false, + error: `Exceeded maximum step limit (${MAX_STEPS}). The diff may require more iterations for this table size.`, + steps: stepCount, + } +} diff --git a/packages/opencode/src/altimate/native/connections/register.ts b/packages/opencode/src/altimate/native/connections/register.ts index ef8ac86861..4f2d83086c 100644 --- a/packages/opencode/src/altimate/native/connections/register.ts +++ b/packages/opencode/src/altimate/native/connections/register.ts @@ -10,6 +10,7 @@ import { register } from "../dispatcher" import * as Registry from "./registry" import { discoverContainers } from "./docker-discovery" import { parseDbtProfiles } from "./dbt-profiles" +import { runDataDiff } from "./data-diff" import type { SqlExecuteParams, SqlExecuteResult, @@ -29,6 +30,8 @@ import type { SchemaInspectResult, DbtProfilesParams, DbtProfilesResult, + DataDiffParams, + DataDiffResult, } from "../types" import type { ConnectionConfig } from "@altimateai/drivers" import { Telemetry } from "../../../telemetry" @@ -425,6 +428,11 @@ register("dbt.profiles", async (params: DbtProfilesParams): Promise => { + return runDataDiff(params) +}) + } // end registerAll // Auto-register on module load diff --git a/packages/opencode/src/altimate/native/types.ts b/packages/opencode/src/altimate/native/types.ts index 16a7f4e062..f88482beb3 100644 --- a/packages/opencode/src/altimate/native/types.ts +++ b/packages/opencode/src/altimate/native/types.ts @@ -964,6 +964,38 @@ export interface LocalTestResult { error?: string } +// --- Data Diff --- + +export interface DataDiffParams { + /** Source table name (e.g. "orders", "db.schema.orders") or full SQL query */ + source: string + /** Target table name or SQL query */ + target: string + /** Primary key columns that uniquely identify each row */ + key_columns: string[] + /** Source warehouse connection name */ + source_warehouse?: string + /** Target warehouse connection name (defaults to source_warehouse) */ + target_warehouse?: string + /** Extra columns to compare beyond the key */ + extra_columns?: string[] + /** Algorithm: "auto" | "joindiff" | "hashdiff" | "profile" | "cascade" */ + algorithm?: string + /** Optional WHERE filter applied to both tables */ + where_clause?: string + /** Absolute numeric tolerance */ + numeric_tolerance?: number + /** Timestamp tolerance in milliseconds */ + timestamp_tolerance_ms?: number +} + +export interface DataDiffResult { + success: boolean + steps: number + outcome?: unknown + error?: string +} + // --- Method registry --- export const BridgeMethods = { @@ -1007,6 +1039,8 @@ export const BridgeMethods = { // --- local testing --- "local.schema_sync": {} as { params: LocalSchemaSyncParams; result: LocalSchemaSyncResult }, "local.test": {} as { params: LocalTestParams; result: LocalTestResult }, + // --- data diff --- + "data.diff": {} as { params: DataDiffParams; result: DataDiffResult }, // --- altimate-core (existing) --- "altimate_core.validate": {} as { params: AltimateCoreValidateParams; result: AltimateCoreResult }, "altimate_core.lint": {} as { params: AltimateCoreLintParams; result: AltimateCoreResult }, diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts new file mode 100644 index 0000000000..0719361dbe --- /dev/null +++ b/packages/opencode/src/altimate/tools/data-diff.ts @@ -0,0 +1,174 @@ +import z from "zod" +import { Tool } from "../../tool/tool" +import { Dispatcher } from "../native" + +export const DataDiffTool = Tool.define("data_diff", { + description: [ + "Compare two database tables or query results row-by-row to find differences.", + "", + "Two use cases:", + "1. Migration validation — compare the same table across two databases:", + ' source="orders" source_warehouse="postgres_prod" target_warehouse="snowflake_dw"', + "2. Query optimization — compare results of two SQL queries on the same database:", + ' source="SELECT id, amount FROM orders WHERE ..." target="SELECT id, amount FROM orders_v2 WHERE ..."', + "", + "Algorithms:", + "- auto: JoinDiff if same dialect, HashDiff if cross-database (default)", + "- joindiff: FULL OUTER JOIN (fast, same-database only)", + "- hashdiff: Bisection with checksums (cross-database, any scale)", + "- profile: Column-level statistics comparison", + ].join("\n"), + parameters: z.object({ + source: z.string().describe( + "Source table name (e.g. 'orders', 'db.schema.orders') or a full SQL query starting with SELECT/WITH", + ), + target: z.string().describe( + "Target table name or SQL query to compare against source", + ), + key_columns: z + .array(z.string()) + .describe("Primary key columns that uniquely identify each row (e.g. ['id'] or ['order_id', 'line_item'])"), + source_warehouse: z.string().optional().describe("Source warehouse connection name"), + target_warehouse: z.string().optional().describe( + "Target warehouse connection name. Omit to use the same warehouse as source (query comparison mode)", + ), + extra_columns: z + .array(z.string()) + .optional() + .describe("Additional columns to compare beyond the key columns. Omit to compare all columns"), + algorithm: z + .enum(["auto", "joindiff", "hashdiff", "profile", "cascade"]) + .optional() + .default("auto") + .describe("Comparison algorithm"), + where_clause: z.string().optional().describe("Optional WHERE filter applied to both tables"), + numeric_tolerance: z + .number() + .optional() + .describe("Absolute tolerance for numeric comparisons (e.g. 0.01 for cent-level tolerance)"), + timestamp_tolerance_ms: z + .number() + .optional() + .describe("Tolerance for timestamp comparisons in milliseconds"), + }), + async execute(args, ctx) { + // Require read permission — data diff executes SELECT queries + await ctx.ask({ + permission: "sql_execute_read", + patterns: [args.source.slice(0, 120), args.target.slice(0, 120)], + always: ["*"], + metadata: {}, + }) + + try { + const result = await Dispatcher.call("data.diff", { + source: args.source, + target: args.target, + key_columns: args.key_columns, + source_warehouse: args.source_warehouse, + target_warehouse: args.target_warehouse, + extra_columns: args.extra_columns, + algorithm: args.algorithm, + where_clause: args.where_clause, + numeric_tolerance: args.numeric_tolerance, + timestamp_tolerance_ms: args.timestamp_tolerance_ms, + }) + + if (!result.success) { + return { + title: "Data diff: ERROR", + metadata: { success: false, steps: result.steps }, + output: `Data diff failed: ${result.error}`, + } + } + + const outcome = result.outcome as any + const output = formatOutcome(outcome, args.source, args.target) + + return { + title: `Data diff: ${summarize(outcome)}`, + metadata: { success: true, steps: result.steps }, + output, + } + } catch (e) { + const msg = e instanceof Error ? e.message : String(e) + return { + title: "Data diff: ERROR", + metadata: { success: false, steps: 0, error: msg }, + output: `Data diff failed: ${msg}`, + } + } + }, +}) + +function summarize(outcome: any): string { + if (!outcome) return "complete" + if (outcome.Match) return "IDENTICAL ✓" + if (outcome.Diff) { + const r = outcome.Diff + const parts: string[] = [] + if (r.rows_only_in_source > 0) parts.push(`${r.rows_only_in_source} only in source`) + if (r.rows_only_in_target > 0) parts.push(`${r.rows_only_in_target} only in target`) + if (r.rows_updated > 0) parts.push(`${r.rows_updated} updated`) + return parts.length ? parts.join(", ") : "differences found" + } + if (outcome.Profile) return "profile complete" + return "complete" +} + +function formatOutcome(outcome: any, source: string, target: string): string { + if (!outcome) return "Comparison complete." + + const lines: string[] = [] + + if (outcome.Match) { + lines.push(`✓ Tables are IDENTICAL`) + const m = outcome.Match + if (m.row_count != null) lines.push(` Rows checked: ${m.row_count.toLocaleString()}`) + if (m.algorithm) lines.push(` Algorithm: ${m.algorithm}`) + return lines.join("\n") + } + + if (outcome.Diff) { + const r = outcome.Diff + lines.push(`✗ Tables DIFFER`) + lines.push(``) + lines.push(` Source: ${source}`) + lines.push(` Target: ${target}`) + lines.push(``) + + if (r.total_source_rows != null) lines.push(` Source rows: ${r.total_source_rows.toLocaleString()}`) + if (r.total_target_rows != null) lines.push(` Target rows: ${r.total_target_rows.toLocaleString()}`) + if (r.rows_only_in_source > 0) lines.push(` Only in source: ${r.rows_only_in_source.toLocaleString()}`) + if (r.rows_only_in_target > 0) lines.push(` Only in target: ${r.rows_only_in_target.toLocaleString()}`) + if (r.rows_updated > 0) lines.push(` Updated rows: ${r.rows_updated.toLocaleString()}`) + if (r.rows_identical > 0) lines.push(` Identical rows: ${r.rows_identical.toLocaleString()}`) + + if (r.sample_diffs?.length) { + lines.push(``) + lines.push(` Sample differences (first ${r.sample_diffs.length}):`) + for (const d of r.sample_diffs.slice(0, 5)) { + lines.push(` key=${JSON.stringify(d.key)} col=${d.column}: ${d.source_value} → ${d.target_value}`) + } + } + + return lines.join("\n") + } + + if (outcome.Profile) { + const p = outcome.Profile + lines.push(`Column Profile Comparison`) + lines.push(``) + for (const col of p.columns ?? []) { + const verdict = col.verdict === "match" ? "✓" : col.verdict === "within_tolerance" ? "~" : "✗" + lines.push(` ${verdict} ${col.column}: ${col.verdict}`) + if (col.source_stats && col.target_stats) { + lines.push(` source: count=${col.source_stats.count} nulls=${col.source_stats.null_count} min=${col.source_stats.min} max=${col.source_stats.max}`) + lines.push(` target: count=${col.target_stats.count} nulls=${col.target_stats.null_count} min=${col.target_stats.min} max=${col.target_stats.max}`) + } + } + return lines.join("\n") + } + + return JSON.stringify(outcome, null, 2) +} diff --git a/packages/opencode/src/tool/registry.ts b/packages/opencode/src/tool/registry.ts index 075291248f..e5fc1bf9c9 100644 --- a/packages/opencode/src/tool/registry.ts +++ b/packages/opencode/src/tool/registry.ts @@ -57,6 +57,7 @@ import { SqlFormatTool } from "../altimate/tools/sql-format" import { SqlFixTool } from "../altimate/tools/sql-fix" import { SqlAutocompleteTool } from "../altimate/tools/sql-autocomplete" import { SqlDiffTool } from "../altimate/tools/sql-diff" +import { DataDiffTool } from "../altimate/tools/data-diff" import { FinopsQueryHistoryTool } from "../altimate/tools/finops-query-history" import { FinopsAnalyzeCreditsTool } from "../altimate/tools/finops-analyze-credits" import { FinopsExpensiveQueriesTool } from "../altimate/tools/finops-expensive-queries" @@ -233,6 +234,7 @@ export namespace ToolRegistry { SqlFixTool, SqlAutocompleteTool, SqlDiffTool, + DataDiffTool, FinopsQueryHistoryTool, FinopsAnalyzeCreditsTool, FinopsExpensiveQueriesTool, From 44d76689cef4342fdd846c1990628c8073ebe682 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Thu, 26 Mar 2026 18:21:06 -0700 Subject: [PATCH 02/20] feat: add partition support to data_diff Split large tables by a date or numeric column before diffing. Each partition is diffed independently then results are aggregated. New params: - partition_column: column to split on (date or numeric) - partition_granularity: day | week | month | year (for dates) - partition_bucket_size: bucket width for numeric columns New output field: - partition_results: per-partition breakdown (identical / differ / error) Dialect-aware SQL: Postgres, Snowflake, BigQuery, ClickHouse, MySQL. Skill updated with partition guidance and examples. --- .opencode/skills/data-parity/SKILL.md | 30 ++- .../altimate/native/connections/data-diff.ts | 233 +++++++++++++++++- .../opencode/src/altimate/native/types.ts | 35 +++ .../opencode/src/altimate/tools/data-diff.ts | 54 +++- 4 files changed, 348 insertions(+), 4 deletions(-) diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md index 4d7b7460c9..3f739eda4b 100644 --- a/.opencode/skills/data-parity/SKILL.md +++ b/.opencode/skills/data-parity/SKILL.md @@ -44,6 +44,9 @@ description: Validate that two tables or query results are identical — or diag - `extra_columns` — columns to compare beyond keys (omit = compare all) - `algorithm` — `auto`, `joindiff`, `hashdiff`, `profile`, `cascade` - `where_clause` — filter applied to both tables +- `partition_column` — split the table by this column and diff each group independently (recommended for large tables) +- `partition_granularity` — `day` | `week` | `month` | `year` for date columns (default: `month`) +- `partition_bucket_size` — for numeric columns: bucket width (e.g. `100000` splits by ranges of 100K) > **CRITICAL — Algorithm choice:** > - If `source_warehouse` ≠ `target_warehouse` → **always use `hashdiff`** (or `auto`). @@ -117,8 +120,31 @@ SELECT COUNT(*) FROM orders Use this to choose the algorithm: - **< 1M rows**: `joindiff` (same DB) or `hashdiff` (cross-DB) — either is fine -- **1M–100M rows**: `hashdiff` or `cascade` -- **> 100M rows**: `hashdiff` with a `where_clause` date filter to validate a recent window first +- **1M–100M rows**: `hashdiff` with `partition_column` for faster, more precise results +- **> 100M rows**: `hashdiff` + `partition_column` — required; bisection alone may miss rows at this scale + +**When to use `partition_column`:** +- Table has a natural time or key column (e.g. `created_at`, `order_id`, `event_date`) +- Table has > 500K rows and bisection is slow or returning incomplete results +- You need per-partition visibility (which month/range has the problem) + +``` +// Date column — partition by month +data_diff(source="lineitem", target="lineitem", + key_columns=["l_orderkey", "l_linenumber"], + source_warehouse="pg_source", target_warehouse="pg_target", + partition_column="l_shipdate", partition_granularity="month", + algorithm="hashdiff") + +// Numeric column — partition by key ranges of 100K +data_diff(source="orders", target="orders", + key_columns=["o_orderkey"], + source_warehouse="pg_source", target_warehouse="pg_target", + partition_column="o_orderkey", partition_bucket_size=100000, + algorithm="hashdiff") +``` + +Output includes an aggregate diff plus a per-partition table showing exactly which ranges differ. ### Step 4: Profile first for unknown tables diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index 035df6b4ca..fe1c926f92 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -7,7 +7,7 @@ * This file is the bridge between that engine and altimate-code's drivers. */ -import type { DataDiffParams, DataDiffResult } from "../types" +import type { DataDiffParams, DataDiffResult, PartitionDiffResult } from "../types" import * as Registry from "./registry" // --------------------------------------------------------------------------- @@ -119,7 +119,238 @@ async function executeQuery(sql: string, warehouseName: string | undefined): Pro const MAX_STEPS = 200 +// --------------------------------------------------------------------------- +// Partition support +// --------------------------------------------------------------------------- + +/** + * Build a DATE_TRUNC expression appropriate for the warehouse dialect. + */ +function dateTruncExpr(granularity: string, column: string, dialect: string): string { + const g = granularity.toLowerCase() + switch (dialect) { + case "bigquery": + return `DATE_TRUNC(${column}, ${g.toUpperCase()})` + case "clickhouse": + return `toStartOf${g.charAt(0).toUpperCase() + g.slice(1)}(${column})` + case "mysql": + case "mariadb": { + const fmt = { day: "%Y-%m-%d", week: "%Y-%u", month: "%Y-%m-01", year: "%Y-01-01" }[g] ?? "%Y-%m-01" + return `DATE_FORMAT(${column}, '${fmt}')` + } + default: + // Postgres, Snowflake, Redshift, DuckDB, etc. + return `DATE_TRUNC('${g}', ${column})` + } +} + +/** + * Build SQL to discover distinct partition values from the source table. + */ +function buildPartitionDiscoverySQL( + table: string, + partitionColumn: string, + granularity: string | undefined, + bucketSize: number | undefined, + dialect: string, + whereClause?: string, +): string { + const isNumeric = bucketSize != null + + let expr: string + if (isNumeric) { + expr = `FLOOR(${partitionColumn} / ${bucketSize}) * ${bucketSize}` + } else { + expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect) + } + + const where = whereClause ? `WHERE ${whereClause}` : "" + return `SELECT DISTINCT ${expr} AS _p FROM ${table} ${where} ORDER BY _p` +} + +/** + * Build a WHERE clause that scopes to a single partition. + */ +function buildPartitionWhereClause( + partitionColumn: string, + partitionValue: string, + granularity: string | undefined, + bucketSize: number | undefined, + dialect: string, +): string { + if (bucketSize != null) { + const lo = Number(partitionValue) + const hi = lo + bucketSize + return `${partitionColumn} >= ${lo} AND ${partitionColumn} < ${hi}` + } + + const expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect) + + // Cast the literal appropriately per dialect + switch (dialect) { + case "bigquery": + return `${expr} = '${partitionValue}'` + case "clickhouse": + return `${expr} = toDate('${partitionValue}')` + case "mysql": + case "mariadb": + return `${expr} = '${partitionValue}'` + default: + return `${expr} = '${partitionValue}'` + } +} + +/** + * Extract DiffStats from a successful outcome (if present). + */ +function extractStats(outcome: unknown): { + rows_source: number + rows_target: number + differences: number + status: "identical" | "differ" +} { + const o = outcome as any + if (!o) return { rows_source: 0, rows_target: 0, differences: 0, status: "identical" } + + if (o.Match) { + return { + rows_source: o.Match.row_count ?? 0, + rows_target: o.Match.row_count ?? 0, + differences: 0, + status: "identical", + } + } + + if (o.Diff) { + const d = o.Diff + return { + rows_source: d.total_source_rows ?? 0, + rows_target: d.total_target_rows ?? 0, + differences: (d.rows_only_in_source ?? 0) + (d.rows_only_in_target ?? 0) + (d.rows_updated ?? 0), + status: "differ", + } + } + + return { rows_source: 0, rows_target: 0, differences: 0, status: "identical" } +} + +/** + * Merge two Diff outcomes into one aggregated Diff outcome. + */ +function mergeOutcomes(accumulated: unknown, next: unknown): unknown { + const a = accumulated as any + const n = next as any + + const aD = a?.Diff ?? (a?.Match ? { total_source_rows: a.Match.row_count, total_target_rows: a.Match.row_count, rows_only_in_source: 0, rows_only_in_target: 0, rows_updated: 0, rows_identical: a.Match.row_count, sample_diffs: [] } : null) + const nD = n?.Diff ?? (n?.Match ? { total_source_rows: n.Match.row_count, total_target_rows: n.Match.row_count, rows_only_in_source: 0, rows_only_in_target: 0, rows_updated: 0, rows_identical: n.Match.row_count, sample_diffs: [] } : null) + + if (!aD && !nD) return { Match: { row_count: 0 } } + if (!aD) return next + if (!nD) return accumulated + + const merged = { + total_source_rows: (aD.total_source_rows ?? 0) + (nD.total_source_rows ?? 0), + total_target_rows: (aD.total_target_rows ?? 0) + (nD.total_target_rows ?? 0), + rows_only_in_source: (aD.rows_only_in_source ?? 0) + (nD.rows_only_in_source ?? 0), + rows_only_in_target: (aD.rows_only_in_target ?? 0) + (nD.rows_only_in_target ?? 0), + rows_updated: (aD.rows_updated ?? 0) + (nD.rows_updated ?? 0), + rows_identical: (aD.rows_identical ?? 0) + (nD.rows_identical ?? 0), + sample_diffs: [...(aD.sample_diffs ?? []), ...(nD.sample_diffs ?? [])].slice(0, 20), + } + + const totalDiff = merged.rows_only_in_source + merged.rows_only_in_target + merged.rows_updated + if (totalDiff === 0) { + return { Match: { row_count: merged.total_source_rows, algorithm: "partitioned" } } + } + return { Diff: merged } +} + +/** + * Run a partitioned diff: discover partition values, diff each partition independently, + * then aggregate results. + */ +async function runPartitionedDiff(params: DataDiffParams): Promise { + const resolveDialect = (warehouse: string | undefined): string => { + if (warehouse) { + const cfg = Registry.getConfig(warehouse) + return cfg?.type ?? "generic" + } + const warehouses = Registry.list().warehouses + return warehouses[0]?.type ?? "generic" + } + + const sourceDialect = resolveDialect(params.source_warehouse) + const { table1Name } = resolveTableSources(params.source, params.target) + + // Discover partition values from source + const discoverySql = buildPartitionDiscoverySQL( + table1Name, + params.partition_column!, + params.partition_granularity, + params.partition_bucket_size, + sourceDialect, + params.where_clause, + ) + + let partitionValues: string[] + try { + const rows = await executeQuery(discoverySql, params.source_warehouse) + partitionValues = rows.map((r) => String(r[0] ?? "")).filter(Boolean) + } catch (e) { + return { success: false, error: `Partition discovery failed: ${e}`, steps: 0 } + } + + if (partitionValues.length === 0) { + return { success: true, steps: 1, outcome: { Match: { row_count: 0, algorithm: "partitioned" } }, partition_results: [] } + } + + // Diff each partition + const partitionResults: PartitionDiffResult[] = [] + let aggregatedOutcome: unknown = null + let totalSteps = 1 + + for (const pVal of partitionValues) { + const partWhere = buildPartitionWhereClause( + params.partition_column!, + pVal, + params.partition_granularity, + params.partition_bucket_size, + sourceDialect, + ) + const fullWhere = params.where_clause ? `(${params.where_clause}) AND (${partWhere})` : partWhere + + const result = await runDataDiff({ + ...params, + where_clause: fullWhere, + partition_column: undefined, // prevent recursion + }) + + totalSteps += result.steps + + if (!result.success) { + partitionResults.push({ partition: pVal, rows_source: 0, rows_target: 0, differences: 0, status: "error", error: result.error }) + continue + } + + const stats = extractStats(result.outcome) + partitionResults.push({ partition: pVal, ...stats }) + aggregatedOutcome = aggregatedOutcome == null ? result.outcome : mergeOutcomes(aggregatedOutcome, result.outcome) + } + + return { + success: true, + steps: totalSteps, + outcome: aggregatedOutcome ?? { Match: { row_count: 0, algorithm: "partitioned" } }, + partition_results: partitionResults, + } +} + export async function runDataDiff(params: DataDiffParams): Promise { + // Dispatch to partitioned diff if partition_column is set + if (params.partition_column) { + return runPartitionedDiff(params) + } + // Dynamically import NAPI module (not available in test environments without the binary) let DataParitySession: new (specJson: string) => { start(): string diff --git a/packages/opencode/src/altimate/native/types.ts b/packages/opencode/src/altimate/native/types.ts index f88482beb3..c5074d7b98 100644 --- a/packages/opencode/src/altimate/native/types.ts +++ b/packages/opencode/src/altimate/native/types.ts @@ -987,6 +987,39 @@ export interface DataDiffParams { numeric_tolerance?: number /** Timestamp tolerance in milliseconds */ timestamp_tolerance_ms?: number + /** + * Column to partition on before diffing. The table is split into groups by + * this column and each group is diffed independently. Results are aggregated. + * Use for large tables where bisection alone is too slow or imprecise. + * + * Examples: "l_shipdate" (date column), "l_orderkey" (numeric column) + */ + partition_column?: string + /** + * Granularity for date partition columns: "day" | "week" | "month" | "year". + * For numeric columns, ignored — use partition_bucket_size instead. + * Defaults to "month". + */ + partition_granularity?: "day" | "week" | "month" | "year" + /** + * For numeric partition columns: size of each bucket. + * E.g. 100000 splits l_orderkey into [0, 100000), [100000, 200000), … + */ + partition_bucket_size?: number +} + +export interface PartitionDiffResult { + /** The partition value (date string or numeric bucket start) */ + partition: string + /** Source row count in this partition */ + rows_source: number + /** Target row count in this partition */ + rows_target: number + /** Total differences found (exclusive + updated) */ + differences: number + /** "identical" | "differ" | "error" */ + status: "identical" | "differ" | "error" + error?: string } export interface DataDiffResult { @@ -994,6 +1027,8 @@ export interface DataDiffResult { steps: number outcome?: unknown error?: string + /** Per-partition breakdown when partition_column is used */ + partition_results?: PartitionDiffResult[] } // --- Method registry --- diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts index 0719361dbe..767921e2e8 100644 --- a/packages/opencode/src/altimate/tools/data-diff.ts +++ b/packages/opencode/src/altimate/tools/data-diff.ts @@ -50,6 +50,23 @@ export const DataDiffTool = Tool.define("data_diff", { .number() .optional() .describe("Tolerance for timestamp comparisons in milliseconds"), + partition_column: z + .string() + .optional() + .describe( + "Column to partition on before diffing. Splits the table into groups and diffs each independently. " + + "Use for large tables to get faster, more precise results. " + + "Examples: 'l_shipdate' (date), 'l_orderkey' (numeric). " + + "Results are aggregated with a per-partition breakdown showing which groups have differences.", + ), + partition_granularity: z + .enum(["day", "week", "month", "year"]) + .optional() + .describe("Granularity for date partition columns. Defaults to 'month'."), + partition_bucket_size: z + .number() + .optional() + .describe("For numeric partition columns: size of each bucket. E.g. 100000 splits orders into ranges of 100K keys."), }), async execute(args, ctx) { // Require read permission — data diff executes SELECT queries @@ -72,6 +89,9 @@ export const DataDiffTool = Tool.define("data_diff", { where_clause: args.where_clause, numeric_tolerance: args.numeric_tolerance, timestamp_tolerance_ms: args.timestamp_tolerance_ms, + partition_column: args.partition_column, + partition_granularity: args.partition_granularity, + partition_bucket_size: args.partition_bucket_size, }) if (!result.success) { @@ -83,7 +103,11 @@ export const DataDiffTool = Tool.define("data_diff", { } const outcome = result.outcome as any - const output = formatOutcome(outcome, args.source, args.target) + let output = formatOutcome(outcome, args.source, args.target) + + if (result.partition_results?.length) { + output += formatPartitionResults(result.partition_results, args.partition_column!) + } return { title: `Data diff: ${summarize(outcome)}`, @@ -172,3 +196,31 @@ function formatOutcome(outcome: any, source: string, target: string): string { return JSON.stringify(outcome, null, 2) } + +function formatPartitionResults( + partitions: Array<{ partition: string; rows_source: number; rows_target: number; differences: number; status: string; error?: string }>, + partitionColumn: string, +): string { + const lines: string[] = ["", `Partition breakdown (by ${partitionColumn}):`] + + const clean = partitions.filter((p) => p.status === "identical") + const dirty = partitions.filter((p) => p.status === "differ") + const errored = partitions.filter((p) => p.status === "error") + + if (dirty.length === 0 && errored.length === 0) { + lines.push(` ✓ All ${partitions.length} partitions identical`) + return lines.join("\n") + } + + for (const p of dirty) { + lines.push(` ✗ ${p.partition} source=${p.rows_source.toLocaleString()} target=${p.rows_target.toLocaleString()} diff=${p.differences.toLocaleString()}`) + } + for (const p of errored) { + lines.push(` ! ${p.partition} ERROR: ${p.error}`) + } + if (clean.length > 0) { + lines.push(` ✓ ${clean.length} partition${clean.length === 1 ? "" : "s"} identical`) + } + + return lines.join("\n") +} From e177f2d01bfa4ba069f563c854c9dddb488a66e2 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Thu, 26 Mar 2026 18:23:05 -0700 Subject: [PATCH 03/20] feat: add categorical partition mode (string, enum, boolean) When partition_column is set without partition_granularity or partition_bucket_size, groups by raw DISTINCT values. Works for any non-date, non-numeric column: status, region, country, etc. WHERE clause uses equality: col = 'value' with proper escaping. --- .opencode/skills/data-parity/SKILL.md | 16 ++++++-- .../altimate/native/connections/data-diff.ts | 41 +++++++++++++++---- .../opencode/src/altimate/tools/data-diff.ts | 10 +++-- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md index 3f739eda4b..4d47be8036 100644 --- a/.opencode/skills/data-parity/SKILL.md +++ b/.opencode/skills/data-parity/SKILL.md @@ -44,9 +44,12 @@ description: Validate that two tables or query results are identical — or diag - `extra_columns` — columns to compare beyond keys (omit = compare all) - `algorithm` — `auto`, `joindiff`, `hashdiff`, `profile`, `cascade` - `where_clause` — filter applied to both tables -- `partition_column` — split the table by this column and diff each group independently (recommended for large tables) -- `partition_granularity` — `day` | `week` | `month` | `year` for date columns (default: `month`) -- `partition_bucket_size` — for numeric columns: bucket width (e.g. `100000` splits by ranges of 100K) +- `partition_column` — split the table by this column and diff each group independently (recommended for large tables); three modes: + - **Date column**: set `partition_granularity` → groups by truncated date periods + - **Numeric column**: set `partition_bucket_size` → groups by equal-width key ranges + - **Categorical column**: set neither → groups by distinct values (strings, enums, booleans like `status`, `region`, `country`) +- `partition_granularity` — `day` | `week` | `month` | `year` — only for date columns +- `partition_bucket_size` — bucket width for numeric columns (e.g. `100000`) > **CRITICAL — Algorithm choice:** > - If `source_warehouse` ≠ `target_warehouse` → **always use `hashdiff`** (or `auto`). @@ -142,6 +145,13 @@ data_diff(source="orders", target="orders", source_warehouse="pg_source", target_warehouse="pg_target", partition_column="o_orderkey", partition_bucket_size=100000, algorithm="hashdiff") + +// Categorical column — partition by distinct status values ('O', 'F', 'P') +data_diff(source="orders", target="orders", + key_columns=["o_orderkey"], + source_warehouse="pg_source", target_warehouse="pg_target", + partition_column="o_orderstatus", // no granularity or bucket_size needed + algorithm="hashdiff") ``` Output includes an aggregate diff plus a per-partition table showing exactly which ranges differ. diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index fe1c926f92..98609b744a 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -144,6 +144,21 @@ function dateTruncExpr(granularity: string, column: string, dialect: string): st } } +/** + * Determine the partition mode based on which params are provided. + * - "date" → partition_granularity is set (or column looks like a date) + * - "numeric" → partition_bucket_size is set + * - "categorical" → neither — use DISTINCT values directly (string, enum, boolean) + */ +function partitionMode( + granularity: string | undefined, + bucketSize: number | undefined, +): "date" | "numeric" | "categorical" { + if (bucketSize != null) return "numeric" + if (granularity != null) return "date" + return "categorical" +} + /** * Build SQL to discover distinct partition values from the source table. */ @@ -155,16 +170,19 @@ function buildPartitionDiscoverySQL( dialect: string, whereClause?: string, ): string { - const isNumeric = bucketSize != null + const where = whereClause ? `WHERE ${whereClause}` : "" + const mode = partitionMode(granularity, bucketSize) let expr: string - if (isNumeric) { + if (mode === "numeric") { expr = `FLOOR(${partitionColumn} / ${bucketSize}) * ${bucketSize}` + } else if (mode === "date") { + expr = dateTruncExpr(granularity!, partitionColumn, dialect) } else { - expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect) + // categorical — raw distinct values, no transformation + expr = partitionColumn } - const where = whereClause ? `WHERE ${whereClause}` : "" return `SELECT DISTINCT ${expr} AS _p FROM ${table} ${where} ORDER BY _p` } @@ -178,13 +196,22 @@ function buildPartitionWhereClause( bucketSize: number | undefined, dialect: string, ): string { - if (bucketSize != null) { + const mode = partitionMode(granularity, bucketSize) + + if (mode === "numeric") { const lo = Number(partitionValue) - const hi = lo + bucketSize + const hi = lo + bucketSize! return `${partitionColumn} >= ${lo} AND ${partitionColumn} < ${hi}` } - const expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect) + if (mode === "categorical") { + // Quote the value — works for strings, enums, booleans + const escaped = partitionValue.replace(/'/g, "''") + return `${partitionColumn} = '${escaped}'` + } + + // date mode + const expr = dateTruncExpr(granularity!, partitionColumn, dialect) // Cast the literal appropriately per dialect switch (dialect) { diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts index 767921e2e8..fc56e0da6d 100644 --- a/packages/opencode/src/altimate/tools/data-diff.ts +++ b/packages/opencode/src/altimate/tools/data-diff.ts @@ -55,18 +55,20 @@ export const DataDiffTool = Tool.define("data_diff", { .optional() .describe( "Column to partition on before diffing. Splits the table into groups and diffs each independently. " + - "Use for large tables to get faster, more precise results. " + - "Examples: 'l_shipdate' (date), 'l_orderkey' (numeric). " + + "Three modes depending on which other params you set:\n" + + " • Date column → set partition_granularity (day/week/month/year). E.g. partition_column='l_shipdate', partition_granularity='month'\n" + + " • Numeric column → set partition_bucket_size. E.g. partition_column='l_orderkey', partition_bucket_size=100000\n" + + " • Categorical → set neither. Works for string/enum/boolean columns like 'status', 'region', 'country'. Groups by distinct values.\n" + "Results are aggregated with a per-partition breakdown showing which groups have differences.", ), partition_granularity: z .enum(["day", "week", "month", "year"]) .optional() - .describe("Granularity for date partition columns. Defaults to 'month'."), + .describe("For date partition columns: truncation granularity. Omit for numeric or categorical columns."), partition_bucket_size: z .number() .optional() - .describe("For numeric partition columns: size of each bucket. E.g. 100000 splits orders into ranges of 100K keys."), + .describe("For numeric partition columns: size of each bucket. E.g. 100000 splits l_orderkey into ranges of 100K. Omit for date or categorical columns."), }), async execute(args, ctx) { // Require read permission — data diff executes SELECT queries From d1cc9325d24153821a92e315f40ec9c8446c92d1 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Thu, 26 Mar 2026 18:41:15 -0700 Subject: [PATCH 04/20] fix: correct outcome shape handling in extractStats and formatOutcome Rust serializes ReladiffOutcome with serde tag 'mode', producing: {mode: 'diff', diff_rows: [...], stats: {rows_table1, rows_table2, exclusive_table1, exclusive_table2, updated, unchanged}} Previous code checked for {Match: {...}} / {Diff: {...}} shapes that never matched, causing partitioned diff to report all partitions as 'identical' with 0 rows. - extractStats(): check outcome.mode === 'diff', read from stats fields - mergeOutcomes(): aggregate mode-based outcomes correctly - summarize()/formatOutcome(): display mode-based shape with correct labels --- .../altimate/native/connections/data-diff.ts | 72 +++++++++--------- .../opencode/src/altimate/tools/data-diff.ts | 76 +++++++++++-------- 2 files changed, 82 insertions(+), 66 deletions(-) diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index 98609b744a..6c4f2e7a61 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -229,6 +229,9 @@ function buildPartitionWhereClause( /** * Extract DiffStats from a successful outcome (if present). + * + * Rust serializes ReladiffOutcome as: {mode: "diff", diff_rows: [...], stats: {...}} + * stats fields: rows_table1, rows_table2, exclusive_table1, exclusive_table2, updated, unchanged */ function extractStats(outcome: unknown): { rows_source: number @@ -239,22 +242,17 @@ function extractStats(outcome: unknown): { const o = outcome as any if (!o) return { rows_source: 0, rows_target: 0, differences: 0, status: "identical" } - if (o.Match) { - return { - rows_source: o.Match.row_count ?? 0, - rows_target: o.Match.row_count ?? 0, - differences: 0, - status: "identical", - } - } - - if (o.Diff) { - const d = o.Diff + if (o.mode === "diff") { + const s = o.stats ?? {} + const exclusive1 = Number(s.exclusive_table1 ?? 0) + const exclusive2 = Number(s.exclusive_table2 ?? 0) + const updated = Number(s.updated ?? 0) + const differences = exclusive1 + exclusive2 + updated return { - rows_source: d.total_source_rows ?? 0, - rows_target: d.total_target_rows ?? 0, - differences: (d.rows_only_in_source ?? 0) + (d.rows_only_in_target ?? 0) + (d.rows_updated ?? 0), - status: "differ", + rows_source: Number(s.rows_table1 ?? 0), + rows_target: Number(s.rows_table2 ?? 0), + differences, + status: differences > 0 ? "differ" : "identical", } } @@ -262,34 +260,36 @@ function extractStats(outcome: unknown): { } /** - * Merge two Diff outcomes into one aggregated Diff outcome. + * Merge two diff outcomes into one aggregated outcome. + * + * Both outcomes use the Rust shape: {mode: "diff", diff_rows: [...], stats: {...}} */ function mergeOutcomes(accumulated: unknown, next: unknown): unknown { + if (!accumulated) return next + if (!next) return accumulated + const a = accumulated as any const n = next as any - const aD = a?.Diff ?? (a?.Match ? { total_source_rows: a.Match.row_count, total_target_rows: a.Match.row_count, rows_only_in_source: 0, rows_only_in_target: 0, rows_updated: 0, rows_identical: a.Match.row_count, sample_diffs: [] } : null) - const nD = n?.Diff ?? (n?.Match ? { total_source_rows: n.Match.row_count, total_target_rows: n.Match.row_count, rows_only_in_source: 0, rows_only_in_target: 0, rows_updated: 0, rows_identical: n.Match.row_count, sample_diffs: [] } : null) - - if (!aD && !nD) return { Match: { row_count: 0 } } - if (!aD) return next - if (!nD) return accumulated - - const merged = { - total_source_rows: (aD.total_source_rows ?? 0) + (nD.total_source_rows ?? 0), - total_target_rows: (aD.total_target_rows ?? 0) + (nD.total_target_rows ?? 0), - rows_only_in_source: (aD.rows_only_in_source ?? 0) + (nD.rows_only_in_source ?? 0), - rows_only_in_target: (aD.rows_only_in_target ?? 0) + (nD.rows_only_in_target ?? 0), - rows_updated: (aD.rows_updated ?? 0) + (nD.rows_updated ?? 0), - rows_identical: (aD.rows_identical ?? 0) + (nD.rows_identical ?? 0), - sample_diffs: [...(aD.sample_diffs ?? []), ...(nD.sample_diffs ?? [])].slice(0, 20), - } + const aS = a.stats ?? {} + const nS = n.stats ?? {} + + const rows_table1 = (Number(aS.rows_table1) || 0) + (Number(nS.rows_table1) || 0) + const rows_table2 = (Number(aS.rows_table2) || 0) + (Number(nS.rows_table2) || 0) + const exclusive_table1 = (Number(aS.exclusive_table1) || 0) + (Number(nS.exclusive_table1) || 0) + const exclusive_table2 = (Number(aS.exclusive_table2) || 0) + (Number(nS.exclusive_table2) || 0) + const updated = (Number(aS.updated) || 0) + (Number(nS.updated) || 0) + const unchanged = (Number(aS.unchanged) || 0) + (Number(nS.unchanged) || 0) - const totalDiff = merged.rows_only_in_source + merged.rows_only_in_target + merged.rows_updated - if (totalDiff === 0) { - return { Match: { row_count: merged.total_source_rows, algorithm: "partitioned" } } + const totalRows = rows_table1 + rows_table2 + const totalDiff = exclusive_table1 + exclusive_table2 + updated + const diff_percent = totalRows > 0 ? (totalDiff / totalRows) * 100 : 0 + + return { + mode: "diff", + diff_rows: [...(a.diff_rows ?? []), ...(n.diff_rows ?? [])].slice(0, 100), + stats: { rows_table1, rows_table2, exclusive_table1, exclusive_table2, updated, unchanged, diff_percent }, } - return { Diff: merged } } /** diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts index fc56e0da6d..d498eefe7e 100644 --- a/packages/opencode/src/altimate/tools/data-diff.ts +++ b/packages/opencode/src/altimate/tools/data-diff.ts @@ -129,16 +129,23 @@ export const DataDiffTool = Tool.define("data_diff", { function summarize(outcome: any): string { if (!outcome) return "complete" - if (outcome.Match) return "IDENTICAL ✓" - if (outcome.Diff) { - const r = outcome.Diff + + // Rust serializes ReladiffOutcome as {mode: "diff"|"profile"|..., stats: {...}, diff_rows: [...]} + if (outcome.mode === "diff") { + const s = outcome.stats ?? {} + const e1 = Number(s.exclusive_table1 ?? 0) + const e2 = Number(s.exclusive_table2 ?? 0) + const upd = Number(s.updated ?? 0) + if (e1 === 0 && e2 === 0 && upd === 0) return "IDENTICAL ✓" const parts: string[] = [] - if (r.rows_only_in_source > 0) parts.push(`${r.rows_only_in_source} only in source`) - if (r.rows_only_in_target > 0) parts.push(`${r.rows_only_in_target} only in target`) - if (r.rows_updated > 0) parts.push(`${r.rows_updated} updated`) - return parts.length ? parts.join(", ") : "differences found" + if (e1 > 0) parts.push(`${e1} only in source`) + if (e2 > 0) parts.push(`${e2} only in target`) + if (upd > 0) parts.push(`${upd} updated`) + return parts.join(", ") } - if (outcome.Profile) return "profile complete" + if (outcome.mode === "profile") return "profile complete" + if (outcome.mode === "cascade") return "cascade complete" + return "complete" } @@ -147,45 +154,54 @@ function formatOutcome(outcome: any, source: string, target: string): string { const lines: string[] = [] - if (outcome.Match) { - lines.push(`✓ Tables are IDENTICAL`) - const m = outcome.Match - if (m.row_count != null) lines.push(` Rows checked: ${m.row_count.toLocaleString()}`) - if (m.algorithm) lines.push(` Algorithm: ${m.algorithm}`) - return lines.join("\n") - } + // Rust serializes ReladiffOutcome as {mode: "diff", diff_rows: [...], stats: {...}} + // stats: rows_table1, rows_table2, exclusive_table1, exclusive_table2, updated, unchanged + if (outcome.mode === "diff") { + const s = outcome.stats ?? {} + const rows1 = Number(s.rows_table1 ?? 0) + const rows2 = Number(s.rows_table2 ?? 0) + const e1 = Number(s.exclusive_table1 ?? 0) + const e2 = Number(s.exclusive_table2 ?? 0) + const updated = Number(s.updated ?? 0) + const unchanged = Number(s.unchanged ?? 0) + + if (e1 === 0 && e2 === 0 && updated === 0) { + lines.push(`✓ Tables are IDENTICAL`) + if (rows1 > 0) lines.push(` Rows checked: ${rows1.toLocaleString()}`) + return lines.join("\n") + } - if (outcome.Diff) { - const r = outcome.Diff lines.push(`✗ Tables DIFFER`) lines.push(``) lines.push(` Source: ${source}`) lines.push(` Target: ${target}`) lines.push(``) - if (r.total_source_rows != null) lines.push(` Source rows: ${r.total_source_rows.toLocaleString()}`) - if (r.total_target_rows != null) lines.push(` Target rows: ${r.total_target_rows.toLocaleString()}`) - if (r.rows_only_in_source > 0) lines.push(` Only in source: ${r.rows_only_in_source.toLocaleString()}`) - if (r.rows_only_in_target > 0) lines.push(` Only in target: ${r.rows_only_in_target.toLocaleString()}`) - if (r.rows_updated > 0) lines.push(` Updated rows: ${r.rows_updated.toLocaleString()}`) - if (r.rows_identical > 0) lines.push(` Identical rows: ${r.rows_identical.toLocaleString()}`) + if (rows1 > 0) lines.push(` Source rows: ${rows1.toLocaleString()}`) + if (rows2 > 0) lines.push(` Target rows: ${rows2.toLocaleString()}`) + if (e1 > 0) lines.push(` Only in source: ${e1.toLocaleString()}`) + if (e2 > 0) lines.push(` Only in target: ${e2.toLocaleString()}`) + if (updated > 0) lines.push(` Updated rows: ${updated.toLocaleString()}`) + if (unchanged > 0) lines.push(` Identical rows: ${unchanged.toLocaleString()}`) - if (r.sample_diffs?.length) { + const diffRows = outcome.diff_rows ?? [] + if (diffRows.length > 0) { lines.push(``) - lines.push(` Sample differences (first ${r.sample_diffs.length}):`) - for (const d of r.sample_diffs.slice(0, 5)) { - lines.push(` key=${JSON.stringify(d.key)} col=${d.column}: ${d.source_value} → ${d.target_value}`) + lines.push(` Sample differences (first ${Math.min(diffRows.length, 5)}):`) + for (const d of diffRows.slice(0, 5)) { + const label = d.sign === "-" ? "source only" : "target only" + lines.push(` [${label}] ${d.values?.join(" | ")}`) } } return lines.join("\n") } - if (outcome.Profile) { - const p = outcome.Profile + if (outcome.mode === "profile") { + const cols = outcome.column_stats ?? outcome.columns ?? [] lines.push(`Column Profile Comparison`) lines.push(``) - for (const col of p.columns ?? []) { + for (const col of cols) { const verdict = col.verdict === "match" ? "✓" : col.verdict === "within_tolerance" ? "~" : "✗" lines.push(` ${verdict} ${col.column}: ${col.verdict}`) if (col.source_stats && col.target_stats) { From 149066b4ed90fc05efa4eb6edb7f721025f7139a Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Fri, 27 Mar 2026 12:28:29 -0700 Subject: [PATCH 05/20] feat: rewrite data-parity skill with interactive, plan-first workflow Key changes based on feedback: - Always generate TODO plan before any tool is called - Enforce data_diff tool usage (never manual EXCEPT/JOIN SQL) - Add PK discovery + explicit user confirmation step - Profile pass is now mandatory before row-level diff - Ask user before expensive row-level diff on large tables: - <100K rows: proceed automatically - 100K-10M rows: ask with where_clause option - >10M rows: offer window/partition/full choices - Document partition modes (date/numeric/categorical) with examples - Add warehouse_list as first step to confirm connections --- .opencode/skills/data-parity/SKILL.md | 344 +++++++++++--------------- 1 file changed, 146 insertions(+), 198 deletions(-) diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md index 4d47be8036..1ecc37f399 100644 --- a/.opencode/skills/data-parity/SKILL.md +++ b/.opencode/skills/data-parity/SKILL.md @@ -5,109 +5,85 @@ description: Validate that two tables or query results are identical — or diag # Data Parity (Table Diff) -## Output Style +## CRITICAL: Always Start With a Plan -**Report facts only. No editorializing.** -- Show counts, changed values, missing rows, new rows — that's it. -- Do NOT explain why row-level diffing is valuable, why COUNT(*) is insufficient, or pitch the tool. -- Do NOT add "the dangerous one", "this is exactly why", "this matters" style commentary. -- The user asked for a diff result, not a lecture. +**Before doing anything else**, generate a numbered TODO list for the user: -## Requirements -**Agent:** any -**Tools used:** `sql_query` (for schema discovery), `data_diff` +``` +Here's my plan: +1. [ ] List available warehouse connections +2. [ ] Inspect schema and discover primary key candidates +3. [ ] Confirm primary keys with you +4. [ ] Check row counts on both sides +5. [ ] Run column-level profile (cheap — no row scan) +6. [ ] Ask whether to proceed with row-level diff (may be expensive for large tables) +7. [ ] Run targeted row-level diff on diverging columns only +8. [ ] Report findings +``` -## When to Use This Skill +Update each item to `[x]` as you complete it. This plan should be visible before any tool is called. -**Use when the user wants to:** -- Confirm two tables contain the same data after a migration -- Find rows added, deleted, or modified between source and target -- Validate that a dbt model produces the same output as the old query -- Run regression checks after a pipeline change +--- -**Do NOT use for:** -- Schema comparison (column names, types) — check DDL instead -- Performance benchmarking — this runs SELECT queries +## CRITICAL: Use `data_diff` Tool — Never Write Manual Diff SQL ---- +**NEVER** write SQL to diff tables manually (e.g., `EXCEPT`, `FULL OUTER JOIN`, `MINUS`). +**ALWAYS** use the `data_diff` tool for any comparison operation. -## The `data_diff` Tool - -`data_diff` takes table names and key columns. It generates SQL, routes it through the specified warehouse connections, and reports differences. It **does not discover schema** — you must provide key columns and relevant comparison columns. - -**Key parameters:** -- `source` — table name (`orders`, `db.schema.orders`) or full SELECT/WITH query -- `target` — table name or SELECT query -- `key_columns` — primary key(s) uniquely identifying each row (required) -- `source_warehouse` — connection name for source -- `target_warehouse` — connection name for target (omit = same as source) -- `extra_columns` — columns to compare beyond keys (omit = compare all) -- `algorithm` — `auto`, `joindiff`, `hashdiff`, `profile`, `cascade` -- `where_clause` — filter applied to both tables -- `partition_column` — split the table by this column and diff each group independently (recommended for large tables); three modes: - - **Date column**: set `partition_granularity` → groups by truncated date periods - - **Numeric column**: set `partition_bucket_size` → groups by equal-width key ranges - - **Categorical column**: set neither → groups by distinct values (strings, enums, booleans like `status`, `region`, `country`) -- `partition_granularity` — `day` | `week` | `month` | `year` — only for date columns -- `partition_bucket_size` — bucket width for numeric columns (e.g. `100000`) - -> **CRITICAL — Algorithm choice:** -> - If `source_warehouse` ≠ `target_warehouse` → **always use `hashdiff`** (or `auto`). -> - `joindiff` runs a single SQL JOIN on ONE connection — it physically cannot see the other table. -> Using `joindiff` across different servers always reports 0 differences (both sides look identical). -> - When in doubt, use `algorithm="auto"` — it picks `joindiff` for same-warehouse and `hashdiff` for cross-warehouse automatically. +`sql_query` is only for: +- Schema inspection (`information_schema`, `SHOW COLUMNS`, `DESCRIBE`) +- Cardinality checks to identify keys +- Row count estimates + +Everything else — profile, row diff, value comparison — goes through `data_diff`. --- -## Workflow +## Step 1: List Connections + +Use `warehouse_list` to show the user what connections are available and which warehouses map to source and target. -The key principle: **the LLM does the identification work using SQL tools first, then calls data_diff with informed parameters.** +--- -### Step 1: Inspect the tables +## Step 2: Inspect Schema and Discover Primary Keys -Before calling `data_diff`, use `sql_query` to understand what you're comparing: +Use `sql_query` to get columns and identify key candidates: ```sql --- Get columns and types +-- Postgres / Redshift / DuckDB SELECT column_name, data_type, is_nullable FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'orders' ORDER BY ordinal_position ``` -For ClickHouse: ```sql -DESCRIBE TABLE source_db.events +-- Snowflake +SHOW COLUMNS IN TABLE orders ``` -For Snowflake: ```sql -SHOW COLUMNS IN TABLE orders +-- ClickHouse +DESCRIBE TABLE source_db.events ``` -**Look for:** -- Columns that look like primary keys (named `id`, `*_id`, `*_key`, `uuid`) -- Columns with `NOT NULL` constraints -- Whether there are composite keys - -### Step 2: Identify the key columns +**Look for:** columns named `id`, `*_id`, `*_key`, `uuid`, or with `NOT NULL` + unique index. -If the primary key isn't obvious from the schema, run a cardinality check: +If no obvious PK, run a cardinality check: ```sql SELECT COUNT(*) AS total_rows, COUNT(DISTINCT order_id) AS distinct_order_id, - COUNT(DISTINCT customer_id) AS distinct_customer_id, - COUNT(DISTINCT created_at) AS distinct_created_at + COUNT(DISTINCT customer_id) AS distinct_customer_id FROM orders ``` -**A good key column:** `distinct_count = total_rows` (fully unique) and `null_count = 0`. +A valid key column: `distinct_count = total_rows`. -If no single column is unique, find a composite key: +For composite keys: ```sql -SELECT order_id, line_item_id, COUNT(*) as cnt +SELECT order_id, line_item_id, COUNT(*) AS cnt FROM order_lines GROUP BY order_id, line_item_id HAVING COUNT(*) > 1 @@ -115,55 +91,37 @@ LIMIT 5 ``` If this returns 0 rows, `(order_id, line_item_id)` is a valid composite key. -### Step 3: Estimate table size +## Step 3: Confirm Keys With the User -```sql -SELECT COUNT(*) FROM orders -``` +**Always confirm** the identified key columns before proceeding: -Use this to choose the algorithm: -- **< 1M rows**: `joindiff` (same DB) or `hashdiff` (cross-DB) — either is fine -- **1M–100M rows**: `hashdiff` with `partition_column` for faster, more precise results -- **> 100M rows**: `hashdiff` + `partition_column` — required; bisection alone may miss rows at this scale +> "I identified `order_id` as the primary key (150,000 distinct values = 150,000 rows, no NULLs). Does that look right, or should I use a different column?" -**When to use `partition_column`:** -- Table has a natural time or key column (e.g. `created_at`, `order_id`, `event_date`) -- Table has > 500K rows and bisection is slow or returning incomplete results -- You need per-partition visibility (which month/range has the problem) +Do not proceed to diff until the user confirms or corrects. -``` -// Date column — partition by month -data_diff(source="lineitem", target="lineitem", - key_columns=["l_orderkey", "l_linenumber"], - source_warehouse="pg_source", target_warehouse="pg_target", - partition_column="l_shipdate", partition_granularity="month", - algorithm="hashdiff") +--- -// Numeric column — partition by key ranges of 100K -data_diff(source="orders", target="orders", - key_columns=["o_orderkey"], - source_warehouse="pg_source", target_warehouse="pg_target", - partition_column="o_orderkey", partition_bucket_size=100000, - algorithm="hashdiff") +## Step 4: Check Row Counts -// Categorical column — partition by distinct status values ('O', 'F', 'P') -data_diff(source="orders", target="orders", - key_columns=["o_orderkey"], - source_warehouse="pg_source", target_warehouse="pg_target", - partition_column="o_orderstatus", // no granularity or bucket_size needed - algorithm="hashdiff") +```sql +SELECT COUNT(*) FROM orders -- run on both source and target ``` -Output includes an aggregate diff plus a per-partition table showing exactly which ranges differ. +Use counts to: +- Detect load completeness issues before row-level diff +- Choose the algorithm and decide whether to ask about cost +- If counts differ significantly (>5%), flag it immediately -### Step 4: Profile first for unknown tables +--- + +## Step 5: Column-Level Profile (Always Run This First) -If you don't know what to expect (first-time validation, unfamiliar pipeline), start cheap: +Profile is cheap — it runs aggregates, not row scans. **Always run profile before row-level diff.** ``` data_diff( source="orders", - target="orders_migrated", + target="orders", key_columns=["order_id"], source_warehouse="postgres_prod", target_warehouse="snowflake_dw", @@ -171,51 +129,101 @@ data_diff( ) ``` -Profile output tells you: -- Row count on each side (mismatch = load completeness problem) -- Which columns have null count differences (mismatch = NULL handling bug) -- Min/max divergence per column (mismatch = value transformation bug) -- Which columns match exactly (safe to skip in row-level diff) +Profile tells you: +- Row count on each side +- Which columns have null count differences → NULL handling bug +- Min/max divergence per column → value transformation bug +- Which columns match exactly → safe to skip in row-level diff -**Interpret profile to narrow the diff:** +**Example output:** ``` Column Profile Comparison ✓ order_id: match ✓ customer_id: match - ✗ amount: DIFFER ← source min=10.00, target min=10.01 — rounding issue? + ✗ amount: DIFFER ← source min=10.00, target min=10.01 — rounding? ✗ status: DIFFER ← source nulls=0, target nulls=47 — NULL mapping bug? ✓ created_at: match ``` -→ Only diff `amount` and `status` in the next step. -### Step 5: Run targeted row-level diff +--- + +## Step 6: Ask Before Running Row-Level Diff on Large Tables + +After profiling, check row count and **ask the user** before proceeding: + +**If table has < 100K rows:** proceed automatically. + +**If table has 100K–10M rows:** +> "The table has 1.2M rows. Row-level diff will scan all rows on both sides — this may take 30–60 seconds and consume warehouse compute. Do you want to proceed? You can also provide a `where_clause` to limit the scope (e.g., `created_at >= '2024-01-01'`)." + +**If table has > 10M rows:** +> "The table has 50M rows. Full row-level diff could be expensive. Options: +> 1. Diff a recent window only (e.g., last 30 days) +> 2. Partition by a date/key column — shows which partition has problems without scanning everything +> 3. Proceed with full diff (may take several minutes) +> Which would you prefer?" + +--- + +## Step 7: Run Targeted Row-Level Diff + +Use only the columns that the profile said differ. This is faster and produces cleaner output. ``` data_diff( source="orders", - target="orders_migrated", + target="orders", key_columns=["order_id"], - extra_columns=["amount", "status"], // only the columns profile said differ + extra_columns=["amount", "status"], // only diverging columns from profile source_warehouse="postgres_prod", target_warehouse="snowflake_dw", algorithm="hashdiff" ) ``` +### For large tables — use partition_column + +Split the table into groups and diff each independently. Three modes: + +``` +// Date column — partition by month +data_diff(source="lineitem", target="lineitem", + key_columns=["l_orderkey", "l_linenumber"], + source_warehouse="pg_source", target_warehouse="pg_target", + partition_column="l_shipdate", partition_granularity="month", + algorithm="hashdiff") + +// Numeric column — partition by key ranges of 100K +data_diff(source="orders", target="orders", + key_columns=["o_orderkey"], + source_warehouse="pg_source", target_warehouse="pg_target", + partition_column="o_orderkey", partition_bucket_size=100000, + algorithm="hashdiff") + +// Categorical column — partition by distinct values (string, enum, boolean) +data_diff(source="orders", target="orders", + key_columns=["o_orderkey"], + source_warehouse="pg_source", target_warehouse="pg_target", + partition_column="o_orderstatus", + algorithm="hashdiff") +``` + +Output includes aggregate diff + per-partition breakdown showing which group has problems. + --- ## Algorithm Selection | Algorithm | When to use | |-----------|-------------| -| `profile` | First pass — column stats (count, min, max, nulls). No row scan. | -| `joindiff` | Same database — single FULL OUTER JOIN query. Fast. | -| `hashdiff` | Cross-database, or large tables — bisection with checksums. Scales. | +| `profile` | **Always run first** — column stats (count, min, max, nulls). No row scan. | +| `joindiff` | Same database — single FULL OUTER JOIN. Fast, exact. | +| `hashdiff` | Cross-database or large tables — bisection with checksums. Scales to billions. | | `cascade` | Auto-escalate: profile → hashdiff on diverging columns. | | `auto` | JoinDiff if same warehouse, HashDiff if cross-database. | -**JoinDiff constraint:** Both tables must be on the **same database connection**. If source and target are on different servers, JoinDiff will always report 0 diffs (it only sees one side). Use `hashdiff` or `auto` for cross-database. +> **CRITICAL:** If `source_warehouse` ≠ `target_warehouse`, **never use `joindiff`** — it only sees one connection and always reports 0 differences. Use `hashdiff` or `auto`. --- @@ -226,101 +234,41 @@ data_diff( ✓ Tables are IDENTICAL Rows checked: 1,000,000 ``` -→ Migration validated. Data is identical. - -### DIFFER — Diagnose by pattern +### DIFFER ``` ✗ Tables DIFFER - Only in source: 2 → rows deleted in target (ETL missed deletes) - Only in target: 2 → rows added to target (dedup issue or new data) - Updated rows: 3 → values changed (transform bug, type casting, rounding) - Identical rows: 15 -``` - -| Pattern | Root cause hypothesis | -|---------|----------------------| -| `only_in_source > 0`, `only_in_target = 0` | ETL dropped rows — check filters, incremental logic | -| `only_in_source = 0`, `only_in_target > 0` | Target has extra rows — check dedup or wrong join | -| `updated_rows > 0`, row counts match | Silent value corruption — check transforms, type casts | -| Row count differs | Load completeness issue — check ETL watermarks | - -Sample diffs point to the specific key + column + old→new value: -``` -key={"order_id":"4"} col=amount: 300.00 → 305.00 -``` -Use this to query the source systems directly and trace the discrepancy. - ---- - -## Usage Examples - -### Full workflow: unknown migration + Source rows: 150,000 + Target rows: 149,950 + Only in source: 50 → rows deleted in target (ETL missed deletes) + Only in target: 0 + Updated rows: 0 + Identical rows: 149,950 ``` -// 1. Discover schema -sql_query("SELECT column_name, data_type FROM information_schema.columns WHERE table_name='orders'", warehouse="postgres_prod") - -// 2. Check row count -sql_query("SELECT COUNT(*), COUNT(DISTINCT order_id) FROM orders", warehouse="postgres_prod") -// 3. Profile to find which columns differ -data_diff(source="orders", target="orders", key_columns=["order_id"], - source_warehouse="postgres_prod", target_warehouse="snowflake_dw", algorithm="profile") - -// 4. Row-level diff on diverging columns only -data_diff(source="orders", target="orders", key_columns=["order_id"], - extra_columns=["amount", "status"], - source_warehouse="postgres_prod", target_warehouse="snowflake_dw", algorithm="hashdiff") -``` - -### Same-database query refactor -``` -data_diff( - source="SELECT id, amount, status FROM orders WHERE region = 'us-east'", - target="SELECT id, amount, status FROM orders_v2 WHERE region = 'us-east'", - key_columns=["id"] -) -``` - -### Large table — filter to recent window first -``` -data_diff( - source="fact_events", - target="fact_events_v2", - key_columns=["event_id"], - where_clause="event_date >= '2024-01-01'", - algorithm="hashdiff" -) -``` - -### ClickHouse — always qualify with database.table -``` -data_diff( - source="source_db.events", - target="target_db.events", - key_columns=["event_id"], - source_warehouse="clickhouse_source", - target_warehouse="clickhouse_target", - algorithm="hashdiff" -) -``` +| Pattern | Root cause | +|---------|-----------| +| `only_in_source > 0`, target = 0 | ETL dropped rows — check filters, incremental logic | +| `only_in_target > 0`, source = 0 | Target has extra rows — dedup issue or wrong join | +| `updated_rows > 0`, counts match | Silent value corruption — check type casts, rounding | +| Row counts differ significantly | Load completeness — check ETL watermarks | --- ## Common Mistakes -**Calling data_diff without knowing the key** -→ Run `sql_query` to check cardinality first. A bad key gives meaningless results. +**Writing manual diff SQL instead of calling data_diff** +→ Never use EXCEPT, MINUS, or FULL OUTER JOIN to diff tables. Use `data_diff`. -**Using joindiff for cross-database tables** -→ JoinDiff runs one SQL query on one connection. It can't see the other table. Use `hashdiff` or `auto`. +**Calling data_diff without confirming the key** +→ Confirm cardinality with the user first. A bad key gives meaningless results. -**Diffing a 1B row table without a date filter** -→ Add `where_clause` to scope to recent data. Validate a window first, then expand. +**Using joindiff for cross-database tables** +→ JoinDiff can't see the remote table. Always returns 0 diffs. Use `hashdiff` or `auto`. -**Ignoring profile output and jumping to full diff** -→ Profile is free. It tells you which columns actually differ so you can avoid scanning all columns across all rows. +**Skipping the profile step and jumping to full row diff** +→ Profile is free. It tells you which columns actually differ so you avoid scanning everything. -**Forgetting to check row counts before diffing** -→ If source has 1M rows and target has 900K, row-level diff is misleading. Fix the load completeness issue first. +**Running full diff on a billion-row table without asking** +→ Always ask the user before expensive operations. Offer filtering and partition options. From 3caab3039490e70a6531a587d46a7ad81d2e8c68 Mon Sep 17 00:00:00 2001 From: Aditya Pandey Date: Fri, 27 Mar 2026 21:43:47 -0700 Subject: [PATCH 06/20] fix: auto-discover extra_columns and exclude audit/timestamp columns from data diff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Rust engine only compares columns explicitly listed in extra_columns. When omitted, it was silently reporting all key-matched rows as 'identical' even when non-key values differed — a false positive bug. Changes: - Auto-discover columns from information_schema when extra_columns is omitted and source is a plain table name (not a SQL query) - Exclude audit/timestamp columns (updated_at, created_at, inserted_at, modified_at, _fivetran_*, _airbyte_*, publisher_last_updated_*, etc.) from comparison by default since they typically differ due to ETL timing - Report excluded columns in tool output so users know what was skipped - Fix misleading tool description that said 'Omit to compare all columns' - Update SKILL.md with critical guidance on extra_columns behavior --- .opencode/skills/data-parity/SKILL.md | 15 ++ .../altimate/native/connections/data-diff.ts | 155 +++++++++++++++++- .../opencode/src/altimate/native/types.ts | 2 + .../opencode/src/altimate/tools/data-diff.ts | 14 +- 4 files changed, 184 insertions(+), 2 deletions(-) diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md index 1ecc37f399..9302e50b66 100644 --- a/.opencode/skills/data-parity/SKILL.md +++ b/.opencode/skills/data-parity/SKILL.md @@ -256,6 +256,18 @@ Output includes aggregate diff + per-partition breakdown showing which group has --- +## CRITICAL: `extra_columns` Behavior + +The Rust engine **only compares columns listed in `extra_columns`**. If the list is empty, it compares key existence only — rows that match on key but differ in values will be silently reported as "identical". This is the most common source of false positives. + +**Auto-discovery (default for table names):** When `extra_columns` is omitted and the source is a plain table name, `data_diff` auto-discovers all non-key columns from `information_schema` and excludes audit/timestamp columns (like `updated_at`, `created_at`, `inserted_at`, `modified_at`, `publisher_last_updated_epoch_ms`, ETL metadata columns like `_fivetran_synced`, etc.). The output will list which columns were auto-excluded. + +**SQL queries:** When source is a SQL query (not a table name), auto-discovery cannot work. You **must** provide `extra_columns` explicitly. If you don't, only key-level matching occurs. + +**When to override auto-exclusion:** If the user specifically wants to compare audit columns (e.g., verifying that `created_at` was preserved during migration), pass those columns explicitly in `extra_columns`. + +--- + ## Common Mistakes **Writing manual diff SQL instead of calling data_diff** @@ -272,3 +284,6 @@ Output includes aggregate diff + per-partition breakdown showing which group has **Running full diff on a billion-row table without asking** → Always ask the user before expensive operations. Offer filtering and partition options. + +**Omitting extra_columns when source is a SQL query** +→ Auto-discovery only works for table names. For SQL queries, always list the columns to compare explicitly. diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index 6c4f2e7a61..a5c009aff9 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -113,6 +113,138 @@ async function executeQuery(sql: string, warehouseName: string | undefined): Pro ) } +// --------------------------------------------------------------------------- +// Column auto-discovery and audit column exclusion +// --------------------------------------------------------------------------- + +/** + * Patterns that match audit/timestamp columns which should be excluded from + * value comparison by default. These columns typically differ between source + * and target due to ETL timing, sync metadata, or pipeline bookkeeping — + * not because of actual data discrepancies. + */ +const AUDIT_COLUMN_PATTERNS = [ + // Exact common names + /^(created|updated|modified|inserted|deleted|synced|published|ingested|loaded|extracted|refreshed)_(at|on|date|time|timestamp|ts|dt|epoch)$/i, + // Suffix patterns: *_at, *_on with temporal prefix + /_(created|updated|modified|inserted|deleted|synced|published|ingested|loaded|extracted|refreshed)$/i, + // ETL metadata columns + /^(etl|elt|dbt|pipeline|batch|sync|publish|ingest)_(created|updated|modified|loaded|run|timestamp|ts|time|at|epoch)/i, + /^(_sdc_|_airbyte_|_fivetran_|_stitch_|__hevo_)/i, + // Generic timestamp metadata + /^(last_updated|last_modified|date_updated|date_modified|date_created|row_updated|row_created)$/i, + /^(publisher_last_updated|publisher_updated)/i, + // Epoch variants + /(updated|modified|created|inserted|published|loaded|synced)_epoch/i, + /epoch_ms$/i, +] + +/** + * Check whether a column name matches known audit/timestamp patterns. + */ +function isAuditColumn(columnName: string): boolean { + return AUDIT_COLUMN_PATTERNS.some((pattern) => pattern.test(columnName)) +} + +/** + * Build a query to discover column names for a table, appropriate for the dialect. + */ +function buildColumnDiscoverySQL(tableName: string, dialect: string): string { + // Parse schema.table or db.schema.table + const parts = tableName.split(".") + let schemaFilter = "" + let tableFilter = "" + + if (parts.length === 3) { + schemaFilter = `table_schema = '${parts[1]}'` + tableFilter = `table_name = '${parts[2]}'` + } else if (parts.length === 2) { + schemaFilter = `table_schema = '${parts[0]}'` + tableFilter = `table_name = '${parts[1]}'` + } else { + tableFilter = `table_name = '${parts[0]}'` + } + + switch (dialect) { + case "clickhouse": + return `DESCRIBE TABLE ${tableName}` + case "snowflake": + return `SHOW COLUMNS IN TABLE ${tableName}` + default: { + // Postgres, MySQL, Redshift, DuckDB, etc. — use information_schema + const conditions = [tableFilter] + if (schemaFilter) conditions.push(schemaFilter) + return `SELECT column_name FROM information_schema.columns WHERE ${conditions.join(" AND ")} ORDER BY ordinal_position` + } + } +} + +/** + * Parse column names from the discovery query result, handling dialect differences. + */ +function parseColumnNames(rows: (string | null)[][], dialect: string): string[] { + switch (dialect) { + case "clickhouse": + // DESCRIBE returns: name, type, default_type, default_expression, ... + return rows.map((r) => r[0] ?? "").filter(Boolean) + case "snowflake": + // SHOW COLUMNS returns: table_name, schema_name, column_name, data_type, ... + // column_name is at index 2 + return rows.map((r) => r[2] ?? "").filter(Boolean) + default: + // information_schema returns: column_name + return rows.map((r) => r[0] ?? "").filter(Boolean) + } +} + +/** + * Auto-discover non-key, non-audit columns for a table. + * + * When the caller omits `extra_columns`, we query the source table's schema to + * find all columns, then exclude: + * 1. Key columns (already used for matching) + * 2. Audit/timestamp columns (updated_at, created_at, etc.) that typically + * differ between source and target due to ETL timing + * + * Returns the list of columns to compare, or undefined if discovery fails + * (in which case the engine falls back to key-only comparison). + */ +async function discoverExtraColumns( + tableName: string, + keyColumns: string[], + dialect: string, + warehouseName: string | undefined, +): Promise<{ columns: string[]; excludedAudit: string[] } | undefined> { + // Only works for plain table names, not SQL queries + if (SQL_KEYWORDS.test(tableName)) return undefined + + try { + const sql = buildColumnDiscoverySQL(tableName, dialect) + const rows = await executeQuery(sql, warehouseName) + const allColumns = parseColumnNames(rows, dialect) + + if (allColumns.length === 0) return undefined + + const keySet = new Set(keyColumns.map((k) => k.toLowerCase())) + const extraColumns: string[] = [] + const excludedAudit: string[] = [] + + for (const col of allColumns) { + if (keySet.has(col.toLowerCase())) continue + if (isAuditColumn(col)) { + excludedAudit.push(col) + } else { + extraColumns.push(col) + } + } + + return { columns: extraColumns, excludedAudit } + } catch { + // Schema discovery failed — fall back to engine default (key-only) + return undefined + } +} + // --------------------------------------------------------------------------- // Main orchestrator // --------------------------------------------------------------------------- @@ -426,6 +558,26 @@ export async function runDataDiff(params: DataDiffParams): Promise 0 ? { excluded_audit_columns: excludedAuditColumns } : {}), } } diff --git a/packages/opencode/src/altimate/native/types.ts b/packages/opencode/src/altimate/native/types.ts index c5074d7b98..6f7307e242 100644 --- a/packages/opencode/src/altimate/native/types.ts +++ b/packages/opencode/src/altimate/native/types.ts @@ -1029,6 +1029,8 @@ export interface DataDiffResult { error?: string /** Per-partition breakdown when partition_column is used */ partition_results?: PartitionDiffResult[] + /** Columns auto-excluded from comparison (audit/timestamp columns like updated_at, created_at) */ + excluded_audit_columns?: string[] } // --- Method registry --- diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts index d498eefe7e..97a4085169 100644 --- a/packages/opencode/src/altimate/tools/data-diff.ts +++ b/packages/opencode/src/altimate/tools/data-diff.ts @@ -35,7 +35,13 @@ export const DataDiffTool = Tool.define("data_diff", { extra_columns: z .array(z.string()) .optional() - .describe("Additional columns to compare beyond the key columns. Omit to compare all columns"), + .describe( + "Columns to compare beyond the key columns. " + + "IMPORTANT: If omitted AND source is a plain table name, columns are auto-discovered from the schema " + + "(excluding key columns and audit/timestamp columns like updated_at, created_at, inserted_at, modified_at). " + + "If omitted AND source is a SQL query, ONLY key columns are compared — value changes in non-key columns will NOT be detected. " + + "Always provide explicit extra_columns when comparing SQL queries to ensure value-level comparison." + ), algorithm: z .enum(["auto", "joindiff", "hashdiff", "profile", "cascade"]) .optional() @@ -111,6 +117,12 @@ export const DataDiffTool = Tool.define("data_diff", { output += formatPartitionResults(result.partition_results, args.partition_column!) } + // Report auto-excluded audit columns so the LLM and user know what was skipped + const excluded = (result as any).excluded_audit_columns as string[] | undefined + if (excluded && excluded.length > 0) { + output += `\n\n Note: ${excluded.length} audit/timestamp column${excluded.length === 1 ? "" : "s"} auto-excluded from comparison: ${excluded.join(", ")}` + } + return { title: `Data diff: ${summarize(outcome)}`, metadata: { success: true, steps: result.steps }, From 550d431b7b57bab27437e50fd34f5d9686aa1632 Mon Sep 17 00:00:00 2001 From: Aditya Pandey Date: Sat, 28 Mar 2026 01:05:19 -0700 Subject: [PATCH 07/20] fix: add `noLimit` option to driver `execute()` to prevent silent result truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All drivers default to `LIMIT 1001` on SELECT queries and post-truncate to 1000 rows. This silently drops rows when the data-diff engine needs complete result sets — a FULL OUTER JOIN returning >1000 diff rows would be truncated, causing the engine to undercount differences. - Add `ExecuteOptions { noLimit?: boolean }` to the `Connector` interface - When `noLimit: true`, set `effectiveLimit = 0` (falsy) so the existing LIMIT injection guard is skipped, and add `effectiveLimit > 0` to the truncation check so rows aren't sliced to zero - Update all 12 drivers: postgres, clickhouse, snowflake, bigquery, mysql, redshift, databricks, duckdb, oracle, sqlserver, sqlite, mongodb - Pass `{ noLimit: true }` from `data-diff.ts` `executeQuery()` Interactive SQL callers are unaffected — they continue to get the default 1000-row limit. Only the data-diff pipeline opts out. Co-Authored-By: Claude Opus 4.6 (1M context) --- packages/drivers/src/bigquery.ts | 8 ++++---- packages/drivers/src/databricks.ts | 8 ++++---- packages/drivers/src/duckdb.ts | 8 ++++---- packages/drivers/src/mysql.ts | 8 ++++---- packages/drivers/src/oracle.ts | 8 ++++---- packages/drivers/src/postgres.ts | 8 ++++---- packages/drivers/src/redshift.ts | 8 ++++---- packages/drivers/src/snowflake.ts | 8 ++++---- packages/drivers/src/sqlite.ts | 8 ++++---- packages/drivers/src/sqlserver.ts | 8 ++++---- packages/drivers/src/types.ts | 8 +++++++- .../opencode/src/altimate/native/connections/data-diff.ts | 3 ++- 12 files changed, 49 insertions(+), 42 deletions(-) diff --git a/packages/drivers/src/bigquery.ts b/packages/drivers/src/bigquery.ts index f14e3b681d..abc7a8f05f 100644 --- a/packages/drivers/src/bigquery.ts +++ b/packages/drivers/src/bigquery.ts @@ -2,7 +2,7 @@ * BigQuery driver using the `@google-cloud/bigquery` package. */ -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { let BigQueryModule: any @@ -37,8 +37,8 @@ export async function connect(config: ConnectionConfig): Promise { client = new BigQuery(options) }, - async execute(sql: string, limit?: number, binds?: any[]): Promise { - const effectiveLimit = limit ?? 1000 + async execute(sql: string, limit?: number, binds?: any[], execOptions?: ExecuteOptions): Promise { + const effectiveLimit = execOptions?.noLimit ? 0 : (limit ?? 1000) const query = sql.replace(/;\s*$/, "") const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql) @@ -58,7 +58,7 @@ export async function connect(config: ConnectionConfig): Promise { const [rows] = await client.query(options) const columns = rows.length > 0 ? Object.keys(rows[0]) : [] - const truncated = rows.length > effectiveLimit + const truncated = effectiveLimit > 0 && rows.length > effectiveLimit const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows return { diff --git a/packages/drivers/src/databricks.ts b/packages/drivers/src/databricks.ts index ccb3d5f8f7..83e75dcd7c 100644 --- a/packages/drivers/src/databricks.ts +++ b/packages/drivers/src/databricks.ts @@ -2,7 +2,7 @@ * Databricks driver using the `@databricks/sql` package. */ -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { let databricksModule: any @@ -44,8 +44,8 @@ export async function connect(config: ConnectionConfig): Promise { }) }, - async execute(sql: string, limit?: number, binds?: any[]): Promise { - const effectiveLimit = limit ?? 1000 + async execute(sql: string, limit?: number, binds?: any[], options?: ExecuteOptions): Promise { + const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000) let query = sql const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql) if ( @@ -65,7 +65,7 @@ export async function connect(config: ConnectionConfig): Promise { await operation.close() const columns = rows.length > 0 ? Object.keys(rows[0]) : [] - const truncated = rows.length > effectiveLimit + const truncated = effectiveLimit > 0 && rows.length > effectiveLimit const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows return { diff --git a/packages/drivers/src/duckdb.ts b/packages/drivers/src/duckdb.ts index f938f99d01..3ccca467aa 100644 --- a/packages/drivers/src/duckdb.ts +++ b/packages/drivers/src/duckdb.ts @@ -2,7 +2,7 @@ * DuckDB driver using the `duckdb` package. */ -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { let duckdb: any @@ -105,8 +105,8 @@ export async function connect(config: ConnectionConfig): Promise { connection = db.connect() }, - async execute(sql: string, limit?: number, binds?: any[]): Promise { - const effectiveLimit = limit ?? 1000 + async execute(sql: string, limit?: number, binds?: any[], options?: ExecuteOptions): Promise { + const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000) let finalSql = sql const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql) @@ -123,7 +123,7 @@ export async function connect(config: ConnectionConfig): Promise { : await query(finalSql) const columns = rows.length > 0 ? Object.keys(rows[0]) : [] - const truncated = rows.length > effectiveLimit + const truncated = effectiveLimit > 0 && rows.length > effectiveLimit const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows return { diff --git a/packages/drivers/src/mysql.ts b/packages/drivers/src/mysql.ts index 28c4a8def9..3859f5e993 100644 --- a/packages/drivers/src/mysql.ts +++ b/packages/drivers/src/mysql.ts @@ -2,7 +2,7 @@ * MySQL driver using the `mysql2` package. */ -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { let mysql: any @@ -41,8 +41,8 @@ export async function connect(config: ConnectionConfig): Promise { pool = mysql.createPool(poolConfig) }, - async execute(sql: string, limit?: number, _binds?: any[]): Promise { - const effectiveLimit = limit ?? 1000 + async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise { + const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000) let query = sql const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql) if ( @@ -56,7 +56,7 @@ export async function connect(config: ConnectionConfig): Promise { const [rows, fields] = await pool.query(query) const columns = fields?.map((f: any) => f.name) ?? [] const rowsArr = Array.isArray(rows) ? rows : [] - const truncated = rowsArr.length > effectiveLimit + const truncated = effectiveLimit > 0 && rowsArr.length > effectiveLimit const limitedRows = truncated ? rowsArr.slice(0, effectiveLimit) : rowsArr diff --git a/packages/drivers/src/oracle.ts b/packages/drivers/src/oracle.ts index e3bab24819..39e4b11c37 100644 --- a/packages/drivers/src/oracle.ts +++ b/packages/drivers/src/oracle.ts @@ -2,7 +2,7 @@ * Oracle driver using the `oracledb` package (thin mode, pure JS). */ -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { let oracledb: any @@ -37,8 +37,8 @@ export async function connect(config: ConnectionConfig): Promise { }) }, - async execute(sql: string, limit?: number, _binds?: any[]): Promise { - const effectiveLimit = limit ?? 1000 + async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise { + const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000) let query = sql const isSelectLike = /^\s*(SELECT|WITH)\b/i.test(sql) @@ -61,7 +61,7 @@ export async function connect(config: ConnectionConfig): Promise { const columns = result.metaData?.map((m: any) => m.name) ?? (rows.length > 0 ? Object.keys(rows[0]) : []) - const truncated = rows.length > effectiveLimit + const truncated = effectiveLimit > 0 && rows.length > effectiveLimit const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows diff --git a/packages/drivers/src/postgres.ts b/packages/drivers/src/postgres.ts index e1b69465eb..755b2e4ed9 100644 --- a/packages/drivers/src/postgres.ts +++ b/packages/drivers/src/postgres.ts @@ -2,7 +2,7 @@ * PostgreSQL driver using the `pg` package. */ -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { let pg: any @@ -46,7 +46,7 @@ export async function connect(config: ConnectionConfig): Promise { pool = new Pool(poolConfig) }, - async execute(sql: string, limit?: number, _binds?: any[]): Promise { + async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise { const client = await pool.connect() try { if (config.statement_timeout) { @@ -57,7 +57,7 @@ export async function connect(config: ConnectionConfig): Promise { } let query = sql - const effectiveLimit = limit ?? 1000 + const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000) const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql) // Add LIMIT only for SELECT-like queries and if not already present if ( @@ -70,7 +70,7 @@ export async function connect(config: ConnectionConfig): Promise { const result = await client.query(query) const columns = result.fields?.map((f: any) => f.name) ?? [] - const truncated = result.rows.length > effectiveLimit + const truncated = effectiveLimit > 0 && result.rows.length > effectiveLimit const rows = truncated ? result.rows.slice(0, effectiveLimit) : result.rows diff --git a/packages/drivers/src/redshift.ts b/packages/drivers/src/redshift.ts index 5893777102..92f8f32790 100644 --- a/packages/drivers/src/redshift.ts +++ b/packages/drivers/src/redshift.ts @@ -3,7 +3,7 @@ * Uses svv_ system views for introspection. */ -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { let pg: any @@ -46,10 +46,10 @@ export async function connect(config: ConnectionConfig): Promise { pool = new Pool(poolConfig) }, - async execute(sql: string, limit?: number, _binds?: any[]): Promise { + async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise { const client = await pool.connect() try { - const effectiveLimit = limit ?? 1000 + const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000) let query = sql const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql) if ( @@ -62,7 +62,7 @@ export async function connect(config: ConnectionConfig): Promise { const result = await client.query(query) const columns = result.fields?.map((f: any) => f.name) ?? [] - const truncated = result.rows.length > effectiveLimit + const truncated = effectiveLimit > 0 && result.rows.length > effectiveLimit const rows = truncated ? result.rows.slice(0, effectiveLimit) : result.rows diff --git a/packages/drivers/src/snowflake.ts b/packages/drivers/src/snowflake.ts index 03cc1c84a7..6a37c6caaa 100644 --- a/packages/drivers/src/snowflake.ts +++ b/packages/drivers/src/snowflake.ts @@ -3,7 +3,7 @@ */ import * as fs from "fs" -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { let snowflake: any @@ -232,8 +232,8 @@ export async function connect(config: ConnectionConfig): Promise { }) }, - async execute(sql: string, limit?: number, binds?: any[]): Promise { - const effectiveLimit = limit ?? 1000 + async execute(sql: string, limit?: number, binds?: any[], options?: ExecuteOptions): Promise { + const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000) let query = sql const isSelectLike = /^\s*(SELECT|WITH|VALUES|SHOW)\b/i.test(sql) if ( @@ -245,7 +245,7 @@ export async function connect(config: ConnectionConfig): Promise { } const result = await executeQuery(query, binds) - const truncated = result.rows.length > effectiveLimit + const truncated = effectiveLimit > 0 && result.rows.length > effectiveLimit const rows = truncated ? result.rows.slice(0, effectiveLimit) : result.rows diff --git a/packages/drivers/src/sqlite.ts b/packages/drivers/src/sqlite.ts index 46d1e74ec8..48ef8321cd 100644 --- a/packages/drivers/src/sqlite.ts +++ b/packages/drivers/src/sqlite.ts @@ -4,7 +4,7 @@ */ import { Database } from "bun:sqlite" -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { const dbPath = (config.path as string) ?? ":memory:" @@ -22,9 +22,9 @@ export async function connect(config: ConnectionConfig): Promise { } }, - async execute(sql: string, limit?: number, _binds?: any[]): Promise { + async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise { if (!db) throw new Error("SQLite connection not open") - const effectiveLimit = limit ?? 1000 + const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000) // Determine if this is a SELECT-like statement const trimmed = sql.trim().toLowerCase() @@ -60,7 +60,7 @@ export async function connect(config: ConnectionConfig): Promise { const stmt = db.prepare(query) const rows = stmt.all() as any[] const columns = rows.length > 0 ? Object.keys(rows[0]) : [] - const truncated = rows.length > effectiveLimit + const truncated = effectiveLimit > 0 && rows.length > effectiveLimit const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows return { diff --git a/packages/drivers/src/sqlserver.ts b/packages/drivers/src/sqlserver.ts index b9aac91760..3ea1e390f3 100644 --- a/packages/drivers/src/sqlserver.ts +++ b/packages/drivers/src/sqlserver.ts @@ -2,7 +2,7 @@ * SQL Server driver using the `mssql` (tedious) package. */ -import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types" +import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types" export async function connect(config: ConnectionConfig): Promise { let mssql: any @@ -42,8 +42,8 @@ export async function connect(config: ConnectionConfig): Promise { pool = await mssql.connect(mssqlConfig) }, - async execute(sql: string, limit?: number, _binds?: any[]): Promise { - const effectiveLimit = limit ?? 1000 + async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise { + const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000) let query = sql const isSelectLike = /^\s*SELECT\b/i.test(sql) @@ -69,7 +69,7 @@ export async function connect(config: ConnectionConfig): Promise { : (result.recordset?.columns ? Object.keys(result.recordset.columns) : []) - const truncated = rows.length > effectiveLimit + const truncated = effectiveLimit > 0 && rows.length > effectiveLimit const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows return { diff --git a/packages/drivers/src/types.ts b/packages/drivers/src/types.ts index 31a7565134..3bc3760d6c 100644 --- a/packages/drivers/src/types.ts +++ b/packages/drivers/src/types.ts @@ -20,9 +20,15 @@ export interface SchemaColumn { nullable: boolean } +export interface ExecuteOptions { + /** Skip the default LIMIT injection and post-truncation. Use when the caller + * needs the complete, untruncated result set (e.g. data-diff pipelines). */ + noLimit?: boolean +} + export interface Connector { connect(): Promise - execute(sql: string, limit?: number, binds?: any[]): Promise + execute(sql: string, limit?: number, binds?: any[], options?: ExecuteOptions): Promise listSchemas(): Promise listTables(schema: string): Promise> describeTable(schema: string, table: string): Promise diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index a5c009aff9..98aae49427 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -105,7 +105,8 @@ async function executeQuery(sql: string, warehouseName: string | undefined): Pro connector = await Registry.get(warehouses[0].name) } - const result = await connector.execute(sql) + // Bypass the driver's default LIMIT — data-diff needs complete result sets. + const result = await connector.execute(sql, undefined, undefined, { noLimit: true }) // Normalise to string[][] — drivers return mixed types return result.rows.map((row: unknown[]) => From f478bffc2ab7c0f49892c2c7af7f2878e691b168 Mon Sep 17 00:00:00 2001 From: Aditya Pandey Date: Sat, 28 Mar 2026 12:49:58 -0700 Subject: [PATCH 08/20] feat: detect auto-timestamp defaults from database catalog and confirm exclusions with user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Column exclusion now has two layers: 1. Name-pattern matching (existing) — updated_at, created_at, _fivetran_synced, etc. 2. Schema-level default detection (new) — queries column_default for NOW(), CURRENT_TIMESTAMP, GETDATE(), SYSDATE, SYSTIMESTAMP, etc. Covers PostgreSQL, MySQL, Snowflake, SQL Server, Oracle, ClickHouse, DuckDB, SQLite, and Redshift in a single round-trip (no extra query). The skill prompt now instructs the agent to present detected auto-timestamp columns to the user and ask for confirmation before excluding them, since migrations should preserve timestamps while ETL replication regenerates them. --- .opencode/skills/data-parity/SKILL.md | 80 +++++++-- .../altimate/native/connections/data-diff.ts | 161 +++++++++++++++--- .../opencode/src/altimate/tools/data-diff.ts | 7 +- 3 files changed, 210 insertions(+), 38 deletions(-) diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md index 9302e50b66..39afa6b616 100644 --- a/.opencode/skills/data-parity/SKILL.md +++ b/.opencode/skills/data-parity/SKILL.md @@ -12,13 +12,14 @@ description: Validate that two tables or query results are identical — or diag ``` Here's my plan: 1. [ ] List available warehouse connections -2. [ ] Inspect schema and discover primary key candidates +2. [ ] Inspect schema, discover primary key candidates, and detect auto-timestamp columns 3. [ ] Confirm primary keys with you -4. [ ] Check row counts on both sides -5. [ ] Run column-level profile (cheap — no row scan) -6. [ ] Ask whether to proceed with row-level diff (may be expensive for large tables) -7. [ ] Run targeted row-level diff on diverging columns only -8. [ ] Report findings +4. [ ] Confirm which auto-timestamp columns to exclude +5. [ ] Check row counts on both sides +6. [ ] Run column-level profile (cheap — no row scan) +7. [ ] Ask whether to proceed with row-level diff (may be expensive for large tables) +8. [ ] Run targeted row-level diff on diverging columns only +9. [ ] Report findings ``` Update each item to `[x]` as you complete it. This plan should be visible before any tool is called. @@ -45,13 +46,13 @@ Use `warehouse_list` to show the user what connections are available and which w --- -## Step 2: Inspect Schema and Discover Primary Keys +## Step 2: Inspect Schema, Discover Primary Keys, and Detect Auto-Timestamp Columns -Use `sql_query` to get columns and identify key candidates: +Use `sql_query` to get columns, defaults, and identify key candidates: ```sql -- Postgres / Redshift / DuckDB -SELECT column_name, data_type, is_nullable +SELECT column_name, data_type, is_nullable, column_default FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'orders' ORDER BY ordinal_position @@ -62,6 +63,14 @@ ORDER BY ordinal_position SHOW COLUMNS IN TABLE orders ``` +```sql +-- MySQL / MariaDB (also fetch EXTRA for ON UPDATE detection) +SELECT column_name, data_type, is_nullable, column_default, extra +FROM information_schema.columns +WHERE table_schema = 'mydb' AND table_name = 'orders' +ORDER BY ordinal_position +``` + ```sql -- ClickHouse DESCRIBE TABLE source_db.events @@ -69,6 +78,15 @@ DESCRIBE TABLE source_db.events **Look for:** columns named `id`, `*_id`, `*_key`, `uuid`, or with `NOT NULL` + unique index. +**Also look for auto-timestamp columns** — any column whose `column_default` contains a time-generating function: +- PostgreSQL/DuckDB/Redshift: `now()`, `CURRENT_TIMESTAMP`, `clock_timestamp()` +- MySQL/MariaDB: `CURRENT_TIMESTAMP` (in default or EXTRA) +- Snowflake: `CURRENT_TIMESTAMP()`, `SYSDATE()` +- SQL Server: `getdate()`, `sysdatetime()` +- Oracle: `SYSDATE`, `SYSTIMESTAMP` + +These columns auto-generate values on INSERT, so they inherently differ between source and target due to write timing — not because of actual data discrepancies. **Collect them for confirmation in Step 4.** + If no obvious PK, run a cardinality check: ```sql @@ -101,7 +119,33 @@ Do not proceed to diff until the user confirms or corrects. --- -## Step 4: Check Row Counts +## Step 4: Confirm Auto-Timestamp Column Exclusions + +If you detected any columns with auto-generating timestamp defaults in Step 2, **present them to the user and ask for confirmation** before excluding them. + +**Example prompt when auto-timestamp columns are found:** + +> "I found **3 columns** with auto-generating timestamp defaults that will inherently differ between source and target (due to when each row was written, not actual data differences): +> +> | Column | Default | Reason to exclude | +> |--------|---------|-------------------| +> | `created_at` | `DEFAULT now()` | Set on insert — reflects when this copy was written | +> | `updated_at` | `DEFAULT now()` | Set on insert — reflects when this copy was written | +> | `_loaded_at` | `DEFAULT CURRENT_TIMESTAMP` | ETL load timestamp | +> +> Should I **exclude** these from the comparison? Or do you want to include any of them (e.g., if you're verifying that `created_at` was preserved during migration)?" + +**If user confirms exclusion:** Omit those columns from `extra_columns` when calling `data_diff`. + +**If user wants to include some:** Add them explicitly to `extra_columns`. + +**If no auto-timestamp columns were detected:** Skip this step and proceed to Step 5. + +> **Why ask?** In migration validation, `created_at` should often be *identical* between source and target (it was migrated, not regenerated). But in ETL replication, `created_at` is freshly generated on each side and *should* differ. Only the user knows which case applies. + +--- + +## Step 5: Check Row Counts ```sql SELECT COUNT(*) FROM orders -- run on both source and target @@ -114,7 +158,7 @@ Use counts to: --- -## Step 5: Column-Level Profile (Always Run This First) +## Step 6: Column-Level Profile (Always Run This First) Profile is cheap — it runs aggregates, not row scans. **Always run profile before row-level diff.** @@ -148,7 +192,7 @@ Column Profile Comparison --- -## Step 6: Ask Before Running Row-Level Diff on Large Tables +## Step 7: Ask Before Running Row-Level Diff on Large Tables After profiling, check row count and **ask the user** before proceeding: @@ -166,7 +210,7 @@ After profiling, check row count and **ask the user** before proceeding: --- -## Step 7: Run Targeted Row-Level Diff +## Step 8: Run Targeted Row-Level Diff Use only the columns that the profile said differ. This is faster and produces cleaner output. @@ -260,7 +304,12 @@ Output includes aggregate diff + per-partition breakdown showing which group has The Rust engine **only compares columns listed in `extra_columns`**. If the list is empty, it compares key existence only — rows that match on key but differ in values will be silently reported as "identical". This is the most common source of false positives. -**Auto-discovery (default for table names):** When `extra_columns` is omitted and the source is a plain table name, `data_diff` auto-discovers all non-key columns from `information_schema` and excludes audit/timestamp columns (like `updated_at`, `created_at`, `inserted_at`, `modified_at`, `publisher_last_updated_epoch_ms`, ETL metadata columns like `_fivetran_synced`, etc.). The output will list which columns were auto-excluded. +**Auto-discovery (default for table names):** When `extra_columns` is omitted and the source is a plain table name, `data_diff` auto-discovers all non-key columns from the database catalog and excludes columns using two detection layers: + +1. **Name-pattern matching** — columns named like `updated_at`, `created_at`, `inserted_at`, `modified_at`, `publisher_last_updated_epoch_ms`, ETL metadata columns like `_fivetran_synced`, `_airbyte_extracted_at`, etc. +2. **Schema-level default detection** — columns with auto-generating timestamp defaults (`DEFAULT NOW()`, `DEFAULT CURRENT_TIMESTAMP`, `GETDATE()`, `SYSDATE()`, `SYSTIMESTAMP`, etc.), detected directly from the database catalog. This catches columns that don't follow naming conventions but still auto-generate values on INSERT. Works across PostgreSQL, MySQL, Snowflake, SQL Server, Oracle, ClickHouse, DuckDB, SQLite, and Redshift. + +The output lists which columns were auto-excluded and why. **SQL queries:** When source is a SQL query (not a table name), auto-discovery cannot work. You **must** provide `extra_columns` explicitly. If you don't, only key-level matching occurs. @@ -287,3 +336,6 @@ The Rust engine **only compares columns listed in `extra_columns`**. If the list **Omitting extra_columns when source is a SQL query** → Auto-discovery only works for table names. For SQL queries, always list the columns to compare explicitly. + +**Silently excluding auto-timestamp columns without asking the user** +→ Always present detected auto-timestamp columns (Step 4) and get explicit confirmation. In migration scenarios, `created_at` should be *identical* — excluding it silently hides real bugs. diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index 98aae49427..0afc2c964a 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -147,8 +147,70 @@ function isAuditColumn(columnName: string): boolean { return AUDIT_COLUMN_PATTERNS.some((pattern) => pattern.test(columnName)) } +// --------------------------------------------------------------------------- +// Auto-timestamp default detection (schema-level) +// --------------------------------------------------------------------------- + +/** + * Patterns that detect auto-generated timestamp/date defaults in column_default + * expressions. These functions produce the current time when a row is inserted + * (or updated), meaning the column value will inherently differ between source + * and target — not because of actual data discrepancies, but because of when + * each copy was written. + * + * Covers: PostgreSQL, MySQL/MariaDB, Snowflake, SQL Server, Oracle, + * ClickHouse, DuckDB, SQLite, Redshift, BigQuery, Databricks. + */ +const AUTO_TIMESTAMP_DEFAULT_PATTERNS = [ + // PostgreSQL, DuckDB, Redshift + /\bnow\s*\(\)/i, + /\bclock_timestamp\s*\(\)/i, + /\bstatement_timestamp\s*\(\)/i, + /\btransaction_timestamp\s*\(\)/i, + /\blocaltimestamp\b/i, + // Standard SQL — used by most dialects + /\bcurrent_timestamp\b/i, + // MySQL / MariaDB — "ON UPDATE CURRENT_TIMESTAMP" in the EXTRA column + /\bon\s+update\s+current_timestamp/i, + // Snowflake + /\bsysdate\s*\(\)/i, + // SQL Server + /\bgetdate\s*\(\)/i, + /\bsysdatetime\s*\(\)/i, + /\bsysutcdatetime\s*\(\)/i, + /\bsysdatetimeoffset\s*\(\)/i, + // Oracle + /\bSYSDATE\b/i, + /\bSYSTIMESTAMP\b/i, + // ClickHouse + /\btoday\s*\(\)/i, + // SQLite + /\bdatetime\s*\(\s*'now'/i, +] + +/** + * Check whether a column_default expression contains an auto-generating + * timestamp function. Also matches expressions that *contain* these functions + * (e.g. `(now() + '1 mon'::interval)`). + */ +function isAutoTimestampDefault(defaultExpr: string | null): boolean { + if (!defaultExpr) return false + return AUTO_TIMESTAMP_DEFAULT_PATTERNS.some((pattern) => pattern.test(defaultExpr)) +} + +// --------------------------------------------------------------------------- +// Column discovery (names + defaults) — dialect-aware +// --------------------------------------------------------------------------- + +interface ColumnInfo { + name: string + defaultExpr: string | null +} + /** - * Build a query to discover column names for a table, appropriate for the dialect. + * Build a query to discover column names and default expressions for a table. + * Returns both pieces of information in a single round-trip so we can detect + * auto-timestamp defaults without an extra query. */ function buildColumnDiscoverySQL(tableName: string, dialect: string): string { // Parse schema.table or db.schema.table @@ -168,33 +230,85 @@ function buildColumnDiscoverySQL(tableName: string, dialect: string): string { switch (dialect) { case "clickhouse": + // Returns: name, type, default_type, default_expression, ... return `DESCRIBE TABLE ${tableName}` case "snowflake": + // Returns: table_name, schema_name, column_name, data_type, null?, default, ... return `SHOW COLUMNS IN TABLE ${tableName}` + case "mysql": + case "mariadb": { + // MySQL puts "on update CURRENT_TIMESTAMP" in the EXTRA column, not column_default + const conditions = [tableFilter] + if (schemaFilter) conditions.push(schemaFilter) + return `SELECT column_name, column_default, extra FROM information_schema.columns WHERE ${conditions.join(" AND ")} ORDER BY ordinal_position` + } + case "oracle": { + // Oracle uses ALL_TAB_COLUMNS (no information_schema) + const oracleTable = parts[parts.length - 1] + const conditions = [`TABLE_NAME = '${oracleTable.toUpperCase()}'`] + if (parts.length >= 2) { + conditions.push(`OWNER = '${parts[parts.length - 2].toUpperCase()}'`) + } + return `SELECT COLUMN_NAME, DATA_DEFAULT FROM ALL_TAB_COLUMNS WHERE ${conditions.join(" AND ")} ORDER BY COLUMN_ID` + } + case "sqlite": { + // PRAGMA table_info returns: cid, name, type, notnull, dflt_value, pk + const table = parts[parts.length - 1] + return `PRAGMA table_info('${table}')` + } default: { - // Postgres, MySQL, Redshift, DuckDB, etc. — use information_schema + // Postgres, Redshift, DuckDB, SQL Server, BigQuery, Databricks, etc. const conditions = [tableFilter] if (schemaFilter) conditions.push(schemaFilter) - return `SELECT column_name FROM information_schema.columns WHERE ${conditions.join(" AND ")} ORDER BY ordinal_position` + return `SELECT column_name, column_default FROM information_schema.columns WHERE ${conditions.join(" AND ")} ORDER BY ordinal_position` } } } /** - * Parse column names from the discovery query result, handling dialect differences. + * Parse column info (name + default expression) from the discovery query result, + * handling dialect-specific output formats. */ -function parseColumnNames(rows: (string | null)[][], dialect: string): string[] { +function parseColumnInfo(rows: (string | null)[][], dialect: string): ColumnInfo[] { switch (dialect) { case "clickhouse": - // DESCRIBE returns: name, type, default_type, default_expression, ... - return rows.map((r) => r[0] ?? "").filter(Boolean) + // DESCRIBE: name[0], type[1], default_type[2], default_expression[3], ... + return rows.map((r) => ({ + name: r[0] ?? "", + defaultExpr: r[3] ?? null, + })).filter((c) => c.name) case "snowflake": - // SHOW COLUMNS returns: table_name, schema_name, column_name, data_type, ... - // column_name is at index 2 - return rows.map((r) => r[2] ?? "").filter(Boolean) + // SHOW COLUMNS: table_name[0], schema_name[1], column_name[2], data_type[3], null?[4], default[5], ... + return rows.map((r) => ({ + name: r[2] ?? "", + defaultExpr: r[5] ?? null, + })).filter((c) => c.name) + case "oracle": + // ALL_TAB_COLUMNS: COLUMN_NAME[0], DATA_DEFAULT[1] + return rows.map((r) => ({ + name: r[0] ?? "", + defaultExpr: r[1] ?? null, + })).filter((c) => c.name) + case "sqlite": + // PRAGMA table_info: cid[0], name[1], type[2], notnull[3], dflt_value[4], pk[5] + return rows.map((r) => ({ + name: r[1] ?? "", + defaultExpr: r[4] ?? null, + })).filter((c) => c.name) + case "mysql": + case "mariadb": + // column_name[0], column_default[1], extra[2] + // Merge default + extra — MySQL puts "on update CURRENT_TIMESTAMP" in extra + return rows.map((r) => ({ + name: r[0] ?? "", + defaultExpr: [r[1], r[2]].filter(Boolean).join(" ") || null, + })).filter((c) => c.name) default: - // information_schema returns: column_name - return rows.map((r) => r[0] ?? "").filter(Boolean) + // Postgres, Redshift, DuckDB, SQL Server, BigQuery: column_name[0], column_default[1] + return rows.map((r) => ({ + name: r[0] ?? "", + defaultExpr: r[1] ?? null, + })).filter((c) => c.name) } } @@ -204,8 +318,13 @@ function parseColumnNames(rows: (string | null)[][], dialect: string): string[] * When the caller omits `extra_columns`, we query the source table's schema to * find all columns, then exclude: * 1. Key columns (already used for matching) - * 2. Audit/timestamp columns (updated_at, created_at, etc.) that typically - * differ between source and target due to ETL timing + * 2. Audit/timestamp columns matched by name pattern (updated_at, created_at, etc.) + * 3. Columns with auto-generating timestamp defaults (DEFAULT NOW(), CURRENT_TIMESTAMP, + * GETDATE(), SYSDATE, etc.) — detected from the database catalog + * + * The schema-level default detection (layer 3) catches columns that don't follow + * naming conventions but still auto-generate values on INSERT — these inherently + * differ between source and target due to when each copy was written. * * Returns the list of columns to compare, or undefined if discovery fails * (in which case the engine falls back to key-only comparison). @@ -222,20 +341,20 @@ async function discoverExtraColumns( try { const sql = buildColumnDiscoverySQL(tableName, dialect) const rows = await executeQuery(sql, warehouseName) - const allColumns = parseColumnNames(rows, dialect) + const columnInfos = parseColumnInfo(rows, dialect) - if (allColumns.length === 0) return undefined + if (columnInfos.length === 0) return undefined const keySet = new Set(keyColumns.map((k) => k.toLowerCase())) const extraColumns: string[] = [] const excludedAudit: string[] = [] - for (const col of allColumns) { - if (keySet.has(col.toLowerCase())) continue - if (isAuditColumn(col)) { - excludedAudit.push(col) + for (const col of columnInfos) { + if (keySet.has(col.name.toLowerCase())) continue + if (isAuditColumn(col.name) || isAutoTimestampDefault(col.defaultExpr)) { + excludedAudit.push(col.name) } else { - extraColumns.push(col) + extraColumns.push(col.name) } } diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts index 97a4085169..bf99487483 100644 --- a/packages/opencode/src/altimate/tools/data-diff.ts +++ b/packages/opencode/src/altimate/tools/data-diff.ts @@ -38,7 +38,8 @@ export const DataDiffTool = Tool.define("data_diff", { .describe( "Columns to compare beyond the key columns. " + "IMPORTANT: If omitted AND source is a plain table name, columns are auto-discovered from the schema " + - "(excluding key columns and audit/timestamp columns like updated_at, created_at, inserted_at, modified_at). " + + "(excluding key columns, audit/timestamp columns matched by name like updated_at/created_at, " + + "and columns with auto-generating timestamp defaults like DEFAULT NOW()/CURRENT_TIMESTAMP/GETDATE()/SYSDATE). " + "If omitted AND source is a SQL query, ONLY key columns are compared — value changes in non-key columns will NOT be detected. " + "Always provide explicit extra_columns when comparing SQL queries to ensure value-level comparison." ), @@ -117,10 +118,10 @@ export const DataDiffTool = Tool.define("data_diff", { output += formatPartitionResults(result.partition_results, args.partition_column!) } - // Report auto-excluded audit columns so the LLM and user know what was skipped + // Report auto-excluded columns so the LLM and user know what was skipped const excluded = (result as any).excluded_audit_columns as string[] | undefined if (excluded && excluded.length > 0) { - output += `\n\n Note: ${excluded.length} audit/timestamp column${excluded.length === 1 ? "" : "s"} auto-excluded from comparison: ${excluded.join(", ")}` + output += `\n\n Note: ${excluded.length} column${excluded.length === 1 ? "" : "s"} auto-excluded from comparison (audit name patterns + auto-timestamp defaults like NOW()/CURRENT_TIMESTAMP): ${excluded.join(", ")}` } return { From b40801758319864b6f469a7657ec272b776577dc Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Mon, 30 Mar 2026 14:18:39 -0700 Subject: [PATCH 09/20] fix: address code review findings in data-diff orchestrator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `buildColumnDiscoverySQL`: escape single quotes in all interpolated table name parts to prevent SQL injection via crafted source/target names - `dateTruncExpr`: add Oracle case (`TRUNC(col, 'UNIT')`) — Oracle does not have `DATE_TRUNC`, date-partitioned diffs on Oracle tables previously failed Co-Authored-By: Claude Sonnet 4.6 --- .../src/altimate/native/connections/data-diff.ts | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index 0afc2c964a..28abcb411d 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -213,19 +213,22 @@ interface ColumnInfo { * auto-timestamp defaults without an extra query. */ function buildColumnDiscoverySQL(tableName: string, dialect: string): string { + // Escape single quotes for safe interpolation into SQL string literals. + const esc = (s: string) => s.replace(/'/g, "''") + // Parse schema.table or db.schema.table const parts = tableName.split(".") let schemaFilter = "" let tableFilter = "" if (parts.length === 3) { - schemaFilter = `table_schema = '${parts[1]}'` - tableFilter = `table_name = '${parts[2]}'` + schemaFilter = `table_schema = '${esc(parts[1])}'` + tableFilter = `table_name = '${esc(parts[2])}'` } else if (parts.length === 2) { - schemaFilter = `table_schema = '${parts[0]}'` - tableFilter = `table_name = '${parts[1]}'` + schemaFilter = `table_schema = '${esc(parts[0])}'` + tableFilter = `table_name = '${esc(parts[1])}'` } else { - tableFilter = `table_name = '${parts[0]}'` + tableFilter = `table_name = '${esc(parts[0])}'` } switch (dialect) { @@ -390,6 +393,9 @@ function dateTruncExpr(granularity: string, column: string, dialect: string): st const fmt = { day: "%Y-%m-%d", week: "%Y-%u", month: "%Y-%m-01", year: "%Y-01-01" }[g] ?? "%Y-%m-01" return `DATE_FORMAT(${column}, '${fmt}')` } + case "oracle": + // Oracle uses TRUNC(), not DATE_TRUNC() + return `TRUNC(${column}, '${g.toUpperCase()}')` default: // Postgres, Snowflake, Redshift, DuckDB, etc. return `DATE_TRUNC('${g}', ${column})` From f2cee71f6affdc02c92e40d9b28238f645310e57 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Mon, 30 Mar 2026 19:26:59 -0700 Subject: [PATCH 10/20] fix: address code review security and correctness findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Apply esc() to Oracle and SQLite paths in buildColumnDiscoverySQL (SQL injection via table name was unpatched in these dialects) - Quote identifiers in resolveTableSources to prevent injection via table names containing semicolons or special characters - Surface SQL execution errors before feeding empty rows to the engine (silent false "match" when warehouse is unreachable is now an error) - Fix Oracle TRUNC() format model map: 'WEEK' → 'IW' (ISO week) ('WEEK' throws ORA-01800 on all Oracle versions) - Quote partition column identifier in buildPartitionWhereClause --- .../altimate/native/connections/data-diff.ts | 60 ++++++++++++++----- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index 28abcb411d..5808ee73f8 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -46,8 +46,15 @@ export function resolveTableSources( } // At least one is a query — wrap both in CTEs - const srcExpr = source_is_query ? source : `SELECT * FROM ${source}` - const tgtExpr = target_is_query ? target : `SELECT * FROM ${target}` + // Quote identifier parts so table names with special chars don't inject SQL. + // Use double-quote escaping (ANSI SQL standard, works in Postgres/Snowflake/DuckDB/etc.) + const quoteIdent = (name: string) => + name + .split(".") + .map((p) => `"${p.replace(/"/g, '""')}"`) + .join(".") + const srcExpr = source_is_query ? source : `SELECT * FROM ${quoteIdent(source)}` + const tgtExpr = target_is_query ? target : `SELECT * FROM ${quoteIdent(target)}` const ctePrefix = `WITH __diff_source AS (\n${srcExpr}\n), __diff_target AS (\n${tgtExpr}\n)` return { @@ -247,16 +254,16 @@ function buildColumnDiscoverySQL(tableName: string, dialect: string): string { } case "oracle": { // Oracle uses ALL_TAB_COLUMNS (no information_schema) - const oracleTable = parts[parts.length - 1] + const oracleTable = esc(parts[parts.length - 1]) const conditions = [`TABLE_NAME = '${oracleTable.toUpperCase()}'`] if (parts.length >= 2) { - conditions.push(`OWNER = '${parts[parts.length - 2].toUpperCase()}'`) + conditions.push(`OWNER = '${esc(parts[parts.length - 2]).toUpperCase()}'`) } return `SELECT COLUMN_NAME, DATA_DEFAULT FROM ALL_TAB_COLUMNS WHERE ${conditions.join(" AND ")} ORDER BY COLUMN_ID` } case "sqlite": { // PRAGMA table_info returns: cid, name, type, notnull, dflt_value, pk - const table = parts[parts.length - 1] + const table = esc(parts[parts.length - 1]) return `PRAGMA table_info('${table}')` } default: { @@ -393,9 +400,19 @@ function dateTruncExpr(granularity: string, column: string, dialect: string): st const fmt = { day: "%Y-%m-%d", week: "%Y-%u", month: "%Y-%m-01", year: "%Y-01-01" }[g] ?? "%Y-%m-01" return `DATE_FORMAT(${column}, '${fmt}')` } - case "oracle": - // Oracle uses TRUNC(), not DATE_TRUNC() - return `TRUNC(${column}, '${g.toUpperCase()}')` + case "oracle": { + // Oracle uses TRUNC() with format models — 'WEEK' is invalid, use 'IW' for ISO week + const oracleFmt: Record = { + day: "DDD", + week: "IW", + month: "MM", + year: "YYYY", + quarter: "Q", + hour: "HH", + minute: "MI", + } + return `TRUNC(${column}, '${oracleFmt[g] ?? g.toUpperCase()}')` + } default: // Postgres, Snowflake, Redshift, DuckDB, etc. return `DATE_TRUNC('${g}', ${column})` @@ -455,21 +472,23 @@ function buildPartitionWhereClause( dialect: string, ): string { const mode = partitionMode(granularity, bucketSize) + // Quote the column identifier to handle special characters and reserved words + const quotedCol = `"${partitionColumn.replace(/"/g, '""')}"` if (mode === "numeric") { const lo = Number(partitionValue) const hi = lo + bucketSize! - return `${partitionColumn} >= ${lo} AND ${partitionColumn} < ${hi}` + return `${quotedCol} >= ${lo} AND ${quotedCol} < ${hi}` } if (mode === "categorical") { // Quote the value — works for strings, enums, booleans const escaped = partitionValue.replace(/'/g, "''") - return `${partitionColumn} = '${escaped}'` + return `${quotedCol} = '${escaped}'` } // date mode - const expr = dateTruncExpr(granularity!, partitionColumn, dialect) + const expr = dateTruncExpr(granularity!, quotedCol, dialect) // Cast the literal appropriately per dialect switch (dialect) { @@ -779,21 +798,32 @@ export async function runDataDiff(params: DataDiffParams): Promise { const warehouse = warehouseFor(task.table_side) // Inject CTE definitions if we're in query-comparison mode const sql = ctePrefix ? injectCte(task.sql, ctePrefix) : task.sql try { const rows = await executeQuery(sql, warehouse) - return { id: task.id, rows } + return { id: task.id, rows, error: null } } catch (e) { - // Return error shape — engine will produce an Error action on next step - return { id: task.id, rows: [], error: String(e) } + return { id: task.id, rows: [] as (string | null)[][], error: String(e) } } }), ) + // Surface any SQL execution errors before feeding to the engine + const sqlError = taskResults.find((r) => r.error !== null) + if (sqlError) { + return { + success: false, + error: `SQL execution failed for task ${sqlError.id}: ${sqlError.error}`, + steps: stepCount, + } + } + + const responses = taskResults.map(({ id, rows }) => ({ id, rows })) + actionJson = session.step(JSON.stringify(responses)) } From 982316e11a4483a3416b4c020fd7f91a61073128 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Tue, 31 Mar 2026 14:20:29 -0700 Subject: [PATCH 11/20] =?UTF-8?q?fix:=20resolve=20simulation=20suite=20fai?= =?UTF-8?q?lures=20=E2=80=94=20object=20stringification,=20error=20propaga?= =?UTF-8?q?tion,=20and=20test=20mock=20formats?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `altimate-core-column-lineage`: fix `[object Object]` in `column_dict` output when source entries are `{ source_table, source_column }` objects instead of strings - `schema-inspect`: propagate `{ success: false, error }` dispatcher responses to `metadata.error` instead of silently returning empty schema - `sql-analyze`: guard against null/undefined result from dispatcher to prevent "undefined" literal in output - `lineage-check`: guard against null/undefined result from dispatcher to prevent "undefined" literal in output - `simulation-suite.test.ts`: fix `sql-translate` mock format — data fields must be flat (not wrapped in `data: {}`), add `source_dialect`/`target_dialect` to mock so assertions pass - `simulation-suite.test.ts`: fix `dbt-manifest` mock format — unwrap `data: {}` so `model_count` and `models` are accessible at top level Simulation suite: 695/839 → 839/839 (100%) --- .../tools/altimate-core-column-lineage.ts | 11 ++++++++- .../src/altimate/tools/lineage-check.ts | 12 +++++++++- .../src/altimate/tools/schema-inspect.ts | 15 ++++++++++-- .../src/altimate/tools/sql-analyze.ts | 12 +++++++++- .../test/altimate/simulation-suite.test.ts | 24 +++++++++---------- 5 files changed, 56 insertions(+), 18 deletions(-) diff --git a/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts b/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts index 180836d123..8b0d2e2220 100644 --- a/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts +++ b/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts @@ -47,7 +47,16 @@ function formatColumnLineage(data: Record): string { if (data.column_dict && Object.keys(data.column_dict).length > 0) { lines.push("Column Mappings:") for (const [target, sources] of Object.entries(data.column_dict)) { - const srcList = Array.isArray(sources) ? (sources as string[]).join(", ") : JSON.stringify(sources) + const srcList = Array.isArray(sources) + ? sources + .map((s: any) => { + if (typeof s === "string") return s + if (s && s.source_table && s.source_column) return `${s.source_table}.${s.source_column}` + if (s && s.source) return String(s.source) + return JSON.stringify(s) + }) + .join(", ") + : JSON.stringify(sources) lines.push(` ${target} ← ${srcList}`) } lines.push("") diff --git a/packages/opencode/src/altimate/tools/lineage-check.ts b/packages/opencode/src/altimate/tools/lineage-check.ts index dbe19fc0ff..d9e0e41748 100644 --- a/packages/opencode/src/altimate/tools/lineage-check.ts +++ b/packages/opencode/src/altimate/tools/lineage-check.ts @@ -20,12 +20,22 @@ export const LineageCheckTool = Tool.define("lineage_check", { }), async execute(args, ctx) { try { - const result = await Dispatcher.call("lineage.check", { + const raw = await Dispatcher.call("lineage.check", { sql: args.sql, dialect: args.dialect, schema_context: args.schema_context, }) + // Guard against null/undefined/non-object responses + if (raw == null || typeof raw !== "object") { + return { + title: "Lineage: ERROR", + metadata: { success: false, error: "Unexpected response from lineage handler" }, + output: "Lineage check failed: unexpected response format.", + } + } + const result = raw as LineageCheckResult + const data = (result.data ?? {}) as Record if (result.error) { return { diff --git a/packages/opencode/src/altimate/tools/schema-inspect.ts b/packages/opencode/src/altimate/tools/schema-inspect.ts index 92f11b48fa..c6f9c93381 100644 --- a/packages/opencode/src/altimate/tools/schema-inspect.ts +++ b/packages/opencode/src/altimate/tools/schema-inspect.ts @@ -15,11 +15,22 @@ export const SchemaInspectTool = Tool.define("schema_inspect", { }), async execute(args, ctx) { try { - const result = await Dispatcher.call("schema.inspect", { + const raw = (await Dispatcher.call("schema.inspect", { table: args.table, schema_name: args.schema_name, warehouse: args.warehouse, - }) + })) as any + + // Surface dispatcher-level errors (e.g. { success: false, error: "..." }) + if (!raw || raw.success === false || raw.error) { + const errorMsg = (raw?.error as string) ?? "Schema inspection failed" + return { + title: "Schema: ERROR", + metadata: { columnCount: 0, rowCount: undefined, error: errorMsg }, + output: `Failed to inspect schema: ${errorMsg}\n\nEnsure the dispatcher is running and a warehouse connection is configured.`, + } + } + const result = raw as SchemaInspectResult // altimate_change start — progressive disclosure suggestions let output = formatSchema(result) diff --git a/packages/opencode/src/altimate/tools/sql-analyze.ts b/packages/opencode/src/altimate/tools/sql-analyze.ts index 87c123727f..d718c57fc4 100644 --- a/packages/opencode/src/altimate/tools/sql-analyze.ts +++ b/packages/opencode/src/altimate/tools/sql-analyze.ts @@ -26,13 +26,23 @@ export const SqlAnalyzeTool = Tool.define("sql_analyze", { async execute(args, ctx) { const hasSchema = !!(args.schema_path || (args.schema_context && Object.keys(args.schema_context).length > 0)) try { - const result = await Dispatcher.call("sql.analyze", { + const raw = await Dispatcher.call("sql.analyze", { sql: args.sql, dialect: args.dialect, schema_path: args.schema_path, schema_context: args.schema_context, }) + // Guard against null/undefined/non-object responses + if (raw == null || typeof raw !== "object") { + return { + title: "Analyze: ERROR", + metadata: { success: false, issueCount: 0, confidence: "unknown", dialect: args.dialect, has_schema: hasSchema, error: "Unexpected response from analysis handler" }, + output: "Analysis failed: unexpected response format.", + } + } + const result = raw + // The handler returns success=true when analysis completes (issues are // reported via issues/issue_count). Only treat it as a failure when // there's an actual error (e.g. parse failure). diff --git a/packages/opencode/test/altimate/simulation-suite.test.ts b/packages/opencode/test/altimate/simulation-suite.test.ts index fc411b2416..6fffc50497 100644 --- a/packages/opencode/test/altimate/simulation-suite.test.ts +++ b/packages/opencode/test/altimate/simulation-suite.test.ts @@ -907,15 +907,13 @@ describe("Category 4: dbt Integration", () => { args: { path: "target/manifest.json" }, mockResponse: { success: true, - data: { - model_count: project.models, - source_count: project.sources, - test_count: project.tests, - snapshot_count: 0, - seed_count: 0, - models, - sources: [{ name: "raw_data", schema: "raw", columns: [] }], - }, + model_count: project.models, + source_count: project.sources, + test_count: project.tests, + snapshot_count: 0, + seed_count: 0, + models, + sources: [{ name: "raw_data", schema: "raw", columns: [] }], }, assertions: (result) => { expect(result.output).toContain("model") @@ -1122,10 +1120,10 @@ describe("Category 7: SQL Translation", () => { dialect: `${source}→${target}`, mockResponse: { success: true, - data: { - translated_sql: SQL_CORPUS[sqlKey].replace(/SELECT/g, "/* translated */ SELECT"), - warnings: source === "snowflake" && target === "mysql" ? ["QUALIFY clause not supported in MySQL"] : [], - }, + translated_sql: SQL_CORPUS[sqlKey].replace(/SELECT/g, "/* translated */ SELECT"), + source_dialect: source, + target_dialect: target, + warnings: source === "snowflake" && target === "mysql" ? ["QUALIFY clause not supported in MySQL"] : [], }, assertions: (result) => { expect(result.output).toContain(source) From 05b6a0284beeb1797f9a1996788b945a4988dcdf Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Wed, 1 Apr 2026 16:30:11 -0700 Subject: [PATCH 12/20] =?UTF-8?q?refactor:=20remove=20existing-tool=20impr?= =?UTF-8?q?ovements=20=E2=80=94=20scope=20to=20data-diff=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../tools/altimate-core-column-lineage.ts | 11 +---------- .../opencode/src/altimate/tools/lineage-check.ts | 12 +----------- .../opencode/src/altimate/tools/schema-inspect.ts | 15 ++------------- .../opencode/src/altimate/tools/sql-analyze.ts | 12 +----------- 4 files changed, 5 insertions(+), 45 deletions(-) diff --git a/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts b/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts index 8b0d2e2220..180836d123 100644 --- a/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts +++ b/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts @@ -47,16 +47,7 @@ function formatColumnLineage(data: Record): string { if (data.column_dict && Object.keys(data.column_dict).length > 0) { lines.push("Column Mappings:") for (const [target, sources] of Object.entries(data.column_dict)) { - const srcList = Array.isArray(sources) - ? sources - .map((s: any) => { - if (typeof s === "string") return s - if (s && s.source_table && s.source_column) return `${s.source_table}.${s.source_column}` - if (s && s.source) return String(s.source) - return JSON.stringify(s) - }) - .join(", ") - : JSON.stringify(sources) + const srcList = Array.isArray(sources) ? (sources as string[]).join(", ") : JSON.stringify(sources) lines.push(` ${target} ← ${srcList}`) } lines.push("") diff --git a/packages/opencode/src/altimate/tools/lineage-check.ts b/packages/opencode/src/altimate/tools/lineage-check.ts index d9e0e41748..dbe19fc0ff 100644 --- a/packages/opencode/src/altimate/tools/lineage-check.ts +++ b/packages/opencode/src/altimate/tools/lineage-check.ts @@ -20,22 +20,12 @@ export const LineageCheckTool = Tool.define("lineage_check", { }), async execute(args, ctx) { try { - const raw = await Dispatcher.call("lineage.check", { + const result = await Dispatcher.call("lineage.check", { sql: args.sql, dialect: args.dialect, schema_context: args.schema_context, }) - // Guard against null/undefined/non-object responses - if (raw == null || typeof raw !== "object") { - return { - title: "Lineage: ERROR", - metadata: { success: false, error: "Unexpected response from lineage handler" }, - output: "Lineage check failed: unexpected response format.", - } - } - const result = raw as LineageCheckResult - const data = (result.data ?? {}) as Record if (result.error) { return { diff --git a/packages/opencode/src/altimate/tools/schema-inspect.ts b/packages/opencode/src/altimate/tools/schema-inspect.ts index c6f9c93381..92f11b48fa 100644 --- a/packages/opencode/src/altimate/tools/schema-inspect.ts +++ b/packages/opencode/src/altimate/tools/schema-inspect.ts @@ -15,22 +15,11 @@ export const SchemaInspectTool = Tool.define("schema_inspect", { }), async execute(args, ctx) { try { - const raw = (await Dispatcher.call("schema.inspect", { + const result = await Dispatcher.call("schema.inspect", { table: args.table, schema_name: args.schema_name, warehouse: args.warehouse, - })) as any - - // Surface dispatcher-level errors (e.g. { success: false, error: "..." }) - if (!raw || raw.success === false || raw.error) { - const errorMsg = (raw?.error as string) ?? "Schema inspection failed" - return { - title: "Schema: ERROR", - metadata: { columnCount: 0, rowCount: undefined, error: errorMsg }, - output: `Failed to inspect schema: ${errorMsg}\n\nEnsure the dispatcher is running and a warehouse connection is configured.`, - } - } - const result = raw as SchemaInspectResult + }) // altimate_change start — progressive disclosure suggestions let output = formatSchema(result) diff --git a/packages/opencode/src/altimate/tools/sql-analyze.ts b/packages/opencode/src/altimate/tools/sql-analyze.ts index d718c57fc4..87c123727f 100644 --- a/packages/opencode/src/altimate/tools/sql-analyze.ts +++ b/packages/opencode/src/altimate/tools/sql-analyze.ts @@ -26,23 +26,13 @@ export const SqlAnalyzeTool = Tool.define("sql_analyze", { async execute(args, ctx) { const hasSchema = !!(args.schema_path || (args.schema_context && Object.keys(args.schema_context).length > 0)) try { - const raw = await Dispatcher.call("sql.analyze", { + const result = await Dispatcher.call("sql.analyze", { sql: args.sql, dialect: args.dialect, schema_path: args.schema_path, schema_context: args.schema_context, }) - // Guard against null/undefined/non-object responses - if (raw == null || typeof raw !== "object") { - return { - title: "Analyze: ERROR", - metadata: { success: false, issueCount: 0, confidence: "unknown", dialect: args.dialect, has_schema: hasSchema, error: "Unexpected response from analysis handler" }, - output: "Analysis failed: unexpected response format.", - } - } - const result = raw - // The handler returns success=true when analysis completes (issues are // reported via issues/issue_count). Only treat it as a failure when // there's an actual error (e.g. parse failure). From 6c60be1a304e7ec1b16f48a6d1dc8f396f765048 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Wed, 1 Apr 2026 16:38:37 -0700 Subject: [PATCH 13/20] =?UTF-8?q?refactor:=20revert=20.gitignore=20changes?= =?UTF-8?q?=20=E2=80=94=20scope=20to=20data-diff=20only?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.gitignore b/.gitignore index 4dfe62f9ee..b10c1bb043 100644 --- a/.gitignore +++ b/.gitignore @@ -28,12 +28,6 @@ target # Commit message scratch files .github/meta/ -# Local connections config (may contain credentials) -.altimate-code/ - -# Pre-built native binaries (platform-specific, not for source control) -packages/opencode/*.node - # Local dev files opencode-dev logs/ From 2c58580ab7cfe96bac5cd8a263f3f316b736bbb8 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Thu, 2 Apr 2026 16:03:23 -0700 Subject: [PATCH 14/20] fix: silence @clickhouse/client internal stderr logger to prevent TUI corruption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The @clickhouse/client package enables ERROR-level logging by default and writes `[ERROR][@clickhouse/client][Connection]` lines directly to stderr on auth/query failures. These raw writes corrupt the terminal TUI rendering. Set `log: { level: 127 }` (ClickHouseLogLevel.OFF) when creating the client — consistent with how Snowflake (`logLevel: 'OFF'`) and Databricks (no-op logger) already suppress their SDK loggers for the same reason. --- packages/drivers/src/clickhouse.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/drivers/src/clickhouse.ts b/packages/drivers/src/clickhouse.ts index cfce48ed2f..38eb738494 100644 --- a/packages/drivers/src/clickhouse.ts +++ b/packages/drivers/src/clickhouse.ts @@ -57,6 +57,9 @@ export async function connect(config: ConnectionConfig): Promise { clientConfig.clickhouse_settings = config.clickhouse_settings } + // Silence the client's internal stderr logger — its ERROR-level output + // writes raw lines directly to stderr and corrupts terminal TUI rendering. + clientConfig.log = { level: 127 } // ClickHouseLogLevel.OFF = 127 client = createClient(clientConfig) }, From 19c2376dde1161c690bb07a860dbf421eb409212 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Thu, 2 Apr 2026 16:19:46 -0700 Subject: [PATCH 15/20] fix: SQL injection hardening, target partition discovery, and local pack script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Validate table names before interpolating into DESCRIBE/SHOW COLUMNS for ClickHouse and Snowflake — reject names with non-alphanumeric characters to prevent SQL injection; also quote parts with dialect-appropriate delimiters - Discover partition values from BOTH source and target tables and union the results — previously only source was queried, silently missing rows that existed only in target-side partitions - Add script/pack-local.ts: mirrors publish.ts but stops before npm publish; injects local altimate-core tarballs from /tmp/altimate-local-dist/ for local end-to-end testing --- packages/opencode/script/pack-local.ts | 134 ++++++++++++++++++ .../altimate/native/connections/data-diff.ts | 59 ++++++-- 2 files changed, 184 insertions(+), 9 deletions(-) create mode 100644 packages/opencode/script/pack-local.ts diff --git a/packages/opencode/script/pack-local.ts b/packages/opencode/script/pack-local.ts new file mode 100644 index 0000000000..4c2e1c5110 --- /dev/null +++ b/packages/opencode/script/pack-local.ts @@ -0,0 +1,134 @@ +#!/usr/bin/env bun +/** + * Mirrors publish.ts exactly — creates all dist packages and packs them as tarballs. + * Stops before `npm publish`. Injects local altimate-core tarballs from /tmp/altimate-local-dist/. + * + * Usage: bun run script/pack-local.ts + */ + +import { $ } from "bun" +import fs from "fs" +import path from "path" +import { fileURLToPath } from "url" + +const dir = fileURLToPath(new URL("..", import.meta.url)) +process.chdir(dir) + +import { Script } from "@opencode-ai/script" +import pkg from "../package.json" + +const LOCAL_DIST = "/tmp/altimate-local-dist" +const OUT = "/tmp/altimate-local-dist" + +// ── Discover built binaries ────────────────────────────────────────────────── +const binaries: Record = {} +for (const filepath of new Bun.Glob("**/package.json").scanSync({ cwd: "./dist" })) { + const p = await Bun.file(`./dist/${filepath}`).json() + if (!p.name || !p.version) continue + binaries[p.name] = p.version +} +console.log("Platform binaries:", Object.keys(binaries)) +const version = Object.values(binaries)[0] +const sanitizedVersion = version.replace(/\//g, "-") +console.log("Version:", sanitizedVersion) + +// ── Sanitize platform binary package.json versions ─────────────────────────── +for (const filepath of new Bun.Glob("**/package.json").scanSync({ cwd: "./dist" })) { + const pkgPath = `./dist/${filepath}` + const p = await Bun.file(pkgPath).json() + if (!p.name || !p.version) continue + if (p.version.includes("/")) { + p.version = p.version.replace(/\//g, "-") + await Bun.file(pkgPath).write(JSON.stringify(p, null, 2)) + } +} + +// ── copyAssets helper (mirrors publish.ts) ─────────────────────────────────── +async function copyAssets(targetDir: string) { + await $`mkdir -p ${targetDir}/bin` + await $`cp bin/altimate bin/altimate-code ${targetDir}/bin/` + await $`cp -r ../../.opencode/skills ${targetDir}/skills` + await $`cp ./script/postinstall.mjs ${targetDir}/postinstall.mjs` + await $`mkdir -p ${targetDir}/dbt-tools/bin` + await $`cp ../dbt-tools/bin/altimate-dbt ${targetDir}/dbt-tools/bin/altimate-dbt` + await $`mkdir -p ${targetDir}/dbt-tools/dist` + await $`cp ../dbt-tools/dist/index.js ${targetDir}/dbt-tools/dist/` + await $`cp ../dbt-tools/dist/node_python_bridge.py ${targetDir}/dbt-tools/dist/` + await Bun.file(`${targetDir}/dbt-tools/package.json`).write(JSON.stringify({ type: "module" }, null, 2) + "\n") + if (fs.existsSync("../dbt-tools/dist/altimate_python_packages")) { + await $`cp -r ../dbt-tools/dist/altimate_python_packages ${targetDir}/dbt-tools/dist/` + } + await Bun.file(`${targetDir}/LICENSE`).write(await Bun.file("../../LICENSE").text()) + await Bun.file(`${targetDir}/CHANGELOG.md`).write(await Bun.file("../../CHANGELOG.md").text()) +} + +// ── Build wrapper package ──────────────────────────────────────────────────── +const wrapperDir = `./dist/${pkg.name}` +await $`mkdir -p ${wrapperDir}` +await copyAssets(wrapperDir) + +// Use local altimate-core tarball path as the dependency +const coreCompanionTgz = `${LOCAL_DIST}/altimateai-altimate-core-darwin-arm64-0.2.6.tgz` +const coreTgz = `${LOCAL_DIST}/altimateai-altimate-core-0.2.6.tgz` + +await Bun.file(`${wrapperDir}/package.json`).write( + JSON.stringify( + { + name: pkg.name, + version: sanitizedVersion, + bin: { + altimate: "./bin/altimate", + "altimate-code": "./bin/altimate-code", + }, + scripts: { + postinstall: "bun ./postinstall.mjs || node ./postinstall.mjs", + }, + license: pkg.license, + dependencies: { + // Reference local tarball so npm install uses our build, not the registry + "@altimateai/altimate-core": `file:${coreTgz}`, + }, + optionalDependencies: Object.fromEntries( + Object.entries(binaries).map(([name, _]) => [name, sanitizedVersion]) + ), + peerDependencies: { + pg: ">=8", "snowflake-sdk": ">=1", "@google-cloud/bigquery": ">=8", + "@databricks/sql": ">=1", mysql2: ">=3", mssql: ">=11", + oracledb: ">=6", duckdb: ">=1", "@clickhouse/client": ">=1", + }, + }, + null, + 2, + ), +) + +// ── Pack all platform binary packages ──────────────────────────────────────── +for (const name of Object.keys(binaries)) { + console.log(`Packing ${name}...`) + await $`chmod -R 755 ./dist/${name}` + await $`npm pack --pack-destination ${OUT}`.cwd(`./dist/${name}`) +} + +// ── Pack wrapper package ────────────────────────────────────────────────────── +console.log(`Packing wrapper ${pkg.name}...`) +await $`chmod -R 755 ${wrapperDir}` +await $`npm pack --pack-destination ${OUT}`.cwd(wrapperDir) + +// ── List all output tarballs ────────────────────────────────────────────────── +const tarballs = (await $`ls ${OUT}/*.tgz`.text()).trim().split("\n") +console.log(`\n✓ All tarballs ready in ${OUT}:\n`) +for (const t of tarballs) { + const size = (await $`du -sh ${t}`.text()).split("\t")[0] + console.log(` ${size} ${path.basename(t)}`) +} + +console.log(` +Install and run: + rm -rf /tmp/altimate-test && mkdir /tmp/altimate-test && cd /tmp/altimate-test + npm install \\ + ${OUT}/altimateai-altimate-core-darwin-arm64-0.2.6.tgz \\ + ${OUT}/altimateai-altimate-core-0.2.6.tgz \\ + ${tarballs.find(t => t.includes("darwin-arm64") && !t.includes("altimate-core"))} \\ + ${tarballs.find(t => t.includes("altimate-code-0") || (t.includes("altimate-code-") && !t.includes("darwin")))} + ./node_modules/.bin/altimate-code +`) diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index 5808ee73f8..2fce479b6a 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -238,13 +238,31 @@ function buildColumnDiscoverySQL(tableName: string, dialect: string): string { tableFilter = `table_name = '${esc(parts[0])}'` } + // Validate table name for dialects that can't use parameterized identifiers. + // Reject anything that doesn't look like a safe identifier (alphanumeric, dots, underscores). + const SAFE_TABLE_NAME = /^[a-zA-Z0-9_.]+$/ + switch (dialect) { - case "clickhouse": + case "clickhouse": { + // DESCRIBE TABLE interpolates directly — validate to prevent injection + if (!SAFE_TABLE_NAME.test(tableName)) { + throw new Error(`Unsafe table name for ClickHouse DESCRIBE: ${tableName}`) + } + // Quote each part with backticks for ClickHouse + const chQuoted = tableName.split(".").map((p) => `\`${p.replace(/`/g, "``")}\``).join(".") // Returns: name, type, default_type, default_expression, ... - return `DESCRIBE TABLE ${tableName}` - case "snowflake": + return `DESCRIBE TABLE ${chQuoted}` + } + case "snowflake": { + // SHOW COLUMNS interpolates directly — validate to prevent injection + if (!SAFE_TABLE_NAME.test(tableName)) { + throw new Error(`Unsafe table name for Snowflake SHOW COLUMNS: ${tableName}`) + } + // Quote each part with double-quotes for Snowflake + const sfQuoted = tableName.split(".").map((p) => `"${p.replace(/"/g, '""')}"`).join(".") // Returns: table_name, schema_name, column_name, data_type, null?, default, ... - return `SHOW COLUMNS IN TABLE ${tableName}` + return `SHOW COLUMNS IN TABLE ${sfQuoted}` + } case "mysql": case "mariadb": { // MySQL puts "on update CURRENT_TIMESTAMP" in the EXTRA column, not column_default @@ -584,10 +602,12 @@ async function runPartitionedDiff(params: DataDiffParams): Promise String(r[0] ?? "")).filter(Boolean) + const [sourceRows, targetRows] = await Promise.all([ + executeQuery(sourceDiscoverySql, params.source_warehouse), + executeQuery(targetDiscoverySql, params.target_warehouse ?? params.source_warehouse), + ]) + // Union partition values from both sides, deduplicated + const allValues = new Set() + for (const r of sourceRows) { + const v = r[0] + if (v != null) allValues.add(String(v)) + } + for (const r of targetRows) { + const v = r[0] + if (v != null) allValues.add(String(v)) + } + partitionValues = [...allValues].sort() } catch (e) { return { success: false, error: `Partition discovery failed: ${e}`, steps: 0 } } From 7402408ee3293c520821124c081b7aecbd6de410 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Thu, 2 Apr 2026 17:06:05 -0700 Subject: [PATCH 16/20] feat: add Step 9 result presentation guidelines to data-parity skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Require that every diff result summary surfaces: - Exact scope (tables + warehouses compared) - Filters and time period applied (or explicitly states none) - Key columns used and how they were confirmed - Columns compared and excluded, with reasons (auto-timestamp, user request) - Algorithm used Includes example full result summary and guidance for identical results — emphasising that bare numbers without context are meaningless to the user. --- .opencode/skills/data-parity/SKILL.md | 72 ++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md index 39afa6b616..2bb7fa5df6 100644 --- a/.opencode/skills/data-parity/SKILL.md +++ b/.opencode/skills/data-parity/SKILL.md @@ -19,7 +19,7 @@ Here's my plan: 6. [ ] Run column-level profile (cheap — no row scan) 7. [ ] Ask whether to proceed with row-level diff (may be expensive for large tables) 8. [ ] Run targeted row-level diff on diverging columns only -9. [ ] Report findings +9. [ ] Present findings with scope, filters, time period, columns compared/excluded, and assumptions ``` Update each item to `[x]` as you complete it. This plan should be visible before any tool is called. @@ -317,6 +317,76 @@ The output lists which columns were auto-excluded and why. --- +## Step 9: Present Findings — Always Surface Context + +When reporting diff results, **never present bare numbers**. Always frame the result with the full context that determines what the numbers actually mean. + +### Required elements in every result summary + +**1. Scope — what was compared** +State exactly which tables/queries were diffed and on which warehouses: +> "Compared `public.orders` on **postgres_prod** vs `public.orders` on **snowflake_dw**" + +**2. Filters and time period applied** +If any `where_clause` or `partition_column` was used, state it explicitly: +> "Scope limited to: `created_at >= '2024-01-01' AND created_at < '2024-04-01'` (Q1 2024 only)" +> "Partitioned by `l_shipdate` (monthly buckets) — diff covers Jan 2023 through Mar 2024" + +If no filter was applied, say so: +> "No row filter applied — full table compared" + +**3. Key columns used** +> "Key: `order_id` (confirmed unique — 150,000 distinct values = 150,000 rows)" + +**4. Columns included and excluded** +List what was compared and what was skipped, and why: +> "Compared columns: `amount`, `status`, `customer_id`" +> "Excluded (auto-timestamp defaults): `created_at`, `updated_at`, `_loaded_at`" +> "Excluded (user request): `internal_score`" + +If the user confirmed exclusions in Step 4, reference that confirmation: +> "Excluded per your confirmation: `created_at`, `updated_at`" + +**5. Algorithm used** +> "Algorithm: `hashdiff` (cross-database)" + +### Example full result summary + +``` +## Data Parity Results + +**Compared:** `public.orders` (postgres_prod) → `public.orders` (snowflake_dw) +**Scope:** `created_at >= '2024-01-01'` (Q1 2024 only — 42,301 rows in scope) +**Key:** `order_id` +**Columns compared:** `amount`, `status`, `customer_id`, `region` +**Columns excluded:** `created_at`, `updated_at` (auto-timestamp defaults, per your confirmation) +**Algorithm:** hashdiff + +### Result: ✗ DIFFER + +| Metric | Value | +|--------|-------| +| Source rows | 42,301 | +| Target rows | 42,298 | +| Only in source | 3 | +| Only in target | 0 | +| Updated rows | 47 | +| Identical rows | 42,251 | + +**Findings:** +- 3 rows exist in source but are missing in target → possible ETL delete propagation gap +- 47 rows have value differences in `amount` or `status` → check rounding or status mapping +``` + +### When result is IDENTICAL — still surface the scope + +Even when tables match perfectly, state what was checked: +> "✓ Tables are **identical** across 150,000 rows. Compared `amount`, `status`, `customer_id` (full table, no filter, key=`order_id`). Auto-timestamp columns `created_at`, `updated_at` were excluded." + +**Why this matters:** "Tables are identical" without context is meaningless — the user needs to know if you checked Q1 only, skipped 5 columns, or used a WHERE clause that covered just 1% of the data. + +--- + ## Common Mistakes **Writing manual diff SQL instead of calling data_diff** From 2caf381be55cf606ede37f0cff62b8e6fde594d0 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Thu, 2 Apr 2026 18:09:06 -0700 Subject: [PATCH 17/20] fix: use correct outcome format for empty/fallback partition results The partitioned diff returned `{ Match: { row_count: 0, algorithm: 'partitioned' } }` when no partition values were found or all partitions failed. This format lacks `mode: 'diff'`, so `formatOutcome` fell through to raw JSON.stringify instead of producing clean output. Use the standard Rust engine format: `{ mode: 'diff', stats: {...}, diff_rows: [] }` --- .../opencode/src/altimate/native/connections/data-diff.ts | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index 2fce479b6a..24a2a16205 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -646,7 +646,10 @@ async function runPartitionedDiff(params: DataDiffParams): Promise Date: Fri, 3 Apr 2026 10:52:45 -0700 Subject: [PATCH 18/20] =?UTF-8?q?chore:=20remove=20pack-local.ts=20?= =?UTF-8?q?=E2=80=94=20dev-only=20utility,=20not=20part=20of=20the=20featu?= =?UTF-8?q?re?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- packages/opencode/script/pack-local.ts | 134 ------------------------- 1 file changed, 134 deletions(-) delete mode 100644 packages/opencode/script/pack-local.ts diff --git a/packages/opencode/script/pack-local.ts b/packages/opencode/script/pack-local.ts deleted file mode 100644 index 4c2e1c5110..0000000000 --- a/packages/opencode/script/pack-local.ts +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env bun -/** - * Mirrors publish.ts exactly — creates all dist packages and packs them as tarballs. - * Stops before `npm publish`. Injects local altimate-core tarballs from /tmp/altimate-local-dist/. - * - * Usage: bun run script/pack-local.ts - */ - -import { $ } from "bun" -import fs from "fs" -import path from "path" -import { fileURLToPath } from "url" - -const dir = fileURLToPath(new URL("..", import.meta.url)) -process.chdir(dir) - -import { Script } from "@opencode-ai/script" -import pkg from "../package.json" - -const LOCAL_DIST = "/tmp/altimate-local-dist" -const OUT = "/tmp/altimate-local-dist" - -// ── Discover built binaries ────────────────────────────────────────────────── -const binaries: Record = {} -for (const filepath of new Bun.Glob("**/package.json").scanSync({ cwd: "./dist" })) { - const p = await Bun.file(`./dist/${filepath}`).json() - if (!p.name || !p.version) continue - binaries[p.name] = p.version -} -console.log("Platform binaries:", Object.keys(binaries)) -const version = Object.values(binaries)[0] -const sanitizedVersion = version.replace(/\//g, "-") -console.log("Version:", sanitizedVersion) - -// ── Sanitize platform binary package.json versions ─────────────────────────── -for (const filepath of new Bun.Glob("**/package.json").scanSync({ cwd: "./dist" })) { - const pkgPath = `./dist/${filepath}` - const p = await Bun.file(pkgPath).json() - if (!p.name || !p.version) continue - if (p.version.includes("/")) { - p.version = p.version.replace(/\//g, "-") - await Bun.file(pkgPath).write(JSON.stringify(p, null, 2)) - } -} - -// ── copyAssets helper (mirrors publish.ts) ─────────────────────────────────── -async function copyAssets(targetDir: string) { - await $`mkdir -p ${targetDir}/bin` - await $`cp bin/altimate bin/altimate-code ${targetDir}/bin/` - await $`cp -r ../../.opencode/skills ${targetDir}/skills` - await $`cp ./script/postinstall.mjs ${targetDir}/postinstall.mjs` - await $`mkdir -p ${targetDir}/dbt-tools/bin` - await $`cp ../dbt-tools/bin/altimate-dbt ${targetDir}/dbt-tools/bin/altimate-dbt` - await $`mkdir -p ${targetDir}/dbt-tools/dist` - await $`cp ../dbt-tools/dist/index.js ${targetDir}/dbt-tools/dist/` - await $`cp ../dbt-tools/dist/node_python_bridge.py ${targetDir}/dbt-tools/dist/` - await Bun.file(`${targetDir}/dbt-tools/package.json`).write(JSON.stringify({ type: "module" }, null, 2) + "\n") - if (fs.existsSync("../dbt-tools/dist/altimate_python_packages")) { - await $`cp -r ../dbt-tools/dist/altimate_python_packages ${targetDir}/dbt-tools/dist/` - } - await Bun.file(`${targetDir}/LICENSE`).write(await Bun.file("../../LICENSE").text()) - await Bun.file(`${targetDir}/CHANGELOG.md`).write(await Bun.file("../../CHANGELOG.md").text()) -} - -// ── Build wrapper package ──────────────────────────────────────────────────── -const wrapperDir = `./dist/${pkg.name}` -await $`mkdir -p ${wrapperDir}` -await copyAssets(wrapperDir) - -// Use local altimate-core tarball path as the dependency -const coreCompanionTgz = `${LOCAL_DIST}/altimateai-altimate-core-darwin-arm64-0.2.6.tgz` -const coreTgz = `${LOCAL_DIST}/altimateai-altimate-core-0.2.6.tgz` - -await Bun.file(`${wrapperDir}/package.json`).write( - JSON.stringify( - { - name: pkg.name, - version: sanitizedVersion, - bin: { - altimate: "./bin/altimate", - "altimate-code": "./bin/altimate-code", - }, - scripts: { - postinstall: "bun ./postinstall.mjs || node ./postinstall.mjs", - }, - license: pkg.license, - dependencies: { - // Reference local tarball so npm install uses our build, not the registry - "@altimateai/altimate-core": `file:${coreTgz}`, - }, - optionalDependencies: Object.fromEntries( - Object.entries(binaries).map(([name, _]) => [name, sanitizedVersion]) - ), - peerDependencies: { - pg: ">=8", "snowflake-sdk": ">=1", "@google-cloud/bigquery": ">=8", - "@databricks/sql": ">=1", mysql2: ">=3", mssql: ">=11", - oracledb: ">=6", duckdb: ">=1", "@clickhouse/client": ">=1", - }, - }, - null, - 2, - ), -) - -// ── Pack all platform binary packages ──────────────────────────────────────── -for (const name of Object.keys(binaries)) { - console.log(`Packing ${name}...`) - await $`chmod -R 755 ./dist/${name}` - await $`npm pack --pack-destination ${OUT}`.cwd(`./dist/${name}`) -} - -// ── Pack wrapper package ────────────────────────────────────────────────────── -console.log(`Packing wrapper ${pkg.name}...`) -await $`chmod -R 755 ${wrapperDir}` -await $`npm pack --pack-destination ${OUT}`.cwd(wrapperDir) - -// ── List all output tarballs ────────────────────────────────────────────────── -const tarballs = (await $`ls ${OUT}/*.tgz`.text()).trim().split("\n") -console.log(`\n✓ All tarballs ready in ${OUT}:\n`) -for (const t of tarballs) { - const size = (await $`du -sh ${t}`.text()).split("\t")[0] - console.log(` ${size} ${path.basename(t)}`) -} - -console.log(` -Install and run: - rm -rf /tmp/altimate-test && mkdir /tmp/altimate-test && cd /tmp/altimate-test - npm install \\ - ${OUT}/altimateai-altimate-core-darwin-arm64-0.2.6.tgz \\ - ${OUT}/altimateai-altimate-core-0.2.6.tgz \\ - ${tarballs.find(t => t.includes("darwin-arm64") && !t.includes("altimate-core"))} \\ - ${tarballs.find(t => t.includes("altimate-code-0") || (t.includes("altimate-code-") && !t.includes("darwin")))} - ./node_modules/.bin/altimate-code -`) From e41e5a069a57d18345e23d6c6849ffae8fc95ab0 Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Fri, 3 Apr 2026 13:39:46 -0700 Subject: [PATCH 19/20] feat: add data-parity skill to builder prompt with table and SQL query comparison modes --- packages/opencode/src/altimate/prompts/builder.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/packages/opencode/src/altimate/prompts/builder.txt b/packages/opencode/src/altimate/prompts/builder.txt index d4a880869a..4fe16dbf52 100644 --- a/packages/opencode/src/altimate/prompts/builder.txt +++ b/packages/opencode/src/altimate/prompts/builder.txt @@ -153,6 +153,12 @@ Skills are specialized workflows that compose multiple tools. Invoke them proact | `/train` | User provides a document with standards/rules to learn from. | | `/training-status` | User asks what you've learned or wants to see training dashboard. | +### Data Validation & Comparison + +| Skill | Invoke When | +|-------|-------------| +| `/data-parity` | User wants to compare two tables, SQL query results, or validate a migration. Uses the `data_diff` tool for row-level and column-level comparison. Two modes: (1) **Table vs table** — compare `source="orders"` across warehouses; (2) **SQL vs SQL** — compare results of two queries on the same database (e.g. `source="SELECT ... FROM orders WHERE ..."` vs `target="SELECT ... FROM orders_v2 WHERE ..."`). Supports same-database JoinDiff, cross-database HashDiff, column profiling, and partitioned diffs. Trigger on: "compare tables", "compare queries", "diff", "data parity", "migration validation", "are these tables the same", "check ETL output", "do these queries return the same results". | + ### Data Visualization | Skill | Invoke When | @@ -173,6 +179,12 @@ Don't wait for `/skill-name` — invoke skills when the task clearly matches: - User says "visualize this data" -> invoke `/data-viz` - User says "make a dashboard" -> invoke `/data-viz` - User says "chart these metrics" -> invoke `/data-viz` +- User says "compare these tables" -> invoke `/data-parity` +- User says "are these tables the same" -> invoke `/data-parity` +- User says "validate my migration" -> invoke `/data-parity` +- User says "diff source and target" -> invoke `/data-parity` +- User says "do these queries return the same thing" -> invoke `/data-parity` +- User says "compare the output of these two queries" -> invoke `/data-parity` ## Teammate Training From b8147c957d4ebffb22cad350f0f7419b2c2bb23f Mon Sep 17 00:00:00 2001 From: suryaiyer95 Date: Fri, 3 Apr 2026 15:58:40 -0700 Subject: [PATCH 20/20] =?UTF-8?q?fix:=20address=20code=20review=20findings?= =?UTF-8?q?=20=E2=80=94=20Oracle=20TRUNC,=20dialect-aware=20quoting,=20que?= =?UTF-8?q?ry+partition=20guard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Oracle day granularity: 'DDD' (day-of-year) → 'DD' (day-of-month) - Add `quoteIdentForDialect()` helper: MySQL/ClickHouse use backticks, TSQL/Fabric use brackets, others use ANSI double-quotes - `buildPartitionDiscoverySQL` and `buildPartitionWhereClause` now use dialect-aware quoting instead of hardcoded double-quotes - `runPartitionedDiff` rejects SQL queries as source/target with a clear error — partitioning requires table names to discover column values --- .../altimate/native/connections/data-diff.ts | 40 ++++++++++++++++--- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts index 24a2a16205..294c43745b 100644 --- a/packages/opencode/src/altimate/native/connections/data-diff.ts +++ b/packages/opencode/src/altimate/native/connections/data-diff.ts @@ -403,6 +403,24 @@ const MAX_STEPS = 200 // Partition support // --------------------------------------------------------------------------- +/** + * Quote a SQL identifier using the correct delimiter for the dialect. + */ +function quoteIdentForDialect(identifier: string, dialect: string): string { + switch (dialect) { + case "mysql": + case "mariadb": + case "clickhouse": + return `\`${identifier.replace(/`/g, "``")}\`` + case "tsql": + case "fabric": + return `[${identifier.replace(/\]/g, "]]")}]` + default: + // ANSI SQL: Postgres, Snowflake, BigQuery, DuckDB, Oracle, Redshift, etc. + return `"${identifier.replace(/"/g, '""')}"` + } +} + /** * Build a DATE_TRUNC expression appropriate for the warehouse dialect. */ @@ -421,7 +439,7 @@ function dateTruncExpr(granularity: string, column: string, dialect: string): st case "oracle": { // Oracle uses TRUNC() with format models — 'WEEK' is invalid, use 'IW' for ISO week const oracleFmt: Record = { - day: "DDD", + day: "DD", week: "IW", month: "MM", year: "YYYY", @@ -465,15 +483,16 @@ function buildPartitionDiscoverySQL( ): string { const where = whereClause ? `WHERE ${whereClause}` : "" const mode = partitionMode(granularity, bucketSize) + const quotedCol = quoteIdentForDialect(partitionColumn, dialect) let expr: string if (mode === "numeric") { - expr = `FLOOR(${partitionColumn} / ${bucketSize}) * ${bucketSize}` + expr = `FLOOR(${quotedCol} / ${bucketSize}) * ${bucketSize}` } else if (mode === "date") { - expr = dateTruncExpr(granularity!, partitionColumn, dialect) + expr = dateTruncExpr(granularity!, quotedCol, dialect) } else { // categorical — raw distinct values, no transformation - expr = partitionColumn + expr = quotedCol } return `SELECT DISTINCT ${expr} AS _p FROM ${table} ${where} ORDER BY _p` @@ -490,8 +509,8 @@ function buildPartitionWhereClause( dialect: string, ): string { const mode = partitionMode(granularity, bucketSize) - // Quote the column identifier to handle special characters and reserved words - const quotedCol = `"${partitionColumn.replace(/"/g, '""')}"` + // Quote the column identifier using dialect-appropriate delimiters + const quotedCol = quoteIdentForDialect(partitionColumn, dialect) if (mode === "numeric") { const lo = Number(partitionValue) @@ -592,6 +611,15 @@ function mergeOutcomes(accumulated: unknown, next: unknown): unknown { * then aggregate results. */ async function runPartitionedDiff(params: DataDiffParams): Promise { + // Partitioned diff requires table names — can't partition a SQL query by column + if (isQuery(params.source) || isQuery(params.target)) { + return { + success: false, + error: "partition_column cannot be used when source or target is a SQL query. Use table names instead, or remove partition_column.", + steps: 0, + } + } + const resolveDialect = (warehouse: string | undefined): string => { if (warehouse) { const cfg = Registry.getConfig(warehouse)