From 6d365ad7807e64e6f95c00ce3daeaf5e323dff6b Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Thu, 26 Mar 2026 17:39:50 -0700
Subject: [PATCH 01/20] feat: add data-parity cross-database table comparison

- Add DataParity engine integration via native Rust bindings
- Add data-diff tool for LLM agent (profile, joindiff, hashdiff, cascade, auto)
- Add ClickHouse driver support
- Add data-parity skill: profile-first workflow, algorithm selection guide,
  CRITICAL warning that joindiff cannot run cross-database (always returns 0 diffs),
  output style rules (facts only, no editorializing)
- Gitignore .altimate-code/ (credentials) and *.node (platform binaries)
---
 .gitignore                                    |   6 +
 .opencode/skills/data-parity/SKILL.md         | 290 ++++++++++++++++++
 packages/drivers/src/clickhouse.ts            |   6 +-
 .../altimate/native/connections/data-diff.ts  | 268 ++++++++++++++++
 .../altimate/native/connections/register.ts   |   8 +
 .../opencode/src/altimate/native/types.ts     |  34 ++
 .../opencode/src/altimate/tools/data-diff.ts  | 174 +++++++++++
 packages/opencode/src/tool/registry.ts        |   2 +
 8 files changed, 785 insertions(+), 3 deletions(-)
 create mode 100644 .opencode/skills/data-parity/SKILL.md
 create mode 100644 packages/opencode/src/altimate/native/connections/data-diff.ts
 create mode 100644 packages/opencode/src/altimate/tools/data-diff.ts

diff --git a/.gitignore b/.gitignore
index b10c1bb043..4dfe62f9ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,6 +28,12 @@ target
 # Commit message scratch files
 .github/meta/
 
+# Local connections config (may contain credentials)
+.altimate-code/
+
+# Pre-built native binaries (platform-specific, not for source control)
+packages/opencode/*.node
+
 # Local dev files
 opencode-dev
 logs/
diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md
new file mode 100644
index 0000000000..4d7b7460c9
--- /dev/null
+++ b/.opencode/skills/data-parity/SKILL.md
@@ -0,0 +1,290 @@
+---
+name: data-parity
+description: Validate that two tables or query results are identical — or diagnose exactly how they differ. Discover schema, identify keys, profile cheaply, then diff. Use for migration validation, ETL regression, and query refactor verification.
+---
+
+# Data Parity (Table Diff)
+
+## Output Style
+
+**Report facts only. No editorializing.**
+- Show counts, changed values, missing rows, new rows — that's it.
+- Do NOT explain why row-level diffing is valuable, why COUNT(*) is insufficient, or pitch the tool.
+- Do NOT add "the dangerous one", "this is exactly why", "this matters" style commentary.
+- The user asked for a diff result, not a lecture.
+
+## Requirements
+**Agent:** any
+**Tools used:** `sql_query` (for schema discovery), `data_diff`
+
+## When to Use This Skill
+
+**Use when the user wants to:**
+- Confirm two tables contain the same data after a migration
+- Find rows added, deleted, or modified between source and target
+- Validate that a dbt model produces the same output as the old query
+- Run regression checks after a pipeline change
+
+**Do NOT use for:**
+- Schema comparison (column names, types) — check DDL instead
+- Performance benchmarking — this runs SELECT queries
+
+---
+
+## The `data_diff` Tool
+
+`data_diff` takes table names and key columns. It generates SQL, routes it through the specified warehouse connections, and reports differences. It **does not discover schema** — you must provide key columns and relevant comparison columns.
+
+**Key parameters:**
+- `source` — table name (`orders`, `db.schema.orders`) or full SELECT/WITH query
+- `target` — table name or SELECT query
+- `key_columns` — primary key(s) uniquely identifying each row (required)
+- `source_warehouse` — connection name for source
+- `target_warehouse` — connection name for target (omit = same as source)
+- `extra_columns` — columns to compare beyond keys (omit = compare all)
+- `algorithm` — `auto`, `joindiff`, `hashdiff`, `profile`, `cascade`
+- `where_clause` — filter applied to both tables
+
+> **CRITICAL — Algorithm choice:**
+> - If `source_warehouse` ≠ `target_warehouse` → **always use `hashdiff`** (or `auto`).
+> - `joindiff` runs a single SQL JOIN on ONE connection — it physically cannot see the other table.
+>   Using `joindiff` across different servers always reports 0 differences (both sides look identical).
+> - When in doubt, use `algorithm="auto"` — it picks `joindiff` for same-warehouse and `hashdiff` for cross-warehouse automatically.
+
+---
+
+## Workflow
+
+The key principle: **the LLM does the identification work using SQL tools first, then calls data_diff with informed parameters.**
+
+### Step 1: Inspect the tables
+
+Before calling `data_diff`, use `sql_query` to understand what you're comparing:
+
+```sql
+-- Get columns and types
+SELECT column_name, data_type, is_nullable
+FROM information_schema.columns
+WHERE table_schema = 'public' AND table_name = 'orders'
+ORDER BY ordinal_position
+```
+
+For ClickHouse:
+```sql
+DESCRIBE TABLE source_db.events
+```
+
+For Snowflake:
+```sql
+SHOW COLUMNS IN TABLE orders
+```
+
+**Look for:**
+- Columns that look like primary keys (named `id`, `*_id`, `*_key`, `uuid`)
+- Columns with `NOT NULL` constraints
+- Whether there are composite keys
+
+### Step 2: Identify the key columns
+
+If the primary key isn't obvious from the schema, run a cardinality check:
+
+```sql
+SELECT
+  COUNT(*) AS total_rows,
+  COUNT(DISTINCT order_id) AS distinct_order_id,
+  COUNT(DISTINCT customer_id) AS distinct_customer_id,
+  COUNT(DISTINCT created_at) AS distinct_created_at
+FROM orders
+```
+
+**A good key column:** `distinct_count = total_rows` (fully unique) and `null_count = 0`.
+
+If no single column is unique, find a composite key:
+```sql
+SELECT order_id, line_item_id, COUNT(*) as cnt
+FROM order_lines
+GROUP BY order_id, line_item_id
+HAVING COUNT(*) > 1
+LIMIT 5
+```
+If this returns 0 rows, `(order_id, line_item_id)` is a valid composite key.
+
+### Step 3: Estimate table size
+
+```sql
+SELECT COUNT(*) FROM orders
+```
+
+Use this to choose the algorithm:
+- **< 1M rows**: `joindiff` (same DB) or `hashdiff` (cross-DB) — either is fine
+- **1M–100M rows**: `hashdiff` or `cascade`
+- **> 100M rows**: `hashdiff` with a `where_clause` date filter to validate a recent window first
+
+### Step 4: Profile first for unknown tables
+
+If you don't know what to expect (first-time validation, unfamiliar pipeline), start cheap:
+
+```
+data_diff(
+  source="orders",
+  target="orders_migrated",
+  key_columns=["order_id"],
+  source_warehouse="postgres_prod",
+  target_warehouse="snowflake_dw",
+  algorithm="profile"
+)
+```
+
+Profile output tells you:
+- Row count on each side (mismatch = load completeness problem)
+- Which columns have null count differences (mismatch = NULL handling bug)
+- Min/max divergence per column (mismatch = value transformation bug)
+- Which columns match exactly (safe to skip in row-level diff)
+
+**Interpret profile to narrow the diff:**
+```
+Column Profile Comparison
+
+  ✓ order_id: match
+  ✓ customer_id: match
+  ✗ amount: DIFFER     ← source min=10.00, target min=10.01 — rounding issue?
+  ✗ status: DIFFER     ← source nulls=0, target nulls=47 — NULL mapping bug?
+  ✓ created_at: match
+```
+→ Only diff `amount` and `status` in the next step.
+
+### Step 5: Run targeted row-level diff
+
+```
+data_diff(
+  source="orders",
+  target="orders_migrated",
+  key_columns=["order_id"],
+  extra_columns=["amount", "status"],    // only the columns profile said differ
+  source_warehouse="postgres_prod",
+  target_warehouse="snowflake_dw",
+  algorithm="hashdiff"
+)
+```
+
+---
+
+## Algorithm Selection
+
+| Algorithm | When to use |
+|-----------|-------------|
+| `profile` | First pass — column stats (count, min, max, nulls). No row scan. |
+| `joindiff` | Same database — single FULL OUTER JOIN query. Fast. |
+| `hashdiff` | Cross-database, or large tables — bisection with checksums. Scales. |
+| `cascade` | Auto-escalate: profile → hashdiff on diverging columns. |
+| `auto` | JoinDiff if same warehouse, HashDiff if cross-database. |
+
+**JoinDiff constraint:** Both tables must be on the **same database connection**. If source and target are on different servers, JoinDiff will always report 0 diffs (it only sees one side). Use `hashdiff` or `auto` for cross-database.
+
+---
+
+## Output Interpretation
+
+### IDENTICAL
+```
+✓ Tables are IDENTICAL
+  Rows checked: 1,000,000
+```
+→ Migration validated. Data is identical.
+
+### DIFFER — Diagnose by pattern
+
+```
+✗ Tables DIFFER
+
+  Only in source:  2       → rows deleted in target (ETL missed deletes)
+  Only in target:  2       → rows added to target (dedup issue or new data)
+  Updated rows:    3       → values changed (transform bug, type casting, rounding)
+  Identical rows:  15
+```
+
+| Pattern | Root cause hypothesis |
+|---------|----------------------|
+| `only_in_source > 0`, `only_in_target = 0` | ETL dropped rows — check filters, incremental logic |
+| `only_in_source = 0`, `only_in_target > 0` | Target has extra rows — check dedup or wrong join |
+| `updated_rows > 0`, row counts match | Silent value corruption — check transforms, type casts |
+| Row count differs | Load completeness issue — check ETL watermarks |
+
+Sample diffs point to the specific key + column + old→new value:
+```
+key={"order_id":"4"} col=amount: 300.00 → 305.00
+```
+Use this to query the source systems directly and trace the discrepancy.
+
+---
+
+## Usage Examples
+
+### Full workflow: unknown migration
+```
+// 1. Discover schema
+sql_query("SELECT column_name, data_type FROM information_schema.columns WHERE table_name='orders'", warehouse="postgres_prod")
+
+// 2. Check row count
+sql_query("SELECT COUNT(*), COUNT(DISTINCT order_id) FROM orders", warehouse="postgres_prod")
+
+// 3. Profile to find which columns differ
+data_diff(source="orders", target="orders", key_columns=["order_id"],
+  source_warehouse="postgres_prod", target_warehouse="snowflake_dw", algorithm="profile")
+
+// 4. Row-level diff on diverging columns only
+data_diff(source="orders", target="orders", key_columns=["order_id"],
+  extra_columns=["amount", "status"],
+  source_warehouse="postgres_prod", target_warehouse="snowflake_dw", algorithm="hashdiff")
+```
+
+### Same-database query refactor
+```
+data_diff(
+  source="SELECT id, amount, status FROM orders WHERE region = 'us-east'",
+  target="SELECT id, amount, status FROM orders_v2 WHERE region = 'us-east'",
+  key_columns=["id"]
+)
+```
+
+### Large table — filter to recent window first
+```
+data_diff(
+  source="fact_events",
+  target="fact_events_v2",
+  key_columns=["event_id"],
+  where_clause="event_date >= '2024-01-01'",
+  algorithm="hashdiff"
+)
+```
+
+### ClickHouse — always qualify with database.table
+```
+data_diff(
+  source="source_db.events",
+  target="target_db.events",
+  key_columns=["event_id"],
+  source_warehouse="clickhouse_source",
+  target_warehouse="clickhouse_target",
+  algorithm="hashdiff"
+)
+```
+
+---
+
+## Common Mistakes
+
+**Calling data_diff without knowing the key**
+→ Run `sql_query` to check cardinality first. A bad key gives meaningless results.
+
+**Using joindiff for cross-database tables**
+→ JoinDiff runs one SQL query on one connection. It can't see the other table. Use `hashdiff` or `auto`.
+
+**Diffing a 1B row table without a date filter**
+→ Add `where_clause` to scope to recent data. Validate a window first, then expand.
+
+**Ignoring profile output and jumping to full diff**
+→ Profile is free. It tells you which columns actually differ so you can avoid scanning all columns across all rows.
+
+**Forgetting to check row counts before diffing**
+→ If source has 1M rows and target has 900K, row-level diff is misleading. Fix the load completeness issue first.
diff --git a/packages/drivers/src/clickhouse.ts b/packages/drivers/src/clickhouse.ts
index 256d060180..cfce48ed2f 100644
--- a/packages/drivers/src/clickhouse.ts
+++ b/packages/drivers/src/clickhouse.ts
@@ -5,7 +5,7 @@
  * Uses the official ClickHouse JS client which communicates over HTTP(S).
  */
 
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   let createClient: any
@@ -60,11 +60,11 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       client = createClient(clientConfig)
     },
 
-    async execute(sql: string, limit?: number, _binds?: any[]): Promise<ConnectorResult> {
+    async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult> {
       if (!client) {
         throw new Error("ClickHouse client not connected — call connect() first")
       }
-      const effectiveLimit = limit === undefined ? 1000 : limit
+      const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000)
       let query = sql
 
       // Strip string literals, then comments, for accurate SQL heuristic checks.
diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
new file mode 100644
index 0000000000..035df6b4ca
--- /dev/null
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -0,0 +1,268 @@
+/**
+ * DataParity orchestrator — runs the cooperative Rust state machine against
+ * live database connections.
+ *
+ * The Rust engine (DataParitySession) never touches databases — it emits SQL
+ * for us to execute, we feed results back, and it decides the next step.
+ * This file is the bridge between that engine and altimate-code's drivers.
+ */
+
+import type { DataDiffParams, DataDiffResult } from "../types"
+import * as Registry from "./registry"
+
+// ---------------------------------------------------------------------------
+// Query-source detection
+// ---------------------------------------------------------------------------
+
+const SQL_KEYWORDS = /^\s*(SELECT|WITH|VALUES)\b/i
+
+/**
+ * Detect whether a string is an arbitrary SQL query (vs a plain table name).
+ * Plain table names may contain dots (schema.table, db.schema.table) but not spaces.
+ */
+function isQuery(input: string): boolean {
+  return SQL_KEYWORDS.test(input)
+}
+
+/**
+ * If either source or target is an arbitrary query, wrap them in CTEs so the
+ * DataParity engine can treat them as tables named `__diff_source` / `__diff_target`.
+ *
+ * Returns `{ table1Name, table2Name, ctePrefix | null }`.
+ *
+ * When a CTE prefix is returned, it must be prepended to every SQL task emitted
+ * by the engine before execution.
+ */
+export function resolveTableSources(
+  source: string,
+  target: string,
+): { table1Name: string; table2Name: string; ctePrefix: string | null } {
+  const source_is_query = isQuery(source)
+  const target_is_query = isQuery(target)
+
+  if (!source_is_query && !target_is_query) {
+    // Both are plain table names — pass through unchanged
+    return { table1Name: source, table2Name: target, ctePrefix: null }
+  }
+
+  // At least one is a query — wrap both in CTEs
+  const srcExpr = source_is_query ? source : `SELECT * FROM ${source}`
+  const tgtExpr = target_is_query ? target : `SELECT * FROM ${target}`
+
+  const ctePrefix = `WITH __diff_source AS (\n${srcExpr}\n), __diff_target AS (\n${tgtExpr}\n)`
+  return {
+    table1Name: "__diff_source",
+    table2Name: "__diff_target",
+    ctePrefix,
+  }
+}
+
+/**
+ * Inject a CTE prefix into a SQL statement from the engine.
+ *
+ * The engine emits standalone SELECT statements. We need to prepend our CTE
+ * definitions so `__diff_source`/`__diff_target` resolve correctly.
+ *
+ * Handles the case where the engine itself emits CTEs (starts with WITH …):
+ *   WITH engine_cte AS (…) SELECT … FROM __diff_source
+ * becomes:
+ *   WITH __diff_source AS (…), __diff_target AS (…), engine_cte AS (…) SELECT …
+ */
+export function injectCte(sql: string, ctePrefix: string): string {
+  const trimmed = sql.trimStart()
+  const withMatch = trimmed.match(/^WITH\s+/i)
+
+  if (withMatch) {
+    // Engine also has CTEs — merge them: our CTEs first, then engine CTEs
+    const afterWith = trimmed.slice(withMatch[0].length)
+    // ctePrefix already starts with "WITH …" — strip "WITH " and append ", "
+    const ourDefs = ctePrefix.replace(/^WITH\s+/i, "")
+    return `WITH ${ourDefs},\n${afterWith}`
+  }
+
+  // Plain SELECT — just prepend our CTE block
+  return `${ctePrefix}\n${trimmed}`
+}
+
+// ---------------------------------------------------------------------------
+// Executor
+// ---------------------------------------------------------------------------
+
+type Rows = (string | null)[][]
+
+/**
+ * Execute a SQL statement against a named warehouse and return rows as string[][].
+ */
+async function executeQuery(sql: string, warehouseName: string | undefined): Promise<Rows> {
+  let connector
+  if (warehouseName) {
+    connector = await Registry.get(warehouseName)
+  } else {
+    const warehouses = Registry.list().warehouses
+    if (warehouses.length === 0) {
+      throw new Error("No default warehouse configured.")
+    }
+    connector = await Registry.get(warehouses[0].name)
+  }
+
+  const result = await connector.execute(sql)
+
+  // Normalise to string[][] — drivers return mixed types
+  return result.rows.map((row: unknown[]) =>
+    row.map((v) => (v === null || v === undefined ? null : String(v))),
+  )
+}
+
+// ---------------------------------------------------------------------------
+// Main orchestrator
+// ---------------------------------------------------------------------------
+
+const MAX_STEPS = 200
+
+export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResult> {
+  // Dynamically import NAPI module (not available in test environments without the binary)
+  let DataParitySession: new (specJson: string) => {
+    start(): string
+    step(responsesJson: string): string
+  }
+
+  try {
+    const core = await import("@altimateai/altimate-core")
+    DataParitySession = (core as any).DataParitySession
+    if (!DataParitySession) throw new Error("DataParitySession not exported from @altimateai/altimate-core")
+  } catch (e) {
+    return {
+      success: false,
+      error: `altimate-core NAPI module unavailable: ${e}`,
+      steps: 0,
+    }
+  }
+
+  // Resolve sources (plain table names vs arbitrary queries)
+  const { table1Name, table2Name, ctePrefix } = resolveTableSources(
+    params.source,
+    params.target,
+  )
+
+  // Parse optional qualified names: "db.schema.table" → { database, schema, table }
+  const parseQualified = (name: string) => {
+    const parts = name.split(".")
+    if (parts.length === 3) return { database: parts[0], schema: parts[1], table: parts[2] }
+    if (parts.length === 2) return { schema: parts[0], table: parts[1] }
+    return { table: name }
+  }
+
+  const table1Ref = parseQualified(table1Name)
+  const table2Ref = parseQualified(table2Name)
+
+  // Resolve dialect from warehouse config
+  const resolveDialect = (warehouse: string | undefined): string => {
+    if (warehouse) {
+      const cfg = Registry.getConfig(warehouse)
+      return cfg?.type ?? "generic"
+    }
+    const warehouses = Registry.list().warehouses
+    return warehouses[0]?.type ?? "generic"
+  }
+
+  const dialect1 = resolveDialect(params.source_warehouse)
+  const dialect2 = resolveDialect(params.target_warehouse ?? params.source_warehouse)
+
+  // Build session spec
+  const spec = {
+    table1: table1Ref,
+    table2: table2Ref,
+    dialect1,
+    dialect2,
+    config: {
+      algorithm: params.algorithm ?? "auto",
+      key_columns: params.key_columns,
+      extra_columns: params.extra_columns ?? [],
+      ...(params.where_clause ? { where_clause: params.where_clause } : {}),
+      ...(params.numeric_tolerance != null ? { numeric_tolerance: params.numeric_tolerance } : {}),
+      ...(params.timestamp_tolerance_ms != null
+        ? { timestamp_tolerance_ms: params.timestamp_tolerance_ms }
+        : {}),
+    },
+  }
+
+  // Create session
+  let session: InstanceType<typeof DataParitySession>
+  try {
+    session = new DataParitySession(JSON.stringify(spec))
+  } catch (e) {
+    return {
+      success: false,
+      error: `Failed to create DataParitySession: ${e}`,
+      steps: 0,
+    }
+  }
+
+  // Route SQL tasks to the correct warehouse
+  const warehouseFor = (tableSide: string): string | undefined =>
+    tableSide === "Table2" ? (params.target_warehouse ?? params.source_warehouse) : params.source_warehouse
+
+  // Cooperative loop
+  let actionJson = session.start()
+  let stepCount = 0
+
+  while (stepCount < MAX_STEPS) {
+    const action = JSON.parse(actionJson) as {
+      type: string
+      tasks?: Array<{ id: string; table_side: string; sql: string; expected_shape: string }>
+      outcome?: unknown
+      message?: string
+    }
+
+    if (action.type === "Done") {
+      return {
+        success: true,
+        steps: stepCount,
+        outcome: action.outcome,
+      }
+    }
+
+    if (action.type === "Error") {
+      return {
+        success: false,
+        error: action.message ?? "Unknown engine error",
+        steps: stepCount,
+      }
+    }
+
+    if (action.type !== "ExecuteSql") {
+      return {
+        success: false,
+        error: `Unexpected action type: ${action.type}`,
+        steps: stepCount,
+      }
+    }
+
+    stepCount++
+
+    // Execute all SQL tasks in parallel
+    const tasks = action.tasks ?? []
+    const responses = await Promise.all(
+      tasks.map(async (task) => {
+        const warehouse = warehouseFor(task.table_side)
+        // Inject CTE definitions if we're in query-comparison mode
+        const sql = ctePrefix ? injectCte(task.sql, ctePrefix) : task.sql
+        try {
+          const rows = await executeQuery(sql, warehouse)
+          return { id: task.id, rows }
+        } catch (e) {
+          // Return error shape — engine will produce an Error action on next step
+          return { id: task.id, rows: [], error: String(e) }
+        }
+      }),
+    )
+
+    actionJson = session.step(JSON.stringify(responses))
+  }
+
+  return {
+    success: false,
+    error: `Exceeded maximum step limit (${MAX_STEPS}). The diff may require more iterations for this table size.`,
+    steps: stepCount,
+  }
+}
diff --git a/packages/opencode/src/altimate/native/connections/register.ts b/packages/opencode/src/altimate/native/connections/register.ts
index ef8ac86861..4f2d83086c 100644
--- a/packages/opencode/src/altimate/native/connections/register.ts
+++ b/packages/opencode/src/altimate/native/connections/register.ts
@@ -10,6 +10,7 @@ import { register } from "../dispatcher"
 import * as Registry from "./registry"
 import { discoverContainers } from "./docker-discovery"
 import { parseDbtProfiles } from "./dbt-profiles"
+import { runDataDiff } from "./data-diff"
 import type {
   SqlExecuteParams,
   SqlExecuteResult,
@@ -29,6 +30,8 @@ import type {
   SchemaInspectResult,
   DbtProfilesParams,
   DbtProfilesResult,
+  DataDiffParams,
+  DataDiffResult,
 } from "../types"
 import type { ConnectionConfig } from "@altimateai/drivers"
 import { Telemetry } from "../../../telemetry"
@@ -425,6 +428,11 @@ register("dbt.profiles", async (params: DbtProfilesParams): Promise<DbtProfilesR
   }
 })
 
+// --- data.diff ---
+register("data.diff", async (params: DataDiffParams): Promise<DataDiffResult> => {
+  return runDataDiff(params)
+})
+
 } // end registerAll
 
 // Auto-register on module load
diff --git a/packages/opencode/src/altimate/native/types.ts b/packages/opencode/src/altimate/native/types.ts
index 16a7f4e062..f88482beb3 100644
--- a/packages/opencode/src/altimate/native/types.ts
+++ b/packages/opencode/src/altimate/native/types.ts
@@ -964,6 +964,38 @@ export interface LocalTestResult {
   error?: string
 }
 
+// --- Data Diff ---
+
+export interface DataDiffParams {
+  /** Source table name (e.g. "orders", "db.schema.orders") or full SQL query */
+  source: string
+  /** Target table name or SQL query */
+  target: string
+  /** Primary key columns that uniquely identify each row */
+  key_columns: string[]
+  /** Source warehouse connection name */
+  source_warehouse?: string
+  /** Target warehouse connection name (defaults to source_warehouse) */
+  target_warehouse?: string
+  /** Extra columns to compare beyond the key */
+  extra_columns?: string[]
+  /** Algorithm: "auto" | "joindiff" | "hashdiff" | "profile" | "cascade" */
+  algorithm?: string
+  /** Optional WHERE filter applied to both tables */
+  where_clause?: string
+  /** Absolute numeric tolerance */
+  numeric_tolerance?: number
+  /** Timestamp tolerance in milliseconds */
+  timestamp_tolerance_ms?: number
+}
+
+export interface DataDiffResult {
+  success: boolean
+  steps: number
+  outcome?: unknown
+  error?: string
+}
+
 // --- Method registry ---
 
 export const BridgeMethods = {
@@ -1007,6 +1039,8 @@ export const BridgeMethods = {
   // --- local testing ---
   "local.schema_sync": {} as { params: LocalSchemaSyncParams; result: LocalSchemaSyncResult },
   "local.test": {} as { params: LocalTestParams; result: LocalTestResult },
+  // --- data diff ---
+  "data.diff": {} as { params: DataDiffParams; result: DataDiffResult },
   // --- altimate-core (existing) ---
   "altimate_core.validate": {} as { params: AltimateCoreValidateParams; result: AltimateCoreResult },
   "altimate_core.lint": {} as { params: AltimateCoreLintParams; result: AltimateCoreResult },
diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts
new file mode 100644
index 0000000000..0719361dbe
--- /dev/null
+++ b/packages/opencode/src/altimate/tools/data-diff.ts
@@ -0,0 +1,174 @@
+import z from "zod"
+import { Tool } from "../../tool/tool"
+import { Dispatcher } from "../native"
+
+export const DataDiffTool = Tool.define("data_diff", {
+  description: [
+    "Compare two database tables or query results row-by-row to find differences.",
+    "",
+    "Two use cases:",
+    "1. Migration validation — compare the same table across two databases:",
+    '   source="orders" source_warehouse="postgres_prod" target_warehouse="snowflake_dw"',
+    "2. Query optimization — compare results of two SQL queries on the same database:",
+    '   source="SELECT id, amount FROM orders WHERE ..." target="SELECT id, amount FROM orders_v2 WHERE ..."',
+    "",
+    "Algorithms:",
+    "- auto: JoinDiff if same dialect, HashDiff if cross-database (default)",
+    "- joindiff: FULL OUTER JOIN (fast, same-database only)",
+    "- hashdiff: Bisection with checksums (cross-database, any scale)",
+    "- profile: Column-level statistics comparison",
+  ].join("\n"),
+  parameters: z.object({
+    source: z.string().describe(
+      "Source table name (e.g. 'orders', 'db.schema.orders') or a full SQL query starting with SELECT/WITH",
+    ),
+    target: z.string().describe(
+      "Target table name or SQL query to compare against source",
+    ),
+    key_columns: z
+      .array(z.string())
+      .describe("Primary key columns that uniquely identify each row (e.g. ['id'] or ['order_id', 'line_item'])"),
+    source_warehouse: z.string().optional().describe("Source warehouse connection name"),
+    target_warehouse: z.string().optional().describe(
+      "Target warehouse connection name. Omit to use the same warehouse as source (query comparison mode)",
+    ),
+    extra_columns: z
+      .array(z.string())
+      .optional()
+      .describe("Additional columns to compare beyond the key columns. Omit to compare all columns"),
+    algorithm: z
+      .enum(["auto", "joindiff", "hashdiff", "profile", "cascade"])
+      .optional()
+      .default("auto")
+      .describe("Comparison algorithm"),
+    where_clause: z.string().optional().describe("Optional WHERE filter applied to both tables"),
+    numeric_tolerance: z
+      .number()
+      .optional()
+      .describe("Absolute tolerance for numeric comparisons (e.g. 0.01 for cent-level tolerance)"),
+    timestamp_tolerance_ms: z
+      .number()
+      .optional()
+      .describe("Tolerance for timestamp comparisons in milliseconds"),
+  }),
+  async execute(args, ctx) {
+    // Require read permission — data diff executes SELECT queries
+    await ctx.ask({
+      permission: "sql_execute_read",
+      patterns: [args.source.slice(0, 120), args.target.slice(0, 120)],
+      always: ["*"],
+      metadata: {},
+    })
+
+    try {
+      const result = await Dispatcher.call("data.diff", {
+        source: args.source,
+        target: args.target,
+        key_columns: args.key_columns,
+        source_warehouse: args.source_warehouse,
+        target_warehouse: args.target_warehouse,
+        extra_columns: args.extra_columns,
+        algorithm: args.algorithm,
+        where_clause: args.where_clause,
+        numeric_tolerance: args.numeric_tolerance,
+        timestamp_tolerance_ms: args.timestamp_tolerance_ms,
+      })
+
+      if (!result.success) {
+        return {
+          title: "Data diff: ERROR",
+          metadata: { success: false, steps: result.steps },
+          output: `Data diff failed: ${result.error}`,
+        }
+      }
+
+      const outcome = result.outcome as any
+      const output = formatOutcome(outcome, args.source, args.target)
+
+      return {
+        title: `Data diff: ${summarize(outcome)}`,
+        metadata: { success: true, steps: result.steps },
+        output,
+      }
+    } catch (e) {
+      const msg = e instanceof Error ? e.message : String(e)
+      return {
+        title: "Data diff: ERROR",
+        metadata: { success: false, steps: 0, error: msg },
+        output: `Data diff failed: ${msg}`,
+      }
+    }
+  },
+})
+
+function summarize(outcome: any): string {
+  if (!outcome) return "complete"
+  if (outcome.Match) return "IDENTICAL ✓"
+  if (outcome.Diff) {
+    const r = outcome.Diff
+    const parts: string[] = []
+    if (r.rows_only_in_source > 0) parts.push(`${r.rows_only_in_source} only in source`)
+    if (r.rows_only_in_target > 0) parts.push(`${r.rows_only_in_target} only in target`)
+    if (r.rows_updated > 0) parts.push(`${r.rows_updated} updated`)
+    return parts.length ? parts.join(", ") : "differences found"
+  }
+  if (outcome.Profile) return "profile complete"
+  return "complete"
+}
+
+function formatOutcome(outcome: any, source: string, target: string): string {
+  if (!outcome) return "Comparison complete."
+
+  const lines: string[] = []
+
+  if (outcome.Match) {
+    lines.push(`✓ Tables are IDENTICAL`)
+    const m = outcome.Match
+    if (m.row_count != null) lines.push(`  Rows checked: ${m.row_count.toLocaleString()}`)
+    if (m.algorithm) lines.push(`  Algorithm: ${m.algorithm}`)
+    return lines.join("\n")
+  }
+
+  if (outcome.Diff) {
+    const r = outcome.Diff
+    lines.push(`✗ Tables DIFFER`)
+    lines.push(``)
+    lines.push(`  Source:  ${source}`)
+    lines.push(`  Target:  ${target}`)
+    lines.push(``)
+
+    if (r.total_source_rows != null) lines.push(`  Source rows:        ${r.total_source_rows.toLocaleString()}`)
+    if (r.total_target_rows != null) lines.push(`  Target rows:        ${r.total_target_rows.toLocaleString()}`)
+    if (r.rows_only_in_source > 0) lines.push(`  Only in source:     ${r.rows_only_in_source.toLocaleString()}`)
+    if (r.rows_only_in_target > 0) lines.push(`  Only in target:     ${r.rows_only_in_target.toLocaleString()}`)
+    if (r.rows_updated > 0) lines.push(`  Updated rows:       ${r.rows_updated.toLocaleString()}`)
+    if (r.rows_identical > 0) lines.push(`  Identical rows:     ${r.rows_identical.toLocaleString()}`)
+
+    if (r.sample_diffs?.length) {
+      lines.push(``)
+      lines.push(`  Sample differences (first ${r.sample_diffs.length}):`)
+      for (const d of r.sample_diffs.slice(0, 5)) {
+        lines.push(`    key=${JSON.stringify(d.key)} col=${d.column}: ${d.source_value} → ${d.target_value}`)
+      }
+    }
+
+    return lines.join("\n")
+  }
+
+  if (outcome.Profile) {
+    const p = outcome.Profile
+    lines.push(`Column Profile Comparison`)
+    lines.push(``)
+    for (const col of p.columns ?? []) {
+      const verdict = col.verdict === "match" ? "✓" : col.verdict === "within_tolerance" ? "~" : "✗"
+      lines.push(`  ${verdict} ${col.column}: ${col.verdict}`)
+      if (col.source_stats && col.target_stats) {
+        lines.push(`      source: count=${col.source_stats.count} nulls=${col.source_stats.null_count} min=${col.source_stats.min} max=${col.source_stats.max}`)
+        lines.push(`      target: count=${col.target_stats.count} nulls=${col.target_stats.null_count} min=${col.target_stats.min} max=${col.target_stats.max}`)
+      }
+    }
+    return lines.join("\n")
+  }
+
+  return JSON.stringify(outcome, null, 2)
+}
diff --git a/packages/opencode/src/tool/registry.ts b/packages/opencode/src/tool/registry.ts
index 075291248f..e5fc1bf9c9 100644
--- a/packages/opencode/src/tool/registry.ts
+++ b/packages/opencode/src/tool/registry.ts
@@ -57,6 +57,7 @@ import { SqlFormatTool } from "../altimate/tools/sql-format"
 import { SqlFixTool } from "../altimate/tools/sql-fix"
 import { SqlAutocompleteTool } from "../altimate/tools/sql-autocomplete"
 import { SqlDiffTool } from "../altimate/tools/sql-diff"
+import { DataDiffTool } from "../altimate/tools/data-diff"
 import { FinopsQueryHistoryTool } from "../altimate/tools/finops-query-history"
 import { FinopsAnalyzeCreditsTool } from "../altimate/tools/finops-analyze-credits"
 import { FinopsExpensiveQueriesTool } from "../altimate/tools/finops-expensive-queries"
@@ -233,6 +234,7 @@ export namespace ToolRegistry {
       SqlFixTool,
       SqlAutocompleteTool,
       SqlDiffTool,
+      DataDiffTool,
       FinopsQueryHistoryTool,
       FinopsAnalyzeCreditsTool,
       FinopsExpensiveQueriesTool,

From 44d76689cef4342fdd846c1990628c8073ebe682 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Thu, 26 Mar 2026 18:21:06 -0700
Subject: [PATCH 02/20] feat: add partition support to data_diff

Split large tables by a date or numeric column before diffing.
Each partition is diffed independently then results are aggregated.

New params:
- partition_column: column to split on (date or numeric)
- partition_granularity: day | week | month | year (for dates)
- partition_bucket_size: bucket width for numeric columns

New output field:
- partition_results: per-partition breakdown (identical / differ / error)

Dialect-aware SQL: Postgres, Snowflake, BigQuery, ClickHouse, MySQL.

Skill updated with partition guidance and examples.
---
 .opencode/skills/data-parity/SKILL.md         |  30 ++-
 .../altimate/native/connections/data-diff.ts  | 233 +++++++++++++++++-
 .../opencode/src/altimate/native/types.ts     |  35 +++
 .../opencode/src/altimate/tools/data-diff.ts  |  54 +++-
 4 files changed, 348 insertions(+), 4 deletions(-)

diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md
index 4d7b7460c9..3f739eda4b 100644
--- a/.opencode/skills/data-parity/SKILL.md
+++ b/.opencode/skills/data-parity/SKILL.md
@@ -44,6 +44,9 @@ description: Validate that two tables or query results are identical — or diag
 - `extra_columns` — columns to compare beyond keys (omit = compare all)
 - `algorithm` — `auto`, `joindiff`, `hashdiff`, `profile`, `cascade`
 - `where_clause` — filter applied to both tables
+- `partition_column` — split the table by this column and diff each group independently (recommended for large tables)
+- `partition_granularity` — `day` | `week` | `month` | `year` for date columns (default: `month`)
+- `partition_bucket_size` — for numeric columns: bucket width (e.g. `100000` splits by ranges of 100K)
 
 > **CRITICAL — Algorithm choice:**
 > - If `source_warehouse` ≠ `target_warehouse` → **always use `hashdiff`** (or `auto`).
@@ -117,8 +120,31 @@ SELECT COUNT(*) FROM orders
 
 Use this to choose the algorithm:
 - **< 1M rows**: `joindiff` (same DB) or `hashdiff` (cross-DB) — either is fine
-- **1M–100M rows**: `hashdiff` or `cascade`
-- **> 100M rows**: `hashdiff` with a `where_clause` date filter to validate a recent window first
+- **1M–100M rows**: `hashdiff` with `partition_column` for faster, more precise results
+- **> 100M rows**: `hashdiff` + `partition_column` — required; bisection alone may miss rows at this scale
+
+**When to use `partition_column`:**
+- Table has a natural time or key column (e.g. `created_at`, `order_id`, `event_date`)
+- Table has > 500K rows and bisection is slow or returning incomplete results
+- You need per-partition visibility (which month/range has the problem)
+
+```
+// Date column — partition by month
+data_diff(source="lineitem", target="lineitem",
+  key_columns=["l_orderkey", "l_linenumber"],
+  source_warehouse="pg_source", target_warehouse="pg_target",
+  partition_column="l_shipdate", partition_granularity="month",
+  algorithm="hashdiff")
+
+// Numeric column — partition by key ranges of 100K
+data_diff(source="orders", target="orders",
+  key_columns=["o_orderkey"],
+  source_warehouse="pg_source", target_warehouse="pg_target",
+  partition_column="o_orderkey", partition_bucket_size=100000,
+  algorithm="hashdiff")
+```
+
+Output includes an aggregate diff plus a per-partition table showing exactly which ranges differ.
 
 ### Step 4: Profile first for unknown tables
 
diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index 035df6b4ca..fe1c926f92 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -7,7 +7,7 @@
  * This file is the bridge between that engine and altimate-code's drivers.
  */
 
-import type { DataDiffParams, DataDiffResult } from "../types"
+import type { DataDiffParams, DataDiffResult, PartitionDiffResult } from "../types"
 import * as Registry from "./registry"
 
 // ---------------------------------------------------------------------------
@@ -119,7 +119,238 @@ async function executeQuery(sql: string, warehouseName: string | undefined): Pro
 
 const MAX_STEPS = 200
 
+// ---------------------------------------------------------------------------
+// Partition support
+// ---------------------------------------------------------------------------
+
+/**
+ * Build a DATE_TRUNC expression appropriate for the warehouse dialect.
+ */
+function dateTruncExpr(granularity: string, column: string, dialect: string): string {
+  const g = granularity.toLowerCase()
+  switch (dialect) {
+    case "bigquery":
+      return `DATE_TRUNC(${column}, ${g.toUpperCase()})`
+    case "clickhouse":
+      return `toStartOf${g.charAt(0).toUpperCase() + g.slice(1)}(${column})`
+    case "mysql":
+    case "mariadb": {
+      const fmt = { day: "%Y-%m-%d", week: "%Y-%u", month: "%Y-%m-01", year: "%Y-01-01" }[g] ?? "%Y-%m-01"
+      return `DATE_FORMAT(${column}, '${fmt}')`
+    }
+    default:
+      // Postgres, Snowflake, Redshift, DuckDB, etc.
+      return `DATE_TRUNC('${g}', ${column})`
+  }
+}
+
+/**
+ * Build SQL to discover distinct partition values from the source table.
+ */
+function buildPartitionDiscoverySQL(
+  table: string,
+  partitionColumn: string,
+  granularity: string | undefined,
+  bucketSize: number | undefined,
+  dialect: string,
+  whereClause?: string,
+): string {
+  const isNumeric = bucketSize != null
+
+  let expr: string
+  if (isNumeric) {
+    expr = `FLOOR(${partitionColumn} / ${bucketSize}) * ${bucketSize}`
+  } else {
+    expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect)
+  }
+
+  const where = whereClause ? `WHERE ${whereClause}` : ""
+  return `SELECT DISTINCT ${expr} AS _p FROM ${table} ${where} ORDER BY _p`
+}
+
+/**
+ * Build a WHERE clause that scopes to a single partition.
+ */
+function buildPartitionWhereClause(
+  partitionColumn: string,
+  partitionValue: string,
+  granularity: string | undefined,
+  bucketSize: number | undefined,
+  dialect: string,
+): string {
+  if (bucketSize != null) {
+    const lo = Number(partitionValue)
+    const hi = lo + bucketSize
+    return `${partitionColumn} >= ${lo} AND ${partitionColumn} < ${hi}`
+  }
+
+  const expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect)
+
+  // Cast the literal appropriately per dialect
+  switch (dialect) {
+    case "bigquery":
+      return `${expr} = '${partitionValue}'`
+    case "clickhouse":
+      return `${expr} = toDate('${partitionValue}')`
+    case "mysql":
+    case "mariadb":
+      return `${expr} = '${partitionValue}'`
+    default:
+      return `${expr} = '${partitionValue}'`
+  }
+}
+
+/**
+ * Extract DiffStats from a successful outcome (if present).
+ */
+function extractStats(outcome: unknown): {
+  rows_source: number
+  rows_target: number
+  differences: number
+  status: "identical" | "differ"
+} {
+  const o = outcome as any
+  if (!o) return { rows_source: 0, rows_target: 0, differences: 0, status: "identical" }
+
+  if (o.Match) {
+    return {
+      rows_source: o.Match.row_count ?? 0,
+      rows_target: o.Match.row_count ?? 0,
+      differences: 0,
+      status: "identical",
+    }
+  }
+
+  if (o.Diff) {
+    const d = o.Diff
+    return {
+      rows_source: d.total_source_rows ?? 0,
+      rows_target: d.total_target_rows ?? 0,
+      differences: (d.rows_only_in_source ?? 0) + (d.rows_only_in_target ?? 0) + (d.rows_updated ?? 0),
+      status: "differ",
+    }
+  }
+
+  return { rows_source: 0, rows_target: 0, differences: 0, status: "identical" }
+}
+
+/**
+ * Merge two Diff outcomes into one aggregated Diff outcome.
+ */
+function mergeOutcomes(accumulated: unknown, next: unknown): unknown {
+  const a = accumulated as any
+  const n = next as any
+
+  const aD = a?.Diff ?? (a?.Match ? { total_source_rows: a.Match.row_count, total_target_rows: a.Match.row_count, rows_only_in_source: 0, rows_only_in_target: 0, rows_updated: 0, rows_identical: a.Match.row_count, sample_diffs: [] } : null)
+  const nD = n?.Diff ?? (n?.Match ? { total_source_rows: n.Match.row_count, total_target_rows: n.Match.row_count, rows_only_in_source: 0, rows_only_in_target: 0, rows_updated: 0, rows_identical: n.Match.row_count, sample_diffs: [] } : null)
+
+  if (!aD && !nD) return { Match: { row_count: 0 } }
+  if (!aD) return next
+  if (!nD) return accumulated
+
+  const merged = {
+    total_source_rows: (aD.total_source_rows ?? 0) + (nD.total_source_rows ?? 0),
+    total_target_rows: (aD.total_target_rows ?? 0) + (nD.total_target_rows ?? 0),
+    rows_only_in_source: (aD.rows_only_in_source ?? 0) + (nD.rows_only_in_source ?? 0),
+    rows_only_in_target: (aD.rows_only_in_target ?? 0) + (nD.rows_only_in_target ?? 0),
+    rows_updated: (aD.rows_updated ?? 0) + (nD.rows_updated ?? 0),
+    rows_identical: (aD.rows_identical ?? 0) + (nD.rows_identical ?? 0),
+    sample_diffs: [...(aD.sample_diffs ?? []), ...(nD.sample_diffs ?? [])].slice(0, 20),
+  }
+
+  const totalDiff = merged.rows_only_in_source + merged.rows_only_in_target + merged.rows_updated
+  if (totalDiff === 0) {
+    return { Match: { row_count: merged.total_source_rows, algorithm: "partitioned" } }
+  }
+  return { Diff: merged }
+}
+
+/**
+ * Run a partitioned diff: discover partition values, diff each partition independently,
+ * then aggregate results.
+ */
+async function runPartitionedDiff(params: DataDiffParams): Promise<DataDiffResult> {
+  const resolveDialect = (warehouse: string | undefined): string => {
+    if (warehouse) {
+      const cfg = Registry.getConfig(warehouse)
+      return cfg?.type ?? "generic"
+    }
+    const warehouses = Registry.list().warehouses
+    return warehouses[0]?.type ?? "generic"
+  }
+
+  const sourceDialect = resolveDialect(params.source_warehouse)
+  const { table1Name } = resolveTableSources(params.source, params.target)
+
+  // Discover partition values from source
+  const discoverySql = buildPartitionDiscoverySQL(
+    table1Name,
+    params.partition_column!,
+    params.partition_granularity,
+    params.partition_bucket_size,
+    sourceDialect,
+    params.where_clause,
+  )
+
+  let partitionValues: string[]
+  try {
+    const rows = await executeQuery(discoverySql, params.source_warehouse)
+    partitionValues = rows.map((r) => String(r[0] ?? "")).filter(Boolean)
+  } catch (e) {
+    return { success: false, error: `Partition discovery failed: ${e}`, steps: 0 }
+  }
+
+  if (partitionValues.length === 0) {
+    return { success: true, steps: 1, outcome: { Match: { row_count: 0, algorithm: "partitioned" } }, partition_results: [] }
+  }
+
+  // Diff each partition
+  const partitionResults: PartitionDiffResult[] = []
+  let aggregatedOutcome: unknown = null
+  let totalSteps = 1
+
+  for (const pVal of partitionValues) {
+    const partWhere = buildPartitionWhereClause(
+      params.partition_column!,
+      pVal,
+      params.partition_granularity,
+      params.partition_bucket_size,
+      sourceDialect,
+    )
+    const fullWhere = params.where_clause ? `(${params.where_clause}) AND (${partWhere})` : partWhere
+
+    const result = await runDataDiff({
+      ...params,
+      where_clause: fullWhere,
+      partition_column: undefined, // prevent recursion
+    })
+
+    totalSteps += result.steps
+
+    if (!result.success) {
+      partitionResults.push({ partition: pVal, rows_source: 0, rows_target: 0, differences: 0, status: "error", error: result.error })
+      continue
+    }
+
+    const stats = extractStats(result.outcome)
+    partitionResults.push({ partition: pVal, ...stats })
+    aggregatedOutcome = aggregatedOutcome == null ? result.outcome : mergeOutcomes(aggregatedOutcome, result.outcome)
+  }
+
+  return {
+    success: true,
+    steps: totalSteps,
+    outcome: aggregatedOutcome ?? { Match: { row_count: 0, algorithm: "partitioned" } },
+    partition_results: partitionResults,
+  }
+}
+
 export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResult> {
+  // Dispatch to partitioned diff if partition_column is set
+  if (params.partition_column) {
+    return runPartitionedDiff(params)
+  }
+
   // Dynamically import NAPI module (not available in test environments without the binary)
   let DataParitySession: new (specJson: string) => {
     start(): string
diff --git a/packages/opencode/src/altimate/native/types.ts b/packages/opencode/src/altimate/native/types.ts
index f88482beb3..c5074d7b98 100644
--- a/packages/opencode/src/altimate/native/types.ts
+++ b/packages/opencode/src/altimate/native/types.ts
@@ -987,6 +987,39 @@ export interface DataDiffParams {
   numeric_tolerance?: number
   /** Timestamp tolerance in milliseconds */
   timestamp_tolerance_ms?: number
+  /**
+   * Column to partition on before diffing. The table is split into groups by
+   * this column and each group is diffed independently. Results are aggregated.
+   * Use for large tables where bisection alone is too slow or imprecise.
+   *
+   * Examples: "l_shipdate" (date column), "l_orderkey" (numeric column)
+   */
+  partition_column?: string
+  /**
+   * Granularity for date partition columns: "day" | "week" | "month" | "year".
+   * For numeric columns, ignored — use partition_bucket_size instead.
+   * Defaults to "month".
+   */
+  partition_granularity?: "day" | "week" | "month" | "year"
+  /**
+   * For numeric partition columns: size of each bucket.
+   * E.g. 100000 splits l_orderkey into [0, 100000), [100000, 200000), …
+   */
+  partition_bucket_size?: number
+}
+
+export interface PartitionDiffResult {
+  /** The partition value (date string or numeric bucket start) */
+  partition: string
+  /** Source row count in this partition */
+  rows_source: number
+  /** Target row count in this partition */
+  rows_target: number
+  /** Total differences found (exclusive + updated) */
+  differences: number
+  /** "identical" | "differ" | "error" */
+  status: "identical" | "differ" | "error"
+  error?: string
 }
 
 export interface DataDiffResult {
@@ -994,6 +1027,8 @@ export interface DataDiffResult {
   steps: number
   outcome?: unknown
   error?: string
+  /** Per-partition breakdown when partition_column is used */
+  partition_results?: PartitionDiffResult[]
 }
 
 // --- Method registry ---
diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts
index 0719361dbe..767921e2e8 100644
--- a/packages/opencode/src/altimate/tools/data-diff.ts
+++ b/packages/opencode/src/altimate/tools/data-diff.ts
@@ -50,6 +50,23 @@ export const DataDiffTool = Tool.define("data_diff", {
       .number()
       .optional()
       .describe("Tolerance for timestamp comparisons in milliseconds"),
+    partition_column: z
+      .string()
+      .optional()
+      .describe(
+        "Column to partition on before diffing. Splits the table into groups and diffs each independently. " +
+        "Use for large tables to get faster, more precise results. " +
+        "Examples: 'l_shipdate' (date), 'l_orderkey' (numeric). " +
+        "Results are aggregated with a per-partition breakdown showing which groups have differences.",
+      ),
+    partition_granularity: z
+      .enum(["day", "week", "month", "year"])
+      .optional()
+      .describe("Granularity for date partition columns. Defaults to 'month'."),
+    partition_bucket_size: z
+      .number()
+      .optional()
+      .describe("For numeric partition columns: size of each bucket. E.g. 100000 splits orders into ranges of 100K keys."),
   }),
   async execute(args, ctx) {
     // Require read permission — data diff executes SELECT queries
@@ -72,6 +89,9 @@ export const DataDiffTool = Tool.define("data_diff", {
         where_clause: args.where_clause,
         numeric_tolerance: args.numeric_tolerance,
         timestamp_tolerance_ms: args.timestamp_tolerance_ms,
+        partition_column: args.partition_column,
+        partition_granularity: args.partition_granularity,
+        partition_bucket_size: args.partition_bucket_size,
       })
 
       if (!result.success) {
@@ -83,7 +103,11 @@ export const DataDiffTool = Tool.define("data_diff", {
       }
 
       const outcome = result.outcome as any
-      const output = formatOutcome(outcome, args.source, args.target)
+      let output = formatOutcome(outcome, args.source, args.target)
+
+      if (result.partition_results?.length) {
+        output += formatPartitionResults(result.partition_results, args.partition_column!)
+      }
 
       return {
         title: `Data diff: ${summarize(outcome)}`,
@@ -172,3 +196,31 @@ function formatOutcome(outcome: any, source: string, target: string): string {
 
   return JSON.stringify(outcome, null, 2)
 }
+
+function formatPartitionResults(
+  partitions: Array<{ partition: string; rows_source: number; rows_target: number; differences: number; status: string; error?: string }>,
+  partitionColumn: string,
+): string {
+  const lines: string[] = ["", `Partition breakdown (by ${partitionColumn}):`]
+
+  const clean = partitions.filter((p) => p.status === "identical")
+  const dirty = partitions.filter((p) => p.status === "differ")
+  const errored = partitions.filter((p) => p.status === "error")
+
+  if (dirty.length === 0 && errored.length === 0) {
+    lines.push(`  ✓ All ${partitions.length} partitions identical`)
+    return lines.join("\n")
+  }
+
+  for (const p of dirty) {
+    lines.push(`  ✗ ${p.partition}  source=${p.rows_source.toLocaleString()}  target=${p.rows_target.toLocaleString()}  diff=${p.differences.toLocaleString()}`)
+  }
+  for (const p of errored) {
+    lines.push(`  ! ${p.partition}  ERROR: ${p.error}`)
+  }
+  if (clean.length > 0) {
+    lines.push(`  ✓ ${clean.length} partition${clean.length === 1 ? "" : "s"} identical`)
+  }
+
+  return lines.join("\n")
+}

From e177f2d01bfa4ba069f563c854c9dddb488a66e2 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Thu, 26 Mar 2026 18:23:05 -0700
Subject: [PATCH 03/20] feat: add categorical partition mode (string, enum,
 boolean)

When partition_column is set without partition_granularity or
partition_bucket_size, groups by raw DISTINCT values. Works for
any non-date, non-numeric column: status, region, country, etc.

WHERE clause uses equality: col = 'value' with proper escaping.
---
 .opencode/skills/data-parity/SKILL.md         | 16 ++++++--
 .../altimate/native/connections/data-diff.ts  | 41 +++++++++++++++----
 .../opencode/src/altimate/tools/data-diff.ts  | 10 +++--
 3 files changed, 53 insertions(+), 14 deletions(-)

diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md
index 3f739eda4b..4d47be8036 100644
--- a/.opencode/skills/data-parity/SKILL.md
+++ b/.opencode/skills/data-parity/SKILL.md
@@ -44,9 +44,12 @@ description: Validate that two tables or query results are identical — or diag
 - `extra_columns` — columns to compare beyond keys (omit = compare all)
 - `algorithm` — `auto`, `joindiff`, `hashdiff`, `profile`, `cascade`
 - `where_clause` — filter applied to both tables
-- `partition_column` — split the table by this column and diff each group independently (recommended for large tables)
-- `partition_granularity` — `day` | `week` | `month` | `year` for date columns (default: `month`)
-- `partition_bucket_size` — for numeric columns: bucket width (e.g. `100000` splits by ranges of 100K)
+- `partition_column` — split the table by this column and diff each group independently (recommended for large tables); three modes:
+  - **Date column**: set `partition_granularity` → groups by truncated date periods
+  - **Numeric column**: set `partition_bucket_size` → groups by equal-width key ranges
+  - **Categorical column**: set neither → groups by distinct values (strings, enums, booleans like `status`, `region`, `country`)
+- `partition_granularity` — `day` | `week` | `month` | `year` — only for date columns
+- `partition_bucket_size` — bucket width for numeric columns (e.g. `100000`)
 
 > **CRITICAL — Algorithm choice:**
 > - If `source_warehouse` ≠ `target_warehouse` → **always use `hashdiff`** (or `auto`).
@@ -142,6 +145,13 @@ data_diff(source="orders", target="orders",
   source_warehouse="pg_source", target_warehouse="pg_target",
   partition_column="o_orderkey", partition_bucket_size=100000,
   algorithm="hashdiff")
+
+// Categorical column — partition by distinct status values ('O', 'F', 'P')
+data_diff(source="orders", target="orders",
+  key_columns=["o_orderkey"],
+  source_warehouse="pg_source", target_warehouse="pg_target",
+  partition_column="o_orderstatus",   // no granularity or bucket_size needed
+  algorithm="hashdiff")
 ```
 
 Output includes an aggregate diff plus a per-partition table showing exactly which ranges differ.
diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index fe1c926f92..98609b744a 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -144,6 +144,21 @@ function dateTruncExpr(granularity: string, column: string, dialect: string): st
   }
 }
 
+/**
+ * Determine the partition mode based on which params are provided.
+ * - "date"        → partition_granularity is set (or column looks like a date)
+ * - "numeric"     → partition_bucket_size is set
+ * - "categorical" → neither — use DISTINCT values directly (string, enum, boolean)
+ */
+function partitionMode(
+  granularity: string | undefined,
+  bucketSize: number | undefined,
+): "date" | "numeric" | "categorical" {
+  if (bucketSize != null) return "numeric"
+  if (granularity != null) return "date"
+  return "categorical"
+}
+
 /**
  * Build SQL to discover distinct partition values from the source table.
  */
@@ -155,16 +170,19 @@ function buildPartitionDiscoverySQL(
   dialect: string,
   whereClause?: string,
 ): string {
-  const isNumeric = bucketSize != null
+  const where = whereClause ? `WHERE ${whereClause}` : ""
+  const mode = partitionMode(granularity, bucketSize)
 
   let expr: string
-  if (isNumeric) {
+  if (mode === "numeric") {
     expr = `FLOOR(${partitionColumn} / ${bucketSize}) * ${bucketSize}`
+  } else if (mode === "date") {
+    expr = dateTruncExpr(granularity!, partitionColumn, dialect)
   } else {
-    expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect)
+    // categorical — raw distinct values, no transformation
+    expr = partitionColumn
   }
 
-  const where = whereClause ? `WHERE ${whereClause}` : ""
   return `SELECT DISTINCT ${expr} AS _p FROM ${table} ${where} ORDER BY _p`
 }
 
@@ -178,13 +196,22 @@ function buildPartitionWhereClause(
   bucketSize: number | undefined,
   dialect: string,
 ): string {
-  if (bucketSize != null) {
+  const mode = partitionMode(granularity, bucketSize)
+
+  if (mode === "numeric") {
     const lo = Number(partitionValue)
-    const hi = lo + bucketSize
+    const hi = lo + bucketSize!
     return `${partitionColumn} >= ${lo} AND ${partitionColumn} < ${hi}`
   }
 
-  const expr = dateTruncExpr(granularity ?? "month", partitionColumn, dialect)
+  if (mode === "categorical") {
+    // Quote the value — works for strings, enums, booleans
+    const escaped = partitionValue.replace(/'/g, "''")
+    return `${partitionColumn} = '${escaped}'`
+  }
+
+  // date mode
+  const expr = dateTruncExpr(granularity!, partitionColumn, dialect)
 
   // Cast the literal appropriately per dialect
   switch (dialect) {
diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts
index 767921e2e8..fc56e0da6d 100644
--- a/packages/opencode/src/altimate/tools/data-diff.ts
+++ b/packages/opencode/src/altimate/tools/data-diff.ts
@@ -55,18 +55,20 @@ export const DataDiffTool = Tool.define("data_diff", {
       .optional()
       .describe(
         "Column to partition on before diffing. Splits the table into groups and diffs each independently. " +
-        "Use for large tables to get faster, more precise results. " +
-        "Examples: 'l_shipdate' (date), 'l_orderkey' (numeric). " +
+        "Three modes depending on which other params you set:\n" +
+        "  • Date column   → set partition_granularity (day/week/month/year). E.g. partition_column='l_shipdate', partition_granularity='month'\n" +
+        "  • Numeric column → set partition_bucket_size. E.g. partition_column='l_orderkey', partition_bucket_size=100000\n" +
+        "  • Categorical   → set neither. Works for string/enum/boolean columns like 'status', 'region', 'country'. Groups by distinct values.\n" +
         "Results are aggregated with a per-partition breakdown showing which groups have differences.",
       ),
     partition_granularity: z
       .enum(["day", "week", "month", "year"])
       .optional()
-      .describe("Granularity for date partition columns. Defaults to 'month'."),
+      .describe("For date partition columns: truncation granularity. Omit for numeric or categorical columns."),
     partition_bucket_size: z
       .number()
       .optional()
-      .describe("For numeric partition columns: size of each bucket. E.g. 100000 splits orders into ranges of 100K keys."),
+      .describe("For numeric partition columns: size of each bucket. E.g. 100000 splits l_orderkey into ranges of 100K. Omit for date or categorical columns."),
   }),
   async execute(args, ctx) {
     // Require read permission — data diff executes SELECT queries

From d1cc9325d24153821a92e315f40ec9c8446c92d1 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Thu, 26 Mar 2026 18:41:15 -0700
Subject: [PATCH 04/20] fix: correct outcome shape handling in extractStats and
 formatOutcome

Rust serializes ReladiffOutcome with serde tag 'mode', producing:
  {mode: 'diff', diff_rows: [...], stats: {rows_table1, rows_table2, exclusive_table1, exclusive_table2, updated, unchanged}}

Previous code checked for {Match: {...}} / {Diff: {...}} shapes that
never matched, causing partitioned diff to report all partitions as
'identical' with 0 rows.

- extractStats(): check outcome.mode === 'diff', read from stats fields
- mergeOutcomes(): aggregate mode-based outcomes correctly
- summarize()/formatOutcome(): display mode-based shape with correct labels
---
 .../altimate/native/connections/data-diff.ts  | 72 +++++++++---------
 .../opencode/src/altimate/tools/data-diff.ts  | 76 +++++++++++--------
 2 files changed, 82 insertions(+), 66 deletions(-)

diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index 98609b744a..6c4f2e7a61 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -229,6 +229,9 @@ function buildPartitionWhereClause(
 
 /**
  * Extract DiffStats from a successful outcome (if present).
+ *
+ * Rust serializes ReladiffOutcome as: {mode: "diff", diff_rows: [...], stats: {...}}
+ * stats fields: rows_table1, rows_table2, exclusive_table1, exclusive_table2, updated, unchanged
  */
 function extractStats(outcome: unknown): {
   rows_source: number
@@ -239,22 +242,17 @@ function extractStats(outcome: unknown): {
   const o = outcome as any
   if (!o) return { rows_source: 0, rows_target: 0, differences: 0, status: "identical" }
 
-  if (o.Match) {
-    return {
-      rows_source: o.Match.row_count ?? 0,
-      rows_target: o.Match.row_count ?? 0,
-      differences: 0,
-      status: "identical",
-    }
-  }
-
-  if (o.Diff) {
-    const d = o.Diff
+  if (o.mode === "diff") {
+    const s = o.stats ?? {}
+    const exclusive1 = Number(s.exclusive_table1 ?? 0)
+    const exclusive2 = Number(s.exclusive_table2 ?? 0)
+    const updated = Number(s.updated ?? 0)
+    const differences = exclusive1 + exclusive2 + updated
     return {
-      rows_source: d.total_source_rows ?? 0,
-      rows_target: d.total_target_rows ?? 0,
-      differences: (d.rows_only_in_source ?? 0) + (d.rows_only_in_target ?? 0) + (d.rows_updated ?? 0),
-      status: "differ",
+      rows_source: Number(s.rows_table1 ?? 0),
+      rows_target: Number(s.rows_table2 ?? 0),
+      differences,
+      status: differences > 0 ? "differ" : "identical",
     }
   }
 
@@ -262,34 +260,36 @@ function extractStats(outcome: unknown): {
 }
 
 /**
- * Merge two Diff outcomes into one aggregated Diff outcome.
+ * Merge two diff outcomes into one aggregated outcome.
+ *
+ * Both outcomes use the Rust shape: {mode: "diff", diff_rows: [...], stats: {...}}
  */
 function mergeOutcomes(accumulated: unknown, next: unknown): unknown {
+  if (!accumulated) return next
+  if (!next) return accumulated
+
   const a = accumulated as any
   const n = next as any
 
-  const aD = a?.Diff ?? (a?.Match ? { total_source_rows: a.Match.row_count, total_target_rows: a.Match.row_count, rows_only_in_source: 0, rows_only_in_target: 0, rows_updated: 0, rows_identical: a.Match.row_count, sample_diffs: [] } : null)
-  const nD = n?.Diff ?? (n?.Match ? { total_source_rows: n.Match.row_count, total_target_rows: n.Match.row_count, rows_only_in_source: 0, rows_only_in_target: 0, rows_updated: 0, rows_identical: n.Match.row_count, sample_diffs: [] } : null)
-
-  if (!aD && !nD) return { Match: { row_count: 0 } }
-  if (!aD) return next
-  if (!nD) return accumulated
-
-  const merged = {
-    total_source_rows: (aD.total_source_rows ?? 0) + (nD.total_source_rows ?? 0),
-    total_target_rows: (aD.total_target_rows ?? 0) + (nD.total_target_rows ?? 0),
-    rows_only_in_source: (aD.rows_only_in_source ?? 0) + (nD.rows_only_in_source ?? 0),
-    rows_only_in_target: (aD.rows_only_in_target ?? 0) + (nD.rows_only_in_target ?? 0),
-    rows_updated: (aD.rows_updated ?? 0) + (nD.rows_updated ?? 0),
-    rows_identical: (aD.rows_identical ?? 0) + (nD.rows_identical ?? 0),
-    sample_diffs: [...(aD.sample_diffs ?? []), ...(nD.sample_diffs ?? [])].slice(0, 20),
-  }
+  const aS = a.stats ?? {}
+  const nS = n.stats ?? {}
+
+  const rows_table1 = (Number(aS.rows_table1) || 0) + (Number(nS.rows_table1) || 0)
+  const rows_table2 = (Number(aS.rows_table2) || 0) + (Number(nS.rows_table2) || 0)
+  const exclusive_table1 = (Number(aS.exclusive_table1) || 0) + (Number(nS.exclusive_table1) || 0)
+  const exclusive_table2 = (Number(aS.exclusive_table2) || 0) + (Number(nS.exclusive_table2) || 0)
+  const updated = (Number(aS.updated) || 0) + (Number(nS.updated) || 0)
+  const unchanged = (Number(aS.unchanged) || 0) + (Number(nS.unchanged) || 0)
 
-  const totalDiff = merged.rows_only_in_source + merged.rows_only_in_target + merged.rows_updated
-  if (totalDiff === 0) {
-    return { Match: { row_count: merged.total_source_rows, algorithm: "partitioned" } }
+  const totalRows = rows_table1 + rows_table2
+  const totalDiff = exclusive_table1 + exclusive_table2 + updated
+  const diff_percent = totalRows > 0 ? (totalDiff / totalRows) * 100 : 0
+
+  return {
+    mode: "diff",
+    diff_rows: [...(a.diff_rows ?? []), ...(n.diff_rows ?? [])].slice(0, 100),
+    stats: { rows_table1, rows_table2, exclusive_table1, exclusive_table2, updated, unchanged, diff_percent },
   }
-  return { Diff: merged }
 }
 
 /**
diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts
index fc56e0da6d..d498eefe7e 100644
--- a/packages/opencode/src/altimate/tools/data-diff.ts
+++ b/packages/opencode/src/altimate/tools/data-diff.ts
@@ -129,16 +129,23 @@ export const DataDiffTool = Tool.define("data_diff", {
 
 function summarize(outcome: any): string {
   if (!outcome) return "complete"
-  if (outcome.Match) return "IDENTICAL ✓"
-  if (outcome.Diff) {
-    const r = outcome.Diff
+
+  // Rust serializes ReladiffOutcome as {mode: "diff"|"profile"|..., stats: {...}, diff_rows: [...]}
+  if (outcome.mode === "diff") {
+    const s = outcome.stats ?? {}
+    const e1 = Number(s.exclusive_table1 ?? 0)
+    const e2 = Number(s.exclusive_table2 ?? 0)
+    const upd = Number(s.updated ?? 0)
+    if (e1 === 0 && e2 === 0 && upd === 0) return "IDENTICAL ✓"
     const parts: string[] = []
-    if (r.rows_only_in_source > 0) parts.push(`${r.rows_only_in_source} only in source`)
-    if (r.rows_only_in_target > 0) parts.push(`${r.rows_only_in_target} only in target`)
-    if (r.rows_updated > 0) parts.push(`${r.rows_updated} updated`)
-    return parts.length ? parts.join(", ") : "differences found"
+    if (e1 > 0) parts.push(`${e1} only in source`)
+    if (e2 > 0) parts.push(`${e2} only in target`)
+    if (upd > 0) parts.push(`${upd} updated`)
+    return parts.join(", ")
   }
-  if (outcome.Profile) return "profile complete"
+  if (outcome.mode === "profile") return "profile complete"
+  if (outcome.mode === "cascade") return "cascade complete"
+
   return "complete"
 }
 
@@ -147,45 +154,54 @@ function formatOutcome(outcome: any, source: string, target: string): string {
 
   const lines: string[] = []
 
-  if (outcome.Match) {
-    lines.push(`✓ Tables are IDENTICAL`)
-    const m = outcome.Match
-    if (m.row_count != null) lines.push(`  Rows checked: ${m.row_count.toLocaleString()}`)
-    if (m.algorithm) lines.push(`  Algorithm: ${m.algorithm}`)
-    return lines.join("\n")
-  }
+  // Rust serializes ReladiffOutcome as {mode: "diff", diff_rows: [...], stats: {...}}
+  // stats: rows_table1, rows_table2, exclusive_table1, exclusive_table2, updated, unchanged
+  if (outcome.mode === "diff") {
+    const s = outcome.stats ?? {}
+    const rows1 = Number(s.rows_table1 ?? 0)
+    const rows2 = Number(s.rows_table2 ?? 0)
+    const e1 = Number(s.exclusive_table1 ?? 0)
+    const e2 = Number(s.exclusive_table2 ?? 0)
+    const updated = Number(s.updated ?? 0)
+    const unchanged = Number(s.unchanged ?? 0)
+
+    if (e1 === 0 && e2 === 0 && updated === 0) {
+      lines.push(`✓ Tables are IDENTICAL`)
+      if (rows1 > 0) lines.push(`  Rows checked: ${rows1.toLocaleString()}`)
+      return lines.join("\n")
+    }
 
-  if (outcome.Diff) {
-    const r = outcome.Diff
     lines.push(`✗ Tables DIFFER`)
     lines.push(``)
     lines.push(`  Source:  ${source}`)
     lines.push(`  Target:  ${target}`)
     lines.push(``)
 
-    if (r.total_source_rows != null) lines.push(`  Source rows:        ${r.total_source_rows.toLocaleString()}`)
-    if (r.total_target_rows != null) lines.push(`  Target rows:        ${r.total_target_rows.toLocaleString()}`)
-    if (r.rows_only_in_source > 0) lines.push(`  Only in source:     ${r.rows_only_in_source.toLocaleString()}`)
-    if (r.rows_only_in_target > 0) lines.push(`  Only in target:     ${r.rows_only_in_target.toLocaleString()}`)
-    if (r.rows_updated > 0) lines.push(`  Updated rows:       ${r.rows_updated.toLocaleString()}`)
-    if (r.rows_identical > 0) lines.push(`  Identical rows:     ${r.rows_identical.toLocaleString()}`)
+    if (rows1 > 0) lines.push(`  Source rows:        ${rows1.toLocaleString()}`)
+    if (rows2 > 0) lines.push(`  Target rows:        ${rows2.toLocaleString()}`)
+    if (e1 > 0) lines.push(`  Only in source:     ${e1.toLocaleString()}`)
+    if (e2 > 0) lines.push(`  Only in target:     ${e2.toLocaleString()}`)
+    if (updated > 0) lines.push(`  Updated rows:       ${updated.toLocaleString()}`)
+    if (unchanged > 0) lines.push(`  Identical rows:     ${unchanged.toLocaleString()}`)
 
-    if (r.sample_diffs?.length) {
+    const diffRows = outcome.diff_rows ?? []
+    if (diffRows.length > 0) {
       lines.push(``)
-      lines.push(`  Sample differences (first ${r.sample_diffs.length}):`)
-      for (const d of r.sample_diffs.slice(0, 5)) {
-        lines.push(`    key=${JSON.stringify(d.key)} col=${d.column}: ${d.source_value} → ${d.target_value}`)
+      lines.push(`  Sample differences (first ${Math.min(diffRows.length, 5)}):`)
+      for (const d of diffRows.slice(0, 5)) {
+        const label = d.sign === "-" ? "source only" : "target only"
+        lines.push(`    [${label}] ${d.values?.join(" | ")}`)
       }
     }
 
     return lines.join("\n")
   }
 
-  if (outcome.Profile) {
-    const p = outcome.Profile
+  if (outcome.mode === "profile") {
+    const cols = outcome.column_stats ?? outcome.columns ?? []
     lines.push(`Column Profile Comparison`)
     lines.push(``)
-    for (const col of p.columns ?? []) {
+    for (const col of cols) {
       const verdict = col.verdict === "match" ? "✓" : col.verdict === "within_tolerance" ? "~" : "✗"
       lines.push(`  ${verdict} ${col.column}: ${col.verdict}`)
       if (col.source_stats && col.target_stats) {

From 149066b4ed90fc05efa4eb6edb7f721025f7139a Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Fri, 27 Mar 2026 12:28:29 -0700
Subject: [PATCH 05/20] feat: rewrite data-parity skill with interactive,
 plan-first workflow

Key changes based on feedback:
- Always generate TODO plan before any tool is called
- Enforce data_diff tool usage (never manual EXCEPT/JOIN SQL)
- Add PK discovery + explicit user confirmation step
- Profile pass is now mandatory before row-level diff
- Ask user before expensive row-level diff on large tables:
  - <100K rows: proceed automatically
  - 100K-10M rows: ask with where_clause option
  - >10M rows: offer window/partition/full choices
- Document partition modes (date/numeric/categorical) with examples
- Add warehouse_list as first step to confirm connections
---
 .opencode/skills/data-parity/SKILL.md | 344 +++++++++++---------------
 1 file changed, 146 insertions(+), 198 deletions(-)

diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md
index 4d47be8036..1ecc37f399 100644
--- a/.opencode/skills/data-parity/SKILL.md
+++ b/.opencode/skills/data-parity/SKILL.md
@@ -5,109 +5,85 @@ description: Validate that two tables or query results are identical — or diag
 
 # Data Parity (Table Diff)
 
-## Output Style
+## CRITICAL: Always Start With a Plan
 
-**Report facts only. No editorializing.**
-- Show counts, changed values, missing rows, new rows — that's it.
-- Do NOT explain why row-level diffing is valuable, why COUNT(*) is insufficient, or pitch the tool.
-- Do NOT add "the dangerous one", "this is exactly why", "this matters" style commentary.
-- The user asked for a diff result, not a lecture.
+**Before doing anything else**, generate a numbered TODO list for the user:
 
-## Requirements
-**Agent:** any
-**Tools used:** `sql_query` (for schema discovery), `data_diff`
+```
+Here's my plan:
+1. [ ] List available warehouse connections
+2. [ ] Inspect schema and discover primary key candidates
+3. [ ] Confirm primary keys with you
+4. [ ] Check row counts on both sides
+5. [ ] Run column-level profile (cheap — no row scan)
+6. [ ] Ask whether to proceed with row-level diff (may be expensive for large tables)
+7. [ ] Run targeted row-level diff on diverging columns only
+8. [ ] Report findings
+```
 
-## When to Use This Skill
+Update each item to `[x]` as you complete it. This plan should be visible before any tool is called.
 
-**Use when the user wants to:**
-- Confirm two tables contain the same data after a migration
-- Find rows added, deleted, or modified between source and target
-- Validate that a dbt model produces the same output as the old query
-- Run regression checks after a pipeline change
+---
 
-**Do NOT use for:**
-- Schema comparison (column names, types) — check DDL instead
-- Performance benchmarking — this runs SELECT queries
+## CRITICAL: Use `data_diff` Tool — Never Write Manual Diff SQL
 
----
+**NEVER** write SQL to diff tables manually (e.g., `EXCEPT`, `FULL OUTER JOIN`, `MINUS`).
+**ALWAYS** use the `data_diff` tool for any comparison operation.
 
-## The `data_diff` Tool
-
-`data_diff` takes table names and key columns. It generates SQL, routes it through the specified warehouse connections, and reports differences. It **does not discover schema** — you must provide key columns and relevant comparison columns.
-
-**Key parameters:**
-- `source` — table name (`orders`, `db.schema.orders`) or full SELECT/WITH query
-- `target` — table name or SELECT query
-- `key_columns` — primary key(s) uniquely identifying each row (required)
-- `source_warehouse` — connection name for source
-- `target_warehouse` — connection name for target (omit = same as source)
-- `extra_columns` — columns to compare beyond keys (omit = compare all)
-- `algorithm` — `auto`, `joindiff`, `hashdiff`, `profile`, `cascade`
-- `where_clause` — filter applied to both tables
-- `partition_column` — split the table by this column and diff each group independently (recommended for large tables); three modes:
-  - **Date column**: set `partition_granularity` → groups by truncated date periods
-  - **Numeric column**: set `partition_bucket_size` → groups by equal-width key ranges
-  - **Categorical column**: set neither → groups by distinct values (strings, enums, booleans like `status`, `region`, `country`)
-- `partition_granularity` — `day` | `week` | `month` | `year` — only for date columns
-- `partition_bucket_size` — bucket width for numeric columns (e.g. `100000`)
-
-> **CRITICAL — Algorithm choice:**
-> - If `source_warehouse` ≠ `target_warehouse` → **always use `hashdiff`** (or `auto`).
-> - `joindiff` runs a single SQL JOIN on ONE connection — it physically cannot see the other table.
->   Using `joindiff` across different servers always reports 0 differences (both sides look identical).
-> - When in doubt, use `algorithm="auto"` — it picks `joindiff` for same-warehouse and `hashdiff` for cross-warehouse automatically.
+`sql_query` is only for:
+- Schema inspection (`information_schema`, `SHOW COLUMNS`, `DESCRIBE`)
+- Cardinality checks to identify keys
+- Row count estimates
+
+Everything else — profile, row diff, value comparison — goes through `data_diff`.
 
 ---
 
-## Workflow
+## Step 1: List Connections
+
+Use `warehouse_list` to show the user what connections are available and which warehouses map to source and target.
 
-The key principle: **the LLM does the identification work using SQL tools first, then calls data_diff with informed parameters.**
+---
 
-### Step 1: Inspect the tables
+## Step 2: Inspect Schema and Discover Primary Keys
 
-Before calling `data_diff`, use `sql_query` to understand what you're comparing:
+Use `sql_query` to get columns and identify key candidates:
 
 ```sql
--- Get columns and types
+-- Postgres / Redshift / DuckDB
 SELECT column_name, data_type, is_nullable
 FROM information_schema.columns
 WHERE table_schema = 'public' AND table_name = 'orders'
 ORDER BY ordinal_position
 ```
 
-For ClickHouse:
 ```sql
-DESCRIBE TABLE source_db.events
+-- Snowflake
+SHOW COLUMNS IN TABLE orders
 ```
 
-For Snowflake:
 ```sql
-SHOW COLUMNS IN TABLE orders
+-- ClickHouse
+DESCRIBE TABLE source_db.events
 ```
 
-**Look for:**
-- Columns that look like primary keys (named `id`, `*_id`, `*_key`, `uuid`)
-- Columns with `NOT NULL` constraints
-- Whether there are composite keys
-
-### Step 2: Identify the key columns
+**Look for:** columns named `id`, `*_id`, `*_key`, `uuid`, or with `NOT NULL` + unique index.
 
-If the primary key isn't obvious from the schema, run a cardinality check:
+If no obvious PK, run a cardinality check:
 
 ```sql
 SELECT
   COUNT(*) AS total_rows,
   COUNT(DISTINCT order_id) AS distinct_order_id,
-  COUNT(DISTINCT customer_id) AS distinct_customer_id,
-  COUNT(DISTINCT created_at) AS distinct_created_at
+  COUNT(DISTINCT customer_id) AS distinct_customer_id
 FROM orders
 ```
 
-**A good key column:** `distinct_count = total_rows` (fully unique) and `null_count = 0`.
+A valid key column: `distinct_count = total_rows`.
 
-If no single column is unique, find a composite key:
+For composite keys:
 ```sql
-SELECT order_id, line_item_id, COUNT(*) as cnt
+SELECT order_id, line_item_id, COUNT(*) AS cnt
 FROM order_lines
 GROUP BY order_id, line_item_id
 HAVING COUNT(*) > 1
@@ -115,55 +91,37 @@ LIMIT 5
 ```
 If this returns 0 rows, `(order_id, line_item_id)` is a valid composite key.
 
-### Step 3: Estimate table size
+## Step 3: Confirm Keys With the User
 
-```sql
-SELECT COUNT(*) FROM orders
-```
+**Always confirm** the identified key columns before proceeding:
 
-Use this to choose the algorithm:
-- **< 1M rows**: `joindiff` (same DB) or `hashdiff` (cross-DB) — either is fine
-- **1M–100M rows**: `hashdiff` with `partition_column` for faster, more precise results
-- **> 100M rows**: `hashdiff` + `partition_column` — required; bisection alone may miss rows at this scale
+> "I identified `order_id` as the primary key (150,000 distinct values = 150,000 rows, no NULLs). Does that look right, or should I use a different column?"
 
-**When to use `partition_column`:**
-- Table has a natural time or key column (e.g. `created_at`, `order_id`, `event_date`)
-- Table has > 500K rows and bisection is slow or returning incomplete results
-- You need per-partition visibility (which month/range has the problem)
+Do not proceed to diff until the user confirms or corrects.
 
-```
-// Date column — partition by month
-data_diff(source="lineitem", target="lineitem",
-  key_columns=["l_orderkey", "l_linenumber"],
-  source_warehouse="pg_source", target_warehouse="pg_target",
-  partition_column="l_shipdate", partition_granularity="month",
-  algorithm="hashdiff")
+---
 
-// Numeric column — partition by key ranges of 100K
-data_diff(source="orders", target="orders",
-  key_columns=["o_orderkey"],
-  source_warehouse="pg_source", target_warehouse="pg_target",
-  partition_column="o_orderkey", partition_bucket_size=100000,
-  algorithm="hashdiff")
+## Step 4: Check Row Counts
 
-// Categorical column — partition by distinct status values ('O', 'F', 'P')
-data_diff(source="orders", target="orders",
-  key_columns=["o_orderkey"],
-  source_warehouse="pg_source", target_warehouse="pg_target",
-  partition_column="o_orderstatus",   // no granularity or bucket_size needed
-  algorithm="hashdiff")
+```sql
+SELECT COUNT(*) FROM orders   -- run on both source and target
 ```
 
-Output includes an aggregate diff plus a per-partition table showing exactly which ranges differ.
+Use counts to:
+- Detect load completeness issues before row-level diff
+- Choose the algorithm and decide whether to ask about cost
+- If counts differ significantly (>5%), flag it immediately
 
-### Step 4: Profile first for unknown tables
+---
+
+## Step 5: Column-Level Profile (Always Run This First)
 
-If you don't know what to expect (first-time validation, unfamiliar pipeline), start cheap:
+Profile is cheap — it runs aggregates, not row scans. **Always run profile before row-level diff.**
 
 ```
 data_diff(
   source="orders",
-  target="orders_migrated",
+  target="orders",
   key_columns=["order_id"],
   source_warehouse="postgres_prod",
   target_warehouse="snowflake_dw",
@@ -171,51 +129,101 @@ data_diff(
 )
 ```
 
-Profile output tells you:
-- Row count on each side (mismatch = load completeness problem)
-- Which columns have null count differences (mismatch = NULL handling bug)
-- Min/max divergence per column (mismatch = value transformation bug)
-- Which columns match exactly (safe to skip in row-level diff)
+Profile tells you:
+- Row count on each side
+- Which columns have null count differences → NULL handling bug
+- Min/max divergence per column → value transformation bug
+- Which columns match exactly → safe to skip in row-level diff
 
-**Interpret profile to narrow the diff:**
+**Example output:**
 ```
 Column Profile Comparison
 
   ✓ order_id: match
   ✓ customer_id: match
-  ✗ amount: DIFFER     ← source min=10.00, target min=10.01 — rounding issue?
+  ✗ amount: DIFFER     ← source min=10.00, target min=10.01 — rounding?
   ✗ status: DIFFER     ← source nulls=0, target nulls=47 — NULL mapping bug?
   ✓ created_at: match
 ```
-→ Only diff `amount` and `status` in the next step.
 
-### Step 5: Run targeted row-level diff
+---
+
+## Step 6: Ask Before Running Row-Level Diff on Large Tables
+
+After profiling, check row count and **ask the user** before proceeding:
+
+**If table has < 100K rows:** proceed automatically.
+
+**If table has 100K–10M rows:**
+> "The table has 1.2M rows. Row-level diff will scan all rows on both sides — this may take 30–60 seconds and consume warehouse compute. Do you want to proceed? You can also provide a `where_clause` to limit the scope (e.g., `created_at >= '2024-01-01'`)."
+
+**If table has > 10M rows:**
+> "The table has 50M rows. Full row-level diff could be expensive. Options:
+> 1. Diff a recent window only (e.g., last 30 days)
+> 2. Partition by a date/key column — shows which partition has problems without scanning everything
+> 3. Proceed with full diff (may take several minutes)
+> Which would you prefer?"
+
+---
+
+## Step 7: Run Targeted Row-Level Diff
+
+Use only the columns that the profile said differ. This is faster and produces cleaner output.
 
 ```
 data_diff(
   source="orders",
-  target="orders_migrated",
+  target="orders",
   key_columns=["order_id"],
-  extra_columns=["amount", "status"],    // only the columns profile said differ
+  extra_columns=["amount", "status"],    // only diverging columns from profile
   source_warehouse="postgres_prod",
   target_warehouse="snowflake_dw",
   algorithm="hashdiff"
 )
 ```
 
+### For large tables — use partition_column
+
+Split the table into groups and diff each independently. Three modes:
+
+```
+// Date column — partition by month
+data_diff(source="lineitem", target="lineitem",
+  key_columns=["l_orderkey", "l_linenumber"],
+  source_warehouse="pg_source", target_warehouse="pg_target",
+  partition_column="l_shipdate", partition_granularity="month",
+  algorithm="hashdiff")
+
+// Numeric column — partition by key ranges of 100K
+data_diff(source="orders", target="orders",
+  key_columns=["o_orderkey"],
+  source_warehouse="pg_source", target_warehouse="pg_target",
+  partition_column="o_orderkey", partition_bucket_size=100000,
+  algorithm="hashdiff")
+
+// Categorical column — partition by distinct values (string, enum, boolean)
+data_diff(source="orders", target="orders",
+  key_columns=["o_orderkey"],
+  source_warehouse="pg_source", target_warehouse="pg_target",
+  partition_column="o_orderstatus",
+  algorithm="hashdiff")
+```
+
+Output includes aggregate diff + per-partition breakdown showing which group has problems.
+
 ---
 
 ## Algorithm Selection
 
 | Algorithm | When to use |
 |-----------|-------------|
-| `profile` | First pass — column stats (count, min, max, nulls). No row scan. |
-| `joindiff` | Same database — single FULL OUTER JOIN query. Fast. |
-| `hashdiff` | Cross-database, or large tables — bisection with checksums. Scales. |
+| `profile` | **Always run first** — column stats (count, min, max, nulls). No row scan. |
+| `joindiff` | Same database — single FULL OUTER JOIN. Fast, exact. |
+| `hashdiff` | Cross-database or large tables — bisection with checksums. Scales to billions. |
 | `cascade` | Auto-escalate: profile → hashdiff on diverging columns. |
 | `auto` | JoinDiff if same warehouse, HashDiff if cross-database. |
 
-**JoinDiff constraint:** Both tables must be on the **same database connection**. If source and target are on different servers, JoinDiff will always report 0 diffs (it only sees one side). Use `hashdiff` or `auto` for cross-database.
+> **CRITICAL:** If `source_warehouse` ≠ `target_warehouse`, **never use `joindiff`** — it only sees one connection and always reports 0 differences. Use `hashdiff` or `auto`.
 
 ---
 
@@ -226,101 +234,41 @@ data_diff(
 ✓ Tables are IDENTICAL
   Rows checked: 1,000,000
 ```
-→ Migration validated. Data is identical.
-
-### DIFFER — Diagnose by pattern
 
+### DIFFER
 ```
 ✗ Tables DIFFER
 
-  Only in source:  2       → rows deleted in target (ETL missed deletes)
-  Only in target:  2       → rows added to target (dedup issue or new data)
-  Updated rows:    3       → values changed (transform bug, type casting, rounding)
-  Identical rows:  15
-```
-
-| Pattern | Root cause hypothesis |
-|---------|----------------------|
-| `only_in_source > 0`, `only_in_target = 0` | ETL dropped rows — check filters, incremental logic |
-| `only_in_source = 0`, `only_in_target > 0` | Target has extra rows — check dedup or wrong join |
-| `updated_rows > 0`, row counts match | Silent value corruption — check transforms, type casts |
-| Row count differs | Load completeness issue — check ETL watermarks |
-
-Sample diffs point to the specific key + column + old→new value:
-```
-key={"order_id":"4"} col=amount: 300.00 → 305.00
-```
-Use this to query the source systems directly and trace the discrepancy.
-
----
-
-## Usage Examples
-
-### Full workflow: unknown migration
+  Source rows:      150,000
+  Target rows:      149,950
+  Only in source:   50       → rows deleted in target (ETL missed deletes)
+  Only in target:   0
+  Updated rows:     0
+  Identical rows:   149,950
 ```
-// 1. Discover schema
-sql_query("SELECT column_name, data_type FROM information_schema.columns WHERE table_name='orders'", warehouse="postgres_prod")
-
-// 2. Check row count
-sql_query("SELECT COUNT(*), COUNT(DISTINCT order_id) FROM orders", warehouse="postgres_prod")
 
-// 3. Profile to find which columns differ
-data_diff(source="orders", target="orders", key_columns=["order_id"],
-  source_warehouse="postgres_prod", target_warehouse="snowflake_dw", algorithm="profile")
-
-// 4. Row-level diff on diverging columns only
-data_diff(source="orders", target="orders", key_columns=["order_id"],
-  extra_columns=["amount", "status"],
-  source_warehouse="postgres_prod", target_warehouse="snowflake_dw", algorithm="hashdiff")
-```
-
-### Same-database query refactor
-```
-data_diff(
-  source="SELECT id, amount, status FROM orders WHERE region = 'us-east'",
-  target="SELECT id, amount, status FROM orders_v2 WHERE region = 'us-east'",
-  key_columns=["id"]
-)
-```
-
-### Large table — filter to recent window first
-```
-data_diff(
-  source="fact_events",
-  target="fact_events_v2",
-  key_columns=["event_id"],
-  where_clause="event_date >= '2024-01-01'",
-  algorithm="hashdiff"
-)
-```
-
-### ClickHouse — always qualify with database.table
-```
-data_diff(
-  source="source_db.events",
-  target="target_db.events",
-  key_columns=["event_id"],
-  source_warehouse="clickhouse_source",
-  target_warehouse="clickhouse_target",
-  algorithm="hashdiff"
-)
-```
+| Pattern | Root cause |
+|---------|-----------|
+| `only_in_source > 0`, target = 0 | ETL dropped rows — check filters, incremental logic |
+| `only_in_target > 0`, source = 0 | Target has extra rows — dedup issue or wrong join |
+| `updated_rows > 0`, counts match | Silent value corruption — check type casts, rounding |
+| Row counts differ significantly | Load completeness — check ETL watermarks |
 
 ---
 
 ## Common Mistakes
 
-**Calling data_diff without knowing the key**
-→ Run `sql_query` to check cardinality first. A bad key gives meaningless results.
+**Writing manual diff SQL instead of calling data_diff**
+→ Never use EXCEPT, MINUS, or FULL OUTER JOIN to diff tables. Use `data_diff`.
 
-**Using joindiff for cross-database tables**
-→ JoinDiff runs one SQL query on one connection. It can't see the other table. Use `hashdiff` or `auto`.
+**Calling data_diff without confirming the key**
+→ Confirm cardinality with the user first. A bad key gives meaningless results.
 
-**Diffing a 1B row table without a date filter**
-→ Add `where_clause` to scope to recent data. Validate a window first, then expand.
+**Using joindiff for cross-database tables**
+→ JoinDiff can't see the remote table. Always returns 0 diffs. Use `hashdiff` or `auto`.
 
-**Ignoring profile output and jumping to full diff**
-→ Profile is free. It tells you which columns actually differ so you can avoid scanning all columns across all rows.
+**Skipping the profile step and jumping to full row diff**
+→ Profile is free. It tells you which columns actually differ so you avoid scanning everything.
 
-**Forgetting to check row counts before diffing**
-→ If source has 1M rows and target has 900K, row-level diff is misleading. Fix the load completeness issue first.
+**Running full diff on a billion-row table without asking**
+→ Always ask the user before expensive operations. Offer filtering and partition options.

From 3caab3039490e70a6531a587d46a7ad81d2e8c68 Mon Sep 17 00:00:00 2001
From: Aditya Pandey <aditya.p@altimate.ai>
Date: Fri, 27 Mar 2026 21:43:47 -0700
Subject: [PATCH 06/20] fix: auto-discover extra_columns and exclude
 audit/timestamp columns from data diff
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Rust engine only compares columns explicitly listed in extra_columns.
When omitted, it was silently reporting all key-matched rows as 'identical'
even when non-key values differed — a false positive bug.

Changes:
- Auto-discover columns from information_schema when extra_columns is omitted
  and source is a plain table name (not a SQL query)
- Exclude audit/timestamp columns (updated_at, created_at, inserted_at,
  modified_at, _fivetran_*, _airbyte_*, publisher_last_updated_*, etc.)
  from comparison by default since they typically differ due to ETL timing
- Report excluded columns in tool output so users know what was skipped
- Fix misleading tool description that said 'Omit to compare all columns'
- Update SKILL.md with critical guidance on extra_columns behavior
---
 .opencode/skills/data-parity/SKILL.md         |  15 ++
 .../altimate/native/connections/data-diff.ts  | 155 +++++++++++++++++-
 .../opencode/src/altimate/native/types.ts     |   2 +
 .../opencode/src/altimate/tools/data-diff.ts  |  14 +-
 4 files changed, 184 insertions(+), 2 deletions(-)

diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md
index 1ecc37f399..9302e50b66 100644
--- a/.opencode/skills/data-parity/SKILL.md
+++ b/.opencode/skills/data-parity/SKILL.md
@@ -256,6 +256,18 @@ Output includes aggregate diff + per-partition breakdown showing which group has
 
 ---
 
+## CRITICAL: `extra_columns` Behavior
+
+The Rust engine **only compares columns listed in `extra_columns`**. If the list is empty, it compares key existence only — rows that match on key but differ in values will be silently reported as "identical". This is the most common source of false positives.
+
+**Auto-discovery (default for table names):** When `extra_columns` is omitted and the source is a plain table name, `data_diff` auto-discovers all non-key columns from `information_schema` and excludes audit/timestamp columns (like `updated_at`, `created_at`, `inserted_at`, `modified_at`, `publisher_last_updated_epoch_ms`, ETL metadata columns like `_fivetran_synced`, etc.). The output will list which columns were auto-excluded.
+
+**SQL queries:** When source is a SQL query (not a table name), auto-discovery cannot work. You **must** provide `extra_columns` explicitly. If you don't, only key-level matching occurs.
+
+**When to override auto-exclusion:** If the user specifically wants to compare audit columns (e.g., verifying that `created_at` was preserved during migration), pass those columns explicitly in `extra_columns`.
+
+---
+
 ## Common Mistakes
 
 **Writing manual diff SQL instead of calling data_diff**
@@ -272,3 +284,6 @@ Output includes aggregate diff + per-partition breakdown showing which group has
 
 **Running full diff on a billion-row table without asking**
 → Always ask the user before expensive operations. Offer filtering and partition options.
+
+**Omitting extra_columns when source is a SQL query**
+→ Auto-discovery only works for table names. For SQL queries, always list the columns to compare explicitly.
diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index 6c4f2e7a61..a5c009aff9 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -113,6 +113,138 @@ async function executeQuery(sql: string, warehouseName: string | undefined): Pro
   )
 }
 
+// ---------------------------------------------------------------------------
+// Column auto-discovery and audit column exclusion
+// ---------------------------------------------------------------------------
+
+/**
+ * Patterns that match audit/timestamp columns which should be excluded from
+ * value comparison by default. These columns typically differ between source
+ * and target due to ETL timing, sync metadata, or pipeline bookkeeping —
+ * not because of actual data discrepancies.
+ */
+const AUDIT_COLUMN_PATTERNS = [
+  // Exact common names
+  /^(created|updated|modified|inserted|deleted|synced|published|ingested|loaded|extracted|refreshed)_(at|on|date|time|timestamp|ts|dt|epoch)$/i,
+  // Suffix patterns: *_at, *_on with temporal prefix
+  /_(created|updated|modified|inserted|deleted|synced|published|ingested|loaded|extracted|refreshed)$/i,
+  // ETL metadata columns
+  /^(etl|elt|dbt|pipeline|batch|sync|publish|ingest)_(created|updated|modified|loaded|run|timestamp|ts|time|at|epoch)/i,
+  /^(_sdc_|_airbyte_|_fivetran_|_stitch_|__hevo_)/i,
+  // Generic timestamp metadata
+  /^(last_updated|last_modified|date_updated|date_modified|date_created|row_updated|row_created)$/i,
+  /^(publisher_last_updated|publisher_updated)/i,
+  // Epoch variants
+  /(updated|modified|created|inserted|published|loaded|synced)_epoch/i,
+  /epoch_ms$/i,
+]
+
+/**
+ * Check whether a column name matches known audit/timestamp patterns.
+ */
+function isAuditColumn(columnName: string): boolean {
+  return AUDIT_COLUMN_PATTERNS.some((pattern) => pattern.test(columnName))
+}
+
+/**
+ * Build a query to discover column names for a table, appropriate for the dialect.
+ */
+function buildColumnDiscoverySQL(tableName: string, dialect: string): string {
+  // Parse schema.table or db.schema.table
+  const parts = tableName.split(".")
+  let schemaFilter = ""
+  let tableFilter = ""
+
+  if (parts.length === 3) {
+    schemaFilter = `table_schema = '${parts[1]}'`
+    tableFilter = `table_name = '${parts[2]}'`
+  } else if (parts.length === 2) {
+    schemaFilter = `table_schema = '${parts[0]}'`
+    tableFilter = `table_name = '${parts[1]}'`
+  } else {
+    tableFilter = `table_name = '${parts[0]}'`
+  }
+
+  switch (dialect) {
+    case "clickhouse":
+      return `DESCRIBE TABLE ${tableName}`
+    case "snowflake":
+      return `SHOW COLUMNS IN TABLE ${tableName}`
+    default: {
+      // Postgres, MySQL, Redshift, DuckDB, etc. — use information_schema
+      const conditions = [tableFilter]
+      if (schemaFilter) conditions.push(schemaFilter)
+      return `SELECT column_name FROM information_schema.columns WHERE ${conditions.join(" AND ")} ORDER BY ordinal_position`
+    }
+  }
+}
+
+/**
+ * Parse column names from the discovery query result, handling dialect differences.
+ */
+function parseColumnNames(rows: (string | null)[][], dialect: string): string[] {
+  switch (dialect) {
+    case "clickhouse":
+      // DESCRIBE returns: name, type, default_type, default_expression, ...
+      return rows.map((r) => r[0] ?? "").filter(Boolean)
+    case "snowflake":
+      // SHOW COLUMNS returns: table_name, schema_name, column_name, data_type, ...
+      // column_name is at index 2
+      return rows.map((r) => r[2] ?? "").filter(Boolean)
+    default:
+      // information_schema returns: column_name
+      return rows.map((r) => r[0] ?? "").filter(Boolean)
+  }
+}
+
+/**
+ * Auto-discover non-key, non-audit columns for a table.
+ *
+ * When the caller omits `extra_columns`, we query the source table's schema to
+ * find all columns, then exclude:
+ *   1. Key columns (already used for matching)
+ *   2. Audit/timestamp columns (updated_at, created_at, etc.) that typically
+ *      differ between source and target due to ETL timing
+ *
+ * Returns the list of columns to compare, or undefined if discovery fails
+ * (in which case the engine falls back to key-only comparison).
+ */
+async function discoverExtraColumns(
+  tableName: string,
+  keyColumns: string[],
+  dialect: string,
+  warehouseName: string | undefined,
+): Promise<{ columns: string[]; excludedAudit: string[] } | undefined> {
+  // Only works for plain table names, not SQL queries
+  if (SQL_KEYWORDS.test(tableName)) return undefined
+
+  try {
+    const sql = buildColumnDiscoverySQL(tableName, dialect)
+    const rows = await executeQuery(sql, warehouseName)
+    const allColumns = parseColumnNames(rows, dialect)
+
+    if (allColumns.length === 0) return undefined
+
+    const keySet = new Set(keyColumns.map((k) => k.toLowerCase()))
+    const extraColumns: string[] = []
+    const excludedAudit: string[] = []
+
+    for (const col of allColumns) {
+      if (keySet.has(col.toLowerCase())) continue
+      if (isAuditColumn(col)) {
+        excludedAudit.push(col)
+      } else {
+        extraColumns.push(col)
+      }
+    }
+
+    return { columns: extraColumns, excludedAudit }
+  } catch {
+    // Schema discovery failed — fall back to engine default (key-only)
+    return undefined
+  }
+}
+
 // ---------------------------------------------------------------------------
 // Main orchestrator
 // ---------------------------------------------------------------------------
@@ -426,6 +558,26 @@ export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResul
   const dialect1 = resolveDialect(params.source_warehouse)
   const dialect2 = resolveDialect(params.target_warehouse ?? params.source_warehouse)
 
+  // Auto-discover extra_columns when not explicitly provided.
+  // The Rust engine only compares columns listed in extra_columns — if the list is
+  // empty, it compares key existence only and reports all matched rows as "identical"
+  // even when non-key values differ. This auto-discovery prevents that silent bug.
+  let extraColumns = params.extra_columns
+  let excludedAuditColumns: string[] = []
+
+  if (!extraColumns || extraColumns.length === 0) {
+    const discovered = await discoverExtraColumns(
+      params.source,
+      params.key_columns,
+      dialect1,
+      params.source_warehouse,
+    )
+    if (discovered) {
+      extraColumns = discovered.columns
+      excludedAuditColumns = discovered.excludedAudit
+    }
+  }
+
   // Build session spec
   const spec = {
     table1: table1Ref,
@@ -435,7 +587,7 @@ export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResul
     config: {
       algorithm: params.algorithm ?? "auto",
       key_columns: params.key_columns,
-      extra_columns: params.extra_columns ?? [],
+      extra_columns: extraColumns ?? [],
       ...(params.where_clause ? { where_clause: params.where_clause } : {}),
       ...(params.numeric_tolerance != null ? { numeric_tolerance: params.numeric_tolerance } : {}),
       ...(params.timestamp_tolerance_ms != null
@@ -477,6 +629,7 @@ export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResul
         success: true,
         steps: stepCount,
         outcome: action.outcome,
+        ...(excludedAuditColumns.length > 0 ? { excluded_audit_columns: excludedAuditColumns } : {}),
       }
     }
 
diff --git a/packages/opencode/src/altimate/native/types.ts b/packages/opencode/src/altimate/native/types.ts
index c5074d7b98..6f7307e242 100644
--- a/packages/opencode/src/altimate/native/types.ts
+++ b/packages/opencode/src/altimate/native/types.ts
@@ -1029,6 +1029,8 @@ export interface DataDiffResult {
   error?: string
   /** Per-partition breakdown when partition_column is used */
   partition_results?: PartitionDiffResult[]
+  /** Columns auto-excluded from comparison (audit/timestamp columns like updated_at, created_at) */
+  excluded_audit_columns?: string[]
 }
 
 // --- Method registry ---
diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts
index d498eefe7e..97a4085169 100644
--- a/packages/opencode/src/altimate/tools/data-diff.ts
+++ b/packages/opencode/src/altimate/tools/data-diff.ts
@@ -35,7 +35,13 @@ export const DataDiffTool = Tool.define("data_diff", {
     extra_columns: z
       .array(z.string())
       .optional()
-      .describe("Additional columns to compare beyond the key columns. Omit to compare all columns"),
+      .describe(
+        "Columns to compare beyond the key columns. " +
+        "IMPORTANT: If omitted AND source is a plain table name, columns are auto-discovered from the schema " +
+        "(excluding key columns and audit/timestamp columns like updated_at, created_at, inserted_at, modified_at). " +
+        "If omitted AND source is a SQL query, ONLY key columns are compared — value changes in non-key columns will NOT be detected. " +
+        "Always provide explicit extra_columns when comparing SQL queries to ensure value-level comparison."
+      ),
     algorithm: z
       .enum(["auto", "joindiff", "hashdiff", "profile", "cascade"])
       .optional()
@@ -111,6 +117,12 @@ export const DataDiffTool = Tool.define("data_diff", {
         output += formatPartitionResults(result.partition_results, args.partition_column!)
       }
 
+      // Report auto-excluded audit columns so the LLM and user know what was skipped
+      const excluded = (result as any).excluded_audit_columns as string[] | undefined
+      if (excluded && excluded.length > 0) {
+        output += `\n\n  Note: ${excluded.length} audit/timestamp column${excluded.length === 1 ? "" : "s"} auto-excluded from comparison: ${excluded.join(", ")}`
+      }
+
       return {
         title: `Data diff: ${summarize(outcome)}`,
         metadata: { success: true, steps: result.steps },

From 550d431b7b57bab27437e50fd34f5d9686aa1632 Mon Sep 17 00:00:00 2001
From: Aditya Pandey <aditya.p@altimate.ai>
Date: Sat, 28 Mar 2026 01:05:19 -0700
Subject: [PATCH 07/20] fix: add `noLimit` option to driver `execute()` to
 prevent silent result truncation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

All drivers default to `LIMIT 1001` on SELECT queries and post-truncate to
1000 rows. This silently drops rows when the data-diff engine needs complete
result sets — a FULL OUTER JOIN returning >1000 diff rows would be truncated,
causing the engine to undercount differences.

- Add `ExecuteOptions { noLimit?: boolean }` to the `Connector` interface
- When `noLimit: true`, set `effectiveLimit = 0` (falsy) so the existing
  LIMIT injection guard is skipped, and add `effectiveLimit > 0` to the
  truncation check so rows aren't sliced to zero
- Update all 12 drivers: postgres, clickhouse, snowflake, bigquery, mysql,
  redshift, databricks, duckdb, oracle, sqlserver, sqlite, mongodb
- Pass `{ noLimit: true }` from `data-diff.ts` `executeQuery()`

Interactive SQL callers are unaffected — they continue to get the default
1000-row limit. Only the data-diff pipeline opts out.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 packages/drivers/src/bigquery.ts                          | 8 ++++----
 packages/drivers/src/databricks.ts                        | 8 ++++----
 packages/drivers/src/duckdb.ts                            | 8 ++++----
 packages/drivers/src/mysql.ts                             | 8 ++++----
 packages/drivers/src/oracle.ts                            | 8 ++++----
 packages/drivers/src/postgres.ts                          | 8 ++++----
 packages/drivers/src/redshift.ts                          | 8 ++++----
 packages/drivers/src/snowflake.ts                         | 8 ++++----
 packages/drivers/src/sqlite.ts                            | 8 ++++----
 packages/drivers/src/sqlserver.ts                         | 8 ++++----
 packages/drivers/src/types.ts                             | 8 +++++++-
 .../opencode/src/altimate/native/connections/data-diff.ts | 3 ++-
 12 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/packages/drivers/src/bigquery.ts b/packages/drivers/src/bigquery.ts
index f14e3b681d..abc7a8f05f 100644
--- a/packages/drivers/src/bigquery.ts
+++ b/packages/drivers/src/bigquery.ts
@@ -2,7 +2,7 @@
  * BigQuery driver using the `@google-cloud/bigquery` package.
  */
 
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   let BigQueryModule: any
@@ -37,8 +37,8 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       client = new BigQuery(options)
     },
 
-    async execute(sql: string, limit?: number, binds?: any[]): Promise<ConnectorResult> {
-      const effectiveLimit = limit ?? 1000
+    async execute(sql: string, limit?: number, binds?: any[], execOptions?: ExecuteOptions): Promise<ConnectorResult> {
+      const effectiveLimit = execOptions?.noLimit ? 0 : (limit ?? 1000)
       const query = sql.replace(/;\s*$/, "")
       const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql)
 
@@ -58,7 +58,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
 
       const [rows] = await client.query(options)
       const columns = rows.length > 0 ? Object.keys(rows[0]) : []
-      const truncated = rows.length > effectiveLimit
+      const truncated = effectiveLimit > 0 && rows.length > effectiveLimit
       const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows
 
       return {
diff --git a/packages/drivers/src/databricks.ts b/packages/drivers/src/databricks.ts
index ccb3d5f8f7..83e75dcd7c 100644
--- a/packages/drivers/src/databricks.ts
+++ b/packages/drivers/src/databricks.ts
@@ -2,7 +2,7 @@
  * Databricks driver using the `@databricks/sql` package.
  */
 
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   let databricksModule: any
@@ -44,8 +44,8 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       })
     },
 
-    async execute(sql: string, limit?: number, binds?: any[]): Promise<ConnectorResult> {
-      const effectiveLimit = limit ?? 1000
+    async execute(sql: string, limit?: number, binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult> {
+      const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000)
       let query = sql
       const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql)
       if (
@@ -65,7 +65,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       await operation.close()
 
       const columns = rows.length > 0 ? Object.keys(rows[0]) : []
-      const truncated = rows.length > effectiveLimit
+      const truncated = effectiveLimit > 0 && rows.length > effectiveLimit
       const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows
 
       return {
diff --git a/packages/drivers/src/duckdb.ts b/packages/drivers/src/duckdb.ts
index f938f99d01..3ccca467aa 100644
--- a/packages/drivers/src/duckdb.ts
+++ b/packages/drivers/src/duckdb.ts
@@ -2,7 +2,7 @@
  * DuckDB driver using the `duckdb` package.
  */
 
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   let duckdb: any
@@ -105,8 +105,8 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       connection = db.connect()
     },
 
-    async execute(sql: string, limit?: number, binds?: any[]): Promise<ConnectorResult> {
-      const effectiveLimit = limit ?? 1000
+    async execute(sql: string, limit?: number, binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult> {
+      const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000)
 
       let finalSql = sql
       const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql)
@@ -123,7 +123,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
         : await query(finalSql)
       const columns =
         rows.length > 0 ? Object.keys(rows[0]) : []
-      const truncated = rows.length > effectiveLimit
+      const truncated = effectiveLimit > 0 && rows.length > effectiveLimit
       const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows
 
       return {
diff --git a/packages/drivers/src/mysql.ts b/packages/drivers/src/mysql.ts
index 28c4a8def9..3859f5e993 100644
--- a/packages/drivers/src/mysql.ts
+++ b/packages/drivers/src/mysql.ts
@@ -2,7 +2,7 @@
  * MySQL driver using the `mysql2` package.
  */
 
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   let mysql: any
@@ -41,8 +41,8 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       pool = mysql.createPool(poolConfig)
     },
 
-    async execute(sql: string, limit?: number, _binds?: any[]): Promise<ConnectorResult> {
-      const effectiveLimit = limit ?? 1000
+    async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult> {
+      const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000)
       let query = sql
       const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql)
       if (
@@ -56,7 +56,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       const [rows, fields] = await pool.query(query)
       const columns = fields?.map((f: any) => f.name) ?? []
       const rowsArr = Array.isArray(rows) ? rows : []
-      const truncated = rowsArr.length > effectiveLimit
+      const truncated = effectiveLimit > 0 && rowsArr.length > effectiveLimit
       const limitedRows = truncated
         ? rowsArr.slice(0, effectiveLimit)
         : rowsArr
diff --git a/packages/drivers/src/oracle.ts b/packages/drivers/src/oracle.ts
index e3bab24819..39e4b11c37 100644
--- a/packages/drivers/src/oracle.ts
+++ b/packages/drivers/src/oracle.ts
@@ -2,7 +2,7 @@
  * Oracle driver using the `oracledb` package (thin mode, pure JS).
  */
 
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   let oracledb: any
@@ -37,8 +37,8 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       })
     },
 
-    async execute(sql: string, limit?: number, _binds?: any[]): Promise<ConnectorResult> {
-      const effectiveLimit = limit ?? 1000
+    async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult> {
+      const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000)
       let query = sql
       const isSelectLike = /^\s*(SELECT|WITH)\b/i.test(sql)
 
@@ -61,7 +61,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
         const columns =
           result.metaData?.map((m: any) => m.name) ??
           (rows.length > 0 ? Object.keys(rows[0]) : [])
-        const truncated = rows.length > effectiveLimit
+        const truncated = effectiveLimit > 0 && rows.length > effectiveLimit
         const limitedRows = truncated
           ? rows.slice(0, effectiveLimit)
           : rows
diff --git a/packages/drivers/src/postgres.ts b/packages/drivers/src/postgres.ts
index e1b69465eb..755b2e4ed9 100644
--- a/packages/drivers/src/postgres.ts
+++ b/packages/drivers/src/postgres.ts
@@ -2,7 +2,7 @@
  * PostgreSQL driver using the `pg` package.
  */
 
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   let pg: any
@@ -46,7 +46,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       pool = new Pool(poolConfig)
     },
 
-    async execute(sql: string, limit?: number, _binds?: any[]): Promise<ConnectorResult> {
+    async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult> {
       const client = await pool.connect()
       try {
         if (config.statement_timeout) {
@@ -57,7 +57,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
         }
 
         let query = sql
-        const effectiveLimit = limit ?? 1000
+        const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000)
         const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql)
         // Add LIMIT only for SELECT-like queries and if not already present
         if (
@@ -70,7 +70,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
 
         const result = await client.query(query)
         const columns = result.fields?.map((f: any) => f.name) ?? []
-        const truncated = result.rows.length > effectiveLimit
+        const truncated = effectiveLimit > 0 && result.rows.length > effectiveLimit
         const rows = truncated
           ? result.rows.slice(0, effectiveLimit)
           : result.rows
diff --git a/packages/drivers/src/redshift.ts b/packages/drivers/src/redshift.ts
index 5893777102..92f8f32790 100644
--- a/packages/drivers/src/redshift.ts
+++ b/packages/drivers/src/redshift.ts
@@ -3,7 +3,7 @@
  * Uses svv_ system views for introspection.
  */
 
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   let pg: any
@@ -46,10 +46,10 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       pool = new Pool(poolConfig)
     },
 
-    async execute(sql: string, limit?: number, _binds?: any[]): Promise<ConnectorResult> {
+    async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult> {
       const client = await pool.connect()
       try {
-        const effectiveLimit = limit ?? 1000
+        const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000)
         let query = sql
         const isSelectLike = /^\s*(SELECT|WITH|VALUES)\b/i.test(sql)
         if (
@@ -62,7 +62,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
 
         const result = await client.query(query)
         const columns = result.fields?.map((f: any) => f.name) ?? []
-        const truncated = result.rows.length > effectiveLimit
+        const truncated = effectiveLimit > 0 && result.rows.length > effectiveLimit
         const rows = truncated
           ? result.rows.slice(0, effectiveLimit)
           : result.rows
diff --git a/packages/drivers/src/snowflake.ts b/packages/drivers/src/snowflake.ts
index 03cc1c84a7..6a37c6caaa 100644
--- a/packages/drivers/src/snowflake.ts
+++ b/packages/drivers/src/snowflake.ts
@@ -3,7 +3,7 @@
  */
 
 import * as fs from "fs"
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   let snowflake: any
@@ -232,8 +232,8 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       })
     },
 
-    async execute(sql: string, limit?: number, binds?: any[]): Promise<ConnectorResult> {
-      const effectiveLimit = limit ?? 1000
+    async execute(sql: string, limit?: number, binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult> {
+      const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000)
       let query = sql
       const isSelectLike = /^\s*(SELECT|WITH|VALUES|SHOW)\b/i.test(sql)
       if (
@@ -245,7 +245,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       }
 
       const result = await executeQuery(query, binds)
-      const truncated = result.rows.length > effectiveLimit
+      const truncated = effectiveLimit > 0 && result.rows.length > effectiveLimit
       const rows = truncated
         ? result.rows.slice(0, effectiveLimit)
         : result.rows
diff --git a/packages/drivers/src/sqlite.ts b/packages/drivers/src/sqlite.ts
index 46d1e74ec8..48ef8321cd 100644
--- a/packages/drivers/src/sqlite.ts
+++ b/packages/drivers/src/sqlite.ts
@@ -4,7 +4,7 @@
  */
 
 import { Database } from "bun:sqlite"
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   const dbPath = (config.path as string) ?? ":memory:"
@@ -22,9 +22,9 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       }
     },
 
-    async execute(sql: string, limit?: number, _binds?: any[]): Promise<ConnectorResult> {
+    async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult> {
       if (!db) throw new Error("SQLite connection not open")
-      const effectiveLimit = limit ?? 1000
+      const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000)
 
       // Determine if this is a SELECT-like statement
       const trimmed = sql.trim().toLowerCase()
@@ -60,7 +60,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       const stmt = db.prepare(query)
       const rows = stmt.all() as any[]
       const columns = rows.length > 0 ? Object.keys(rows[0]) : []
-      const truncated = rows.length > effectiveLimit
+      const truncated = effectiveLimit > 0 && rows.length > effectiveLimit
       const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows
 
       return {
diff --git a/packages/drivers/src/sqlserver.ts b/packages/drivers/src/sqlserver.ts
index b9aac91760..3ea1e390f3 100644
--- a/packages/drivers/src/sqlserver.ts
+++ b/packages/drivers/src/sqlserver.ts
@@ -2,7 +2,7 @@
  * SQL Server driver using the `mssql` (tedious) package.
  */
 
-import type { ConnectionConfig, Connector, ConnectorResult, SchemaColumn } from "./types"
+import type { ConnectionConfig, Connector, ConnectorResult, ExecuteOptions, SchemaColumn } from "./types"
 
 export async function connect(config: ConnectionConfig): Promise<Connector> {
   let mssql: any
@@ -42,8 +42,8 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
       pool = await mssql.connect(mssqlConfig)
     },
 
-    async execute(sql: string, limit?: number, _binds?: any[]): Promise<ConnectorResult> {
-      const effectiveLimit = limit ?? 1000
+    async execute(sql: string, limit?: number, _binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult> {
+      const effectiveLimit = options?.noLimit ? 0 : (limit ?? 1000)
 
       let query = sql
       const isSelectLike = /^\s*SELECT\b/i.test(sql)
@@ -69,7 +69,7 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
           : (result.recordset?.columns
               ? Object.keys(result.recordset.columns)
               : [])
-      const truncated = rows.length > effectiveLimit
+      const truncated = effectiveLimit > 0 && rows.length > effectiveLimit
       const limitedRows = truncated ? rows.slice(0, effectiveLimit) : rows
 
       return {
diff --git a/packages/drivers/src/types.ts b/packages/drivers/src/types.ts
index 31a7565134..3bc3760d6c 100644
--- a/packages/drivers/src/types.ts
+++ b/packages/drivers/src/types.ts
@@ -20,9 +20,15 @@ export interface SchemaColumn {
   nullable: boolean
 }
 
+export interface ExecuteOptions {
+  /** Skip the default LIMIT injection and post-truncation. Use when the caller
+   *  needs the complete, untruncated result set (e.g. data-diff pipelines). */
+  noLimit?: boolean
+}
+
 export interface Connector {
   connect(): Promise<void>
-  execute(sql: string, limit?: number, binds?: any[]): Promise<ConnectorResult>
+  execute(sql: string, limit?: number, binds?: any[], options?: ExecuteOptions): Promise<ConnectorResult>
   listSchemas(): Promise<string[]>
   listTables(schema: string): Promise<Array<{ name: string; type: string }>>
   describeTable(schema: string, table: string): Promise<SchemaColumn[]>
diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index a5c009aff9..98aae49427 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -105,7 +105,8 @@ async function executeQuery(sql: string, warehouseName: string | undefined): Pro
     connector = await Registry.get(warehouses[0].name)
   }
 
-  const result = await connector.execute(sql)
+  // Bypass the driver's default LIMIT — data-diff needs complete result sets.
+  const result = await connector.execute(sql, undefined, undefined, { noLimit: true })
 
   // Normalise to string[][] — drivers return mixed types
   return result.rows.map((row: unknown[]) =>

From f478bffc2ab7c0f49892c2c7af7f2878e691b168 Mon Sep 17 00:00:00 2001
From: Aditya Pandey <aditya.p@altimate.ai>
Date: Sat, 28 Mar 2026 12:49:58 -0700
Subject: [PATCH 08/20] feat: detect auto-timestamp defaults from database
 catalog and confirm exclusions with user
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Column exclusion now has two layers:
1. Name-pattern matching (existing) — updated_at, created_at, _fivetran_synced, etc.
2. Schema-level default detection (new) — queries column_default for NOW(),
   CURRENT_TIMESTAMP, GETDATE(), SYSDATE, SYSTIMESTAMP, etc.

Covers PostgreSQL, MySQL, Snowflake, SQL Server, Oracle, ClickHouse, DuckDB,
SQLite, and Redshift in a single round-trip (no extra query).

The skill prompt now instructs the agent to present detected auto-timestamp
columns to the user and ask for confirmation before excluding them, since
migrations should preserve timestamps while ETL replication regenerates them.
---
 .opencode/skills/data-parity/SKILL.md         |  80 +++++++--
 .../altimate/native/connections/data-diff.ts  | 161 +++++++++++++++---
 .../opencode/src/altimate/tools/data-diff.ts  |   7 +-
 3 files changed, 210 insertions(+), 38 deletions(-)

diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md
index 9302e50b66..39afa6b616 100644
--- a/.opencode/skills/data-parity/SKILL.md
+++ b/.opencode/skills/data-parity/SKILL.md
@@ -12,13 +12,14 @@ description: Validate that two tables or query results are identical — or diag
 ```
 Here's my plan:
 1. [ ] List available warehouse connections
-2. [ ] Inspect schema and discover primary key candidates
+2. [ ] Inspect schema, discover primary key candidates, and detect auto-timestamp columns
 3. [ ] Confirm primary keys with you
-4. [ ] Check row counts on both sides
-5. [ ] Run column-level profile (cheap — no row scan)
-6. [ ] Ask whether to proceed with row-level diff (may be expensive for large tables)
-7. [ ] Run targeted row-level diff on diverging columns only
-8. [ ] Report findings
+4. [ ] Confirm which auto-timestamp columns to exclude
+5. [ ] Check row counts on both sides
+6. [ ] Run column-level profile (cheap — no row scan)
+7. [ ] Ask whether to proceed with row-level diff (may be expensive for large tables)
+8. [ ] Run targeted row-level diff on diverging columns only
+9. [ ] Report findings
 ```
 
 Update each item to `[x]` as you complete it. This plan should be visible before any tool is called.
@@ -45,13 +46,13 @@ Use `warehouse_list` to show the user what connections are available and which w
 
 ---
 
-## Step 2: Inspect Schema and Discover Primary Keys
+## Step 2: Inspect Schema, Discover Primary Keys, and Detect Auto-Timestamp Columns
 
-Use `sql_query` to get columns and identify key candidates:
+Use `sql_query` to get columns, defaults, and identify key candidates:
 
 ```sql
 -- Postgres / Redshift / DuckDB
-SELECT column_name, data_type, is_nullable
+SELECT column_name, data_type, is_nullable, column_default
 FROM information_schema.columns
 WHERE table_schema = 'public' AND table_name = 'orders'
 ORDER BY ordinal_position
@@ -62,6 +63,14 @@ ORDER BY ordinal_position
 SHOW COLUMNS IN TABLE orders
 ```
 
+```sql
+-- MySQL / MariaDB  (also fetch EXTRA for ON UPDATE detection)
+SELECT column_name, data_type, is_nullable, column_default, extra
+FROM information_schema.columns
+WHERE table_schema = 'mydb' AND table_name = 'orders'
+ORDER BY ordinal_position
+```
+
 ```sql
 -- ClickHouse
 DESCRIBE TABLE source_db.events
@@ -69,6 +78,15 @@ DESCRIBE TABLE source_db.events
 
 **Look for:** columns named `id`, `*_id`, `*_key`, `uuid`, or with `NOT NULL` + unique index.
 
+**Also look for auto-timestamp columns** — any column whose `column_default` contains a time-generating function:
+- PostgreSQL/DuckDB/Redshift: `now()`, `CURRENT_TIMESTAMP`, `clock_timestamp()`
+- MySQL/MariaDB: `CURRENT_TIMESTAMP` (in default or EXTRA)
+- Snowflake: `CURRENT_TIMESTAMP()`, `SYSDATE()`
+- SQL Server: `getdate()`, `sysdatetime()`
+- Oracle: `SYSDATE`, `SYSTIMESTAMP`
+
+These columns auto-generate values on INSERT, so they inherently differ between source and target due to write timing — not because of actual data discrepancies. **Collect them for confirmation in Step 4.**
+
 If no obvious PK, run a cardinality check:
 
 ```sql
@@ -101,7 +119,33 @@ Do not proceed to diff until the user confirms or corrects.
 
 ---
 
-## Step 4: Check Row Counts
+## Step 4: Confirm Auto-Timestamp Column Exclusions
+
+If you detected any columns with auto-generating timestamp defaults in Step 2, **present them to the user and ask for confirmation** before excluding them.
+
+**Example prompt when auto-timestamp columns are found:**
+
+> "I found **3 columns** with auto-generating timestamp defaults that will inherently differ between source and target (due to when each row was written, not actual data differences):
+>
+> | Column | Default | Reason to exclude |
+> |--------|---------|-------------------|
+> | `created_at` | `DEFAULT now()` | Set on insert — reflects when this copy was written |
+> | `updated_at` | `DEFAULT now()` | Set on insert — reflects when this copy was written |
+> | `_loaded_at` | `DEFAULT CURRENT_TIMESTAMP` | ETL load timestamp |
+>
+> Should I **exclude** these from the comparison? Or do you want to include any of them (e.g., if you're verifying that `created_at` was preserved during migration)?"
+
+**If user confirms exclusion:** Omit those columns from `extra_columns` when calling `data_diff`.
+
+**If user wants to include some:** Add them explicitly to `extra_columns`.
+
+**If no auto-timestamp columns were detected:** Skip this step and proceed to Step 5.
+
+> **Why ask?** In migration validation, `created_at` should often be *identical* between source and target (it was migrated, not regenerated). But in ETL replication, `created_at` is freshly generated on each side and *should* differ. Only the user knows which case applies.
+
+---
+
+## Step 5: Check Row Counts
 
 ```sql
 SELECT COUNT(*) FROM orders   -- run on both source and target
@@ -114,7 +158,7 @@ Use counts to:
 
 ---
 
-## Step 5: Column-Level Profile (Always Run This First)
+## Step 6: Column-Level Profile (Always Run This First)
 
 Profile is cheap — it runs aggregates, not row scans. **Always run profile before row-level diff.**
 
@@ -148,7 +192,7 @@ Column Profile Comparison
 
 ---
 
-## Step 6: Ask Before Running Row-Level Diff on Large Tables
+## Step 7: Ask Before Running Row-Level Diff on Large Tables
 
 After profiling, check row count and **ask the user** before proceeding:
 
@@ -166,7 +210,7 @@ After profiling, check row count and **ask the user** before proceeding:
 
 ---
 
-## Step 7: Run Targeted Row-Level Diff
+## Step 8: Run Targeted Row-Level Diff
 
 Use only the columns that the profile said differ. This is faster and produces cleaner output.
 
@@ -260,7 +304,12 @@ Output includes aggregate diff + per-partition breakdown showing which group has
 
 The Rust engine **only compares columns listed in `extra_columns`**. If the list is empty, it compares key existence only — rows that match on key but differ in values will be silently reported as "identical". This is the most common source of false positives.
 
-**Auto-discovery (default for table names):** When `extra_columns` is omitted and the source is a plain table name, `data_diff` auto-discovers all non-key columns from `information_schema` and excludes audit/timestamp columns (like `updated_at`, `created_at`, `inserted_at`, `modified_at`, `publisher_last_updated_epoch_ms`, ETL metadata columns like `_fivetran_synced`, etc.). The output will list which columns were auto-excluded.
+**Auto-discovery (default for table names):** When `extra_columns` is omitted and the source is a plain table name, `data_diff` auto-discovers all non-key columns from the database catalog and excludes columns using two detection layers:
+
+1. **Name-pattern matching** — columns named like `updated_at`, `created_at`, `inserted_at`, `modified_at`, `publisher_last_updated_epoch_ms`, ETL metadata columns like `_fivetran_synced`, `_airbyte_extracted_at`, etc.
+2. **Schema-level default detection** — columns with auto-generating timestamp defaults (`DEFAULT NOW()`, `DEFAULT CURRENT_TIMESTAMP`, `GETDATE()`, `SYSDATE()`, `SYSTIMESTAMP`, etc.), detected directly from the database catalog. This catches columns that don't follow naming conventions but still auto-generate values on INSERT. Works across PostgreSQL, MySQL, Snowflake, SQL Server, Oracle, ClickHouse, DuckDB, SQLite, and Redshift.
+
+The output lists which columns were auto-excluded and why.
 
 **SQL queries:** When source is a SQL query (not a table name), auto-discovery cannot work. You **must** provide `extra_columns` explicitly. If you don't, only key-level matching occurs.
 
@@ -287,3 +336,6 @@ The Rust engine **only compares columns listed in `extra_columns`**. If the list
 
 **Omitting extra_columns when source is a SQL query**
 → Auto-discovery only works for table names. For SQL queries, always list the columns to compare explicitly.
+
+**Silently excluding auto-timestamp columns without asking the user**
+→ Always present detected auto-timestamp columns (Step 4) and get explicit confirmation. In migration scenarios, `created_at` should be *identical* — excluding it silently hides real bugs.
diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index 98aae49427..0afc2c964a 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -147,8 +147,70 @@ function isAuditColumn(columnName: string): boolean {
   return AUDIT_COLUMN_PATTERNS.some((pattern) => pattern.test(columnName))
 }
 
+// ---------------------------------------------------------------------------
+// Auto-timestamp default detection (schema-level)
+// ---------------------------------------------------------------------------
+
+/**
+ * Patterns that detect auto-generated timestamp/date defaults in column_default
+ * expressions. These functions produce the current time when a row is inserted
+ * (or updated), meaning the column value will inherently differ between source
+ * and target — not because of actual data discrepancies, but because of when
+ * each copy was written.
+ *
+ * Covers: PostgreSQL, MySQL/MariaDB, Snowflake, SQL Server, Oracle,
+ *         ClickHouse, DuckDB, SQLite, Redshift, BigQuery, Databricks.
+ */
+const AUTO_TIMESTAMP_DEFAULT_PATTERNS = [
+  // PostgreSQL, DuckDB, Redshift
+  /\bnow\s*\(\)/i,
+  /\bclock_timestamp\s*\(\)/i,
+  /\bstatement_timestamp\s*\(\)/i,
+  /\btransaction_timestamp\s*\(\)/i,
+  /\blocaltimestamp\b/i,
+  // Standard SQL — used by most dialects
+  /\bcurrent_timestamp\b/i,
+  // MySQL / MariaDB — "ON UPDATE CURRENT_TIMESTAMP" in the EXTRA column
+  /\bon\s+update\s+current_timestamp/i,
+  // Snowflake
+  /\bsysdate\s*\(\)/i,
+  // SQL Server
+  /\bgetdate\s*\(\)/i,
+  /\bsysdatetime\s*\(\)/i,
+  /\bsysutcdatetime\s*\(\)/i,
+  /\bsysdatetimeoffset\s*\(\)/i,
+  // Oracle
+  /\bSYSDATE\b/i,
+  /\bSYSTIMESTAMP\b/i,
+  // ClickHouse
+  /\btoday\s*\(\)/i,
+  // SQLite
+  /\bdatetime\s*\(\s*'now'/i,
+]
+
+/**
+ * Check whether a column_default expression contains an auto-generating
+ * timestamp function. Also matches expressions that *contain* these functions
+ * (e.g. `(now() + '1 mon'::interval)`).
+ */
+function isAutoTimestampDefault(defaultExpr: string | null): boolean {
+  if (!defaultExpr) return false
+  return AUTO_TIMESTAMP_DEFAULT_PATTERNS.some((pattern) => pattern.test(defaultExpr))
+}
+
+// ---------------------------------------------------------------------------
+// Column discovery (names + defaults) — dialect-aware
+// ---------------------------------------------------------------------------
+
+interface ColumnInfo {
+  name: string
+  defaultExpr: string | null
+}
+
 /**
- * Build a query to discover column names for a table, appropriate for the dialect.
+ * Build a query to discover column names and default expressions for a table.
+ * Returns both pieces of information in a single round-trip so we can detect
+ * auto-timestamp defaults without an extra query.
  */
 function buildColumnDiscoverySQL(tableName: string, dialect: string): string {
   // Parse schema.table or db.schema.table
@@ -168,33 +230,85 @@ function buildColumnDiscoverySQL(tableName: string, dialect: string): string {
 
   switch (dialect) {
     case "clickhouse":
+      // Returns: name, type, default_type, default_expression, ...
       return `DESCRIBE TABLE ${tableName}`
     case "snowflake":
+      // Returns: table_name, schema_name, column_name, data_type, null?, default, ...
       return `SHOW COLUMNS IN TABLE ${tableName}`
+    case "mysql":
+    case "mariadb": {
+      // MySQL puts "on update CURRENT_TIMESTAMP" in the EXTRA column, not column_default
+      const conditions = [tableFilter]
+      if (schemaFilter) conditions.push(schemaFilter)
+      return `SELECT column_name, column_default, extra FROM information_schema.columns WHERE ${conditions.join(" AND ")} ORDER BY ordinal_position`
+    }
+    case "oracle": {
+      // Oracle uses ALL_TAB_COLUMNS (no information_schema)
+      const oracleTable = parts[parts.length - 1]
+      const conditions = [`TABLE_NAME = '${oracleTable.toUpperCase()}'`]
+      if (parts.length >= 2) {
+        conditions.push(`OWNER = '${parts[parts.length - 2].toUpperCase()}'`)
+      }
+      return `SELECT COLUMN_NAME, DATA_DEFAULT FROM ALL_TAB_COLUMNS WHERE ${conditions.join(" AND ")} ORDER BY COLUMN_ID`
+    }
+    case "sqlite": {
+      // PRAGMA table_info returns: cid, name, type, notnull, dflt_value, pk
+      const table = parts[parts.length - 1]
+      return `PRAGMA table_info('${table}')`
+    }
     default: {
-      // Postgres, MySQL, Redshift, DuckDB, etc. — use information_schema
+      // Postgres, Redshift, DuckDB, SQL Server, BigQuery, Databricks, etc.
       const conditions = [tableFilter]
       if (schemaFilter) conditions.push(schemaFilter)
-      return `SELECT column_name FROM information_schema.columns WHERE ${conditions.join(" AND ")} ORDER BY ordinal_position`
+      return `SELECT column_name, column_default FROM information_schema.columns WHERE ${conditions.join(" AND ")} ORDER BY ordinal_position`
     }
   }
 }
 
 /**
- * Parse column names from the discovery query result, handling dialect differences.
+ * Parse column info (name + default expression) from the discovery query result,
+ * handling dialect-specific output formats.
  */
-function parseColumnNames(rows: (string | null)[][], dialect: string): string[] {
+function parseColumnInfo(rows: (string | null)[][], dialect: string): ColumnInfo[] {
   switch (dialect) {
     case "clickhouse":
-      // DESCRIBE returns: name, type, default_type, default_expression, ...
-      return rows.map((r) => r[0] ?? "").filter(Boolean)
+      // DESCRIBE: name[0], type[1], default_type[2], default_expression[3], ...
+      return rows.map((r) => ({
+        name: r[0] ?? "",
+        defaultExpr: r[3] ?? null,
+      })).filter((c) => c.name)
     case "snowflake":
-      // SHOW COLUMNS returns: table_name, schema_name, column_name, data_type, ...
-      // column_name is at index 2
-      return rows.map((r) => r[2] ?? "").filter(Boolean)
+      // SHOW COLUMNS: table_name[0], schema_name[1], column_name[2], data_type[3], null?[4], default[5], ...
+      return rows.map((r) => ({
+        name: r[2] ?? "",
+        defaultExpr: r[5] ?? null,
+      })).filter((c) => c.name)
+    case "oracle":
+      // ALL_TAB_COLUMNS: COLUMN_NAME[0], DATA_DEFAULT[1]
+      return rows.map((r) => ({
+        name: r[0] ?? "",
+        defaultExpr: r[1] ?? null,
+      })).filter((c) => c.name)
+    case "sqlite":
+      // PRAGMA table_info: cid[0], name[1], type[2], notnull[3], dflt_value[4], pk[5]
+      return rows.map((r) => ({
+        name: r[1] ?? "",
+        defaultExpr: r[4] ?? null,
+      })).filter((c) => c.name)
+    case "mysql":
+    case "mariadb":
+      // column_name[0], column_default[1], extra[2]
+      // Merge default + extra — MySQL puts "on update CURRENT_TIMESTAMP" in extra
+      return rows.map((r) => ({
+        name: r[0] ?? "",
+        defaultExpr: [r[1], r[2]].filter(Boolean).join(" ") || null,
+      })).filter((c) => c.name)
     default:
-      // information_schema returns: column_name
-      return rows.map((r) => r[0] ?? "").filter(Boolean)
+      // Postgres, Redshift, DuckDB, SQL Server, BigQuery: column_name[0], column_default[1]
+      return rows.map((r) => ({
+        name: r[0] ?? "",
+        defaultExpr: r[1] ?? null,
+      })).filter((c) => c.name)
   }
 }
 
@@ -204,8 +318,13 @@ function parseColumnNames(rows: (string | null)[][], dialect: string): string[]
  * When the caller omits `extra_columns`, we query the source table's schema to
  * find all columns, then exclude:
  *   1. Key columns (already used for matching)
- *   2. Audit/timestamp columns (updated_at, created_at, etc.) that typically
- *      differ between source and target due to ETL timing
+ *   2. Audit/timestamp columns matched by name pattern (updated_at, created_at, etc.)
+ *   3. Columns with auto-generating timestamp defaults (DEFAULT NOW(), CURRENT_TIMESTAMP,
+ *      GETDATE(), SYSDATE, etc.) — detected from the database catalog
+ *
+ * The schema-level default detection (layer 3) catches columns that don't follow
+ * naming conventions but still auto-generate values on INSERT — these inherently
+ * differ between source and target due to when each copy was written.
  *
  * Returns the list of columns to compare, or undefined if discovery fails
  * (in which case the engine falls back to key-only comparison).
@@ -222,20 +341,20 @@ async function discoverExtraColumns(
   try {
     const sql = buildColumnDiscoverySQL(tableName, dialect)
     const rows = await executeQuery(sql, warehouseName)
-    const allColumns = parseColumnNames(rows, dialect)
+    const columnInfos = parseColumnInfo(rows, dialect)
 
-    if (allColumns.length === 0) return undefined
+    if (columnInfos.length === 0) return undefined
 
     const keySet = new Set(keyColumns.map((k) => k.toLowerCase()))
     const extraColumns: string[] = []
     const excludedAudit: string[] = []
 
-    for (const col of allColumns) {
-      if (keySet.has(col.toLowerCase())) continue
-      if (isAuditColumn(col)) {
-        excludedAudit.push(col)
+    for (const col of columnInfos) {
+      if (keySet.has(col.name.toLowerCase())) continue
+      if (isAuditColumn(col.name) || isAutoTimestampDefault(col.defaultExpr)) {
+        excludedAudit.push(col.name)
       } else {
-        extraColumns.push(col)
+        extraColumns.push(col.name)
       }
     }
 
diff --git a/packages/opencode/src/altimate/tools/data-diff.ts b/packages/opencode/src/altimate/tools/data-diff.ts
index 97a4085169..bf99487483 100644
--- a/packages/opencode/src/altimate/tools/data-diff.ts
+++ b/packages/opencode/src/altimate/tools/data-diff.ts
@@ -38,7 +38,8 @@ export const DataDiffTool = Tool.define("data_diff", {
       .describe(
         "Columns to compare beyond the key columns. " +
         "IMPORTANT: If omitted AND source is a plain table name, columns are auto-discovered from the schema " +
-        "(excluding key columns and audit/timestamp columns like updated_at, created_at, inserted_at, modified_at). " +
+        "(excluding key columns, audit/timestamp columns matched by name like updated_at/created_at, " +
+        "and columns with auto-generating timestamp defaults like DEFAULT NOW()/CURRENT_TIMESTAMP/GETDATE()/SYSDATE). " +
         "If omitted AND source is a SQL query, ONLY key columns are compared — value changes in non-key columns will NOT be detected. " +
         "Always provide explicit extra_columns when comparing SQL queries to ensure value-level comparison."
       ),
@@ -117,10 +118,10 @@ export const DataDiffTool = Tool.define("data_diff", {
         output += formatPartitionResults(result.partition_results, args.partition_column!)
       }
 
-      // Report auto-excluded audit columns so the LLM and user know what was skipped
+      // Report auto-excluded columns so the LLM and user know what was skipped
       const excluded = (result as any).excluded_audit_columns as string[] | undefined
       if (excluded && excluded.length > 0) {
-        output += `\n\n  Note: ${excluded.length} audit/timestamp column${excluded.length === 1 ? "" : "s"} auto-excluded from comparison: ${excluded.join(", ")}`
+        output += `\n\n  Note: ${excluded.length} column${excluded.length === 1 ? "" : "s"} auto-excluded from comparison (audit name patterns + auto-timestamp defaults like NOW()/CURRENT_TIMESTAMP): ${excluded.join(", ")}`
       }
 
       return {

From b40801758319864b6f469a7657ec272b776577dc Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Mon, 30 Mar 2026 14:18:39 -0700
Subject: [PATCH 09/20] fix: address code review findings in data-diff
 orchestrator
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- `buildColumnDiscoverySQL`: escape single quotes in all interpolated table
  name parts to prevent SQL injection via crafted source/target names
- `dateTruncExpr`: add Oracle case (`TRUNC(col, 'UNIT')`) — Oracle does not
  have `DATE_TRUNC`, date-partitioned diffs on Oracle tables previously failed

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../src/altimate/native/connections/data-diff.ts | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index 0afc2c964a..28abcb411d 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -213,19 +213,22 @@ interface ColumnInfo {
  * auto-timestamp defaults without an extra query.
  */
 function buildColumnDiscoverySQL(tableName: string, dialect: string): string {
+  // Escape single quotes for safe interpolation into SQL string literals.
+  const esc = (s: string) => s.replace(/'/g, "''")
+
   // Parse schema.table or db.schema.table
   const parts = tableName.split(".")
   let schemaFilter = ""
   let tableFilter = ""
 
   if (parts.length === 3) {
-    schemaFilter = `table_schema = '${parts[1]}'`
-    tableFilter = `table_name = '${parts[2]}'`
+    schemaFilter = `table_schema = '${esc(parts[1])}'`
+    tableFilter = `table_name = '${esc(parts[2])}'`
   } else if (parts.length === 2) {
-    schemaFilter = `table_schema = '${parts[0]}'`
-    tableFilter = `table_name = '${parts[1]}'`
+    schemaFilter = `table_schema = '${esc(parts[0])}'`
+    tableFilter = `table_name = '${esc(parts[1])}'`
   } else {
-    tableFilter = `table_name = '${parts[0]}'`
+    tableFilter = `table_name = '${esc(parts[0])}'`
   }
 
   switch (dialect) {
@@ -390,6 +393,9 @@ function dateTruncExpr(granularity: string, column: string, dialect: string): st
       const fmt = { day: "%Y-%m-%d", week: "%Y-%u", month: "%Y-%m-01", year: "%Y-01-01" }[g] ?? "%Y-%m-01"
       return `DATE_FORMAT(${column}, '${fmt}')`
     }
+    case "oracle":
+      // Oracle uses TRUNC(), not DATE_TRUNC()
+      return `TRUNC(${column}, '${g.toUpperCase()}')`
     default:
       // Postgres, Snowflake, Redshift, DuckDB, etc.
       return `DATE_TRUNC('${g}', ${column})`

From f2cee71f6affdc02c92e40d9b28238f645310e57 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Mon, 30 Mar 2026 19:26:59 -0700
Subject: [PATCH 10/20] fix: address code review security and correctness
 findings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Apply esc() to Oracle and SQLite paths in buildColumnDiscoverySQL
  (SQL injection via table name was unpatched in these dialects)
- Quote identifiers in resolveTableSources to prevent injection via
  table names containing semicolons or special characters
- Surface SQL execution errors before feeding empty rows to the engine
  (silent false "match" when warehouse is unreachable is now an error)
- Fix Oracle TRUNC() format model map: 'WEEK' → 'IW' (ISO week)
  ('WEEK' throws ORA-01800 on all Oracle versions)
- Quote partition column identifier in buildPartitionWhereClause
---
 .../altimate/native/connections/data-diff.ts  | 60 ++++++++++++++-----
 1 file changed, 45 insertions(+), 15 deletions(-)

diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index 28abcb411d..5808ee73f8 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -46,8 +46,15 @@ export function resolveTableSources(
   }
 
   // At least one is a query — wrap both in CTEs
-  const srcExpr = source_is_query ? source : `SELECT * FROM ${source}`
-  const tgtExpr = target_is_query ? target : `SELECT * FROM ${target}`
+  // Quote identifier parts so table names with special chars don't inject SQL.
+  // Use double-quote escaping (ANSI SQL standard, works in Postgres/Snowflake/DuckDB/etc.)
+  const quoteIdent = (name: string) =>
+    name
+      .split(".")
+      .map((p) => `"${p.replace(/"/g, '""')}"`)
+      .join(".")
+  const srcExpr = source_is_query ? source : `SELECT * FROM ${quoteIdent(source)}`
+  const tgtExpr = target_is_query ? target : `SELECT * FROM ${quoteIdent(target)}`
 
   const ctePrefix = `WITH __diff_source AS (\n${srcExpr}\n), __diff_target AS (\n${tgtExpr}\n)`
   return {
@@ -247,16 +254,16 @@ function buildColumnDiscoverySQL(tableName: string, dialect: string): string {
     }
     case "oracle": {
       // Oracle uses ALL_TAB_COLUMNS (no information_schema)
-      const oracleTable = parts[parts.length - 1]
+      const oracleTable = esc(parts[parts.length - 1])
       const conditions = [`TABLE_NAME = '${oracleTable.toUpperCase()}'`]
       if (parts.length >= 2) {
-        conditions.push(`OWNER = '${parts[parts.length - 2].toUpperCase()}'`)
+        conditions.push(`OWNER = '${esc(parts[parts.length - 2]).toUpperCase()}'`)
       }
       return `SELECT COLUMN_NAME, DATA_DEFAULT FROM ALL_TAB_COLUMNS WHERE ${conditions.join(" AND ")} ORDER BY COLUMN_ID`
     }
     case "sqlite": {
       // PRAGMA table_info returns: cid, name, type, notnull, dflt_value, pk
-      const table = parts[parts.length - 1]
+      const table = esc(parts[parts.length - 1])
       return `PRAGMA table_info('${table}')`
     }
     default: {
@@ -393,9 +400,19 @@ function dateTruncExpr(granularity: string, column: string, dialect: string): st
       const fmt = { day: "%Y-%m-%d", week: "%Y-%u", month: "%Y-%m-01", year: "%Y-01-01" }[g] ?? "%Y-%m-01"
       return `DATE_FORMAT(${column}, '${fmt}')`
     }
-    case "oracle":
-      // Oracle uses TRUNC(), not DATE_TRUNC()
-      return `TRUNC(${column}, '${g.toUpperCase()}')`
+    case "oracle": {
+      // Oracle uses TRUNC() with format models — 'WEEK' is invalid, use 'IW' for ISO week
+      const oracleFmt: Record<string, string> = {
+        day: "DDD",
+        week: "IW",
+        month: "MM",
+        year: "YYYY",
+        quarter: "Q",
+        hour: "HH",
+        minute: "MI",
+      }
+      return `TRUNC(${column}, '${oracleFmt[g] ?? g.toUpperCase()}')`
+    }
     default:
       // Postgres, Snowflake, Redshift, DuckDB, etc.
       return `DATE_TRUNC('${g}', ${column})`
@@ -455,21 +472,23 @@ function buildPartitionWhereClause(
   dialect: string,
 ): string {
   const mode = partitionMode(granularity, bucketSize)
+  // Quote the column identifier to handle special characters and reserved words
+  const quotedCol = `"${partitionColumn.replace(/"/g, '""')}"`
 
   if (mode === "numeric") {
     const lo = Number(partitionValue)
     const hi = lo + bucketSize!
-    return `${partitionColumn} >= ${lo} AND ${partitionColumn} < ${hi}`
+    return `${quotedCol} >= ${lo} AND ${quotedCol} < ${hi}`
   }
 
   if (mode === "categorical") {
     // Quote the value — works for strings, enums, booleans
     const escaped = partitionValue.replace(/'/g, "''")
-    return `${partitionColumn} = '${escaped}'`
+    return `${quotedCol} = '${escaped}'`
   }
 
   // date mode
-  const expr = dateTruncExpr(granularity!, partitionColumn, dialect)
+  const expr = dateTruncExpr(granularity!, quotedCol, dialect)
 
   // Cast the literal appropriately per dialect
   switch (dialect) {
@@ -779,21 +798,32 @@ export async function runDataDiff(params: DataDiffParams): Promise<DataDiffResul
 
     // Execute all SQL tasks in parallel
     const tasks = action.tasks ?? []
-    const responses = await Promise.all(
+    const taskResults = await Promise.all(
       tasks.map(async (task) => {
         const warehouse = warehouseFor(task.table_side)
         // Inject CTE definitions if we're in query-comparison mode
         const sql = ctePrefix ? injectCte(task.sql, ctePrefix) : task.sql
         try {
           const rows = await executeQuery(sql, warehouse)
-          return { id: task.id, rows }
+          return { id: task.id, rows, error: null }
         } catch (e) {
-          // Return error shape — engine will produce an Error action on next step
-          return { id: task.id, rows: [], error: String(e) }
+          return { id: task.id, rows: [] as (string | null)[][], error: String(e) }
         }
       }),
     )
 
+    // Surface any SQL execution errors before feeding to the engine
+    const sqlError = taskResults.find((r) => r.error !== null)
+    if (sqlError) {
+      return {
+        success: false,
+        error: `SQL execution failed for task ${sqlError.id}: ${sqlError.error}`,
+        steps: stepCount,
+      }
+    }
+
+    const responses = taskResults.map(({ id, rows }) => ({ id, rows }))
+
     actionJson = session.step(JSON.stringify(responses))
   }
 

From 982316e11a4483a3416b4c020fd7f91a61073128 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Tue, 31 Mar 2026 14:20:29 -0700
Subject: [PATCH 11/20] =?UTF-8?q?fix:=20resolve=20simulation=20suite=20fai?=
 =?UTF-8?q?lures=20=E2=80=94=20object=20stringification,=20error=20propaga?=
 =?UTF-8?q?tion,=20and=20test=20mock=20formats?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- `altimate-core-column-lineage`: fix `[object Object]` in `column_dict` output when source entries are `{ source_table, source_column }` objects instead of strings
- `schema-inspect`: propagate `{ success: false, error }` dispatcher responses to `metadata.error` instead of silently returning empty schema
- `sql-analyze`: guard against null/undefined result from dispatcher to prevent "undefined" literal in output
- `lineage-check`: guard against null/undefined result from dispatcher to prevent "undefined" literal in output
- `simulation-suite.test.ts`: fix `sql-translate` mock format — data fields must be flat (not wrapped in `data: {}`), add `source_dialect`/`target_dialect` to mock so assertions pass
- `simulation-suite.test.ts`: fix `dbt-manifest` mock format — unwrap `data: {}` so `model_count` and `models` are accessible at top level

Simulation suite: 695/839 → 839/839 (100%)
---
 .../tools/altimate-core-column-lineage.ts     | 11 ++++++++-
 .../src/altimate/tools/lineage-check.ts       | 12 +++++++++-
 .../src/altimate/tools/schema-inspect.ts      | 15 ++++++++++--
 .../src/altimate/tools/sql-analyze.ts         | 12 +++++++++-
 .../test/altimate/simulation-suite.test.ts    | 24 +++++++++----------
 5 files changed, 56 insertions(+), 18 deletions(-)

diff --git a/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts b/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts
index 180836d123..8b0d2e2220 100644
--- a/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts
+++ b/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts
@@ -47,7 +47,16 @@ function formatColumnLineage(data: Record<string, any>): string {
   if (data.column_dict && Object.keys(data.column_dict).length > 0) {
     lines.push("Column Mappings:")
     for (const [target, sources] of Object.entries(data.column_dict)) {
-      const srcList = Array.isArray(sources) ? (sources as string[]).join(", ") : JSON.stringify(sources)
+      const srcList = Array.isArray(sources)
+        ? sources
+            .map((s: any) => {
+              if (typeof s === "string") return s
+              if (s && s.source_table && s.source_column) return `${s.source_table}.${s.source_column}`
+              if (s && s.source) return String(s.source)
+              return JSON.stringify(s)
+            })
+            .join(", ")
+        : JSON.stringify(sources)
       lines.push(`  ${target} ← ${srcList}`)
     }
     lines.push("")
diff --git a/packages/opencode/src/altimate/tools/lineage-check.ts b/packages/opencode/src/altimate/tools/lineage-check.ts
index dbe19fc0ff..d9e0e41748 100644
--- a/packages/opencode/src/altimate/tools/lineage-check.ts
+++ b/packages/opencode/src/altimate/tools/lineage-check.ts
@@ -20,12 +20,22 @@ export const LineageCheckTool = Tool.define("lineage_check", {
   }),
   async execute(args, ctx) {
     try {
-      const result = await Dispatcher.call("lineage.check", {
+      const raw = await Dispatcher.call("lineage.check", {
         sql: args.sql,
         dialect: args.dialect,
         schema_context: args.schema_context,
       })
 
+      // Guard against null/undefined/non-object responses
+      if (raw == null || typeof raw !== "object") {
+        return {
+          title: "Lineage: ERROR",
+          metadata: { success: false, error: "Unexpected response from lineage handler" },
+          output: "Lineage check failed: unexpected response format.",
+        }
+      }
+      const result = raw as LineageCheckResult
+
       const data = (result.data ?? {}) as Record<string, any>
       if (result.error) {
         return {
diff --git a/packages/opencode/src/altimate/tools/schema-inspect.ts b/packages/opencode/src/altimate/tools/schema-inspect.ts
index 92f11b48fa..c6f9c93381 100644
--- a/packages/opencode/src/altimate/tools/schema-inspect.ts
+++ b/packages/opencode/src/altimate/tools/schema-inspect.ts
@@ -15,11 +15,22 @@ export const SchemaInspectTool = Tool.define("schema_inspect", {
   }),
   async execute(args, ctx) {
     try {
-      const result = await Dispatcher.call("schema.inspect", {
+      const raw = (await Dispatcher.call("schema.inspect", {
         table: args.table,
         schema_name: args.schema_name,
         warehouse: args.warehouse,
-      })
+      })) as any
+
+      // Surface dispatcher-level errors (e.g. { success: false, error: "..." })
+      if (!raw || raw.success === false || raw.error) {
+        const errorMsg = (raw?.error as string) ?? "Schema inspection failed"
+        return {
+          title: "Schema: ERROR",
+          metadata: { columnCount: 0, rowCount: undefined, error: errorMsg },
+          output: `Failed to inspect schema: ${errorMsg}\n\nEnsure the dispatcher is running and a warehouse connection is configured.`,
+        }
+      }
+      const result = raw as SchemaInspectResult
 
       // altimate_change start — progressive disclosure suggestions
       let output = formatSchema(result)
diff --git a/packages/opencode/src/altimate/tools/sql-analyze.ts b/packages/opencode/src/altimate/tools/sql-analyze.ts
index 87c123727f..d718c57fc4 100644
--- a/packages/opencode/src/altimate/tools/sql-analyze.ts
+++ b/packages/opencode/src/altimate/tools/sql-analyze.ts
@@ -26,13 +26,23 @@ export const SqlAnalyzeTool = Tool.define("sql_analyze", {
   async execute(args, ctx) {
     const hasSchema = !!(args.schema_path || (args.schema_context && Object.keys(args.schema_context).length > 0))
     try {
-      const result = await Dispatcher.call("sql.analyze", {
+      const raw = await Dispatcher.call("sql.analyze", {
         sql: args.sql,
         dialect: args.dialect,
         schema_path: args.schema_path,
         schema_context: args.schema_context,
       })
 
+      // Guard against null/undefined/non-object responses
+      if (raw == null || typeof raw !== "object") {
+        return {
+          title: "Analyze: ERROR",
+          metadata: { success: false, issueCount: 0, confidence: "unknown", dialect: args.dialect, has_schema: hasSchema, error: "Unexpected response from analysis handler" },
+          output: "Analysis failed: unexpected response format.",
+        }
+      }
+      const result = raw
+
       // The handler returns success=true when analysis completes (issues are
       // reported via issues/issue_count). Only treat it as a failure when
       // there's an actual error (e.g. parse failure).
diff --git a/packages/opencode/test/altimate/simulation-suite.test.ts b/packages/opencode/test/altimate/simulation-suite.test.ts
index fc411b2416..6fffc50497 100644
--- a/packages/opencode/test/altimate/simulation-suite.test.ts
+++ b/packages/opencode/test/altimate/simulation-suite.test.ts
@@ -907,15 +907,13 @@ describe("Category 4: dbt Integration", () => {
         args: { path: "target/manifest.json" },
         mockResponse: {
           success: true,
-          data: {
-            model_count: project.models,
-            source_count: project.sources,
-            test_count: project.tests,
-            snapshot_count: 0,
-            seed_count: 0,
-            models,
-            sources: [{ name: "raw_data", schema: "raw", columns: [] }],
-          },
+          model_count: project.models,
+          source_count: project.sources,
+          test_count: project.tests,
+          snapshot_count: 0,
+          seed_count: 0,
+          models,
+          sources: [{ name: "raw_data", schema: "raw", columns: [] }],
         },
         assertions: (result) => {
           expect(result.output).toContain("model")
@@ -1122,10 +1120,10 @@ describe("Category 7: SQL Translation", () => {
             dialect: `${source}→${target}`,
             mockResponse: {
               success: true,
-              data: {
-                translated_sql: SQL_CORPUS[sqlKey].replace(/SELECT/g, "/* translated */ SELECT"),
-                warnings: source === "snowflake" && target === "mysql" ? ["QUALIFY clause not supported in MySQL"] : [],
-              },
+              translated_sql: SQL_CORPUS[sqlKey].replace(/SELECT/g, "/* translated */ SELECT"),
+              source_dialect: source,
+              target_dialect: target,
+              warnings: source === "snowflake" && target === "mysql" ? ["QUALIFY clause not supported in MySQL"] : [],
             },
             assertions: (result) => {
               expect(result.output).toContain(source)

From 05b6a0284beeb1797f9a1996788b945a4988dcdf Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Wed, 1 Apr 2026 16:30:11 -0700
Subject: [PATCH 12/20] =?UTF-8?q?refactor:=20remove=20existing-tool=20impr?=
 =?UTF-8?q?ovements=20=E2=80=94=20scope=20to=20data-diff=20only?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../tools/altimate-core-column-lineage.ts         | 11 +----------
 .../opencode/src/altimate/tools/lineage-check.ts  | 12 +-----------
 .../opencode/src/altimate/tools/schema-inspect.ts | 15 ++-------------
 .../opencode/src/altimate/tools/sql-analyze.ts    | 12 +-----------
 4 files changed, 5 insertions(+), 45 deletions(-)

diff --git a/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts b/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts
index 8b0d2e2220..180836d123 100644
--- a/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts
+++ b/packages/opencode/src/altimate/tools/altimate-core-column-lineage.ts
@@ -47,16 +47,7 @@ function formatColumnLineage(data: Record<string, any>): string {
   if (data.column_dict && Object.keys(data.column_dict).length > 0) {
     lines.push("Column Mappings:")
     for (const [target, sources] of Object.entries(data.column_dict)) {
-      const srcList = Array.isArray(sources)
-        ? sources
-            .map((s: any) => {
-              if (typeof s === "string") return s
-              if (s && s.source_table && s.source_column) return `${s.source_table}.${s.source_column}`
-              if (s && s.source) return String(s.source)
-              return JSON.stringify(s)
-            })
-            .join(", ")
-        : JSON.stringify(sources)
+      const srcList = Array.isArray(sources) ? (sources as string[]).join(", ") : JSON.stringify(sources)
       lines.push(`  ${target} ← ${srcList}`)
     }
     lines.push("")
diff --git a/packages/opencode/src/altimate/tools/lineage-check.ts b/packages/opencode/src/altimate/tools/lineage-check.ts
index d9e0e41748..dbe19fc0ff 100644
--- a/packages/opencode/src/altimate/tools/lineage-check.ts
+++ b/packages/opencode/src/altimate/tools/lineage-check.ts
@@ -20,22 +20,12 @@ export const LineageCheckTool = Tool.define("lineage_check", {
   }),
   async execute(args, ctx) {
     try {
-      const raw = await Dispatcher.call("lineage.check", {
+      const result = await Dispatcher.call("lineage.check", {
         sql: args.sql,
         dialect: args.dialect,
         schema_context: args.schema_context,
       })
 
-      // Guard against null/undefined/non-object responses
-      if (raw == null || typeof raw !== "object") {
-        return {
-          title: "Lineage: ERROR",
-          metadata: { success: false, error: "Unexpected response from lineage handler" },
-          output: "Lineage check failed: unexpected response format.",
-        }
-      }
-      const result = raw as LineageCheckResult
-
       const data = (result.data ?? {}) as Record<string, any>
       if (result.error) {
         return {
diff --git a/packages/opencode/src/altimate/tools/schema-inspect.ts b/packages/opencode/src/altimate/tools/schema-inspect.ts
index c6f9c93381..92f11b48fa 100644
--- a/packages/opencode/src/altimate/tools/schema-inspect.ts
+++ b/packages/opencode/src/altimate/tools/schema-inspect.ts
@@ -15,22 +15,11 @@ export const SchemaInspectTool = Tool.define("schema_inspect", {
   }),
   async execute(args, ctx) {
     try {
-      const raw = (await Dispatcher.call("schema.inspect", {
+      const result = await Dispatcher.call("schema.inspect", {
         table: args.table,
         schema_name: args.schema_name,
         warehouse: args.warehouse,
-      })) as any
-
-      // Surface dispatcher-level errors (e.g. { success: false, error: "..." })
-      if (!raw || raw.success === false || raw.error) {
-        const errorMsg = (raw?.error as string) ?? "Schema inspection failed"
-        return {
-          title: "Schema: ERROR",
-          metadata: { columnCount: 0, rowCount: undefined, error: errorMsg },
-          output: `Failed to inspect schema: ${errorMsg}\n\nEnsure the dispatcher is running and a warehouse connection is configured.`,
-        }
-      }
-      const result = raw as SchemaInspectResult
+      })
 
       // altimate_change start — progressive disclosure suggestions
       let output = formatSchema(result)
diff --git a/packages/opencode/src/altimate/tools/sql-analyze.ts b/packages/opencode/src/altimate/tools/sql-analyze.ts
index d718c57fc4..87c123727f 100644
--- a/packages/opencode/src/altimate/tools/sql-analyze.ts
+++ b/packages/opencode/src/altimate/tools/sql-analyze.ts
@@ -26,23 +26,13 @@ export const SqlAnalyzeTool = Tool.define("sql_analyze", {
   async execute(args, ctx) {
     const hasSchema = !!(args.schema_path || (args.schema_context && Object.keys(args.schema_context).length > 0))
     try {
-      const raw = await Dispatcher.call("sql.analyze", {
+      const result = await Dispatcher.call("sql.analyze", {
         sql: args.sql,
         dialect: args.dialect,
         schema_path: args.schema_path,
         schema_context: args.schema_context,
       })
 
-      // Guard against null/undefined/non-object responses
-      if (raw == null || typeof raw !== "object") {
-        return {
-          title: "Analyze: ERROR",
-          metadata: { success: false, issueCount: 0, confidence: "unknown", dialect: args.dialect, has_schema: hasSchema, error: "Unexpected response from analysis handler" },
-          output: "Analysis failed: unexpected response format.",
-        }
-      }
-      const result = raw
-
       // The handler returns success=true when analysis completes (issues are
       // reported via issues/issue_count). Only treat it as a failure when
       // there's an actual error (e.g. parse failure).

From 6c60be1a304e7ec1b16f48a6d1dc8f396f765048 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Wed, 1 Apr 2026 16:38:37 -0700
Subject: [PATCH 13/20] =?UTF-8?q?refactor:=20revert=20.gitignore=20changes?=
 =?UTF-8?q?=20=E2=80=94=20scope=20to=20data-diff=20only?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4dfe62f9ee..b10c1bb043 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,12 +28,6 @@ target
 # Commit message scratch files
 .github/meta/
 
-# Local connections config (may contain credentials)
-.altimate-code/
-
-# Pre-built native binaries (platform-specific, not for source control)
-packages/opencode/*.node
-
 # Local dev files
 opencode-dev
 logs/

From 2c58580ab7cfe96bac5cd8a263f3f316b736bbb8 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Thu, 2 Apr 2026 16:03:23 -0700
Subject: [PATCH 14/20] fix: silence @clickhouse/client internal stderr logger
 to prevent TUI corruption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The @clickhouse/client package enables ERROR-level logging by default and writes
`[ERROR][@clickhouse/client][Connection]` lines directly to stderr on auth/query
failures. These raw writes corrupt the terminal TUI rendering.

Set `log: { level: 127 }` (ClickHouseLogLevel.OFF) when creating the client —
consistent with how Snowflake (`logLevel: 'OFF'`) and Databricks (no-op logger)
already suppress their SDK loggers for the same reason.
---
 packages/drivers/src/clickhouse.ts | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/packages/drivers/src/clickhouse.ts b/packages/drivers/src/clickhouse.ts
index cfce48ed2f..38eb738494 100644
--- a/packages/drivers/src/clickhouse.ts
+++ b/packages/drivers/src/clickhouse.ts
@@ -57,6 +57,9 @@ export async function connect(config: ConnectionConfig): Promise<Connector> {
         clientConfig.clickhouse_settings = config.clickhouse_settings
       }
 
+      // Silence the client's internal stderr logger — its ERROR-level output
+      // writes raw lines directly to stderr and corrupts terminal TUI rendering.
+      clientConfig.log = { level: 127 } // ClickHouseLogLevel.OFF = 127
       client = createClient(clientConfig)
     },
 

From 19c2376dde1161c690bb07a860dbf421eb409212 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Thu, 2 Apr 2026 16:19:46 -0700
Subject: [PATCH 15/20] fix: SQL injection hardening, target partition
 discovery, and local pack script
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Validate table names before interpolating into DESCRIBE/SHOW COLUMNS for
  ClickHouse and Snowflake — reject names with non-alphanumeric characters to
  prevent SQL injection; also quote parts with dialect-appropriate delimiters
- Discover partition values from BOTH source and target tables and union the
  results — previously only source was queried, silently missing rows that
  existed only in target-side partitions
- Add script/pack-local.ts: mirrors publish.ts but stops before npm publish;
  injects local altimate-core tarballs from /tmp/altimate-local-dist/ for
  local end-to-end testing
---
 packages/opencode/script/pack-local.ts        | 134 ++++++++++++++++++
 .../altimate/native/connections/data-diff.ts  |  59 ++++++--
 2 files changed, 184 insertions(+), 9 deletions(-)
 create mode 100644 packages/opencode/script/pack-local.ts

diff --git a/packages/opencode/script/pack-local.ts b/packages/opencode/script/pack-local.ts
new file mode 100644
index 0000000000..4c2e1c5110
--- /dev/null
+++ b/packages/opencode/script/pack-local.ts
@@ -0,0 +1,134 @@
+#!/usr/bin/env bun
+/**
+ * Mirrors publish.ts exactly — creates all dist packages and packs them as tarballs.
+ * Stops before `npm publish`. Injects local altimate-core tarballs from /tmp/altimate-local-dist/.
+ *
+ * Usage: bun run script/pack-local.ts
+ */
+
+import { $ } from "bun"
+import fs from "fs"
+import path from "path"
+import { fileURLToPath } from "url"
+
+const dir = fileURLToPath(new URL("..", import.meta.url))
+process.chdir(dir)
+
+import { Script } from "@opencode-ai/script"
+import pkg from "../package.json"
+
+const LOCAL_DIST = "/tmp/altimate-local-dist"
+const OUT = "/tmp/altimate-local-dist"
+
+// ── Discover built binaries ──────────────────────────────────────────────────
+const binaries: Record<string, string> = {}
+for (const filepath of new Bun.Glob("**/package.json").scanSync({ cwd: "./dist" })) {
+  const p = await Bun.file(`./dist/${filepath}`).json()
+  if (!p.name || !p.version) continue
+  binaries[p.name] = p.version
+}
+console.log("Platform binaries:", Object.keys(binaries))
+const version = Object.values(binaries)[0]
+const sanitizedVersion = version.replace(/\//g, "-")
+console.log("Version:", sanitizedVersion)
+
+// ── Sanitize platform binary package.json versions ───────────────────────────
+for (const filepath of new Bun.Glob("**/package.json").scanSync({ cwd: "./dist" })) {
+  const pkgPath = `./dist/${filepath}`
+  const p = await Bun.file(pkgPath).json()
+  if (!p.name || !p.version) continue
+  if (p.version.includes("/")) {
+    p.version = p.version.replace(/\//g, "-")
+    await Bun.file(pkgPath).write(JSON.stringify(p, null, 2))
+  }
+}
+
+// ── copyAssets helper (mirrors publish.ts) ───────────────────────────────────
+async function copyAssets(targetDir: string) {
+  await $`mkdir -p ${targetDir}/bin`
+  await $`cp bin/altimate bin/altimate-code ${targetDir}/bin/`
+  await $`cp -r ../../.opencode/skills ${targetDir}/skills`
+  await $`cp ./script/postinstall.mjs ${targetDir}/postinstall.mjs`
+  await $`mkdir -p ${targetDir}/dbt-tools/bin`
+  await $`cp ../dbt-tools/bin/altimate-dbt ${targetDir}/dbt-tools/bin/altimate-dbt`
+  await $`mkdir -p ${targetDir}/dbt-tools/dist`
+  await $`cp ../dbt-tools/dist/index.js ${targetDir}/dbt-tools/dist/`
+  await $`cp ../dbt-tools/dist/node_python_bridge.py ${targetDir}/dbt-tools/dist/`
+  await Bun.file(`${targetDir}/dbt-tools/package.json`).write(JSON.stringify({ type: "module" }, null, 2) + "\n")
+  if (fs.existsSync("../dbt-tools/dist/altimate_python_packages")) {
+    await $`cp -r ../dbt-tools/dist/altimate_python_packages ${targetDir}/dbt-tools/dist/`
+  }
+  await Bun.file(`${targetDir}/LICENSE`).write(await Bun.file("../../LICENSE").text())
+  await Bun.file(`${targetDir}/CHANGELOG.md`).write(await Bun.file("../../CHANGELOG.md").text())
+}
+
+// ── Build wrapper package ────────────────────────────────────────────────────
+const wrapperDir = `./dist/${pkg.name}`
+await $`mkdir -p ${wrapperDir}`
+await copyAssets(wrapperDir)
+
+// Use local altimate-core tarball path as the dependency
+const coreCompanionTgz = `${LOCAL_DIST}/altimateai-altimate-core-darwin-arm64-0.2.6.tgz`
+const coreTgz = `${LOCAL_DIST}/altimateai-altimate-core-0.2.6.tgz`
+
+await Bun.file(`${wrapperDir}/package.json`).write(
+  JSON.stringify(
+    {
+      name: pkg.name,
+      version: sanitizedVersion,
+      bin: {
+        altimate: "./bin/altimate",
+        "altimate-code": "./bin/altimate-code",
+      },
+      scripts: {
+        postinstall: "bun ./postinstall.mjs || node ./postinstall.mjs",
+      },
+      license: pkg.license,
+      dependencies: {
+        // Reference local tarball so npm install uses our build, not the registry
+        "@altimateai/altimate-core": `file:${coreTgz}`,
+      },
+      optionalDependencies: Object.fromEntries(
+        Object.entries(binaries).map(([name, _]) => [name, sanitizedVersion])
+      ),
+      peerDependencies: {
+        pg: ">=8", "snowflake-sdk": ">=1", "@google-cloud/bigquery": ">=8",
+        "@databricks/sql": ">=1", mysql2: ">=3", mssql: ">=11",
+        oracledb: ">=6", duckdb: ">=1", "@clickhouse/client": ">=1",
+      },
+    },
+    null,
+    2,
+  ),
+)
+
+// ── Pack all platform binary packages ────────────────────────────────────────
+for (const name of Object.keys(binaries)) {
+  console.log(`Packing ${name}...`)
+  await $`chmod -R 755 ./dist/${name}`
+  await $`npm pack --pack-destination ${OUT}`.cwd(`./dist/${name}`)
+}
+
+// ── Pack wrapper package ──────────────────────────────────────────────────────
+console.log(`Packing wrapper ${pkg.name}...`)
+await $`chmod -R 755 ${wrapperDir}`
+await $`npm pack --pack-destination ${OUT}`.cwd(wrapperDir)
+
+// ── List all output tarballs ──────────────────────────────────────────────────
+const tarballs = (await $`ls ${OUT}/*.tgz`.text()).trim().split("\n")
+console.log(`\n✓ All tarballs ready in ${OUT}:\n`)
+for (const t of tarballs) {
+  const size = (await $`du -sh ${t}`.text()).split("\t")[0]
+  console.log(`  ${size}  ${path.basename(t)}`)
+}
+
+console.log(`
+Install and run:
+  rm -rf /tmp/altimate-test && mkdir /tmp/altimate-test && cd /tmp/altimate-test
+  npm install \\
+    ${OUT}/altimateai-altimate-core-darwin-arm64-0.2.6.tgz \\
+    ${OUT}/altimateai-altimate-core-0.2.6.tgz \\
+    ${tarballs.find(t => t.includes("darwin-arm64") && !t.includes("altimate-core"))} \\
+    ${tarballs.find(t => t.includes("altimate-code-0") || (t.includes("altimate-code-") && !t.includes("darwin")))}
+  ./node_modules/.bin/altimate-code
+`)
diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index 5808ee73f8..2fce479b6a 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -238,13 +238,31 @@ function buildColumnDiscoverySQL(tableName: string, dialect: string): string {
     tableFilter = `table_name = '${esc(parts[0])}'`
   }
 
+  // Validate table name for dialects that can't use parameterized identifiers.
+  // Reject anything that doesn't look like a safe identifier (alphanumeric, dots, underscores).
+  const SAFE_TABLE_NAME = /^[a-zA-Z0-9_.]+$/
+
   switch (dialect) {
-    case "clickhouse":
+    case "clickhouse": {
+      // DESCRIBE TABLE interpolates directly — validate to prevent injection
+      if (!SAFE_TABLE_NAME.test(tableName)) {
+        throw new Error(`Unsafe table name for ClickHouse DESCRIBE: ${tableName}`)
+      }
+      // Quote each part with backticks for ClickHouse
+      const chQuoted = tableName.split(".").map((p) => `\`${p.replace(/`/g, "``")}\``).join(".")
       // Returns: name, type, default_type, default_expression, ...
-      return `DESCRIBE TABLE ${tableName}`
-    case "snowflake":
+      return `DESCRIBE TABLE ${chQuoted}`
+    }
+    case "snowflake": {
+      // SHOW COLUMNS interpolates directly — validate to prevent injection
+      if (!SAFE_TABLE_NAME.test(tableName)) {
+        throw new Error(`Unsafe table name for Snowflake SHOW COLUMNS: ${tableName}`)
+      }
+      // Quote each part with double-quotes for Snowflake
+      const sfQuoted = tableName.split(".").map((p) => `"${p.replace(/"/g, '""')}"`).join(".")
       // Returns: table_name, schema_name, column_name, data_type, null?, default, ...
-      return `SHOW COLUMNS IN TABLE ${tableName}`
+      return `SHOW COLUMNS IN TABLE ${sfQuoted}`
+    }
     case "mysql":
     case "mariadb": {
       // MySQL puts "on update CURRENT_TIMESTAMP" in the EXTRA column, not column_default
@@ -584,10 +602,12 @@ async function runPartitionedDiff(params: DataDiffParams): Promise<DataDiffResul
   }
 
   const sourceDialect = resolveDialect(params.source_warehouse)
-  const { table1Name } = resolveTableSources(params.source, params.target)
+  const targetDialect = resolveDialect(params.target_warehouse ?? params.source_warehouse)
+  const { table1Name, table2Name } = resolveTableSources(params.source, params.target)
 
-  // Discover partition values from source
-  const discoverySql = buildPartitionDiscoverySQL(
+  // Discover partition values from BOTH source and target to catch target-only partitions.
+  // Without this, rows that exist only in target partitions are silently missed.
+  const sourceDiscoverySql = buildPartitionDiscoverySQL(
     table1Name,
     params.partition_column!,
     params.partition_granularity,
@@ -595,11 +615,32 @@ async function runPartitionedDiff(params: DataDiffParams): Promise<DataDiffResul
     sourceDialect,
     params.where_clause,
   )
+  const targetDiscoverySql = buildPartitionDiscoverySQL(
+    table2Name,
+    params.partition_column!,
+    params.partition_granularity,
+    params.partition_bucket_size,
+    targetDialect,
+    params.where_clause,
+  )
 
   let partitionValues: string[]
   try {
-    const rows = await executeQuery(discoverySql, params.source_warehouse)
-    partitionValues = rows.map((r) => String(r[0] ?? "")).filter(Boolean)
+    const [sourceRows, targetRows] = await Promise.all([
+      executeQuery(sourceDiscoverySql, params.source_warehouse),
+      executeQuery(targetDiscoverySql, params.target_warehouse ?? params.source_warehouse),
+    ])
+    // Union partition values from both sides, deduplicated
+    const allValues = new Set<string>()
+    for (const r of sourceRows) {
+      const v = r[0]
+      if (v != null) allValues.add(String(v))
+    }
+    for (const r of targetRows) {
+      const v = r[0]
+      if (v != null) allValues.add(String(v))
+    }
+    partitionValues = [...allValues].sort()
   } catch (e) {
     return { success: false, error: `Partition discovery failed: ${e}`, steps: 0 }
   }

From 7402408ee3293c520821124c081b7aecbd6de410 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Thu, 2 Apr 2026 17:06:05 -0700
Subject: [PATCH 16/20] feat: add Step 9 result presentation guidelines to
 data-parity skill
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Require that every diff result summary surfaces:
- Exact scope (tables + warehouses compared)
- Filters and time period applied (or explicitly states none)
- Key columns used and how they were confirmed
- Columns compared and excluded, with reasons (auto-timestamp, user request)
- Algorithm used

Includes example full result summary and guidance for identical results —
emphasising that bare numbers without context are meaningless to the user.
---
 .opencode/skills/data-parity/SKILL.md | 72 ++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/.opencode/skills/data-parity/SKILL.md b/.opencode/skills/data-parity/SKILL.md
index 39afa6b616..2bb7fa5df6 100644
--- a/.opencode/skills/data-parity/SKILL.md
+++ b/.opencode/skills/data-parity/SKILL.md
@@ -19,7 +19,7 @@ Here's my plan:
 6. [ ] Run column-level profile (cheap — no row scan)
 7. [ ] Ask whether to proceed with row-level diff (may be expensive for large tables)
 8. [ ] Run targeted row-level diff on diverging columns only
-9. [ ] Report findings
+9. [ ] Present findings with scope, filters, time period, columns compared/excluded, and assumptions
 ```
 
 Update each item to `[x]` as you complete it. This plan should be visible before any tool is called.
@@ -317,6 +317,76 @@ The output lists which columns were auto-excluded and why.
 
 ---
 
+## Step 9: Present Findings — Always Surface Context
+
+When reporting diff results, **never present bare numbers**. Always frame the result with the full context that determines what the numbers actually mean.
+
+### Required elements in every result summary
+
+**1. Scope — what was compared**
+State exactly which tables/queries were diffed and on which warehouses:
+> "Compared `public.orders` on **postgres_prod** vs `public.orders` on **snowflake_dw**"
+
+**2. Filters and time period applied**
+If any `where_clause` or `partition_column` was used, state it explicitly:
+> "Scope limited to: `created_at >= '2024-01-01' AND created_at < '2024-04-01'` (Q1 2024 only)"
+> "Partitioned by `l_shipdate` (monthly buckets) — diff covers Jan 2023 through Mar 2024"
+
+If no filter was applied, say so:
+> "No row filter applied — full table compared"
+
+**3. Key columns used**
+> "Key: `order_id` (confirmed unique — 150,000 distinct values = 150,000 rows)"
+
+**4. Columns included and excluded**
+List what was compared and what was skipped, and why:
+> "Compared columns: `amount`, `status`, `customer_id`"
+> "Excluded (auto-timestamp defaults): `created_at`, `updated_at`, `_loaded_at`"
+> "Excluded (user request): `internal_score`"
+
+If the user confirmed exclusions in Step 4, reference that confirmation:
+> "Excluded per your confirmation: `created_at`, `updated_at`"
+
+**5. Algorithm used**
+> "Algorithm: `hashdiff` (cross-database)"
+
+### Example full result summary
+
+```
+## Data Parity Results
+
+**Compared:** `public.orders` (postgres_prod) → `public.orders` (snowflake_dw)
+**Scope:** `created_at >= '2024-01-01'` (Q1 2024 only — 42,301 rows in scope)
+**Key:** `order_id`
+**Columns compared:** `amount`, `status`, `customer_id`, `region`
+**Columns excluded:** `created_at`, `updated_at` (auto-timestamp defaults, per your confirmation)
+**Algorithm:** hashdiff
+
+### Result: ✗ DIFFER
+
+| Metric | Value |
+|--------|-------|
+| Source rows | 42,301 |
+| Target rows | 42,298 |
+| Only in source | 3 |
+| Only in target | 0 |
+| Updated rows | 47 |
+| Identical rows | 42,251 |
+
+**Findings:**
+- 3 rows exist in source but are missing in target → possible ETL delete propagation gap
+- 47 rows have value differences in `amount` or `status` → check rounding or status mapping
+```
+
+### When result is IDENTICAL — still surface the scope
+
+Even when tables match perfectly, state what was checked:
+> "✓ Tables are **identical** across 150,000 rows. Compared `amount`, `status`, `customer_id` (full table, no filter, key=`order_id`). Auto-timestamp columns `created_at`, `updated_at` were excluded."
+
+**Why this matters:** "Tables are identical" without context is meaningless — the user needs to know if you checked Q1 only, skipped 5 columns, or used a WHERE clause that covered just 1% of the data.
+
+---
+
 ## Common Mistakes
 
 **Writing manual diff SQL instead of calling data_diff**

From 2caf381be55cf606ede37f0cff62b8e6fde594d0 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Thu, 2 Apr 2026 18:09:06 -0700
Subject: [PATCH 17/20] fix: use correct outcome format for empty/fallback
 partition results

The partitioned diff returned `{ Match: { row_count: 0, algorithm: 'partitioned' } }`
when no partition values were found or all partitions failed. This format lacks
`mode: 'diff'`, so `formatOutcome` fell through to raw JSON.stringify instead
of producing clean output.

Use the standard Rust engine format:
`{ mode: 'diff', stats: {...}, diff_rows: [] }`
---
 .../opencode/src/altimate/native/connections/data-diff.ts  | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index 2fce479b6a..24a2a16205 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -646,7 +646,10 @@ async function runPartitionedDiff(params: DataDiffParams): Promise<DataDiffResul
   }
 
   if (partitionValues.length === 0) {
-    return { success: true, steps: 1, outcome: { Match: { row_count: 0, algorithm: "partitioned" } }, partition_results: [] }
+    return {
+      success: true, steps: 1, partition_results: [],
+      outcome: { mode: "diff", stats: { rows_table1: 0, rows_table2: 0, exclusive_table1: 0, exclusive_table2: 0, updated: 0, unchanged: 0 }, diff_rows: [] },
+    }
   }
 
   // Diff each partition
@@ -685,7 +688,7 @@ async function runPartitionedDiff(params: DataDiffParams): Promise<DataDiffResul
   return {
     success: true,
     steps: totalSteps,
-    outcome: aggregatedOutcome ?? { Match: { row_count: 0, algorithm: "partitioned" } },
+    outcome: aggregatedOutcome ?? { mode: "diff", stats: { rows_table1: 0, rows_table2: 0, exclusive_table1: 0, exclusive_table2: 0, updated: 0, unchanged: 0 }, diff_rows: [] },
     partition_results: partitionResults,
   }
 }

From 1bc67eff27bb240688260895d3d7041e85478606 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Fri, 3 Apr 2026 10:52:45 -0700
Subject: [PATCH 18/20] =?UTF-8?q?chore:=20remove=20pack-local.ts=20?=
 =?UTF-8?q?=E2=80=94=20dev-only=20utility,=20not=20part=20of=20the=20featu?=
 =?UTF-8?q?re?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 packages/opencode/script/pack-local.ts | 134 -------------------------
 1 file changed, 134 deletions(-)
 delete mode 100644 packages/opencode/script/pack-local.ts

diff --git a/packages/opencode/script/pack-local.ts b/packages/opencode/script/pack-local.ts
deleted file mode 100644
index 4c2e1c5110..0000000000
--- a/packages/opencode/script/pack-local.ts
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/usr/bin/env bun
-/**
- * Mirrors publish.ts exactly — creates all dist packages and packs them as tarballs.
- * Stops before `npm publish`. Injects local altimate-core tarballs from /tmp/altimate-local-dist/.
- *
- * Usage: bun run script/pack-local.ts
- */
-
-import { $ } from "bun"
-import fs from "fs"
-import path from "path"
-import { fileURLToPath } from "url"
-
-const dir = fileURLToPath(new URL("..", import.meta.url))
-process.chdir(dir)
-
-import { Script } from "@opencode-ai/script"
-import pkg from "../package.json"
-
-const LOCAL_DIST = "/tmp/altimate-local-dist"
-const OUT = "/tmp/altimate-local-dist"
-
-// ── Discover built binaries ──────────────────────────────────────────────────
-const binaries: Record<string, string> = {}
-for (const filepath of new Bun.Glob("**/package.json").scanSync({ cwd: "./dist" })) {
-  const p = await Bun.file(`./dist/${filepath}`).json()
-  if (!p.name || !p.version) continue
-  binaries[p.name] = p.version
-}
-console.log("Platform binaries:", Object.keys(binaries))
-const version = Object.values(binaries)[0]
-const sanitizedVersion = version.replace(/\//g, "-")
-console.log("Version:", sanitizedVersion)
-
-// ── Sanitize platform binary package.json versions ───────────────────────────
-for (const filepath of new Bun.Glob("**/package.json").scanSync({ cwd: "./dist" })) {
-  const pkgPath = `./dist/${filepath}`
-  const p = await Bun.file(pkgPath).json()
-  if (!p.name || !p.version) continue
-  if (p.version.includes("/")) {
-    p.version = p.version.replace(/\//g, "-")
-    await Bun.file(pkgPath).write(JSON.stringify(p, null, 2))
-  }
-}
-
-// ── copyAssets helper (mirrors publish.ts) ───────────────────────────────────
-async function copyAssets(targetDir: string) {
-  await $`mkdir -p ${targetDir}/bin`
-  await $`cp bin/altimate bin/altimate-code ${targetDir}/bin/`
-  await $`cp -r ../../.opencode/skills ${targetDir}/skills`
-  await $`cp ./script/postinstall.mjs ${targetDir}/postinstall.mjs`
-  await $`mkdir -p ${targetDir}/dbt-tools/bin`
-  await $`cp ../dbt-tools/bin/altimate-dbt ${targetDir}/dbt-tools/bin/altimate-dbt`
-  await $`mkdir -p ${targetDir}/dbt-tools/dist`
-  await $`cp ../dbt-tools/dist/index.js ${targetDir}/dbt-tools/dist/`
-  await $`cp ../dbt-tools/dist/node_python_bridge.py ${targetDir}/dbt-tools/dist/`
-  await Bun.file(`${targetDir}/dbt-tools/package.json`).write(JSON.stringify({ type: "module" }, null, 2) + "\n")
-  if (fs.existsSync("../dbt-tools/dist/altimate_python_packages")) {
-    await $`cp -r ../dbt-tools/dist/altimate_python_packages ${targetDir}/dbt-tools/dist/`
-  }
-  await Bun.file(`${targetDir}/LICENSE`).write(await Bun.file("../../LICENSE").text())
-  await Bun.file(`${targetDir}/CHANGELOG.md`).write(await Bun.file("../../CHANGELOG.md").text())
-}
-
-// ── Build wrapper package ────────────────────────────────────────────────────
-const wrapperDir = `./dist/${pkg.name}`
-await $`mkdir -p ${wrapperDir}`
-await copyAssets(wrapperDir)
-
-// Use local altimate-core tarball path as the dependency
-const coreCompanionTgz = `${LOCAL_DIST}/altimateai-altimate-core-darwin-arm64-0.2.6.tgz`
-const coreTgz = `${LOCAL_DIST}/altimateai-altimate-core-0.2.6.tgz`
-
-await Bun.file(`${wrapperDir}/package.json`).write(
-  JSON.stringify(
-    {
-      name: pkg.name,
-      version: sanitizedVersion,
-      bin: {
-        altimate: "./bin/altimate",
-        "altimate-code": "./bin/altimate-code",
-      },
-      scripts: {
-        postinstall: "bun ./postinstall.mjs || node ./postinstall.mjs",
-      },
-      license: pkg.license,
-      dependencies: {
-        // Reference local tarball so npm install uses our build, not the registry
-        "@altimateai/altimate-core": `file:${coreTgz}`,
-      },
-      optionalDependencies: Object.fromEntries(
-        Object.entries(binaries).map(([name, _]) => [name, sanitizedVersion])
-      ),
-      peerDependencies: {
-        pg: ">=8", "snowflake-sdk": ">=1", "@google-cloud/bigquery": ">=8",
-        "@databricks/sql": ">=1", mysql2: ">=3", mssql: ">=11",
-        oracledb: ">=6", duckdb: ">=1", "@clickhouse/client": ">=1",
-      },
-    },
-    null,
-    2,
-  ),
-)
-
-// ── Pack all platform binary packages ────────────────────────────────────────
-for (const name of Object.keys(binaries)) {
-  console.log(`Packing ${name}...`)
-  await $`chmod -R 755 ./dist/${name}`
-  await $`npm pack --pack-destination ${OUT}`.cwd(`./dist/${name}`)
-}
-
-// ── Pack wrapper package ──────────────────────────────────────────────────────
-console.log(`Packing wrapper ${pkg.name}...`)
-await $`chmod -R 755 ${wrapperDir}`
-await $`npm pack --pack-destination ${OUT}`.cwd(wrapperDir)
-
-// ── List all output tarballs ──────────────────────────────────────────────────
-const tarballs = (await $`ls ${OUT}/*.tgz`.text()).trim().split("\n")
-console.log(`\n✓ All tarballs ready in ${OUT}:\n`)
-for (const t of tarballs) {
-  const size = (await $`du -sh ${t}`.text()).split("\t")[0]
-  console.log(`  ${size}  ${path.basename(t)}`)
-}
-
-console.log(`
-Install and run:
-  rm -rf /tmp/altimate-test && mkdir /tmp/altimate-test && cd /tmp/altimate-test
-  npm install \\
-    ${OUT}/altimateai-altimate-core-darwin-arm64-0.2.6.tgz \\
-    ${OUT}/altimateai-altimate-core-0.2.6.tgz \\
-    ${tarballs.find(t => t.includes("darwin-arm64") && !t.includes("altimate-core"))} \\
-    ${tarballs.find(t => t.includes("altimate-code-0") || (t.includes("altimate-code-") && !t.includes("darwin")))}
-  ./node_modules/.bin/altimate-code
-`)

From e41e5a069a57d18345e23d6c6849ffae8fc95ab0 Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Fri, 3 Apr 2026 13:39:46 -0700
Subject: [PATCH 19/20] feat: add data-parity skill to builder prompt with
 table and SQL query comparison modes

---
 packages/opencode/src/altimate/prompts/builder.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/packages/opencode/src/altimate/prompts/builder.txt b/packages/opencode/src/altimate/prompts/builder.txt
index d4a880869a..4fe16dbf52 100644
--- a/packages/opencode/src/altimate/prompts/builder.txt
+++ b/packages/opencode/src/altimate/prompts/builder.txt
@@ -153,6 +153,12 @@ Skills are specialized workflows that compose multiple tools. Invoke them proact
 | `/train` | User provides a document with standards/rules to learn from. |
 | `/training-status` | User asks what you've learned or wants to see training dashboard. |
 
+### Data Validation & Comparison
+
+| Skill | Invoke When |
+|-------|-------------|
+| `/data-parity` | User wants to compare two tables, SQL query results, or validate a migration. Uses the `data_diff` tool for row-level and column-level comparison. Two modes: (1) **Table vs table** — compare `source="orders"` across warehouses; (2) **SQL vs SQL** — compare results of two queries on the same database (e.g. `source="SELECT ... FROM orders WHERE ..."` vs `target="SELECT ... FROM orders_v2 WHERE ..."`). Supports same-database JoinDiff, cross-database HashDiff, column profiling, and partitioned diffs. Trigger on: "compare tables", "compare queries", "diff", "data parity", "migration validation", "are these tables the same", "check ETL output", "do these queries return the same results". |
+
 ### Data Visualization
 
 | Skill | Invoke When |
@@ -173,6 +179,12 @@ Don't wait for `/skill-name` — invoke skills when the task clearly matches:
 - User says "visualize this data" -> invoke `/data-viz`
 - User says "make a dashboard" -> invoke `/data-viz`
 - User says "chart these metrics" -> invoke `/data-viz`
+- User says "compare these tables" -> invoke `/data-parity`
+- User says "are these tables the same" -> invoke `/data-parity`
+- User says "validate my migration" -> invoke `/data-parity`
+- User says "diff source and target" -> invoke `/data-parity`
+- User says "do these queries return the same thing" -> invoke `/data-parity`
+- User says "compare the output of these two queries" -> invoke `/data-parity`
 
 ## Teammate Training
 

From b8147c957d4ebffb22cad350f0f7419b2c2bb23f Mon Sep 17 00:00:00 2001
From: suryaiyer95 <surya@altimate.ai>
Date: Fri, 3 Apr 2026 15:58:40 -0700
Subject: [PATCH 20/20] =?UTF-8?q?fix:=20address=20code=20review=20findings?=
 =?UTF-8?q?=20=E2=80=94=20Oracle=20TRUNC,=20dialect-aware=20quoting,=20que?=
 =?UTF-8?q?ry+partition=20guard?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Oracle day granularity: 'DDD' (day-of-year) → 'DD' (day-of-month)
- Add `quoteIdentForDialect()` helper: MySQL/ClickHouse use backticks,
  TSQL/Fabric use brackets, others use ANSI double-quotes
- `buildPartitionDiscoverySQL` and `buildPartitionWhereClause` now use
  dialect-aware quoting instead of hardcoded double-quotes
- `runPartitionedDiff` rejects SQL queries as source/target with a clear
  error — partitioning requires table names to discover column values
---
 .../altimate/native/connections/data-diff.ts  | 40 ++++++++++++++++---
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/packages/opencode/src/altimate/native/connections/data-diff.ts b/packages/opencode/src/altimate/native/connections/data-diff.ts
index 24a2a16205..294c43745b 100644
--- a/packages/opencode/src/altimate/native/connections/data-diff.ts
+++ b/packages/opencode/src/altimate/native/connections/data-diff.ts
@@ -403,6 +403,24 @@ const MAX_STEPS = 200
 // Partition support
 // ---------------------------------------------------------------------------
 
+/**
+ * Quote a SQL identifier using the correct delimiter for the dialect.
+ */
+function quoteIdentForDialect(identifier: string, dialect: string): string {
+  switch (dialect) {
+    case "mysql":
+    case "mariadb":
+    case "clickhouse":
+      return `\`${identifier.replace(/`/g, "``")}\``
+    case "tsql":
+    case "fabric":
+      return `[${identifier.replace(/\]/g, "]]")}]`
+    default:
+      // ANSI SQL: Postgres, Snowflake, BigQuery, DuckDB, Oracle, Redshift, etc.
+      return `"${identifier.replace(/"/g, '""')}"`
+  }
+}
+
 /**
  * Build a DATE_TRUNC expression appropriate for the warehouse dialect.
  */
@@ -421,7 +439,7 @@ function dateTruncExpr(granularity: string, column: string, dialect: string): st
     case "oracle": {
       // Oracle uses TRUNC() with format models — 'WEEK' is invalid, use 'IW' for ISO week
       const oracleFmt: Record<string, string> = {
-        day: "DDD",
+        day: "DD",
         week: "IW",
         month: "MM",
         year: "YYYY",
@@ -465,15 +483,16 @@ function buildPartitionDiscoverySQL(
 ): string {
   const where = whereClause ? `WHERE ${whereClause}` : ""
   const mode = partitionMode(granularity, bucketSize)
+  const quotedCol = quoteIdentForDialect(partitionColumn, dialect)
 
   let expr: string
   if (mode === "numeric") {
-    expr = `FLOOR(${partitionColumn} / ${bucketSize}) * ${bucketSize}`
+    expr = `FLOOR(${quotedCol} / ${bucketSize}) * ${bucketSize}`
   } else if (mode === "date") {
-    expr = dateTruncExpr(granularity!, partitionColumn, dialect)
+    expr = dateTruncExpr(granularity!, quotedCol, dialect)
   } else {
     // categorical — raw distinct values, no transformation
-    expr = partitionColumn
+    expr = quotedCol
   }
 
   return `SELECT DISTINCT ${expr} AS _p FROM ${table} ${where} ORDER BY _p`
@@ -490,8 +509,8 @@ function buildPartitionWhereClause(
   dialect: string,
 ): string {
   const mode = partitionMode(granularity, bucketSize)
-  // Quote the column identifier to handle special characters and reserved words
-  const quotedCol = `"${partitionColumn.replace(/"/g, '""')}"`
+  // Quote the column identifier using dialect-appropriate delimiters
+  const quotedCol = quoteIdentForDialect(partitionColumn, dialect)
 
   if (mode === "numeric") {
     const lo = Number(partitionValue)
@@ -592,6 +611,15 @@ function mergeOutcomes(accumulated: unknown, next: unknown): unknown {
  * then aggregate results.
  */
 async function runPartitionedDiff(params: DataDiffParams): Promise<DataDiffResult> {
+  // Partitioned diff requires table names — can't partition a SQL query by column
+  if (isQuery(params.source) || isQuery(params.target)) {
+    return {
+      success: false,
+      error: "partition_column cannot be used when source or target is a SQL query. Use table names instead, or remove partition_column.",
+      steps: 0,
+    }
+  }
+
   const resolveDialect = (warehouse: string | undefined): string => {
     if (warehouse) {
       const cfg = Registry.getConfig(warehouse)