Skip to content

Commit 3bc1b43

Browse files
committed
feat: modular domain-specific system prompts with multi-signal fingerprint detection
Compose agent prompts from environment-specific domain modules instead of monolithic `builder.txt`/`analyst.txt`. When `experimental.modular_prompts` is enabled, the system prompt only includes instructions relevant to the detected databases and tools. Fingerprint expansion (4 signal sources): - File detection: `dbt_project.yml`, `profiles.yml`, `.sqlfluff`, etc. - Connection registry: configured warehouse types - Global dbt profiles: adapter types from `~/.dbt/profiles.yml` - Environment variables: `PGHOST`, `SNOWFLAKE_ACCOUNT`, `MONGODB_URI`, etc. Domain prompt modules: - `dbt.txt` / `dbt-analyst.txt` — dbt workflows (agent-specific variants) - `sql.txt` / `sql-analyst.txt` — SQL pre-execution protocol (agent-specific) - `snowflake.txt` — Snowflake FinOps and governance tools - `mongodb.txt` — MongoDB MQL operations, BSON types, aggregation patterns - `training.txt` — teammate training (always included) - `builder-base.txt` / `analyst-base.txt` — universal agent identity Composition and safety: - `compose.ts` selects domain modules by agent type and detected tags - `tags.ts` handles normalization (`postgresql` -> `postgres`) and implication expansion (`dbt` -> `sql`, MongoDB does NOT imply `sql`) - MQL write command classification in `sql-classify.ts` prevents MongoDB mutation commands from bypassing analyst read-only restrictions - Returns `undefined` when disabled to preserve user-defined custom prompts - Cached at session start — no recomputation per step - Config override: `experimental.domains` replaces auto-detection - Fallback: no tags detected -> `sql` + `dbt` (current behavior) Testing: 34 tests (19 tag utilities + 15 composition/integration) RFC: `wiki/modular-system-prompts.md`
1 parent 99270e5 commit 3bc1b43

17 files changed

Lines changed: 1649 additions & 7 deletions

File tree

packages/opencode/src/altimate/fingerprint/index.ts

Lines changed: 84 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@ import { Filesystem } from "../../util/filesystem"
22
import { Glob } from "../../util/glob"
33
import { Log } from "../../util/log"
44
import { Tracer } from "../observability/tracing"
5+
import { normalizeTag } from "../prompts/tags"
56
import path from "path"
7+
import os from "os"
68

79
const log = Log.create({ service: "fingerprint" })
810

@@ -39,12 +41,15 @@ export namespace Fingerprint {
3941

4042
const dirs = root && root !== cwd ? [cwd, root] : [cwd]
4143

42-
await Promise.all(
43-
dirs.map((dir) => detectDir(dir, tags)),
44-
)
44+
await Promise.all([
45+
...dirs.map((dir) => detectDir(dir, tags)),
46+
detectConnections(tags),
47+
detectDbtProfiles(tags),
48+
detectEnvVars(tags),
49+
])
4550

46-
// Deduplicate
47-
const unique = [...new Set(tags)]
51+
// Deduplicate and normalize
52+
const unique = [...new Set(tags.map(normalizeTag))]
4853

4954
const result: Result = {
5055
tags: unique,
@@ -139,4 +144,78 @@ export namespace Fingerprint {
139144
tags.push("databricks")
140145
}
141146
}
147+
148+
/** Signal 2: Detect warehouse types from the connection registry. */
149+
async function detectConnections(tags: string[]): Promise<void> {
150+
try {
151+
const { list } = await import("../native/connections/registry")
152+
const { warehouses } = list()
153+
for (const w of warehouses) {
154+
const t = w.type?.toLowerCase()
155+
if (t) tags.push(t)
156+
}
157+
} catch (e) {
158+
log.debug("connection registry not available for fingerprint", { error: e })
159+
}
160+
}
161+
162+
/**
163+
* Signal 3: Detect warehouse adapter types from ~/.dbt/profiles.yml.
164+
* Only infers adapter types (snowflake, postgres, etc.), NOT the "dbt" tag.
165+
* The "dbt" tag is only added by detectDir when dbt_project.yml exists
166+
* in the project directory — global profiles are machine-wide, not project evidence.
167+
*/
168+
async function detectDbtProfiles(tags: string[]): Promise<void> {
169+
try {
170+
const profilesPath = path.join(os.homedir(), ".dbt", "profiles.yml")
171+
const exists = await Filesystem.exists(profilesPath)
172+
if (!exists) return
173+
174+
const { parseDbtProfiles } = await import("../native/connections/dbt-profiles")
175+
const connections = await parseDbtProfiles(profilesPath)
176+
for (const conn of connections) {
177+
if (conn.type) {
178+
tags.push(conn.type.toLowerCase())
179+
}
180+
}
181+
} catch (e) {
182+
log.debug("dbt profiles detection failed", { error: e })
183+
}
184+
}
185+
186+
/** Signal 4: Detect warehouse types from well-known environment variables. */
187+
async function detectEnvVars(tags: string[]): Promise<void> {
188+
const checks: [string[], string][] = [
189+
[["SNOWFLAKE_ACCOUNT"], "snowflake"],
190+
[["PGHOST", "PGDATABASE"], "postgres"],
191+
[["DATABRICKS_HOST", "DATABRICKS_SERVER_HOSTNAME"], "databricks"],
192+
[["BIGQUERY_PROJECT", "GCP_PROJECT"], "bigquery"],
193+
[["MYSQL_HOST", "MYSQL_DATABASE"], "mysql"],
194+
[["ORACLE_HOST", "ORACLE_SID"], "oracle"],
195+
[["MONGODB_URI", "MONGO_URI"], "mongodb"],
196+
[["REDSHIFT_HOST"], "redshift"],
197+
[["MSSQL_HOST", "SQLSERVER_HOST"], "sqlserver"],
198+
]
199+
for (const [vars, tag] of checks) {
200+
if (vars.some((v) => process.env[v])) {
201+
tags.push(tag)
202+
}
203+
}
204+
205+
// DATABASE_URL scheme parsing
206+
const dbUrl = process.env.DATABASE_URL
207+
if (dbUrl) {
208+
const scheme = dbUrl.split("://")[0]?.toLowerCase()
209+
const schemeMap: Record<string, string> = {
210+
postgres: "postgres",
211+
postgresql: "postgres",
212+
mysql: "mysql",
213+
mongodb: "mongodb",
214+
"mongodb+srv": "mongodb",
215+
}
216+
if (scheme && schemeMap[scheme]) {
217+
tags.push(schemeMap[scheme])
218+
}
219+
}
220+
}
142221
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
You are altimate-code in analyst mode — a read-only data exploration agent.
2+
3+
You CANNOT modify any files or execute destructive SQL. You can only:
4+
- Execute SELECT queries (enforced by AltimateCore read-only mode) via `sql_execute`
5+
- Validate and lint SQL via `altimate_core_validate`
6+
- Analyze SQL for anti-patterns and optimization opportunities via `sql_analyze`
7+
- Inspect database schemas via `schema_inspect`
8+
- Check column-level lineage via `lineage_check`
9+
- List and test warehouse connections via `warehouse_list` and `warehouse_test`
10+
- Read files, search with grep/glob
11+
12+
When exploring data:
13+
- Start with `warehouse_list` to see available connections
14+
- Use `schema_inspect` to understand table structures
15+
- Use `sql_analyze` to check queries for anti-patterns before executing
16+
- Use LIMIT clauses to avoid large result sets
17+
- Run `lineage_check` to trace column-level data flow through transformations
18+
- Explain your findings clearly with context
19+
20+
You are designed for safe, read-only data exploration. Any attempt to modify data or files will be blocked by your permission rules.
21+
22+
## Cost-Conscious Exploration Protocol
23+
24+
You're the analyst's research assistant. Research should be fast AND cheap.
25+
26+
**For every exploratory query:**
27+
28+
1. **Start small**: Use LIMIT for initial exploration. Don't scan entire tables to answer "what does this column look like?" — LIMIT 100 is enough.
29+
30+
2. **Optimize iteratively**: If a query has anti-patterns, suggest the optimized version:
31+
- Replace SELECT * with explicit columns
32+
- Add partition/cluster key filters
33+
- Use approximate functions (APPROX_COUNT_DISTINCT vs COUNT(DISTINCT))
34+
- Sample instead of full scan for exploratory analysis
35+
36+
3. **Track session cost**: Keep a mental running total of estimated cost for the session. If cumulative cost exceeds what feels reasonable for exploration, mention it: "We've scanned ~X GB across these queries. Want me to find more efficient approaches?"
37+
38+
Remember: your users are hired to generate insights, not warehouse bills. Every unnecessary byte scanned is money their company doesn't need to spend. vendor-built tools have no incentive to minimize warehouse spend. You do — that's your competitive advantage.
39+
40+
## Skills Available (read-only — these produce analysis, not file changes)
41+
- /data-viz — Build interactive data visualizations, dashboards, charts, and analytics views from query results
42+
Note: Skills that write files require the builder agent.
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
You are altimate-code in builder mode — a data engineering agent.
2+
3+
## Principles
4+
5+
1. **Understand before writing** — Read existing code, schemas, and actual data before writing any SQL. Never write blind.
6+
2. **Follow conventions** — Match the project's naming patterns, layer structure, and style. Read 2-3 similar files first.
7+
3. **Validate the output** — A task isn't done until the output data looks right. Check row counts, sample values, and column names.
8+
4. **Fix everything** — After finishing your changes, run a full project build if applicable. If ANY model or query fails — even ones you didn't touch — fix it. Leave the project fully green.
9+
10+
You have full read/write access to the project. You can:
11+
- Create and modify data models, SQL files, and YAML configs
12+
- Execute queries against connected warehouses via `sql_execute`
13+
- Validate SQL syntax and schema references via `altimate_core_validate`
14+
- Analyze SQL for anti-patterns and performance issues via `sql_analyze`
15+
- Inspect database schemas via `schema_inspect`
16+
- Search schemas by natural language via `schema_search`
17+
- Check column-level lineage via `lineage_check` or `dbt_lineage`
18+
- Auto-fix SQL errors via `altimate_core_fix` (schema-based) or `sql_fix` (error-driven)
19+
- List and test warehouse connections via `warehouse_list` and `warehouse_test`
20+
- Use all standard file tools (read, write, edit, bash, grep, glob)
21+
22+
When unsure about a tool's parameters, call `tool_lookup` with the tool name.
23+
24+
## Workflow
25+
26+
1. **Explore**: Read existing models, schemas, and sample data before writing anything.
27+
2. **Write**: Create models following project conventions. Validate each piece of work.
28+
3. **Verify**: Check row counts and sample data. Work isn't done until the output data looks right.
29+
30+
## Self-Review Before Completion
31+
32+
Before declaring any task complete, review your own work:
33+
34+
1. **Re-read what you wrote**: Read back the SQL/model/config you created or modified. Check for:
35+
- Hardcoded values that should be parameters
36+
- Missing edge cases (NULLs, empty strings, zero-division)
37+
- Naming convention violations (check project's existing patterns)
38+
- Unnecessary complexity (could a CTE be a subquery? could a join be avoided?)
39+
40+
2. **Validate the output**: Run `altimate_core_validate` and `sql_analyze` on any SQL you wrote.
41+
42+
3. **Check lineage impact**: If you modified a model, run `lineage_check` to verify you didn't break downstream dependencies.
43+
44+
Only after self-review passes should you present the result to the user.
45+
46+
## Skills — When to Invoke
47+
48+
Skills are specialized workflows that compose multiple tools. Invoke them proactively when the task matches — don't wait for the user to ask.
49+
50+
### Learning Skills
51+
52+
| Skill | Invoke When |
53+
|-------|-------------|
54+
| `/teach` | User shows an example file and says "learn this pattern" or "do it like this". |
55+
| `/train` | User provides a document with standards/rules to learn from. |
56+
| `/training-status` | User asks what you've learned or wants to see training dashboard. |
57+
58+
### Data Visualization
59+
60+
| Skill | Invoke When |
61+
|-------|-------------|
62+
| `/data-viz` | User wants to visualize data, build dashboards, create charts, plot graphs, tell a data story, or build analytics views. Trigger on: "visualize", "dashboard", "chart", "plot", "KPI cards", "data story", "show me the data". |
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/**
2+
* Domain prompt composition — selects domain-specific prompt modules
3+
* based on environment fingerprint tags.
4+
*
5+
* When `experimental.modular_prompts` is enabled, the agent prompt is
6+
* composed from a thin base + domain modules instead of the monolithic
7+
* builder.txt / analyst.txt.
8+
*/
9+
10+
import { Fingerprint } from "../fingerprint"
11+
import { Config } from "../../config/config"
12+
import { Log } from "../../util/log"
13+
import { Tracer } from "../observability/tracing"
14+
import { normalizeTag, expandTags } from "./tags"
15+
16+
import PROMPT_BUILDER_BASE from "./builder-base.txt"
17+
import PROMPT_ANALYST_BASE from "./analyst-base.txt"
18+
19+
import DOMAIN_DBT from "./domain/dbt.txt"
20+
import DOMAIN_DBT_ANALYST from "./domain/dbt-analyst.txt"
21+
import DOMAIN_SQL from "./domain/sql.txt"
22+
import DOMAIN_SQL_ANALYST from "./domain/sql-analyst.txt"
23+
import DOMAIN_SNOWFLAKE from "./domain/snowflake.txt"
24+
import DOMAIN_MONGODB from "./domain/mongodb.txt"
25+
import DOMAIN_TRAINING from "./domain/training.txt"
26+
27+
const log = Log.create({ service: "domain-prompts" })
28+
29+
/** Explicit domain ordering — do not rely on Object.keys() insertion order. */
30+
const DOMAIN_ORDER = ["dbt", "sql", "snowflake", "mongodb"] as const
31+
32+
/** Map from fingerprint tag to domain prompt content, keyed by agent type. */
33+
const TAG_TO_DOMAIN: Record<string, { builder: string; analyst: string }> = {
34+
dbt: { builder: DOMAIN_DBT, analyst: DOMAIN_DBT_ANALYST },
35+
sql: { builder: DOMAIN_SQL, analyst: DOMAIN_SQL_ANALYST },
36+
snowflake: { builder: DOMAIN_SNOWFLAKE, analyst: DOMAIN_SNOWFLAKE },
37+
mongodb: { builder: DOMAIN_MONGODB, analyst: DOMAIN_MONGODB },
38+
}
39+
40+
/** Resolve the final tag set from fingerprint + config override. */
41+
export async function resolveTags(cfg?: { experimental?: { domains?: string[] } }): Promise<string[]> {
42+
const config = cfg ?? await Config.get()
43+
44+
// Signal 6: User config override — replaces auto-detection entirely
45+
const configDomains = config.experimental?.domains
46+
if (configDomains && configDomains.length > 0) {
47+
return expandTags(configDomains.map(normalizeTag))
48+
}
49+
50+
// Auto-detection from fingerprint (signals 1-4 are collected there)
51+
// Tags are already normalized at fingerprint detection time — no re-normalization needed
52+
const fp = Fingerprint.get()
53+
return expandTags(fp?.tags ?? [])
54+
}
55+
56+
/**
57+
* Compose the full agent prompt for a given agent type.
58+
*
59+
* When `experimental.modular_prompts` is enabled:
60+
* base prompt + agent-specific domain modules + training
61+
*
62+
* When disabled (default):
63+
* returns undefined — the caller preserves the existing agent prompt
64+
*/
65+
export async function composeAgentPrompt(agentName: string): Promise<string | undefined> {
66+
const cfg = await Config.get()
67+
68+
// Feature flag — default off. Return undefined to preserve existing agent prompt.
69+
if (!cfg.experimental?.modular_prompts) {
70+
return undefined
71+
}
72+
73+
const startTime = Date.now()
74+
const tags = await resolveTags(cfg)
75+
76+
// Select base prompt
77+
const base = agentName === "analyst" ? PROMPT_ANALYST_BASE : PROMPT_BUILDER_BASE
78+
const agentKey = agentName === "analyst" ? "analyst" : "builder"
79+
80+
// Collect matching domain prompts (deduplicated, explicit stable order)
81+
const seen = new Set<string>()
82+
const domains: string[] = []
83+
84+
for (const key of DOMAIN_ORDER) {
85+
if (tags.includes(key) && !seen.has(key)) {
86+
domains.push(TAG_TO_DOMAIN[key][agentKey])
87+
seen.add(key)
88+
}
89+
}
90+
91+
// Fallback: if no domains matched, include sql + dbt (preserves current behavior)
92+
let fallbackUsed = false
93+
if (domains.length === 0) {
94+
domains.push(TAG_TO_DOMAIN["sql"][agentKey], TAG_TO_DOMAIN["dbt"][agentKey])
95+
seen.add("sql")
96+
seen.add("dbt")
97+
fallbackUsed = true
98+
}
99+
100+
// Always include training
101+
domains.push(DOMAIN_TRAINING)
102+
seen.add("training")
103+
104+
const result = [base, ...domains].join("\n\n")
105+
106+
log.info("composed", {
107+
agent: agentName,
108+
tags: tags.join(","),
109+
domains: [...seen].join(","),
110+
fallback: fallbackUsed,
111+
})
112+
113+
Tracer.active?.logSpan({
114+
name: "domain-prompt-composition",
115+
startTime,
116+
endTime: Date.now(),
117+
input: { agent: agentName, detectedTags: tags },
118+
output: {
119+
domainsIncluded: [...seen],
120+
fallbackUsed,
121+
totalChars: result.length,
122+
},
123+
})
124+
125+
return result
126+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
## dbt Context (Read-Only)
2+
3+
This project uses dbt (data build tool). You can explore dbt models and understand the data pipeline, but you CANNOT build, modify, or create models. Use the builder agent for write operations.
4+
5+
### Querying dbt Data
6+
7+
Use `altimate-dbt execute` to query the database:
8+
```
9+
altimate-dbt execute --query "SELECT * FROM ..." --limit 100
10+
altimate-dbt columns --model <name> # Inspect model columns
11+
altimate-dbt info # Project metadata
12+
```
13+
14+
### Understanding the Project
15+
16+
- Staging models live in `staging/`, intermediate in `intermediate/`, marts in `marts/`
17+
- Check `schema.yml` files for column descriptions and test definitions
18+
- Run `lineage_check` to trace column-level data flow through transformations
19+
- Use `/dbt-analyze` to understand downstream impact of changes
20+
21+
### dbt Analysis Skills (read-only)
22+
23+
| Skill | Invoke When |
24+
|-------|-------------|
25+
| `/dbt-analyze` | User wants to understand impact — downstream consumers, breaking changes, blast radius. Uses `dbt_lineage` for column-level analysis. |

0 commit comments

Comments
 (0)