diff --git a/SKILL.md b/SKILL.md index 7838996b7..07a500f18 100644 --- a/SKILL.md +++ b/SKILL.md @@ -90,6 +90,21 @@ fi echo "VENDORED_GSTACK: $_VENDORED" # Detect spawned session (OpenClaw or other orchestrator) [ -n "$OPENCLAW_SESSION" ] && echo "SPAWNED_SESSION: true" || true +# Multi-LLM orchestration (llm-cli-gateway) +_LLM_GW="unavailable" +_LLM_GW_CLAUDE="no" +_LLM_GW_CODEX="no" +_LLM_GW_GEMINI="no" +if command -v llm-cli-gateway >/dev/null 2>&1; then + _LLM_GW="available" + command -v claude >/dev/null 2>&1 && _LLM_GW_CLAUDE="yes" + command -v codex >/dev/null 2>&1 && _LLM_GW_CODEX="yes" + command -v gemini >/dev/null 2>&1 && _LLM_GW_GEMINI="yes" +fi +echo "LLM_GATEWAY: $_LLM_GW" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CLAUDE: $_LLM_GW_CLAUDE" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CODEX: $_LLM_GW_CODEX" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_GEMINI: $_LLM_GW_GEMINI" ``` If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not diff --git a/contrib/add-tool/README.md b/contrib/add-tool/README.md new file mode 100644 index 000000000..6ef733238 --- /dev/null +++ b/contrib/add-tool/README.md @@ -0,0 +1,44 @@ +# Adding an External Tool to gstack + +This directory contains integrations for external development tools that +enhance gstack's workflow skills with specialized capabilities. + +## Structure + +Each tool integration lives in its own directory: + + contrib/add-tool// + ├── README.md # What the tool does and how the integration works + ├── tools.json # Routing contract: which gstack skills use which tools + ├── detection.sh # Bash fragment appended to preamble for detection + ├── install.sh # Idempotent install script + └── uninstall.sh # Clean removal script + +## How it works + +1. **Detection**: A bash block in the preamble checks if the tool binary + exists and outputs status variables (available/unavailable, version, etc.) + +2. **Resolver**: A TypeScript resolver reads `tools.json` and generates + conditional markdown blocks for each skill template. The block is skipped + entirely when the tool is not detected. + +3. **Template**: Skills that benefit from the tool include `{{TOOL_CONTEXT}}` + in their SKILL.md.tmpl, placed after `{{LLM_GATEWAY_CONTEXT}}` where present; + otherwise after `{{LEARNINGS_SEARCH}}`. + +## Requirements for a tool integration + +- Tool MUST be optional — gstack works without it +- Detection MUST be fast (< 50ms) — it runs on every skill invocation +- Resolver output MUST be concise — avoid prompt bloat +- Install script MUST be idempotent +- Uninstall script MUST leave gstack in a clean state +- tools.json MUST include min_version for compatibility gating + +## Existing integrations + +- [llm-gateway](llm-gateway/) — Multi-LLM orchestration via MCP (Gemini + Codex + Claude, + async parallel reviews, session continuity) +- [sqry](sqry/) — AST-based semantic code search via MCP (callers/callees tracing, + cycle detection, complexity metrics, structural call-path tracing) diff --git a/contrib/add-tool/llm-gateway/README.md b/contrib/add-tool/llm-gateway/README.md new file mode 100644 index 000000000..bcbc8f933 --- /dev/null +++ b/contrib/add-tool/llm-gateway/README.md @@ -0,0 +1,41 @@ +# llm-cli-gateway Integration for gstack + +[llm-cli-gateway](https://github.com/verivus-oss/llm-cli-gateway) provides +unified multi-LLM orchestration via 23 MCP tools. This integration adds Gemini +as a third review voice, async parallel orchestration, and session continuity +to gstack skills. + +## Install + + bash contrib/add-tool/llm-gateway/install.sh [claude|codex|all] + +## What it does + +When llm-cli-gateway is installed and configured as an MCP server, gstack +skills gain a "Multi-LLM Orchestration" section with contextual tool +recommendations. For example: + +- `/review` gets async parallel Gemini + Codex reviews with cross-model synthesis +- `/investigate` gets Gemini hypothesis validation alongside Codex second opinion +- `/plan-eng-review` gets multi-model architecture feedback +- `/ship` gets parallel pre-ship reviews from all available models + +See `tools.json` for the complete routing table. + +## Relationship to existing multi-LLM + +gstack already invokes Codex via shell subprocess (`codex exec`). This +integration does NOT replace that — it adds complementary capabilities: + +| Existing | Gateway adds | +|----------|-------------| +| Codex via `codex exec` (Bash) | Codex via `mcp__llm-cli-gw__codex_request` (MCP, structured) | +| Claude subagent (Agent tool) | Gemini as third voice (new) | +| Sequential blocking calls | Async parallel orchestration (new) | +| Stateless invocations | Session continuity (new) | + +## Uninstall + + bash contrib/add-tool/llm-gateway/uninstall.sh + +This removes the gstack integration. llm-cli-gateway itself remains installed. diff --git a/contrib/add-tool/llm-gateway/detection.sh b/contrib/add-tool/llm-gateway/detection.sh new file mode 100644 index 000000000..e2d0ea43c --- /dev/null +++ b/contrib/add-tool/llm-gateway/detection.sh @@ -0,0 +1,16 @@ +# Multi-LLM orchestration (llm-cli-gateway) +# Reference fragment — inlined by preamble.ts resolver +_LLM_GW="unavailable" +_LLM_GW_CLAUDE="no" +_LLM_GW_CODEX="no" +_LLM_GW_GEMINI="no" +if command -v llm-cli-gateway >/dev/null 2>&1; then + _LLM_GW="available" + command -v claude >/dev/null 2>&1 && _LLM_GW_CLAUDE="yes" + command -v codex >/dev/null 2>&1 && _LLM_GW_CODEX="yes" + command -v gemini >/dev/null 2>&1 && _LLM_GW_GEMINI="yes" +fi +echo "LLM_GATEWAY: $_LLM_GW" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CLAUDE: $_LLM_GW_CLAUDE" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CODEX: $_LLM_GW_CODEX" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_GEMINI: $_LLM_GW_GEMINI" diff --git a/contrib/add-tool/llm-gateway/install.sh b/contrib/add-tool/llm-gateway/install.sh new file mode 100755 index 000000000..0490c05e3 --- /dev/null +++ b/contrib/add-tool/llm-gateway/install.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# Install llm-cli-gateway as a gstack multi-LLM orchestration add-in. +# Idempotent — safe to run multiple times. +set -e + +AGENT="${1:-claude}" +MIN_VERSION="1.1.0" + +echo "=== llm-cli-gateway integration for gstack ===" +echo "" + +# 1. Check for llm-cli-gateway +if ! command -v llm-cli-gateway >/dev/null 2>&1; then + echo "llm-cli-gateway not found on PATH." + echo "" + echo "Install via npm:" + echo " npm install -g llm-cli-gateway" + echo "" + echo "Or clone and build:" + echo " git clone https://github.com/verivus-oss/llm-cli-gateway.git" + echo " cd llm-cli-gateway && npm install && npm run build && npm link" + echo "" + echo "Then re-run this script." + exit 1 +fi + +# 2. Check version +GW_VERSION=$(llm-cli-gateway --version 2>/dev/null || echo "0.0.0") +echo "Found llm-cli-gateway $GW_VERSION" + +version_lt() { + # Portable semver comparison (no sort -V, works on macOS + Linux) + local IFS=. + local i a=($1) b=($2) + for ((i=0; i<3; i++)); do + local ai=${a[i]:-0} bi=${b[i]:-0} + if [ "$ai" -lt "$bi" ] 2>/dev/null; then return 0; fi + if [ "$ai" -gt "$bi" ] 2>/dev/null; then return 1; fi + done + return 1 # equal +} + +if version_lt "$GW_VERSION" "$MIN_VERSION"; then + echo "llm-cli-gateway $MIN_VERSION+ required. Please upgrade:" + echo " npm install -g llm-cli-gateway@latest" + exit 1 +fi + +# 3. Report CLI availability +echo "" +echo "CLI availability:" +command -v claude >/dev/null 2>&1 && echo " claude: yes" || echo " claude: no (optional — install for Claude orchestration)" +command -v codex >/dev/null 2>&1 && echo " codex: yes" || echo " codex: no (optional — install for Codex orchestration)" +command -v gemini >/dev/null 2>&1 && echo " gemini: yes" || echo " gemini: no (optional — install for Gemini orchestration)" + +# 4. Configure MCP for the target agent +echo "" +echo "Configuring MCP server for $AGENT..." + +configure_claude() { + local settings="$HOME/.claude/settings.json" + if [ ! -f "$settings" ]; then + mkdir -p "$HOME/.claude" + echo '{}' > "$settings" + fi + # Add llm-cli-gw MCP server if not present + node -e " + const fs = require('fs'); + const s = JSON.parse(fs.readFileSync('$settings', 'utf-8')); + if (!s.mcpServers) s.mcpServers = {}; + if (!s.mcpServers['llm-cli-gw']) { + s.mcpServers['llm-cli-gw'] = { + command: 'llm-cli-gateway', + args: [], + env: {} + }; + fs.writeFileSync('$settings', JSON.stringify(s, null, 2)); + console.log('Added llm-cli-gw MCP server to ' + '$settings'); + } else { + console.log('llm-cli-gw MCP server already configured in ' + '$settings'); + } + " +} + +configure_codex() { + local config="$HOME/.codex/config.toml" + if [ ! -f "$config" ]; then + mkdir -p "$HOME/.codex" + echo "" > "$config" + fi + if ! grep -q 'llm-cli-gw' "$config" 2>/dev/null; then + cat >> "$config" << 'TOML' + +[[mcp_servers]] +name = "llm-cli-gw" +command = "llm-cli-gateway" +args = [] +TOML + echo "Added llm-cli-gw MCP server to $config" + else + echo "llm-cli-gw MCP server already configured in $config" + fi +} + +case "$AGENT" in + claude) configure_claude ;; + codex) configure_codex ;; + all) configure_claude; configure_codex ;; + *) echo "Warning: Auto-configuration not supported for $AGENT. Configure MCP manually." ;; +esac + +# 5. Regenerate gstack skills (picks up {{LLM_GATEWAY_CONTEXT}} resolver) +GSTACK_DIR="${GSTACK_ROOT:-$HOME/.claude/skills/gstack}" +if [ -f "$GSTACK_DIR/package.json" ]; then + echo "" + echo "Regenerating gstack skill docs..." + (cd "$GSTACK_DIR" && bun run gen:skill-docs --host all 2>/dev/null) || { + echo "Warning: Could not regenerate skill docs. Run manually:" + echo " cd $GSTACK_DIR && bun run gen:skill-docs --host all" + } +fi + +echo "" +echo "Done. llm-cli-gateway multi-LLM orchestration is now available in gstack skills." diff --git a/contrib/add-tool/llm-gateway/tools.json b/contrib/add-tool/llm-gateway/tools.json new file mode 100644 index 000000000..0d1dde738 --- /dev/null +++ b/contrib/add-tool/llm-gateway/tools.json @@ -0,0 +1,66 @@ +{ + "tool": "llm-cli-gateway", + "homepage": "https://github.com/verivus-oss/llm-cli-gateway", + "mcp_server_name": "llm-cli-gw", + "detection": { + "binary": "llm-cli-gateway", + "min_version": "1.1.0" + }, + "integrations": { + "review": { + "phase": "multi-llm-review", + "context": "multi-LLM code review", + "tools": [ + { "tool": "gemini_request_async", "when": "dispatch Gemini review in parallel with Codex", "requires_cli": "gemini" }, + { "tool": "codex_request_async", "when": "dispatch Codex review in parallel with Gemini", "requires_cli": "codex" }, + { "tool": "llm_job_status", "when": "poll async job completion" }, + { "tool": "llm_job_result", "when": "collect finished review results" }, + { "tool": "session_create", "when": "establish review session for follow-up clarification" } + ] + }, + "investigate": { + "phase": "hypothesis-validation", + "context": "cross-model root cause validation", + "tools": [ + { "tool": "gemini_request", "when": "validate root cause hypothesis with a fresh perspective", "requires_cli": "gemini" }, + { "tool": "codex_request", "when": "get Codex second opinion on the suspected root cause", "requires_cli": "codex" }, + { "tool": "session_create", "when": "establish investigation session for iterative hypothesis testing" } + ] + }, + "plan-eng-review": { + "phase": "architecture-review", + "context": "multi-LLM architecture review", + "tools": [ + { "tool": "gemini_request", "when": "get Gemini perspective on architecture decisions and trade-offs", "requires_cli": "gemini" }, + { "tool": "codex_request", "when": "get Codex cold-read of the architecture plan", "requires_cli": "codex" }, + { "tool": "session_create", "when": "establish review session for follow-up questions" } + ] + }, + "plan-ceo-review": { + "phase": "strategic-review", + "context": "multi-LLM strategic assessment", + "tools": [ + { "tool": "gemini_request", "when": "get Gemini perspective on scope and strategic direction", "requires_cli": "gemini" }, + { "tool": "codex_request", "when": "get Codex assessment of feasibility and technical risk", "requires_cli": "codex" } + ] + }, + "ship": { + "phase": "pre-ship-review", + "context": "multi-LLM pre-ship verification", + "tools": [ + { "tool": "gemini_request_async", "when": "dispatch Gemini pre-ship review in parallel", "requires_cli": "gemini" }, + { "tool": "codex_request_async", "when": "dispatch Codex pre-ship review in parallel", "requires_cli": "codex" }, + { "tool": "llm_job_status", "when": "poll async job completion" }, + { "tool": "llm_job_result", "when": "collect pre-ship review results" } + ] + }, + "retro": { + "phase": "multi-perspective-analysis", + "context": "multi-LLM retrospective analysis", + "tools": [ + { "tool": "gemini_request", "when": "get Gemini perspective on patterns, trends, and blind spots", "requires_cli": "gemini" }, + { "tool": "codex_request", "when": "get Codex analysis of code quality trends", "requires_cli": "codex" } + ] + } + } +} diff --git a/contrib/add-tool/llm-gateway/uninstall.sh b/contrib/add-tool/llm-gateway/uninstall.sh new file mode 100755 index 000000000..8e8bd36b7 --- /dev/null +++ b/contrib/add-tool/llm-gateway/uninstall.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# Remove llm-cli-gateway integration from gstack. +# Does NOT uninstall llm-cli-gateway itself — only removes the gstack integration. +set -e + +echo "=== Removing llm-cli-gateway integration from gstack ===" + +# 1. Remove MCP config entries (best-effort) +if command -v node >/dev/null 2>&1; then + node -e " + const fs = require('fs'); + const settings = process.env.HOME + '/.claude/settings.json'; + try { + const s = JSON.parse(fs.readFileSync(settings, 'utf-8')); + if (s.mcpServers && s.mcpServers['llm-cli-gw']) { + delete s.mcpServers['llm-cli-gw']; + fs.writeFileSync(settings, JSON.stringify(s, null, 2)); + console.log('Removed llm-cli-gw MCP server from Claude settings'); + } + } catch(e) {} + " 2>/dev/null || true +fi + +# 2. Remove from Codex config (best-effort, uses node for portability — no sed -i) +CODEX_CONFIG="$HOME/.codex/config.toml" +if [ -f "$CODEX_CONFIG" ] && grep -q 'llm-cli-gw' "$CODEX_CONFIG" 2>/dev/null; then + if command -v node >/dev/null 2>&1; then + node -e " + const fs = require('fs'); + const config = '$CODEX_CONFIG'; + try { + const lines = fs.readFileSync(config, 'utf-8').split('\n'); + const out = []; + let skip = false; + for (let i = 0; i < lines.length; i++) { + if (lines[i].trim() === '[[mcp_servers]]') { + // Look ahead: is this the llm-cli-gw block? + const block = lines.slice(i, i + 5).join('\n'); + if (block.includes('llm-cli-gw')) { skip = true; continue; } + } + if (skip) { + if (lines[i].trim() === '' || lines[i].startsWith('[[')) { skip = false; } + else { continue; } + } + if (!skip) out.push(lines[i]); + } + fs.writeFileSync(config, out.join('\n')); + console.log('Removed llm-cli-gw MCP server from Codex config'); + } catch(e) {} + " 2>/dev/null || true + else + echo "Warning: node not available. Manually remove llm-cli-gw from $CODEX_CONFIG" + fi +fi + +# 3. Regenerate gstack skills ({{LLM_GATEWAY_CONTEXT}} emits nothing without gateway) +GSTACK_DIR="${GSTACK_ROOT:-$HOME/.claude/skills/gstack}" +if [ -f "$GSTACK_DIR/package.json" ]; then + echo "Regenerating gstack skill docs..." + (cd "$GSTACK_DIR" && bun run gen:skill-docs --host all 2>/dev/null) || true +fi + +echo "Done. llm-cli-gateway integration removed. The gateway itself is still installed." +echo "To fully uninstall: npm uninstall -g llm-cli-gateway" diff --git a/investigate/SKILL.md.tmpl b/investigate/SKILL.md.tmpl index 3004300e2..f05def461 100644 --- a/investigate/SKILL.md.tmpl +++ b/investigate/SKILL.md.tmpl @@ -63,6 +63,8 @@ Gather context before forming any hypothesis. {{LEARNINGS_SEARCH}} +{{LLM_GATEWAY_CONTEXT}} + Output: **"Root cause hypothesis: ..."** — a specific, testable claim about what is wrong and why. --- diff --git a/plan-ceo-review/SKILL.md.tmpl b/plan-ceo-review/SKILL.md.tmpl index 225cd05da..1650e8160 100644 --- a/plan-ceo-review/SKILL.md.tmpl +++ b/plan-ceo-review/SKILL.md.tmpl @@ -595,6 +595,8 @@ If this plan has significant UI scope, recommend: "Consider running /plan-design {{CODEX_PLAN_REVIEW}} +{{LLM_GATEWAY_CONTEXT}} + ### Outside Voice Integration Rule Outside voice findings are INFORMATIONAL until the user explicitly approves each one. diff --git a/plan-eng-review/SKILL.md.tmpl b/plan-eng-review/SKILL.md.tmpl index 36c9d59e8..b358a7928 100644 --- a/plan-eng-review/SKILL.md.tmpl +++ b/plan-eng-review/SKILL.md.tmpl @@ -163,6 +163,8 @@ Evaluate: {{CODEX_PLAN_REVIEW}} +{{LLM_GATEWAY_CONTEXT}} + ### Outside Voice Integration Rule Outside voice findings are INFORMATIONAL until the user explicitly approves each one. diff --git a/retro/SKILL.md.tmpl b/retro/SKILL.md.tmpl index d89cb7175..0b85546c4 100644 --- a/retro/SKILL.md.tmpl +++ b/retro/SKILL.md.tmpl @@ -60,6 +60,8 @@ Usage: /retro [window | compare | global] {{LEARNINGS_SEARCH}} +{{LLM_GATEWAY_CONTEXT}} + ### Step 1: Gather Raw Data First, fetch origin and identify the current user: diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index 9ccb1ec23..082acee01 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -221,6 +221,8 @@ If no documentation files exist, skip this step silently. {{ADVERSARIAL_STEP}} +{{LLM_GATEWAY_CONTEXT}} + ## Step 5.8: Persist Eng Review result After all review passes complete, persist the final `/review` outcome so `/ship` can diff --git a/scripts/resolvers/index.ts b/scripts/resolvers/index.ts index 072b1a3da..ae1d7d364 100644 --- a/scripts/resolvers/index.ts +++ b/scripts/resolvers/index.ts @@ -18,6 +18,7 @@ import { generateConfidenceCalibration } from './confidence'; import { generateInvokeSkill } from './composition'; import { generateReviewArmy } from './review-army'; import { generateDxFramework } from './dx'; +import { generateLlmGatewayContext } from './llm-gateway'; export const RESOLVERS: Record = { SLUG_EVAL: generateSlugEval, @@ -62,4 +63,5 @@ export const RESOLVERS: Record = { REVIEW_ARMY: generateReviewArmy, CROSS_REVIEW_DEDUP: generateCrossReviewDedup, DX_FRAMEWORK: generateDxFramework, + LLM_GATEWAY_CONTEXT: generateLlmGatewayContext, }; diff --git a/scripts/resolvers/llm-gateway.ts b/scripts/resolvers/llm-gateway.ts new file mode 100644 index 000000000..25c6ba368 --- /dev/null +++ b/scripts/resolvers/llm-gateway.ts @@ -0,0 +1,84 @@ +import type { TemplateContext, ResolverFn } from './types'; +import { readFileSync } from 'fs'; +import { resolve } from 'path'; + +interface ToolMapping { + tool: string; + when: string; + requires_cli?: string; +} + +interface SkillIntegration { + phase: string; + context: string; + tools: ToolMapping[]; +} + +interface ToolsConfig { + tool: string; + mcp_server_name: string; + detection: { binary: string; min_version: string }; + integrations: Record; +} + +let cachedConfig: ToolsConfig | null = null; + +function loadToolsConfig(): ToolsConfig { + if (cachedConfig) return cachedConfig; + const configPath = resolve(import.meta.dir, '../../contrib/add-tool/llm-gateway/tools.json'); + cachedConfig = JSON.parse(readFileSync(configPath, 'utf-8')); + return cachedConfig!; +} + +export const generateLlmGatewayContext: ResolverFn = (ctx: TemplateContext): string => { + let config: ToolsConfig; + try { + config = loadToolsConfig(); + } catch { + return ''; + } + + const integration = config.integrations[ctx.skillName]; + if (!integration) return ''; + + const prefix = `mcp__${config.mcp_server_name}__`; + + // Filter out tools for the current host's CLI (prevent self-invocation) + const tools = integration.tools.filter(t => { + if (t.requires_cli && ctx.host === t.requires_cli) return false; + return true; + }); + + if (tools.length === 0) return ''; + + const toolList = tools + .map(t => { + const cliNote = t.requires_cli + ? ` (requires \`LLM_GW_${t.requires_cli.toUpperCase()}: yes\`)` + : ''; + return `- \`${prefix}${t.tool}\` — ${t.when}${cliNote}`; + }) + .join('\n'); + + return `## Multi-LLM Orchestration (llm-cli-gateway) + +If preamble shows \`LLM_GATEWAY: unavailable\`: skip this section entirely. + +If preamble shows \`LLM_GATEWAY: available\`: + +1. **CLI availability:** Only use tools for CLIs shown as \`yes\` in the preamble. + Skip tool recommendations for unavailable CLIs. + +2. **Async for parallel work:** Use \`_async\` variants + \`llm_job_status\`/\`llm_job_result\` + when running multiple LLM requests in parallel. Use sync variants for single sequential calls. + +3. **Session continuity:** Use \`session_create\` to establish a session for multi-turn + workflows. Pass \`sessionId\` to subsequent requests in the same skill invocation. + +**During ${integration.context}**, use these gateway MCP tools: + +${toolList} + +Collect results from all models before synthesizing. Always show which models contributed +and flag where models agree vs. diverge.`; +}; diff --git a/scripts/resolvers/preamble.ts b/scripts/resolvers/preamble.ts index bacbc0f00..990b208c0 100644 --- a/scripts/resolvers/preamble.ts +++ b/scripts/resolvers/preamble.ts @@ -99,6 +99,21 @@ fi echo "VENDORED_GSTACK: $_VENDORED" # Detect spawned session (OpenClaw or other orchestrator) [ -n "$OPENCLAW_SESSION" ] && echo "SPAWNED_SESSION: true" || true +# Multi-LLM orchestration (llm-cli-gateway) +_LLM_GW="unavailable" +_LLM_GW_CLAUDE="no" +_LLM_GW_CODEX="no" +_LLM_GW_GEMINI="no" +if command -v llm-cli-gateway >/dev/null 2>&1; then + _LLM_GW="available" + command -v claude >/dev/null 2>&1 && _LLM_GW_CLAUDE="yes" + command -v codex >/dev/null 2>&1 && _LLM_GW_CODEX="yes" + command -v gemini >/dev/null 2>&1 && _LLM_GW_GEMINI="yes" +fi +echo "LLM_GATEWAY: $_LLM_GW" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CLAUDE: $_LLM_GW_CLAUDE" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CODEX: $_LLM_GW_CODEX" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_GEMINI: $_LLM_GW_GEMINI" \`\`\``; } diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index 76e4873d6..bcfc383f4 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -343,6 +343,8 @@ For each classified comment: {{ADVERSARIAL_STEP}} +{{LLM_GATEWAY_CONTEXT}} + {{LEARNINGS_LOG}} ## Step 4: Version bump (auto-decide) diff --git a/test/fixtures/golden/claude-ship-SKILL.md b/test/fixtures/golden/claude-ship-SKILL.md index 34cfaa7b2..17a6692ae 100644 --- a/test/fixtures/golden/claude-ship-SKILL.md +++ b/test/fixtures/golden/claude-ship-SKILL.md @@ -86,8 +86,31 @@ fi _ROUTING_DECLINED=$(~/.claude/skills/gstack/bin/gstack-config get routing_declined 2>/dev/null || echo "false") echo "HAS_ROUTING: $_HAS_ROUTING" echo "ROUTING_DECLINED: $_ROUTING_DECLINED" +# Vendoring deprecation: detect if CWD has a vendored gstack copy +_VENDORED="no" +if [ -d ".claude/skills/gstack" ] && [ ! -L ".claude/skills/gstack" ]; then + if [ -f ".claude/skills/gstack/VERSION" ] || [ -d ".claude/skills/gstack/.git" ]; then + _VENDORED="yes" + fi +fi +echo "VENDORED_GSTACK: $_VENDORED" # Detect spawned session (OpenClaw or other orchestrator) [ -n "$OPENCLAW_SESSION" ] && echo "SPAWNED_SESSION: true" || true +# Multi-LLM orchestration (llm-cli-gateway) +_LLM_GW="unavailable" +_LLM_GW_CLAUDE="no" +_LLM_GW_CODEX="no" +_LLM_GW_GEMINI="no" +if command -v llm-cli-gateway >/dev/null 2>&1; then + _LLM_GW="available" + command -v claude >/dev/null 2>&1 && _LLM_GW_CLAUDE="yes" + command -v codex >/dev/null 2>&1 && _LLM_GW_CODEX="yes" + command -v gemini >/dev/null 2>&1 && _LLM_GW_GEMINI="yes" +fi +echo "LLM_GATEWAY: $_LLM_GW" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CLAUDE: $_LLM_GW_CLAUDE" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CODEX: $_LLM_GW_CODEX" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_GEMINI: $_LLM_GW_GEMINI" ``` If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not @@ -214,6 +237,38 @@ Say "No problem. You can add routing rules later by running `gstack-config set r This only happens once per project. If `HAS_ROUTING` is `yes` or `ROUTING_DECLINED` is `true`, skip this entirely. +If `VENDORED_GSTACK` is `yes`: This project has a vendored copy of gstack at +`.claude/skills/gstack/`. Vendoring is deprecated. We will not keep vendored copies +up to date, so this project's gstack will fall behind. + +Use AskUserQuestion (one-time per project, check for `~/.gstack/.vendoring-warned-$SLUG` marker): + +> This project has gstack vendored in `.claude/skills/gstack/`. Vendoring is deprecated. +> We won't keep this copy up to date, so you'll fall behind on new features and fixes. +> +> Want to migrate to team mode? It takes about 30 seconds. + +Options: +- A) Yes, migrate to team mode now +- B) No, I'll handle it myself + +If A: +1. Run `git rm -r .claude/skills/gstack/` +2. Run `echo '.claude/skills/gstack/' >> .gitignore` +3. Run `~/.claude/skills/gstack/bin/gstack-team-init required` (or `optional`) +4. Run `git add .claude/ .gitignore CLAUDE.md && git commit -m "chore: migrate gstack from vendored to team mode"` +5. Tell the user: "Done. Each developer now runs: `cd ~/.claude/skills/gstack && ./setup --team`" + +If B: say "OK, you're on your own to keep the vendored copy up to date." + +Always run (regardless of choice): +```bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" 2>/dev/null || true +touch ~/.gstack/.vendoring-warned-${SLUG:-unknown} +``` + +This only happens once per project. If the marker file exists, skip entirely. + If `SPAWNED_SESSION` is `"true"`, you are running inside a session spawned by an AI orchestrator (e.g., OpenClaw). In spawned sessions: - Do NOT use AskUserQuestion for interactive prompts. Auto-choose the recommended option. @@ -580,6 +635,16 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Auto-fixable review findings (dead code, N+1, stale comments — fixed automatically) - Test coverage gaps within target threshold (auto-generate and commit, or flag in PR body) +**Re-run behavior (idempotency):** +Re-running `/ship` means "run the whole checklist again." Every verification step +(tests, coverage audit, plan completion, pre-landing review, adversarial review, +VERSION/CHANGELOG check, TODOS, document-release) runs on every invocation. +Only *actions* are idempotent: +- Step 4: If VERSION already bumped, skip the bump but still read the version +- Step 7: If already pushed, skip the push command +- Step 8: If PR exists, update the body instead of creating a new PR +Never skip a verification step because a prior `/ship` run already performed it. + --- ## Step 1: Pre-flight @@ -1658,7 +1723,244 @@ Present Codex output under a `CODEX (design):` header, merged with the checklist Include any design findings alongside the code review findings. They follow the same Fix-First flow below. -4. **Classify each finding as AUTO-FIX or ASK** per the Fix-First Heuristic in +## Step 3.55: Review Army — Specialist Dispatch + +### Detect stack and scope + +```bash +source <(~/.claude/skills/gstack/bin/gstack-diff-scope 2>/dev/null) || true +# Detect stack for specialist context +STACK="" +[ -f Gemfile ] && STACK="${STACK}ruby " +[ -f package.json ] && STACK="${STACK}node " +[ -f requirements.txt ] || [ -f pyproject.toml ] && STACK="${STACK}python " +[ -f go.mod ] && STACK="${STACK}go " +[ -f Cargo.toml ] && STACK="${STACK}rust " +echo "STACK: ${STACK:-unknown}" +DIFF_INS=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0") +DIFF_DEL=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0") +DIFF_LINES=$((DIFF_INS + DIFF_DEL)) +echo "DIFF_LINES: $DIFF_LINES" +# Detect test framework for specialist test stub generation +TEST_FW="" +{ [ -f jest.config.ts ] || [ -f jest.config.js ]; } && TEST_FW="jest" +[ -f vitest.config.ts ] && TEST_FW="vitest" +{ [ -f spec/spec_helper.rb ] || [ -f .rspec ]; } && TEST_FW="rspec" +{ [ -f pytest.ini ] || [ -f conftest.py ]; } && TEST_FW="pytest" +[ -f go.mod ] && TEST_FW="go-test" +echo "TEST_FW: ${TEST_FW:-unknown}" +``` + +### Read specialist hit rates (adaptive gating) + +```bash +~/.claude/skills/gstack/bin/gstack-specialist-stats 2>/dev/null || true +``` + +### Select specialists + +Based on the scope signals above, select which specialists to dispatch. + +**Always-on (dispatch on every review with 50+ changed lines):** +1. **Testing** — read `~/.claude/skills/gstack/review/specialists/testing.md` +2. **Maintainability** — read `~/.claude/skills/gstack/review/specialists/maintainability.md` + +**If DIFF_LINES < 50:** Skip all specialists. Print: "Small diff ($DIFF_LINES lines) — specialists skipped." Continue to the Fix-First flow (item 4). + +**Conditional (dispatch if the matching scope signal is true):** +3. **Security** — if SCOPE_AUTH=true, OR if SCOPE_BACKEND=true AND DIFF_LINES > 100. Read `~/.claude/skills/gstack/review/specialists/security.md` +4. **Performance** — if SCOPE_BACKEND=true OR SCOPE_FRONTEND=true. Read `~/.claude/skills/gstack/review/specialists/performance.md` +5. **Data Migration** — if SCOPE_MIGRATIONS=true. Read `~/.claude/skills/gstack/review/specialists/data-migration.md` +6. **API Contract** — if SCOPE_API=true. Read `~/.claude/skills/gstack/review/specialists/api-contract.md` +7. **Design** — if SCOPE_FRONTEND=true. Use the existing design review checklist at `~/.claude/skills/gstack/review/design-checklist.md` + +### Adaptive gating + +After scope-based selection, apply adaptive gating based on specialist hit rates: + +For each conditional specialist that passed scope gating, check the `gstack-specialist-stats` output above: +- If tagged `[GATE_CANDIDATE]` (0 findings in 10+ dispatches): skip it. Print: "[specialist] auto-gated (0 findings in N reviews)." +- If tagged `[NEVER_GATE]`: always dispatch regardless of hit rate. Security and data-migration are insurance policy specialists — they should run even when silent. + +**Force flags:** If the user's prompt includes `--security`, `--performance`, `--testing`, `--maintainability`, `--data-migration`, `--api-contract`, `--design`, or `--all-specialists`, force-include that specialist regardless of gating. + +Note which specialists were selected, gated, and skipped. Print the selection: +"Dispatching N specialists: [names]. Skipped: [names] (scope not detected). Gated: [names] (0 findings in N+ reviews)." + +--- + +### Dispatch specialists in parallel + +For each selected specialist, launch an independent subagent via the Agent tool. +**Launch ALL selected specialists in a single message** (multiple Agent tool calls) +so they run in parallel. Each subagent has fresh context — no prior review bias. + +**Each specialist subagent prompt:** + +Construct the prompt for each specialist. The prompt includes: + +1. The specialist's checklist content (you already read the file above) +2. Stack context: "This is a {STACK} project." +3. Past learnings for this domain (if any exist): + +```bash +~/.claude/skills/gstack/bin/gstack-learnings-search --type pitfall --query "{specialist domain}" --limit 5 2>/dev/null || true +``` + +If learnings are found, include them: "Past learnings for this domain: {learnings}" + +4. Instructions: + +"You are a specialist code reviewer. Read the checklist below, then run +`git diff origin/` to get the full diff. Apply the checklist against the diff. + +For each finding, output a JSON object on its own line: +{\"severity\":\"CRITICAL|INFORMATIONAL\",\"confidence\":N,\"path\":\"file\",\"line\":N,\"category\":\"category\",\"summary\":\"description\",\"fix\":\"recommended fix\",\"fingerprint\":\"path:line:category\",\"specialist\":\"name\"} + +Required fields: severity, confidence, path, category, summary, specialist. +Optional: line, fix, fingerprint, evidence, test_stub. + +If you can write a test that would catch this issue, include it in the `test_stub` field. +Use the detected test framework ({TEST_FW}). Write a minimal skeleton — describe/it/test +blocks with clear intent. Skip test_stub for architectural or design-only findings. + +If no findings: output `NO FINDINGS` and nothing else. +Do not output anything else — no preamble, no summary, no commentary. + +Stack context: {STACK} +Past learnings: {learnings or 'none'} + +CHECKLIST: +{checklist content}" + +**Subagent configuration:** +- Use `subagent_type: "general-purpose"` +- Do NOT use `run_in_background` — all specialists must complete before merge +- If any specialist subagent fails or times out, log the failure and continue with results from successful specialists. Specialists are additive — partial results are better than no results. + +--- + +### Step 3.56: Collect and merge findings + +After all specialist subagents complete, collect their outputs. + +**Parse findings:** +For each specialist's output: +1. If output is "NO FINDINGS" — skip, this specialist found nothing +2. Otherwise, parse each line as a JSON object. Skip lines that are not valid JSON. +3. Collect all parsed findings into a single list, tagged with their specialist name. + +**Fingerprint and deduplicate:** +For each finding, compute its fingerprint: +- If `fingerprint` field is present, use it +- Otherwise: `{path}:{line}:{category}` (if line is present) or `{path}:{category}` + +Group findings by fingerprint. For findings sharing the same fingerprint: +- Keep the finding with the highest confidence score +- Tag it: "MULTI-SPECIALIST CONFIRMED ({specialist1} + {specialist2})" +- Boost confidence by +1 (cap at 10) +- Note the confirming specialists in the output + +**Apply confidence gates:** +- Confidence 7+: show normally in the findings output +- Confidence 5-6: show with caveat "Medium confidence — verify this is actually an issue" +- Confidence 3-4: move to appendix (suppress from main findings) +- Confidence 1-2: suppress entirely + +**Compute PR Quality Score:** +After merging, compute the quality score: +`quality_score = max(0, 10 - (critical_count * 2 + informational_count * 0.5))` +Cap at 10. Log this in the review result at the end. + +**Output merged findings:** +Present the merged findings in the same format as the current review: + +``` +SPECIALIST REVIEW: N findings (X critical, Y informational) from Z specialists + +[For each finding, in order: CRITICAL first, then INFORMATIONAL, sorted by confidence descending] +[SEVERITY] (confidence: N/10, specialist: name) path:line — summary + Fix: recommended fix + [If MULTI-SPECIALIST CONFIRMED: show confirmation note] + +PR Quality Score: X/10 +``` + +These findings flow into the Fix-First flow (item 4) alongside the checklist pass (Step 3.5). +The Fix-First heuristic applies identically — specialist findings follow the same AUTO-FIX vs ASK classification. + +**Compile per-specialist stats:** +After merging findings, compile a `specialists` object for the review-log persist. +For each specialist (testing, maintainability, security, performance, data-migration, api-contract, design, red-team): +- If dispatched: `{"dispatched": true, "findings": N, "critical": N, "informational": N}` +- If skipped by scope: `{"dispatched": false, "reason": "scope"}` +- If skipped by gating: `{"dispatched": false, "reason": "gated"}` +- If not applicable (e.g., red-team not activated): omit from the object + +Include the Design specialist even though it uses `design-checklist.md` instead of the specialist schema files. +Remember these stats — you will need them for the review-log entry in Step 5.8. + +--- + +### Red Team dispatch (conditional) + +**Activation:** Only if DIFF_LINES > 200 OR any specialist produced a CRITICAL finding. + +If activated, dispatch one more subagent via the Agent tool (foreground, not background). + +The Red Team subagent receives: +1. The red-team checklist from `~/.claude/skills/gstack/review/specialists/red-team.md` +2. The merged specialist findings from Step 3.56 (so it knows what was already caught) +3. The git diff command + +Prompt: "You are a red team reviewer. The code has already been reviewed by N specialists +who found the following issues: {merged findings summary}. Your job is to find what they +MISSED. Read the checklist, run `git diff origin/`, and look for gaps. +Output findings as JSON objects (same schema as the specialists). Focus on cross-cutting +concerns, integration boundary issues, and failure modes that specialist checklists +don't cover." + +If the Red Team finds additional issues, merge them into the findings list before +the Fix-First flow (item 4). Red Team findings are tagged with `"specialist":"red-team"`. + +If the Red Team returns NO FINDINGS, note: "Red Team review: no additional issues found." +If the Red Team subagent fails or times out, skip silently and continue. + +### Step 3.57: Cross-review finding dedup + +Before classifying findings, check if any were previously skipped by the user in a prior review on this branch. + +```bash +~/.claude/skills/gstack/bin/gstack-review-read +``` + +Parse the output: only lines BEFORE `---CONFIG---` are JSONL entries (the output also contains `---CONFIG---` and `---HEAD---` footer sections that are not JSONL — ignore those). + +For each JSONL entry that has a `findings` array: +1. Collect all fingerprints where `action: "skipped"` +2. Note the `commit` field from that entry + +If skipped fingerprints exist, get the list of files changed since that review: + +```bash +git diff --name-only HEAD +``` + +For each current finding (from both the checklist pass (Step 3.5) and specialist review (Step 3.55-3.56)), check: +- Does its fingerprint match a previously skipped finding? +- Is the finding's file path NOT in the changed-files set? + +If both conditions are true: suppress the finding. It was intentionally skipped and the relevant code hasn't changed. + +Print: "Suppressed N findings from prior reviews (previously skipped by user)" + +**Only suppress `skipped` findings — never `fixed` or `auto-fixed`** (those might regress and should be re-checked). + +If no prior reviews exist or none have a `findings` array, skip this step silently. + +Output a summary header: `Pre-Landing Review: N issues (X critical, Y informational)` + +4. **Classify each finding from both the checklist pass and specialist review (Step 3.55-3.56) as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. 5. **Auto-fix all AUTO-FIX items.** Apply each fix. Output one line per fix: @@ -1680,10 +1982,13 @@ Present Codex output under a `CODEX (design):` header, merged with the checklist 9. Persist the review result to the review log: ```bash -~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"review","timestamp":"TIMESTAMP","status":"STATUS","issues_found":N,"critical":N,"informational":N,"commit":"'"$(git rev-parse --short HEAD)"'","via":"ship"}' +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"review","timestamp":"TIMESTAMP","status":"STATUS","issues_found":N,"critical":N,"informational":N,"quality_score":SCORE,"specialists":SPECIALISTS_JSON,"findings":FINDINGS_JSON,"commit":"'"$(git rev-parse --short HEAD)"'","via":"ship"}' ``` Substitute TIMESTAMP (ISO 8601), STATUS ("clean" if no issues, "issues_found" otherwise), and N values from the summary counts above. The `via:"ship"` distinguishes from standalone `/review` runs. +- `quality_score` = the PR Quality Score computed in Step 3.56 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` +- `specialists` = the per-specialist stats object compiled in Step 3.56. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` +- `findings` = array of per-finding records. For each finding (from checklist pass and specialists), include: `{"fingerprint":"path:line:category","severity":"CRITICAL|INFORMATIONAL","action":"ACTION"}`. ACTION is `"auto-fixed"`, `"fixed"` (user approved), or `"skipped"` (user chose Skip). Save the review output — it goes into the PR body in Step 8. @@ -1853,6 +2158,31 @@ High-confidence findings (agreed on by multiple sources) should be prioritized f --- +## Multi-LLM Orchestration (llm-cli-gateway) + +If preamble shows `LLM_GATEWAY: unavailable`: skip this section entirely. + +If preamble shows `LLM_GATEWAY: available`: + +1. **CLI availability:** Only use tools for CLIs shown as `yes` in the preamble. + Skip tool recommendations for unavailable CLIs. + +2. **Async for parallel work:** Use `_async` variants + `llm_job_status`/`llm_job_result` + when running multiple LLM requests in parallel. Use sync variants for single sequential calls. + +3. **Session continuity:** Use `session_create` to establish a session for multi-turn + workflows. Pass `sessionId` to subsequent requests in the same skill invocation. + +**During multi-LLM pre-ship verification**, use these gateway MCP tools: + +- `mcp__llm-cli-gw__gemini_request_async` — dispatch Gemini pre-ship review in parallel (requires `LLM_GW_GEMINI: yes`) +- `mcp__llm-cli-gw__codex_request_async` — dispatch Codex pre-ship review in parallel (requires `LLM_GW_CODEX: yes`) +- `mcp__llm-cli-gw__llm_job_status` — poll async job completion +- `mcp__llm-cli-gw__llm_job_result` — collect pre-ship review results + +Collect results from all models before synthesizing. Always show which models contributed +and flag where models agree vs. diverge. + ## Capture Learnings If you discovered a non-obvious pattern, pitfall, or architectural insight during @@ -1889,7 +2219,7 @@ echo "BASE: $BASE_VERSION HEAD: $CURRENT_VERSION" if [ "$CURRENT_VERSION" != "$BASE_VERSION" ]; then echo "ALREADY_BUMPED"; fi ``` -If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (prior `/ship` run). Skip the rest of Step 4 and use the current VERSION. Otherwise proceed with the bump. +If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (prior `/ship` run). Skip the bump action (do not modify VERSION), but read the current VERSION value — it is needed for CHANGELOG and PR body. Continue to the next step. Otherwise proceed with the bump. 1. Read the current `VERSION` file (4-digit format: `MAJOR.MINOR.PATCH.MICRO`) @@ -2080,7 +2410,7 @@ echo "LOCAL: $LOCAL REMOTE: $REMOTE" [ "$LOCAL" = "$REMOTE" ] && echo "ALREADY_PUSHED" || echo "PUSH_NEEDED" ``` -If `ALREADY_PUSHED`, skip the push. Otherwise push with upstream tracking: +If `ALREADY_PUSHED`, skip the push but continue to Step 8. Otherwise push with upstream tracking: ```bash git push -u origin @@ -2102,7 +2432,7 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number): glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR" ``` -If an **open** PR/MR already exists: **update** the PR body with the latest test results, coverage, and review findings using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Print the existing URL and continue to Step 8.5. +If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 8.5. If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0. @@ -2207,6 +2537,8 @@ execute its full workflow: This step is automatic. Do not ask the user for confirmation. The goal is zero-friction doc updates — the user runs `/ship` and documentation stays current without a separate command. +If Step 8.5 created a docs commit, re-edit the PR/MR body to include the latest commit SHA in the summary. This ensures the PR body reflects the truly final state after document-release. + --- ## Step 8.75: Persist ship metrics diff --git a/test/fixtures/golden/codex-ship-SKILL.md b/test/fixtures/golden/codex-ship-SKILL.md index ec0116f06..61fb9c04c 100644 --- a/test/fixtures/golden/codex-ship-SKILL.md +++ b/test/fixtures/golden/codex-ship-SKILL.md @@ -80,8 +80,31 @@ fi _ROUTING_DECLINED=$($GSTACK_BIN/gstack-config get routing_declined 2>/dev/null || echo "false") echo "HAS_ROUTING: $_HAS_ROUTING" echo "ROUTING_DECLINED: $_ROUTING_DECLINED" +# Vendoring deprecation: detect if CWD has a vendored gstack copy +_VENDORED="no" +if [ -d ".agents/skills/gstack" ] && [ ! -L ".agents/skills/gstack" ]; then + if [ -f ".agents/skills/gstack/VERSION" ] || [ -d ".agents/skills/gstack/.git" ]; then + _VENDORED="yes" + fi +fi +echo "VENDORED_GSTACK: $_VENDORED" # Detect spawned session (OpenClaw or other orchestrator) [ -n "$OPENCLAW_SESSION" ] && echo "SPAWNED_SESSION: true" || true +# Multi-LLM orchestration (llm-cli-gateway) +_LLM_GW="unavailable" +_LLM_GW_CLAUDE="no" +_LLM_GW_CODEX="no" +_LLM_GW_GEMINI="no" +if command -v llm-cli-gateway >/dev/null 2>&1; then + _LLM_GW="available" + command -v claude >/dev/null 2>&1 && _LLM_GW_CLAUDE="yes" + command -v codex >/dev/null 2>&1 && _LLM_GW_CODEX="yes" + command -v gemini >/dev/null 2>&1 && _LLM_GW_GEMINI="yes" +fi +echo "LLM_GATEWAY: $_LLM_GW" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CLAUDE: $_LLM_GW_CLAUDE" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CODEX: $_LLM_GW_CODEX" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_GEMINI: $_LLM_GW_GEMINI" ``` If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not @@ -208,6 +231,38 @@ Say "No problem. You can add routing rules later by running `gstack-config set r This only happens once per project. If `HAS_ROUTING` is `yes` or `ROUTING_DECLINED` is `true`, skip this entirely. +If `VENDORED_GSTACK` is `yes`: This project has a vendored copy of gstack at +`.agents/skills/gstack/`. Vendoring is deprecated. We will not keep vendored copies +up to date, so this project's gstack will fall behind. + +Use AskUserQuestion (one-time per project, check for `~/.gstack/.vendoring-warned-$SLUG` marker): + +> This project has gstack vendored in `.agents/skills/gstack/`. Vendoring is deprecated. +> We won't keep this copy up to date, so you'll fall behind on new features and fixes. +> +> Want to migrate to team mode? It takes about 30 seconds. + +Options: +- A) Yes, migrate to team mode now +- B) No, I'll handle it myself + +If A: +1. Run `git rm -r .agents/skills/gstack/` +2. Run `echo '.agents/skills/gstack/' >> .gitignore` +3. Run `$GSTACK_BIN/gstack-team-init required` (or `optional`) +4. Run `git add .claude/ .gitignore CLAUDE.md && git commit -m "chore: migrate gstack from vendored to team mode"` +5. Tell the user: "Done. Each developer now runs: `cd $GSTACK_ROOT && ./setup --team`" + +If B: say "OK, you're on your own to keep the vendored copy up to date." + +Always run (regardless of choice): +```bash +eval "$($GSTACK_BIN/gstack-slug 2>/dev/null)" 2>/dev/null || true +touch ~/.gstack/.vendoring-warned-${SLUG:-unknown} +``` + +This only happens once per project. If the marker file exists, skip entirely. + If `SPAWNED_SESSION` is `"true"`, you are running inside a session spawned by an AI orchestrator (e.g., OpenClaw). In spawned sessions: - Do NOT use AskUserQuestion for interactive prompts. Auto-choose the recommended option. @@ -574,6 +629,16 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Auto-fixable review findings (dead code, N+1, stale comments — fixed automatically) - Test coverage gaps within target threshold (auto-generate and commit, or flag in PR body) +**Re-run behavior (idempotency):** +Re-running `/ship` means "run the whole checklist again." Every verification step +(tests, coverage audit, plan completion, pre-landing review, adversarial review, +VERSION/CHANGELOG check, TODOS, document-release) runs on every invocation. +Only *actions* are idempotent: +- Step 4: If VERSION already bumped, skip the bump but still read the version +- Step 7: If already pushed, skip the push command +- Step 8: If PR exists, update the body instead of creating a new PR +Never skip a verification step because a prior `/ship` run already performed it. + --- ## Step 1: Pre-flight @@ -1602,7 +1667,43 @@ Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "is Include any design findings alongside the code review findings. They follow the same Fix-First flow below. -4. **Classify each finding as AUTO-FIX or ASK** per the Fix-First Heuristic in + + +### Step 3.57: Cross-review finding dedup + +Before classifying findings, check if any were previously skipped by the user in a prior review on this branch. + +```bash +$GSTACK_ROOT/bin/gstack-review-read +``` + +Parse the output: only lines BEFORE `---CONFIG---` are JSONL entries (the output also contains `---CONFIG---` and `---HEAD---` footer sections that are not JSONL — ignore those). + +For each JSONL entry that has a `findings` array: +1. Collect all fingerprints where `action: "skipped"` +2. Note the `commit` field from that entry + +If skipped fingerprints exist, get the list of files changed since that review: + +```bash +git diff --name-only HEAD +``` + +For each current finding (from both the checklist pass (Step 3.5) and specialist review (Step 3.55-3.56)), check: +- Does its fingerprint match a previously skipped finding? +- Is the finding's file path NOT in the changed-files set? + +If both conditions are true: suppress the finding. It was intentionally skipped and the relevant code hasn't changed. + +Print: "Suppressed N findings from prior reviews (previously skipped by user)" + +**Only suppress `skipped` findings — never `fixed` or `auto-fixed`** (those might regress and should be re-checked). + +If no prior reviews exist or none have a `findings` array, skip this step silently. + +Output a summary header: `Pre-Landing Review: N issues (X critical, Y informational)` + +4. **Classify each finding from both the checklist pass and specialist review (Step 3.55-3.56) as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. 5. **Auto-fix all AUTO-FIX items.** Apply each fix. Output one line per fix: @@ -1624,10 +1725,13 @@ Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "is 9. Persist the review result to the review log: ```bash -$GSTACK_ROOT/bin/gstack-review-log '{"skill":"review","timestamp":"TIMESTAMP","status":"STATUS","issues_found":N,"critical":N,"informational":N,"commit":"'"$(git rev-parse --short HEAD)"'","via":"ship"}' +$GSTACK_ROOT/bin/gstack-review-log '{"skill":"review","timestamp":"TIMESTAMP","status":"STATUS","issues_found":N,"critical":N,"informational":N,"quality_score":SCORE,"specialists":SPECIALISTS_JSON,"findings":FINDINGS_JSON,"commit":"'"$(git rev-parse --short HEAD)"'","via":"ship"}' ``` Substitute TIMESTAMP (ISO 8601), STATUS ("clean" if no issues, "issues_found" otherwise), and N values from the summary counts above. The `via:"ship"` distinguishes from standalone `/review` runs. +- `quality_score` = the PR Quality Score computed in Step 3.56 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` +- `specialists` = the per-specialist stats object compiled in Step 3.56. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` +- `findings` = array of per-finding records. For each finding (from checklist pass and specialists), include: `{"fingerprint":"path:line:category","severity":"CRITICAL|INFORMATIONAL","action":"ACTION"}`. ACTION is `"auto-fixed"`, `"fixed"` (user approved), or `"skipped"` (user chose Skip). Save the review output — it goes into the PR body in Step 8. @@ -1674,6 +1778,30 @@ For each classified comment: +## Multi-LLM Orchestration (llm-cli-gateway) + +If preamble shows `LLM_GATEWAY: unavailable`: skip this section entirely. + +If preamble shows `LLM_GATEWAY: available`: + +1. **CLI availability:** Only use tools for CLIs shown as `yes` in the preamble. + Skip tool recommendations for unavailable CLIs. + +2. **Async for parallel work:** Use `_async` variants + `llm_job_status`/`llm_job_result` + when running multiple LLM requests in parallel. Use sync variants for single sequential calls. + +3. **Session continuity:** Use `session_create` to establish a session for multi-turn + workflows. Pass `sessionId` to subsequent requests in the same skill invocation. + +**During multi-LLM pre-ship verification**, use these gateway MCP tools: + +- `mcp__llm-cli-gw__gemini_request_async` — dispatch Gemini pre-ship review in parallel (requires `LLM_GW_GEMINI: yes`) +- `mcp__llm-cli-gw__llm_job_status` — poll async job completion +- `mcp__llm-cli-gw__llm_job_result` — collect pre-ship review results + +Collect results from all models before synthesizing. Always show which models contributed +and flag where models agree vs. diverge. + ## Capture Learnings If you discovered a non-obvious pattern, pitfall, or architectural insight during @@ -1710,7 +1838,7 @@ echo "BASE: $BASE_VERSION HEAD: $CURRENT_VERSION" if [ "$CURRENT_VERSION" != "$BASE_VERSION" ]; then echo "ALREADY_BUMPED"; fi ``` -If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (prior `/ship` run). Skip the rest of Step 4 and use the current VERSION. Otherwise proceed with the bump. +If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (prior `/ship` run). Skip the bump action (do not modify VERSION), but read the current VERSION value — it is needed for CHANGELOG and PR body. Continue to the next step. Otherwise proceed with the bump. 1. Read the current `VERSION` file (4-digit format: `MAJOR.MINOR.PATCH.MICRO`) @@ -1901,7 +2029,7 @@ echo "LOCAL: $LOCAL REMOTE: $REMOTE" [ "$LOCAL" = "$REMOTE" ] && echo "ALREADY_PUSHED" || echo "PUSH_NEEDED" ``` -If `ALREADY_PUSHED`, skip the push. Otherwise push with upstream tracking: +If `ALREADY_PUSHED`, skip the push but continue to Step 8. Otherwise push with upstream tracking: ```bash git push -u origin @@ -1923,7 +2051,7 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number): glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR" ``` -If an **open** PR/MR already exists: **update** the PR body with the latest test results, coverage, and review findings using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Print the existing URL and continue to Step 8.5. +If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 8.5. If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0. @@ -2028,6 +2156,8 @@ execute its full workflow: This step is automatic. Do not ask the user for confirmation. The goal is zero-friction doc updates — the user runs `/ship` and documentation stays current without a separate command. +If Step 8.5 created a docs commit, re-edit the PR/MR body to include the latest commit SHA in the summary. This ensures the PR body reflects the truly final state after document-release. + --- ## Step 8.75: Persist ship metrics diff --git a/test/fixtures/golden/factory-ship-SKILL.md b/test/fixtures/golden/factory-ship-SKILL.md index 95f051118..0c9637263 100644 --- a/test/fixtures/golden/factory-ship-SKILL.md +++ b/test/fixtures/golden/factory-ship-SKILL.md @@ -82,8 +82,31 @@ fi _ROUTING_DECLINED=$($GSTACK_BIN/gstack-config get routing_declined 2>/dev/null || echo "false") echo "HAS_ROUTING: $_HAS_ROUTING" echo "ROUTING_DECLINED: $_ROUTING_DECLINED" +# Vendoring deprecation: detect if CWD has a vendored gstack copy +_VENDORED="no" +if [ -d ".factory/skills/gstack" ] && [ ! -L ".factory/skills/gstack" ]; then + if [ -f ".factory/skills/gstack/VERSION" ] || [ -d ".factory/skills/gstack/.git" ]; then + _VENDORED="yes" + fi +fi +echo "VENDORED_GSTACK: $_VENDORED" # Detect spawned session (OpenClaw or other orchestrator) [ -n "$OPENCLAW_SESSION" ] && echo "SPAWNED_SESSION: true" || true +# Multi-LLM orchestration (llm-cli-gateway) +_LLM_GW="unavailable" +_LLM_GW_CLAUDE="no" +_LLM_GW_CODEX="no" +_LLM_GW_GEMINI="no" +if command -v llm-cli-gateway >/dev/null 2>&1; then + _LLM_GW="available" + command -v claude >/dev/null 2>&1 && _LLM_GW_CLAUDE="yes" + command -v codex >/dev/null 2>&1 && _LLM_GW_CODEX="yes" + command -v gemini >/dev/null 2>&1 && _LLM_GW_GEMINI="yes" +fi +echo "LLM_GATEWAY: $_LLM_GW" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CLAUDE: $_LLM_GW_CLAUDE" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_CODEX: $_LLM_GW_CODEX" +[ "$_LLM_GW" = "available" ] && echo "LLM_GW_GEMINI: $_LLM_GW_GEMINI" ``` If `PROACTIVE` is `"false"`, do not proactively suggest gstack skills AND do not @@ -210,6 +233,38 @@ Say "No problem. You can add routing rules later by running `gstack-config set r This only happens once per project. If `HAS_ROUTING` is `yes` or `ROUTING_DECLINED` is `true`, skip this entirely. +If `VENDORED_GSTACK` is `yes`: This project has a vendored copy of gstack at +`.factory/skills/gstack/`. Vendoring is deprecated. We will not keep vendored copies +up to date, so this project's gstack will fall behind. + +Use AskUserQuestion (one-time per project, check for `~/.gstack/.vendoring-warned-$SLUG` marker): + +> This project has gstack vendored in `.factory/skills/gstack/`. Vendoring is deprecated. +> We won't keep this copy up to date, so you'll fall behind on new features and fixes. +> +> Want to migrate to team mode? It takes about 30 seconds. + +Options: +- A) Yes, migrate to team mode now +- B) No, I'll handle it myself + +If A: +1. Run `git rm -r .factory/skills/gstack/` +2. Run `echo '.factory/skills/gstack/' >> .gitignore` +3. Run `$GSTACK_BIN/gstack-team-init required` (or `optional`) +4. Run `git add .claude/ .gitignore CLAUDE.md && git commit -m "chore: migrate gstack from vendored to team mode"` +5. Tell the user: "Done. Each developer now runs: `cd $GSTACK_ROOT && ./setup --team`" + +If B: say "OK, you're on your own to keep the vendored copy up to date." + +Always run (regardless of choice): +```bash +eval "$($GSTACK_BIN/gstack-slug 2>/dev/null)" 2>/dev/null || true +touch ~/.gstack/.vendoring-warned-${SLUG:-unknown} +``` + +This only happens once per project. If the marker file exists, skip entirely. + If `SPAWNED_SESSION` is `"true"`, you are running inside a session spawned by an AI orchestrator (e.g., OpenClaw). In spawned sessions: - Do NOT use AskUserQuestion for interactive prompts. Auto-choose the recommended option. @@ -576,6 +631,16 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Auto-fixable review findings (dead code, N+1, stale comments — fixed automatically) - Test coverage gaps within target threshold (auto-generate and commit, or flag in PR body) +**Re-run behavior (idempotency):** +Re-running `/ship` means "run the whole checklist again." Every verification step +(tests, coverage audit, plan completion, pre-landing review, adversarial review, +VERSION/CHANGELOG check, TODOS, document-release) runs on every invocation. +Only *actions* are idempotent: +- Step 4: If VERSION already bumped, skip the bump but still read the version +- Step 7: If already pushed, skip the push command +- Step 8: If PR exists, update the body instead of creating a new PR +Never skip a verification step because a prior `/ship` run already performed it. + --- ## Step 1: Pre-flight @@ -1654,7 +1719,244 @@ Present Codex output under a `CODEX (design):` header, merged with the checklist Include any design findings alongside the code review findings. They follow the same Fix-First flow below. -4. **Classify each finding as AUTO-FIX or ASK** per the Fix-First Heuristic in +## Step 3.55: Review Army — Specialist Dispatch + +### Detect stack and scope + +```bash +source <($GSTACK_BIN/gstack-diff-scope 2>/dev/null) || true +# Detect stack for specialist context +STACK="" +[ -f Gemfile ] && STACK="${STACK}ruby " +[ -f package.json ] && STACK="${STACK}node " +[ -f requirements.txt ] || [ -f pyproject.toml ] && STACK="${STACK}python " +[ -f go.mod ] && STACK="${STACK}go " +[ -f Cargo.toml ] && STACK="${STACK}rust " +echo "STACK: ${STACK:-unknown}" +DIFF_INS=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0") +DIFF_DEL=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0") +DIFF_LINES=$((DIFF_INS + DIFF_DEL)) +echo "DIFF_LINES: $DIFF_LINES" +# Detect test framework for specialist test stub generation +TEST_FW="" +{ [ -f jest.config.ts ] || [ -f jest.config.js ]; } && TEST_FW="jest" +[ -f vitest.config.ts ] && TEST_FW="vitest" +{ [ -f spec/spec_helper.rb ] || [ -f .rspec ]; } && TEST_FW="rspec" +{ [ -f pytest.ini ] || [ -f conftest.py ]; } && TEST_FW="pytest" +[ -f go.mod ] && TEST_FW="go-test" +echo "TEST_FW: ${TEST_FW:-unknown}" +``` + +### Read specialist hit rates (adaptive gating) + +```bash +$GSTACK_BIN/gstack-specialist-stats 2>/dev/null || true +``` + +### Select specialists + +Based on the scope signals above, select which specialists to dispatch. + +**Always-on (dispatch on every review with 50+ changed lines):** +1. **Testing** — read `$GSTACK_ROOT/review/specialists/testing.md` +2. **Maintainability** — read `$GSTACK_ROOT/review/specialists/maintainability.md` + +**If DIFF_LINES < 50:** Skip all specialists. Print: "Small diff ($DIFF_LINES lines) — specialists skipped." Continue to the Fix-First flow (item 4). + +**Conditional (dispatch if the matching scope signal is true):** +3. **Security** — if SCOPE_AUTH=true, OR if SCOPE_BACKEND=true AND DIFF_LINES > 100. Read `$GSTACK_ROOT/review/specialists/security.md` +4. **Performance** — if SCOPE_BACKEND=true OR SCOPE_FRONTEND=true. Read `$GSTACK_ROOT/review/specialists/performance.md` +5. **Data Migration** — if SCOPE_MIGRATIONS=true. Read `$GSTACK_ROOT/review/specialists/data-migration.md` +6. **API Contract** — if SCOPE_API=true. Read `$GSTACK_ROOT/review/specialists/api-contract.md` +7. **Design** — if SCOPE_FRONTEND=true. Use the existing design review checklist at `$GSTACK_ROOT/review/design-checklist.md` + +### Adaptive gating + +After scope-based selection, apply adaptive gating based on specialist hit rates: + +For each conditional specialist that passed scope gating, check the `gstack-specialist-stats` output above: +- If tagged `[GATE_CANDIDATE]` (0 findings in 10+ dispatches): skip it. Print: "[specialist] auto-gated (0 findings in N reviews)." +- If tagged `[NEVER_GATE]`: always dispatch regardless of hit rate. Security and data-migration are insurance policy specialists — they should run even when silent. + +**Force flags:** If the user's prompt includes `--security`, `--performance`, `--testing`, `--maintainability`, `--data-migration`, `--api-contract`, `--design`, or `--all-specialists`, force-include that specialist regardless of gating. + +Note which specialists were selected, gated, and skipped. Print the selection: +"Dispatching N specialists: [names]. Skipped: [names] (scope not detected). Gated: [names] (0 findings in N+ reviews)." + +--- + +### Dispatch specialists in parallel + +For each selected specialist, launch an independent subagent via the Agent tool. +**Launch ALL selected specialists in a single message** (multiple Agent tool calls) +so they run in parallel. Each subagent has fresh context — no prior review bias. + +**Each specialist subagent prompt:** + +Construct the prompt for each specialist. The prompt includes: + +1. The specialist's checklist content (you already read the file above) +2. Stack context: "This is a {STACK} project." +3. Past learnings for this domain (if any exist): + +```bash +$GSTACK_BIN/gstack-learnings-search --type pitfall --query "{specialist domain}" --limit 5 2>/dev/null || true +``` + +If learnings are found, include them: "Past learnings for this domain: {learnings}" + +4. Instructions: + +"You are a specialist code reviewer. Read the checklist below, then run +`git diff origin/` to get the full diff. Apply the checklist against the diff. + +For each finding, output a JSON object on its own line: +{\"severity\":\"CRITICAL|INFORMATIONAL\",\"confidence\":N,\"path\":\"file\",\"line\":N,\"category\":\"category\",\"summary\":\"description\",\"fix\":\"recommended fix\",\"fingerprint\":\"path:line:category\",\"specialist\":\"name\"} + +Required fields: severity, confidence, path, category, summary, specialist. +Optional: line, fix, fingerprint, evidence, test_stub. + +If you can write a test that would catch this issue, include it in the `test_stub` field. +Use the detected test framework ({TEST_FW}). Write a minimal skeleton — describe/it/test +blocks with clear intent. Skip test_stub for architectural or design-only findings. + +If no findings: output `NO FINDINGS` and nothing else. +Do not output anything else — no preamble, no summary, no commentary. + +Stack context: {STACK} +Past learnings: {learnings or 'none'} + +CHECKLIST: +{checklist content}" + +**Subagent configuration:** +- Use `subagent_type: "general-purpose"` +- Do NOT use `run_in_background` — all specialists must complete before merge +- If any specialist subagent fails or times out, log the failure and continue with results from successful specialists. Specialists are additive — partial results are better than no results. + +--- + +### Step 3.56: Collect and merge findings + +After all specialist subagents complete, collect their outputs. + +**Parse findings:** +For each specialist's output: +1. If output is "NO FINDINGS" — skip, this specialist found nothing +2. Otherwise, parse each line as a JSON object. Skip lines that are not valid JSON. +3. Collect all parsed findings into a single list, tagged with their specialist name. + +**Fingerprint and deduplicate:** +For each finding, compute its fingerprint: +- If `fingerprint` field is present, use it +- Otherwise: `{path}:{line}:{category}` (if line is present) or `{path}:{category}` + +Group findings by fingerprint. For findings sharing the same fingerprint: +- Keep the finding with the highest confidence score +- Tag it: "MULTI-SPECIALIST CONFIRMED ({specialist1} + {specialist2})" +- Boost confidence by +1 (cap at 10) +- Note the confirming specialists in the output + +**Apply confidence gates:** +- Confidence 7+: show normally in the findings output +- Confidence 5-6: show with caveat "Medium confidence — verify this is actually an issue" +- Confidence 3-4: move to appendix (suppress from main findings) +- Confidence 1-2: suppress entirely + +**Compute PR Quality Score:** +After merging, compute the quality score: +`quality_score = max(0, 10 - (critical_count * 2 + informational_count * 0.5))` +Cap at 10. Log this in the review result at the end. + +**Output merged findings:** +Present the merged findings in the same format as the current review: + +``` +SPECIALIST REVIEW: N findings (X critical, Y informational) from Z specialists + +[For each finding, in order: CRITICAL first, then INFORMATIONAL, sorted by confidence descending] +[SEVERITY] (confidence: N/10, specialist: name) path:line — summary + Fix: recommended fix + [If MULTI-SPECIALIST CONFIRMED: show confirmation note] + +PR Quality Score: X/10 +``` + +These findings flow into the Fix-First flow (item 4) alongside the checklist pass (Step 3.5). +The Fix-First heuristic applies identically — specialist findings follow the same AUTO-FIX vs ASK classification. + +**Compile per-specialist stats:** +After merging findings, compile a `specialists` object for the review-log persist. +For each specialist (testing, maintainability, security, performance, data-migration, api-contract, design, red-team): +- If dispatched: `{"dispatched": true, "findings": N, "critical": N, "informational": N}` +- If skipped by scope: `{"dispatched": false, "reason": "scope"}` +- If skipped by gating: `{"dispatched": false, "reason": "gated"}` +- If not applicable (e.g., red-team not activated): omit from the object + +Include the Design specialist even though it uses `design-checklist.md` instead of the specialist schema files. +Remember these stats — you will need them for the review-log entry in Step 5.8. + +--- + +### Red Team dispatch (conditional) + +**Activation:** Only if DIFF_LINES > 200 OR any specialist produced a CRITICAL finding. + +If activated, dispatch one more subagent via the Agent tool (foreground, not background). + +The Red Team subagent receives: +1. The red-team checklist from `$GSTACK_ROOT/review/specialists/red-team.md` +2. The merged specialist findings from Step 3.56 (so it knows what was already caught) +3. The git diff command + +Prompt: "You are a red team reviewer. The code has already been reviewed by N specialists +who found the following issues: {merged findings summary}. Your job is to find what they +MISSED. Read the checklist, run `git diff origin/`, and look for gaps. +Output findings as JSON objects (same schema as the specialists). Focus on cross-cutting +concerns, integration boundary issues, and failure modes that specialist checklists +don't cover." + +If the Red Team finds additional issues, merge them into the findings list before +the Fix-First flow (item 4). Red Team findings are tagged with `"specialist":"red-team"`. + +If the Red Team returns NO FINDINGS, note: "Red Team review: no additional issues found." +If the Red Team subagent fails or times out, skip silently and continue. + +### Step 3.57: Cross-review finding dedup + +Before classifying findings, check if any were previously skipped by the user in a prior review on this branch. + +```bash +$GSTACK_ROOT/bin/gstack-review-read +``` + +Parse the output: only lines BEFORE `---CONFIG---` are JSONL entries (the output also contains `---CONFIG---` and `---HEAD---` footer sections that are not JSONL — ignore those). + +For each JSONL entry that has a `findings` array: +1. Collect all fingerprints where `action: "skipped"` +2. Note the `commit` field from that entry + +If skipped fingerprints exist, get the list of files changed since that review: + +```bash +git diff --name-only HEAD +``` + +For each current finding (from both the checklist pass (Step 3.5) and specialist review (Step 3.55-3.56)), check: +- Does its fingerprint match a previously skipped finding? +- Is the finding's file path NOT in the changed-files set? + +If both conditions are true: suppress the finding. It was intentionally skipped and the relevant code hasn't changed. + +Print: "Suppressed N findings from prior reviews (previously skipped by user)" + +**Only suppress `skipped` findings — never `fixed` or `auto-fixed`** (those might regress and should be re-checked). + +If no prior reviews exist or none have a `findings` array, skip this step silently. + +Output a summary header: `Pre-Landing Review: N issues (X critical, Y informational)` + +4. **Classify each finding from both the checklist pass and specialist review (Step 3.55-3.56) as AUTO-FIX or ASK** per the Fix-First Heuristic in checklist.md. Critical findings lean toward ASK; informational lean toward AUTO-FIX. 5. **Auto-fix all AUTO-FIX items.** Apply each fix. Output one line per fix: @@ -1676,10 +1978,13 @@ Present Codex output under a `CODEX (design):` header, merged with the checklist 9. Persist the review result to the review log: ```bash -$GSTACK_ROOT/bin/gstack-review-log '{"skill":"review","timestamp":"TIMESTAMP","status":"STATUS","issues_found":N,"critical":N,"informational":N,"commit":"'"$(git rev-parse --short HEAD)"'","via":"ship"}' +$GSTACK_ROOT/bin/gstack-review-log '{"skill":"review","timestamp":"TIMESTAMP","status":"STATUS","issues_found":N,"critical":N,"informational":N,"quality_score":SCORE,"specialists":SPECIALISTS_JSON,"findings":FINDINGS_JSON,"commit":"'"$(git rev-parse --short HEAD)"'","via":"ship"}' ``` Substitute TIMESTAMP (ISO 8601), STATUS ("clean" if no issues, "issues_found" otherwise), and N values from the summary counts above. The `via:"ship"` distinguishes from standalone `/review` runs. +- `quality_score` = the PR Quality Score computed in Step 3.56 (e.g., 7.5). If specialists were skipped (small diff), use `10.0` +- `specialists` = the per-specialist stats object compiled in Step 3.56. Each specialist that was considered gets an entry: `{"dispatched":true/false,"findings":N,"critical":N,"informational":N}` if dispatched, or `{"dispatched":false,"reason":"scope|gated"}` if skipped. Example: `{"testing":{"dispatched":true,"findings":2,"critical":0,"informational":2},"security":{"dispatched":false,"reason":"scope"}}` +- `findings` = array of per-finding records. For each finding (from checklist pass and specialists), include: `{"fingerprint":"path:line:category","severity":"CRITICAL|INFORMATIONAL","action":"ACTION"}`. ACTION is `"auto-fixed"`, `"fixed"` (user approved), or `"skipped"` (user chose Skip). Save the review output — it goes into the PR body in Step 8. @@ -1849,6 +2154,31 @@ High-confidence findings (agreed on by multiple sources) should be prioritized f --- +## Multi-LLM Orchestration (llm-cli-gateway) + +If preamble shows `LLM_GATEWAY: unavailable`: skip this section entirely. + +If preamble shows `LLM_GATEWAY: available`: + +1. **CLI availability:** Only use tools for CLIs shown as `yes` in the preamble. + Skip tool recommendations for unavailable CLIs. + +2. **Async for parallel work:** Use `_async` variants + `llm_job_status`/`llm_job_result` + when running multiple LLM requests in parallel. Use sync variants for single sequential calls. + +3. **Session continuity:** Use `session_create` to establish a session for multi-turn + workflows. Pass `sessionId` to subsequent requests in the same skill invocation. + +**During multi-LLM pre-ship verification**, use these gateway MCP tools: + +- `mcp__llm-cli-gw__gemini_request_async` — dispatch Gemini pre-ship review in parallel (requires `LLM_GW_GEMINI: yes`) +- `mcp__llm-cli-gw__codex_request_async` — dispatch Codex pre-ship review in parallel (requires `LLM_GW_CODEX: yes`) +- `mcp__llm-cli-gw__llm_job_status` — poll async job completion +- `mcp__llm-cli-gw__llm_job_result` — collect pre-ship review results + +Collect results from all models before synthesizing. Always show which models contributed +and flag where models agree vs. diverge. + ## Capture Learnings If you discovered a non-obvious pattern, pitfall, or architectural insight during @@ -1885,7 +2215,7 @@ echo "BASE: $BASE_VERSION HEAD: $CURRENT_VERSION" if [ "$CURRENT_VERSION" != "$BASE_VERSION" ]; then echo "ALREADY_BUMPED"; fi ``` -If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (prior `/ship` run). Skip the rest of Step 4 and use the current VERSION. Otherwise proceed with the bump. +If output shows `ALREADY_BUMPED`, VERSION was already bumped on this branch (prior `/ship` run). Skip the bump action (do not modify VERSION), but read the current VERSION value — it is needed for CHANGELOG and PR body. Continue to the next step. Otherwise proceed with the bump. 1. Read the current `VERSION` file (4-digit format: `MAJOR.MINOR.PATCH.MICRO`) @@ -2076,7 +2406,7 @@ echo "LOCAL: $LOCAL REMOTE: $REMOTE" [ "$LOCAL" = "$REMOTE" ] && echo "ALREADY_PUSHED" || echo "PUSH_NEEDED" ``` -If `ALREADY_PUSHED`, skip the push. Otherwise push with upstream tracking: +If `ALREADY_PUSHED`, skip the push but continue to Step 8. Otherwise push with upstream tracking: ```bash git push -u origin @@ -2098,7 +2428,7 @@ gh pr view --json url,number,state -q 'if .state == "OPEN" then "PR #\(.number): glab mr view -F json 2>/dev/null | jq -r 'if .state == "opened" then "MR_EXISTS" else "NO_MR" end' 2>/dev/null || echo "NO_MR" ``` -If an **open** PR/MR already exists: **update** the PR body with the latest test results, coverage, and review findings using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Print the existing URL and continue to Step 8.5. +If an **open** PR/MR already exists: **update** the PR body using `gh pr edit --body "..."` (GitHub) or `glab mr update -d "..."` (GitLab). Always regenerate the PR body from scratch using this run's fresh results (test output, coverage audit, review findings, adversarial review, TODOS summary). Never reuse stale PR body content from a prior run. Print the existing URL and continue to Step 8.5. If no PR/MR exists: create a pull request (GitHub) or merge request (GitLab) using the platform detected in Step 0. @@ -2203,6 +2533,8 @@ execute its full workflow: This step is automatic. Do not ask the user for confirmation. The goal is zero-friction doc updates — the user runs `/ship` and documentation stays current without a separate command. +If Step 8.5 created a docs commit, re-edit the PR/MR body to include the latest commit SHA in the summary. This ensures the PR body reflects the truly final state after document-release. + --- ## Step 8.75: Persist ship metrics diff --git a/test/llm-gateway-resolver.test.ts b/test/llm-gateway-resolver.test.ts new file mode 100644 index 000000000..dfa68d92a --- /dev/null +++ b/test/llm-gateway-resolver.test.ts @@ -0,0 +1,210 @@ +import { describe, test, expect } from 'bun:test'; +import { generateLlmGatewayContext } from '../scripts/resolvers/llm-gateway'; +import type { TemplateContext, HostPaths } from '../scripts/resolvers/types'; +import * as fs from 'fs'; +import * as path from 'path'; + +const ROOT = path.resolve(import.meta.dir, '..'); + +// Known llm-cli-gateway MCP tool names (from src/index.ts tool registrations) +const KNOWN_GATEWAY_TOOLS = [ + 'claude_request', 'claude_request_async', + 'codex_request', 'codex_request_async', + 'gemini_request', 'gemini_request_async', + 'llm_job_status', 'llm_job_result', 'llm_job_cancel', + 'session_create', 'session_list', 'session_get', 'session_set_active', 'session_delete', 'session_clear_all', + 'list_models', 'approval_list', 'llm_process_health', +]; + +const VALID_CLI_VALUES = ['claude', 'codex', 'gemini']; + +const claudePaths: HostPaths = { + skillRoot: '~/.claude/skills/gstack', + localSkillRoot: '.claude/skills/gstack', + binDir: '~/.claude/skills/gstack/bin', + browseDir: '~/.claude/skills/gstack/browse/dist', + designDir: '~/.claude/skills/gstack/design/dist', +}; + +function makeCtx(skillName: string, host: string = 'claude'): TemplateContext { + return { + skillName, + tmplPath: path.join(ROOT, skillName, 'SKILL.md.tmpl'), + host: host as any, + paths: claudePaths, + preambleTier: 4, + }; +} + +// Load tools.json for schema validation +const toolsJsonPath = path.join(ROOT, 'contrib/add-tool/llm-gateway/tools.json'); +const toolsConfig = JSON.parse(fs.readFileSync(toolsJsonPath, 'utf-8')); + +describe('tools.json schema validation', () => { + test('has valid top-level structure', () => { + expect(toolsConfig.tool).toBe('llm-cli-gateway'); + expect(toolsConfig.mcp_server_name).toBe('llm-cli-gw'); + expect(toolsConfig.detection).toBeDefined(); + expect(toolsConfig.detection.binary).toBe('llm-cli-gateway'); + expect(toolsConfig.detection.min_version).toBe('1.1.0'); + expect(toolsConfig.integrations).toBeDefined(); + }); + + test('mcp_server_name is not "llm" (collides with simonw llm tool)', () => { + expect(toolsConfig.mcp_server_name).not.toBe('llm'); + }); + + const integrationNames = Object.keys(toolsConfig.integrations); + + test('has 6 skill integrations', () => { + expect(integrationNames).toEqual([ + 'review', 'investigate', 'plan-eng-review', 'plan-ceo-review', 'ship', 'retro', + ]); + }); + + for (const [skillName, integration] of Object.entries(toolsConfig.integrations) as [string, any][]) { + describe(`integration: ${skillName}`, () => { + test('has required fields', () => { + expect(integration.phase).toBeTruthy(); + expect(integration.context).toBeTruthy(); + expect(Array.isArray(integration.tools)).toBe(true); + expect(integration.tools.length).toBeGreaterThan(0); + }); + + for (const tool of integration.tools) { + test(`tool "${tool.tool}" is a known llm-cli-gateway MCP tool`, () => { + expect(KNOWN_GATEWAY_TOOLS).toContain(tool.tool); + }); + + test(`tool "${tool.tool}" has a when description`, () => { + expect(tool.when).toBeTruthy(); + expect(tool.when.length).toBeGreaterThan(10); + }); + + if (tool.requires_cli) { + test(`tool "${tool.tool}" requires_cli is a valid CLI name`, () => { + expect(VALID_CLI_VALUES).toContain(tool.requires_cli); + }); + } + } + }); + } +}); + +describe('LLM_GATEWAY_CONTEXT resolver', () => { + const integratedSkills = Object.keys(toolsConfig.integrations); + + for (const skillName of integratedSkills) { + test(`${skillName}: returns non-empty output`, () => { + const result = generateLlmGatewayContext(makeCtx(skillName)); + expect(result.length).toBeGreaterThan(0); + }); + + test(`${skillName}: contains mcp__llm-cli-gw__ prefix`, () => { + const result = generateLlmGatewayContext(makeCtx(skillName)); + expect(result).toContain('mcp__llm-cli-gw__'); + }); + + test(`${skillName}: contains CLI availability gating`, () => { + const result = generateLlmGatewayContext(makeCtx(skillName)); + expect(result).toContain('LLM_GATEWAY: unavailable'); + expect(result).toContain('LLM_GATEWAY: available'); + }); + + test(`${skillName}: contains cross-model synthesis instruction`, () => { + const result = generateLlmGatewayContext(makeCtx(skillName)); + expect(result).toContain('which models contributed'); + expect(result).toContain('agree vs. diverge'); + }); + + test(`${skillName}: uses context from tools.json`, () => { + const result = generateLlmGatewayContext(makeCtx(skillName)); + const expectedContext = toolsConfig.integrations[skillName].context; + expect(result).toContain(expectedContext); + }); + } + + test('returns empty string for unknown skills', () => { + expect(generateLlmGatewayContext(makeCtx('browse'))).toBe(''); + expect(generateLlmGatewayContext(makeCtx('qa'))).toBe(''); + expect(generateLlmGatewayContext(makeCtx('design-review'))).toBe(''); + expect(generateLlmGatewayContext(makeCtx('nonexistent-skill'))).toBe(''); + }); + + test('codex host suppresses codex-specific tools', () => { + const result = generateLlmGatewayContext(makeCtx('review', 'codex')); + expect(result).not.toContain('codex_request_async'); + expect(result).toContain('gemini_request_async'); + }); + + test('codex host still produces output when gemini tools exist', () => { + const result = generateLlmGatewayContext(makeCtx('review', 'codex')); + expect(result.length).toBeGreaterThan(0); + expect(result).toContain('Multi-LLM Orchestration'); + }); + + test('host filtering is generalized — any host suppresses its own CLI tools', () => { + // If gemini were a host, it should suppress gemini tools + const geminiResult = generateLlmGatewayContext(makeCtx('review', 'gemini')); + expect(geminiResult).not.toContain('gemini_request_async'); + expect(geminiResult).toContain('codex_request_async'); + }); + + test('async tools in tool list only for review and ship', () => { + for (const skillName of integratedSkills) { + const tools = toolsConfig.integrations[skillName].tools as any[]; + const hasAsyncTools = tools.some((t: any) => t.tool.endsWith('_async')); + if (skillName === 'review' || skillName === 'ship') { + expect(hasAsyncTools).toBe(true); + } else { + expect(hasAsyncTools).toBe(false); + } + } + }); +}); + +describe('generated SKILL.md files contain gateway content', () => { + const integratedSkills = ['review', 'investigate', 'plan-eng-review', 'plan-ceo-review', 'ship', 'retro']; + + for (const skill of integratedSkills) { + test(`${skill}/SKILL.md contains Multi-LLM Orchestration section`, () => { + const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); + expect(content).toContain('## Multi-LLM Orchestration (llm-cli-gateway)'); + expect(content).toContain('mcp__llm-cli-gw__'); + }); + + test(`${skill}/SKILL.md has no unresolved {{LLM_GATEWAY_CONTEXT}} placeholder`, () => { + const content = fs.readFileSync(path.join(ROOT, skill, 'SKILL.md'), 'utf-8'); + expect(content).not.toContain('{{LLM_GATEWAY_CONTEXT}}'); + }); + } + + test('non-integrated skills have no gateway content', () => { + const nonIntegrated = ['browse', 'qa', 'design-review', 'office-hours', 'codex']; + for (const skill of nonIntegrated) { + const skillPath = path.join(ROOT, skill, 'SKILL.md'); + if (fs.existsSync(skillPath)) { + const content = fs.readFileSync(skillPath, 'utf-8'); + expect(content).not.toContain('Multi-LLM Orchestration'); + expect(content).not.toContain('mcp__llm-cli-gw__'); + } + } + }); +}); + +describe('preamble detection block', () => { + test('preamble.ts contains llm-cli-gateway detection', () => { + const preamble = fs.readFileSync(path.join(ROOT, 'scripts/resolvers/preamble.ts'), 'utf-8'); + expect(preamble).toContain('llm-cli-gateway'); + expect(preamble).toContain('LLM_GATEWAY'); + expect(preamble).toContain('LLM_GW_CLAUDE'); + expect(preamble).toContain('LLM_GW_CODEX'); + expect(preamble).toContain('LLM_GW_GEMINI'); + }); + + test('generated SKILL.md preamble contains gateway detection output', () => { + const content = fs.readFileSync(path.join(ROOT, 'review/SKILL.md'), 'utf-8'); + expect(content).toContain('LLM_GATEWAY:'); + expect(content).toContain('LLM_GW_CLAUDE:'); + }); +});