diff --git a/AGENTS.md b/AGENTS.md index 79553db..02fb3d5 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -202,5 +202,5 @@ go run ./cmd/portolan scan --help For additional context about technologies to be used, project structure, shell commands, and other important information, read the current plan: -`docs/specs/083-tool-acquisition-guidance/plan.md` +`docs/specs/084-external-tool-evaluation-profiles/plan.md` diff --git a/docs/adapter-contracts/external-tool-evaluation-profiles.md b/docs/adapter-contracts/external-tool-evaluation-profiles.md new file mode 100644 index 0000000..0b56f0a --- /dev/null +++ b/docs/adapter-contracts/external-tool-evaluation-profiles.md @@ -0,0 +1,140 @@ +# External Tool Evaluation Profiles + +Last refreshed: 2026-06-05 + +These profiles record adoption decisions for external code-understanding tools. +They are planning and guidance artifacts, not Portolan evidence graph facts. + +## Decision Summary + +| Project | Role | Fit | Recommended Portolan Action | +| --- | --- | --- | --- | +| `defendend/Claude-ast-index-search` | `producer_candidate` | Strongest current symbol/reference producer candidate. | Plan future import of explicitly supplied local ast-index output in spec 085; do not install or run it by default. | +| `colbymchenry/codegraph` | `producer_candidate` | Useful but lower-fit optional candidate because default workflows can write target-local state and include install/watch/MCP behavior. | Keep as optional evaluation profile; require approval and output-boundary review before any use. | +| `Lum1104/Understand-Anything` | `ux_pattern_source` | Useful navigation and teaching patterns, but LLM-authored graph output is not verified Portolan evidence. | Borrow UX patterns only through spec 086; reject graph claims as evidence unless a future deterministic local output path is proven. | + +Profile role labels are product-facing guidance only. Machine-readable producer +evidence continues to use the existing producer-family `Decision` and +`SupportState` records until a future spec changes that contract. + +## Role Mapping Boundary + +| Profile Role | Producer-Family Decision | Producer-Family Support State | Required Follow-Up | +| --- | --- | --- | --- | +| `producer_candidate` | `not_assessed` until local evaluation exists | `candidate_only` | Keep as guidance; open a future importer/evaluation spec before support is claimed. | +| `ux_pattern_source` | `not_assessed` for evidence production | `candidate_only` or `rejected` for evidence use | Route useful interaction ideas to a UX/navigation spec; do not emit producer evidence. | +| `ready_for_import_planning` | `narrowed` only after a local output contract is reviewed | `narrowed` | Open or update an importer spec and validate explicit local output. | +| `blocked` | `blocked` | `blocked` | Record blocker and required approval or redesign before any execution/import work. | +| `rejected` | `rejected` | `rejected` | Record why it cannot satisfy Portolan evidence boundaries; do not recommend as producer evidence. | + +## Common Rules + +- Profiles do not create graph facts. +- Profiles do not promote symbol/reference/call/runtime evidence to + `source-visible`, `metadata-visible`, or `runtime-visible`. +- Candidate tools remain `not_assessed` until explicit local output exists and + a Portolan importer or producer-evaluation record covers it. +- Network download, tool install, target mutation, hook install, MCP install, + watcher startup, daemon startup, and global configuration writes require + explicit human or spec-level approval. +- LLM-authored summaries, dashboards, and architecture graphs are claims until a + future spec proves deterministic, locally supplied, evidence-bounded output. +- Refresh public metadata before depending on a profile if more than one day has + passed since `Last refreshed`. + +## Profile: ast-index + +- Project identity: `defendend/Claude-ast-index-search` +- URL: https://github.com/defendend/Claude-ast-index-search +- License: MIT observed by GitHub API on 2026-06-05. +- Maintenance snapshot: default branch `main`, pushed at + 2026-06-03T14:46:59Z, 415 stars, 32 forks. +- Role: `producer_candidate`. +- Relevant output surfaces: local index/search CLI, SQLite/cache-style index, + JSON CLI surfaces, name/string-based reference and caller-style outputs. +- Local execution posture: may create local index/cache output; optional + watcher, hook, or MCP installation commands require approval. +- Target mutation risk: unknown until exact command and output path are + reviewed; treat any target-local cache/write as approval-gated. +- Network/install risk: package acquisition or repository clone requires + approval. +- Daemon/watch risk: watcher or MCP behavior is not approved by this slice. +- Privacy risk: index/cache/search output may include source paths, symbols, + snippets, or customer-sensitive identifiers. +- Integration cost: medium; future importer spec must validate exact output + format and map name/string references without treating them as complete call + graphs. +- Approval boundary: do not install, run, watch, hook, or start MCP behavior + without explicit approval and a selected output directory. +- Recommended Portolan action: strongest current candidate for future + symbol/reference import planning in `docs/specs/085-ast-index-producer-import/`. +- Evidence limitations: no ast-index output is supplied in this slice; symbol, + reference, caller, and call-graph evidence remain `not_assessed`. + +## Profile: CodeGraph + +- Project identity: `colbymchenry/codegraph` +- URL: https://github.com/colbymchenry/codegraph +- License: MIT observed by GitHub API on 2026-06-05. +- Maintenance snapshot: default branch `main`, pushed at + 2026-06-05T04:54:11Z, 41920 stars, 2584 forks. +- Role: `producer_candidate`. +- Relevant output surfaces: local code knowledge graph, agent-facing query + surfaces, installable CLI/runtime, possible MCP/watch workflows. +- Local execution posture: useful local graph workflow, but default usage can + write `.codegraph/` in the target and may involve install, watch, or MCP + behavior. +- Target mutation risk: target-local `.codegraph/` writes are incompatible with + Portolan defaults unless explicitly approved or redirected outside the target. +- Network/install risk: install scripts or package downloads require approval. +- Daemon/watch risk: watch/MCP behavior is not approved by this slice. +- Privacy risk: graph/index output may include source paths, symbols, labels, + prompts, or summaries that should not be committed as fixtures. +- Integration cost: medium to high; future work must prove deterministic output + shape and evidence-state mapping before import. +- Approval boundary: do not install, run, write `.codegraph/`, watch, or start + MCP behavior without explicit approval and an output isolation plan. +- Recommended Portolan action: keep as lower-fit optional producer candidate; + prefer ast-index for near-term symbol/reference import planning. +- Evidence limitations: no CodeGraph output is supplied in this slice; graph, + relationship, and architecture claims remain `not_assessed`. + +## Profile: Understand-Anything + +- Project identity: `Lum1104/Understand-Anything` +- URL: https://github.com/Lum1104/Understand-Anything +- License: MIT observed by GitHub API on 2026-06-05. +- Maintenance snapshot: default branch `main`, pushed at + 2026-06-04T05:46:14Z, 52578 stars, 4315 forks. +- Role: `ux_pattern_source`. +- Relevant output surfaces: interactive knowledge graph, search/tour patterns, + agent/plugin guidance, LLM-authored graph and architecture explanations. +- Local execution posture: plugin/agent workflow may install assets, write + generated graph outputs, and depend on LLM-authored processing. +- Target mutation risk: generated graph or plugin state must be reviewed before + any target-local writes are allowed. +- Network/install risk: marketplace or package installation requires approval. +- Daemon/watch risk: any background or plugin runtime is not approved by this + slice. +- Privacy risk: generated knowledge graphs and summaries may contain source + snippets, file paths, prompts, or architecture claims. +- Integration cost: high for evidence; lower for UX pattern inspiration. +- Approval boundary: do not install, run, or treat generated graph output as + evidence without a future deterministic-output spec. +- Recommended Portolan action: borrow navigation UX ideas only in + `docs/specs/086-evidence-navigation-ux-patterns/`. +- Evidence limitations: LLM-authored graph content is not Portolan evidence; + all symbol/reference/architecture claims remain `not_assessed`. + +## Refresh Procedure + +1. Re-check project identity, license, default branch, and latest pushed date. +2. Review install, output, watcher, hook, MCP, daemon, and target-write behavior. +3. Update only the affected profile section and `Last refreshed` date. +4. If a tool moves from guidance to import planning, create or update a future + SpecKit slice and map the decision to producer-family `Decision` and + `SupportState` records. +5. Use `schema/producer-family.schema.json` and the producer-family precedent in + `docs/specs/053-language-agnostic-producers/` before emitting or changing any + machine-readable producer-family record. +6. Keep unrelated profiles unchanged unless their metadata was refreshed too. diff --git a/docs/product-backlog.md b/docs/product-backlog.md index 91006bc..9b564e2 100644 --- a/docs/product-backlog.md +++ b/docs/product-backlog.md @@ -165,7 +165,7 @@ fixtures are preflight evidence only. | P6-081 | `docs/specs/081-maven-sharded-producer-plan/` | Context packs emit repository-sharded Maven/CycloneDX next actions for multi-repo JVM landscapes so agents do not treat one sample `pom.xml` as a landscape rollout plan. | Merged via PR #59; local baseline, fresh Bigtop context smoke, Cursor Composer 2.5 stress, three assessed non-GPT review lanes, GitHub checks, explicit user merge approval, squash merge `a89a965`, and remote branch cleanup verified. Maven execution, dependency evidence, JVM relationship claims, and GitHub review approval remain `not_assessed` | | P6-082 | `docs/specs/082-syft-sharded-sbom-plan/` | Context packs emit repository-sharded Syft/CycloneDX SBOM next actions for multi-repo landscapes so component/dependency evidence can be acquired incrementally without full-root SBOM scans. | Merged via PR #60; local baseline, fresh Bigtop context smoke, Cursor Composer 2.5 stress, three assessed non-GPT review lanes, GitHub checks, explicit user merge approval, squash merge `9390379`, and remote branch cleanup verified. Syft execution, component inventory, dependency evidence, and GitHub review approval remain `not_assessed` | | P6-083 | `docs/specs/083-tool-acquisition-guidance/` | Context packs make tool acquisition guidance explicit and stack-agnostic: agents can pull in the right local producer tools without treating Portolan as a PHP/JVM/Gradle adapter stack. | Merged via PR #61; local baseline, fresh Bigtop context smoke, Cursor Composer 2.5 stress, integrated PR #57-#61 stack-agnostic stress, three assessed non-GPT review lanes, GitHub checks, explicit user merge approval, squash merge `847e84e`, post-merge Bigtop context smoke, and remote branch cleanup verified. Native producer execution, tool install/acquisition, component inventory, dependency relationships, duplication metrics, runtime topology, and GitHub review approval remain `not_assessed` | -| P6-084 | `docs/specs/084-external-tool-evaluation-profiles/` | Portolan keeps dated evaluation profiles for CodeGraph, Understand-Anything, and ast-index so external tool adoption stays explicit, evidence-honest, and refreshable. | Draft spec; backlog-only. No plan/tasks yet. External tool metadata, profile implementation, and generated guidance remain `not_assessed` until planning and verification refresh the 2026-06-04 review snapshot | +| P6-084 | `docs/specs/084-external-tool-evaluation-profiles/` | Portolan keeps dated evaluation profiles for CodeGraph, Understand-Anything, and ast-index so external tool adoption stays explicit, evidence-honest, and refreshable. | Implemented locally; baseline, context smoke, and story review cycles verified. PR review, GitHub checks, real external tool execution/output, and merge approval remain `not_assessed` | | P6-085 | `docs/specs/085-ast-index-producer-import/` | Explicit local ast-index outputs can be imported as bounded symbol/reference/module producer evidence without Portolan installing, executing, watching, or mutating targets. | Draft spec; backlog-only. No plan/tasks yet. ast-index execution, real output acquisition, importer implementation, CodeGraph import, and call-graph parity remain `not_assessed` | | P6-086 | `docs/specs/086-evidence-navigation-ux-patterns/` | Useful navigation ideas from Understand-Anything, CodeGraph, and ast-index are adopted as evidence-backed guide patterns without accepting LLM-authored graphs or live dashboards as truth. | Draft spec; backlog-only. No plan/tasks yet. UX implementation, dashboard/MCP surfaces, LLM workflows, and agent acceptance impact remain `not_assessed` | diff --git a/docs/specs/084-external-tool-evaluation-profiles/contracts/external-tool-profile.md b/docs/specs/084-external-tool-evaluation-profiles/contracts/external-tool-profile.md new file mode 100644 index 0000000..e5cbe80 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/contracts/external-tool-profile.md @@ -0,0 +1,41 @@ +# Contract: External Tool Evaluation Profile + +External tool profiles are committed planning artifacts, not graph evidence. +Their product-facing role labels are companion guidance only; they do not +replace existing machine-readable producer-family `Decision` or `SupportState` +records. + +## Required Sections + +- Project identity. +- Last refreshed date. +- Role classification. +- Observed license and maintenance snapshot. +- Useful output surfaces. +- Local execution posture. +- Target mutation, network/install, daemon/watch, and privacy risks. +- Approval boundary. +- Recommended Portolan action. +- Evidence limitations. + +## Evidence Rules + +- A profile may recommend a local producer candidate. +- A profile must not create graph facts. +- A profile must not create a parallel machine-readable producer decision + scheme; future importer specs must map profile decisions to the existing + producer-family record contract. +- Role mapping must follow + `docs/adapter-contracts/external-tool-evaluation-profiles.md` and + `schema/producer-family.schema.json`. +- A profile must not promote symbol/reference/call/runtime evidence to + `source-visible`, `metadata-visible`, or `runtime-visible`. +- A profile must preserve stale upstream metadata as stale until refreshed. + +## Safety Rules + +- Network download, tool install, target mutation, hook install, MCP install, + watcher startup, daemon startup, and global configuration writes require + explicit human or spec-level approval. +- LLM-authored summaries and dashboards are not evidence unless a future spec + proves deterministic, locally supplied, evidence-bounded output. diff --git a/docs/specs/084-external-tool-evaluation-profiles/data-model.md b/docs/specs/084-external-tool-evaluation-profiles/data-model.md new file mode 100644 index 0000000..e90fe6e --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/data-model.md @@ -0,0 +1,59 @@ +# Data Model: External Tool Evaluation Profiles + +## Tool Evaluation Profile + +Fields: + +- `project_identity`: canonical repository owner/name and URL. +- `last_refreshed`: date when public metadata was checked. +- `license`: observed SPDX license from the refresh snapshot. +- `role`: one of `producer_candidate`, `ux_pattern_source`, + `ready_for_import_planning`, `blocked`, or `rejected`. +- `fit`: short product-fit rationale. +- `output_surfaces`: deterministic local outputs, indexes, graph files, + dashboards, MCP surfaces, or LLM-authored summaries identified by review. +- `local_execution_posture`: whether use is read-only, writes caches, writes in + target, installs tools, starts watchers, or starts daemon/MCP behavior. +- `risks`: target mutation, network/install, daemon/watch, privacy, schema + stability, and evidence-state risks. +- `approval_boundary`: action that requires human or future-spec approval. +- `recommended_portolan_action`: current adoption decision and next step. +- `evidence_limitations`: what remains `not_assessed`, `unknown`, or + `cannot_verify`. + +Validation rules: + +- Every profile must include `last_refreshed`. +- The profile `role` is product-facing guidance only. Machine-readable producer + evidence must continue to use existing producer-family decision/support-state + records until a future spec changes that contract. +- Every execution, install, watcher, daemon, hook, MCP, or target-mutation path + must be approval-gated. +- Profiles must not state that candidate output is observed Portolan evidence + until a local output/import path exists. + +## Candidate Role + +Allowed states: + +- `producer_candidate`: may be a useful local producer after approval and output + review. +- `ux_pattern_source`: useful design/reference input, not a verified evidence + producer. +- `ready_for_import_planning`: deterministic output is promising enough for a + future importer spec. +- `blocked`: current behavior blocks safe adoption without redesign or + approval. +- `rejected`: incompatible with Portolan's evidence boundary. + +## Approval Boundary + +An approval boundary records the exact class of action that Portolan does not +take by default: + +- network download or package installation; +- target repository mutation or cache write; +- global agent/editor configuration write; +- hook installation; +- watcher, daemon, server, or MCP startup; +- LLM-authored graph or summary use as evidence. diff --git a/docs/specs/084-external-tool-evaluation-profiles/plan.md b/docs/specs/084-external-tool-evaluation-profiles/plan.md new file mode 100644 index 0000000..a03878d --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/plan.md @@ -0,0 +1,146 @@ +# Implementation Plan: External Tool Evaluation Profiles + +**Branch**: `codex/084-external-tool-evaluation-profiles` + +**Date**: 2026-06-05 + +**Spec**: `docs/specs/084-external-tool-evaluation-profiles/spec.md` + +**Input**: Feature specification from +`docs/specs/084-external-tool-evaluation-profiles/spec.md` + +## Summary + +Create dated external-tool evaluation profiles for CodeGraph, +Understand-Anything, and ast-index so agents can distinguish producer +candidates, UX inspirations, adoption blockers, and evidence limitations without +treating candidate tools as observed Portolan evidence. The implementation is a +documentation and context-guidance slice: no external tool execution, no network +calls in product code, no target mutation, no daemon behavior, no schema +changes, and no new dependencies. + +## Decision Gate + +- **Simpler/Faster**: Keep the slice as profile documentation plus bounded + context guidance. Rejected broader implementation because importer work, + schema changes, or producer execution would solve a different problem. +- **Blocking Edge Cases**: Candidate tools can require package installation, + write caches inside target repositories, install hooks/MCP surfaces, start + watchers/daemons, contact networks, or emit LLM-authored summaries. Those + risks require approval boundaries and stale-profile labeling. +- **Existing Open Source**: The reviewed OSS projects are the subject of this + slice. Portolan will compose deterministic local outputs later if a future + importer spec proves fit; it will not reimplement their scanners here. + +## Technical Context + +**Language/Version**: Go for existing context-pack tests; markdown for profile +artifacts. + +**Primary Dependencies**: Standard library only. No new dependency. + +**Storage**: Committed documentation and generated context-pack artifacts. + +**Testing**: Focused `internal/contextprep` test, docs/profile checks, full +baseline. + +**Target Platform**: Local CLI repository workflow. + +**Project Type**: Go CLI plus documentation. + +**Performance Goals**: No new runtime path. Context-pack additions must remain +bounded text output. + +**Constraints**: Local-first/read-only defaults. No installs, network access in +product code, producer execution, daemon behavior, credentials, target +mutation, or graph evidence promotion. + +**Scale/Scope**: Three external projects and one context-guidance surface. + +## Constitution Check + +*GATE: Passed before Phase 0 research. Re-checked after Phase 1 design.* + +- I. Local-First And Read-Only By Default: Passed. Product code will not contact + networks, start daemons, install tools, mutate targets, or collect + credentials. +- II. Evidence State Honesty: Passed. Candidate profiles are planning guidance, + not graph facts or observed evidence. +- III. Complement, Do Not Replace: Passed. The slice evaluates OSS tool fit and + defers importer/execution work to future specs. +- IV. SpecKit Before Implementation: Passed after this plan, research, + data-model, contract, quickstart, and tasks are created. +- V. Test-First For Behavior: Passed. Any context-pack behavior change gets a + focused test before implementation. + +## Project Structure + +### Documentation (this feature) + +```text +docs/specs/084-external-tool-evaluation-profiles/ +├── plan.md # This file (/speckit-plan command output) +├── research.md # Phase 0 output (/speckit-plan command) +├── data-model.md # Phase 1 output (/speckit-plan command) +├── quickstart.md # Phase 1 output (/speckit-plan command) +├── contracts/ # Phase 1 output (/speckit-plan command) +└── tasks.md # Phase 2 output (/speckit-tasks command - NOT created by /speckit-plan) +``` + +### Source Code (repository root) +```text +docs/adapter-contracts/ +└── external-tool-evaluation-profiles.md + +internal/contextprep/ +├── contextprep.go +└── contextprep_test.go + +docs/specs/084-external-tool-evaluation-profiles/ +├── plan.md +├── research.md +├── data-model.md +├── quickstart.md +├── contracts/ +│ └── external-tool-profile.md +├── reviews/ +└── tasks.md +``` + +**Structure Decision**: Keep candidate profiles in `docs/adapter-contracts/` +beside existing adapter profiles and keep implementation evidence under the +spec-local `reviews/` directory. Use `internal/contextprep` only if generated +context needs a bounded pointer to the profile artifact. + +The profile `role` vocabulary is intentionally product-facing markdown +guidance. It is not a replacement for the existing producer-family +`Decision`/`SupportState` machine-readable records. Future importer or +navigation specs, including P6-085 and P6-086, must map any adopted profile +decision back to the existing producer-family contract before generated evidence +records depend on it. + +## Complexity Tracking + +> **Fill ONLY if Constitution Check has violations that must be justified** + +No constitution violations. + +## Verification + +```bash +go test -count=1 ./internal/contextprep +go test ./... +go vet ./... +jq empty schema/*.json +git diff --check +``` + +For generated context behavior, also run: + +```bash +go run ./cmd/portolan context prepare --root . --out /tmp/portolan-084-context --force +jq empty /tmp/portolan-084-context/oss-plan.json +``` + +Refresh public GitHub metadata before PR readiness if more than one day has +passed since the profile `last_refreshed` date. diff --git a/docs/specs/084-external-tool-evaluation-profiles/quickstart.md b/docs/specs/084-external-tool-evaluation-profiles/quickstart.md new file mode 100644 index 0000000..dec3cf4 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/quickstart.md @@ -0,0 +1,41 @@ +# Quickstart: External Tool Evaluation Profiles + +This feature is documentation and context guidance first. It does not execute +external tools. + +## Verify Profile Guidance + +1. Read `docs/adapter-contracts/external-tool-evaluation-profiles.md`. +2. Confirm the profile table includes `colbymchenry/codegraph`, + `Lum1104/Understand-Anything`, and + `defendend/Claude-ast-index-search`. +3. Confirm every profile has role, license, last refresh date, output surfaces, + risk boundaries, approval boundaries, and recommended Portolan action. +4. Confirm ast-index is the strongest current symbol/reference producer + candidate, CodeGraph is lower-fit optional, and Understand-Anything is UX + inspiration rather than verified evidence. +5. Refresh GitHub metadata before marking the PR ready if more than one day has + passed since the profile `last_refreshed` date. + +## Verify Context Pack Behavior + +```bash +go test -count=1 ./internal/contextprep +go run ./cmd/portolan context prepare --root . --out /tmp/portolan-084-context --force +jq empty /tmp/portolan-084-context/oss-plan.json +``` + +Expected result: + +- context guidance may mention external tool profiles as candidate guidance; +- graph facts and evidence states are not promoted; +- no external tool is installed, executed, watched, or started. + +## Baseline + +```bash +go test ./... +go vet ./... +jq empty schema/*.json +git diff --check +``` diff --git a/docs/specs/084-external-tool-evaluation-profiles/research.md b/docs/specs/084-external-tool-evaluation-profiles/research.md new file mode 100644 index 0000000..1cf68da --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/research.md @@ -0,0 +1,86 @@ +# Research: External Tool Evaluation Profiles + +## Decision: Keep Profiles As Documentation And Context Guidance + +Portolan will keep external tool evaluation profiles as dated markdown records +and bounded context-pack guidance. Candidate profiles do not become graph facts, +do not change evidence states, and do not introduce importer contracts. +The human-readable profile `role` field is a companion decision label, not a +replacement for existing machine-readable `producer-evaluation` records. Future +importer or navigation specs must map candidate decisions back to the existing +producer-family `Decision` and `SupportState` fields before generated context +or evidence records depend on them. + +Rationale: The feature is about adoption decisions and risk boundaries, not +executing or normalizing new tools. A documentation-first implementation is the +smallest reversible step and matches the backlog-only spec. + +Alternatives considered: + +- Add a JSON schema and machine-readable profile registry. Rejected for this + slice because no importer or CLI consumer needs a stable profile format yet. +- Reuse only the existing producer-family `Decision` and `SupportState` enums + in markdown. Rejected for this slice because maintainers need product-facing + labels such as `ux_pattern_source`; accepted only with an explicit mapping + boundary so the labels do not become a parallel machine-readable scheme. +- Add native producer execution wrappers. Rejected because the constitution + defaults to import/export composition and requires explicit approval before + installs, network access, daemon behavior, or target mutation. +- Add graph nodes for candidate tools. Rejected because a candidate is not local + evidence until a deterministic output is supplied and imported. + +## Decision: Use Dated GitHub Metadata Snapshots, Not Live Claims + +Profiles will include a `Last refreshed` date and a short metadata snapshot for +license and repository health. The snapshot is evidence for planning only and +must be refreshed before implementation depends on current upstream behavior. + +Rationale: The reviewed projects are moving quickly. A profile that looks +current without a refresh date invites stale adoption claims. + +Snapshot checked on 2026-06-05 with GitHub API: + +| Project | License | Default Branch | Pushed At | Stars | Forks | +| --- | --- | --- | --- | ---: | ---: | +| `colbymchenry/codegraph` | MIT | `main` | 2026-06-05T04:54:11Z | 41920 | 2584 | +| `Lum1104/Understand-Anything` | MIT | `main` | 2026-06-04T05:46:14Z | 52578 | 4315 | +| `defendend/Claude-ast-index-search` | MIT | `main` | 2026-06-03T14:46:59Z | 415 | 32 | + +Alternatives considered: + +- Treat the 2026-06-04 external review snapshot as sufficient. Rejected because + current license and maintenance facts are cheap to refresh and drift-prone. +- Query GitHub during Portolan context generation. Rejected because Portolan + stays local-first and read-only by default; no network calls in product code. + +## Decision: Classify Ast-Index Strongest, CodeGraph Lower-Fit, Understand-Anything UX-Only + +`Claude-ast-index-search` will be classified as the strongest current +symbol/reference producer candidate because it exposes local CLI/index/search +surfaces. `codegraph` will be classified as a lower-fit optional producer +candidate because its default workflow includes target-local `.codegraph/` +writes and broader install/watch/MCP behavior. `Understand-Anything` will be +classified as UX pattern source only unless a future spec proves deterministic, +locally supplied, evidence-bounded output. + +Rationale: Portolan can reuse deterministic local outputs, but it cannot treat +LLM-authored graph summaries, dashboards, or agent instructions as evidence. + +Alternatives considered: + +- Treat all three tools as producer candidates. Rejected because their default + outputs and authorship models are materially different. +- Reject all three until importers exist. Rejected because maintainers still + need an honest candidate ledger and next-action guidance. + +## Decision: No New Dependency + +This slice uses existing Go tests and markdown artifacts only. + +Rationale: A dependency would add maintenance and license review without +improving the user-visible profile decision. + +Alternatives considered: + +- Use a metadata parser or scanner library. Rejected because no structured + parsing is needed beyond committed markdown and existing context output. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/planning-review-disposition-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/planning-review-disposition-2026-06-05.md new file mode 100644 index 0000000..b6ebe00 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/planning-review-disposition-2026-06-05.md @@ -0,0 +1,61 @@ +# Planning Review Disposition: Spec 084 + +Date: 2026-06-05 + +Scope: `spec.md`, `plan.md`, `tasks.md`, SpecKit readiness, product boundary, +constitution alignment, and evidence-state semantics before implementation. + +## Review Lanes + +| Lane | Raw Output | Status | Verdict | +| --- | --- | --- | --- | +| `kimi-for-coding/k2p6` via `opencode run` | `raw-planning-kimi-k2p6-2026-06-05.md` | assessed | PASS with minor hygiene findings | +| `zai-coding-plan/glm-5.1` via `opencode run` | `raw-planning-zai-glm51-2026-06-05.md` | assessed | PASS with minor findings | +| `opencode-go/minimax-m3` via `opencode run` | `raw-planning-minimax-m3-2026-06-05.md` | assessed | CHANGES_REQUESTED before closeout | + +Note: The MiniMax lane observed its own raw-output file while it was still being +written and incorrectly reported that lane as 0 bytes. The final raw file +contains an assessed review, so that self-observation is rejected as stale +harness timing evidence. + +## Accepted Findings And Fixes + +- Branch metadata drift: fixed by changing `spec.md` to + `codex/084-external-tool-evaluation-profiles`. +- Spec status/backlog drift: fixed by marking `spec.md` and + `docs/product-backlog.md` ready for implementation while leaving + implementation and PR verification `not_assessed`. +- Docs-heavy PR quality lenses: fixed by clarifying T022 so docs-only lenses are + `not_applicable` with diff evidence, while bounded Go changes still receive + applicable code-quality review. +- Parallel role vocabulary risk: fixed by documenting that profile `role` is a + product-facing companion label only and does not replace existing + producer-family `Decision`/`SupportState` records. +- Spec assumption vs contextprep change: fixed by stating that a bounded + `internal/contextprep` pointer is allowed and is not an importer, schema + change, or graph evidence fact. +- Cross-spec coordination: fixed by linking future backlog-only slices 085 and + 086 from the spec. +- Snapshot freshness: fixed by requiring metadata refresh before PR readiness if + more than one day has passed since `last_refreshed`. + +## Rejected Findings + +- MiniMax F1 said MiniMax itself did not count because its raw file was empty. + Rejected: the model read the file before shell redirection flushed the final + output. The final file is non-empty and contains a substantive review. +- GLM F8 said the GLM raw file was empty. Rejected for the same stale + self-observation pattern; the final file contains an assessed review. + +## Readiness Decision + +Plan/tasks readiness: verified. + +Implementation may proceed after T001-T005 are marked complete in `tasks.md`. + +## Not Assessed + +- Implementation behavior for `docs/adapter-contracts/` and + `internal/contextprep`. +- GitHub checks, PR state, and merge readiness. +- Real CodeGraph, Understand-Anything, or ast-index execution/output. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/pr62-readiness-closeout-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/pr62-readiness-closeout-2026-06-05.md new file mode 100644 index 0000000..5cc92b3 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/pr62-readiness-closeout-2026-06-05.md @@ -0,0 +1,55 @@ +# PR #62 Readiness Closeout + +Date: 2026-06-05 + +PR: https://github.com/fcon-tech/portolan/pull/62 + +## Implementation State + +- Spec 084 implementation is locally complete. +- `docs/product-backlog.md`, `spec.md`, and `tasks.md` are aligned to local + implementation status. +- `tasks.md` T001 through T020 are complete. T021 through T023 are completed in + this PR review closeout after commit amend and push. + +## Local Verification + +```bash +go test -count=1 ./internal/contextprep +go test ./... +go vet ./... +jq empty schema/*.json +git diff --check +go run ./cmd/portolan context prepare --root . --out /tmp/portolan-084-context --force +jq empty /tmp/portolan-084-context/oss-plan.json +``` + +Result: verified locally. + +## Review Evidence + +- Planning review: Kimi, GLM, MiniMax opencode lanes recorded. +- US1 review: Kimi opencode lane PASS. +- US2 review: GLM opencode lane with repo-grounded sub-review PASS after stale + profile fix. +- US3 review: MiniMax opencode CHANGES_REQUESTED, Kimi opencode rerun + `not_assessed` due certificate failure, MiMo opencode replacement PASS. +- PR review: Kimi opencode PASS after accepted CleanCode fix. + +## GitHub State + +- PR draft/ready state: draft at creation; ready-for-review after final check + refresh. +- GitHub checks: verified successful on head + `a8b5860048b501077e2436908f4d753facc47bda` before this closeout update: + Baseline SUCCESS; CodeQL Analyze actions/go/python SUCCESS; aggregate CodeQL + SUCCESS. +- GitHub review approval: `not_assessed`. + +## Merge Readiness + +- ready-for-review PR: ready after final amended push/check refresh. +- ready-to-merge PR: no. +- merge_approval: `not_assessed`. + +Stop reason: PR review complete locally; merge requires explicit user command. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/pr62-review-disposition-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/pr62-review-disposition-2026-06-05.md new file mode 100644 index 0000000..32440d0 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/pr62-review-disposition-2026-06-05.md @@ -0,0 +1,51 @@ +# PR #62 Review Disposition + +Date: 2026-06-05 + +PR: https://github.com/fcon-tech/portolan/pull/62 + +## Review Lanes + +| Lane | Raw Output | Status | Verdict | +| --- | --- | --- | --- | +| `kimi-for-coding/k2p6` via `opencode run` | `raw-pr62-review-kimi-k2p6-2026-06-05.md` | assessed | PASS with one info finding | + +## Lens Results + +| Lens | Result | Evidence | +| --- | --- | --- | +| spec drift | verified: pass | `spec.md`, `plan.md`, `tasks.md`, backlog, and branch metadata agree. | +| constitution drift | verified: pass | No network, daemon, install, target mutation, schema change, or evidence promotion. | +| product drift | verified: pass | Scope remains profile guidance plus bounded context text. | +| CRAP < 5 | verified: pass for code, `not_applicable` for docs | Bounded `contextprep.go` string append has no branching; docs files do not have code metrics. | +| MI > 70 | verified: pass for code, `not_applicable` for docs | Bounded renderer/test change only; docs files do not have code metrics. | +| CleanArch hex | verified: pass for code, `not_applicable` for docs | Change stays in context rendering surface; no domain boundary change. | +| CleanCode | fixed | Reviewer flagged case-sensitive evidence-index negative assertion; fixed with case-insensitive check. | +| SOLID | verified: pass | No new interfaces or responsibilities. | +| DRY | verified: pass | No duplicate code abstraction introduced. | +| YAGNI | verified: pass | No new dependency, schema, importer, or execution wrapper. | + +## Accepted Findings And Fixes + +- CleanCode info finding: `contextprep_test.go` checked only lowercase + `codegraph`. Fixed by lowercasing the generated evidence index and forbidden + names before comparison. + +## Verification After Fix + +```bash +go test -count=1 ./internal/contextprep +go test ./... +go vet ./... +jq empty schema/*.json +git diff --check +``` + +All passed. + +## Not Assessed + +- GitHub checks after amended push. +- GitHub review approval. +- Real external tool execution/output. +- Merge approval. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-planning-kimi-k2p6-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-planning-kimi-k2p6-2026-06-05.md new file mode 100644 index 0000000..b0e2704 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-planning-kimi-k2p6-2026-06-05.md @@ -0,0 +1,27 @@ +## Spec 084 Readiness Review + +**Verdict:** PASS (minor hygiene findings; implementation may proceed) + +### Findings + +| Severity | Finding | Evidence | Recommendation | +|---|---|---|---| +| Minor | Spec header still claims "Draft; backlog-only" despite `plan.md` and `tasks.md` existing and satisfying the stated prerequisite | spec.md:7-8 | Update spec status to "Ready for implementation" during T001 or as a Phase 1 completion step; T020 covers final sync | +| Minor | Plan requires Phase 0/1 artifacts (`research.md`, `data-model.md`, `quickstart.md`, `contracts/external-tool-profile.md`) in addition to the constitution minimum set | plan.md:78-88, constitution.md:30-34 | Acceptable for this documentation slice, but verify lightweight artifacts are created before T019 to avoid empty-file drift | +| Minor | T022 mandates code-quality review lenses (CRAP, MI, CleanArch, CleanCode, SOLID, DRY, YAGNI) for a documentation-only slice; per delivery workflow these should be rated `not_applicable` when diff is docs/config-only | tasks.md:T022 | Add explicit note in T022 that docs-only changes rate code/architecture lenses as `not_applicable` with diff evidence | +| Minor | Phase 1 planning-review lanes (T005) are not yet recorded in the repository | tasks.md:T005 | Complete T005 before starting Phase 2 implementation, or treat this review as one assessed lane and record disposition | + +### Assessment by Area + +- **Requirements drift:** No drift detected. FR-001–FR-010 are concrete, traceable to tasks, and bounded to documentation/guidance. +- **Constitution drift:** No drift detected. Plan.md constitution check explicitly passes all five principles; local-first/read-only defaults are preserved. +- **Product drift:** No drift detected. Scope is explicitly documentation and context-guidance only; no importer, execution, schema change, or network behavior is introduced. +- **Evidence-state honesty:** PASS. FR-004 and SC-002 explicitly prohibit treating candidate profiles as observed evidence. T006 and T014 add test coverage for this boundary. +- **Privacy/security boundaries:** PASS. FR-003, FR-005, and edge cases cover target mutation, daemon/watch, credential, and sensitive-payload risks. Approval gates are required before any execution. +- **Task testability:** PASS. Each phase names concrete files, independent tests, and verification commands (`go test`, `jq empty`, `git diff --check`). + +### not_assessed + +- T005 review-lane execution evidence (pending Phase 1 completion) +- GitHub checks / PR state (pending Phase 6) +- External tool upstream health/metadata freshness post-2026-06-04 (acknowledged in spec.md:167-168; to be refreshed during implementation) diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-planning-minimax-m3-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-planning-minimax-m3-2026-06-05.md new file mode 100644 index 0000000..1ae8309 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-planning-minimax-m3-2026-06-05.md @@ -0,0 +1,55 @@ +# Spec 084 Plan/Tasks Readiness Review + +**Reviewer**: repo-grounded local review (no edits) +**Date**: 2026-06-05 +**Scope**: `spec.md`, `plan.md`, `tasks.md`, `research.md`, `data-model.md`, `quickstart.md`, `contracts/external-tool-profile.md`, `reviews/`, `AGENTS.md` diff, backlog row P6-084, existing `internal/contextprep`, `internal/producerfamily`, `docs/adapter-contracts/*`, `docs/product-boundary.md`, `docs/product-claims.md`, `docs/review-harness-benchmark.md`. + +## Findings + +| # | Severity | Area | Finding | Evidence | Recommendation | Disposition | +|---|---|---|---|---|---|---| +| F1 | **Major** | SpecKit pipeline drift | T005 requires three assessed non-GPT planning lanes. Only two produced real review output: `raw-planning-kimi-k2p6-2026-06-05.md` (PASS, 4 minor findings) and `raw-planning-zai-glm51-2026-06-05.md` (PASS, 10 minor findings). The `raw-planning-minimax-m3-2026-06-05.md` lane is an **empty 0-byte file** — that is harness failure / `not_assessed` per AGENTS.md and `docs/review-harness-benchmark.md:130-138`. Coverage is 2/3. | `docs/specs/084-external-tool-evaluation-profiles/reviews/raw-planning-minimax-m3-2026-06-05.md:1` (0 bytes), `tasks.md:T005`, `AGENTS.md` "empty/hung/malformed… do not count toward the three" | Run an explicit enabled non-GPT replacement lane (e.g., `openrouter/deepseek/deepseek-v4-flash`, `openrouter/~moonshotai/kimi-latest`, or `openrouter/xiaomi/mimo-v2.5-pro` from the roster) and record its disposition before Phase 2. | Open | +| F2 | **Major** | Product/scheme drift (vocab reuse) | Spec 084 introduces a parallel "role" classification vocabulary (`producer_candidate`, `ux_pattern_source`, `ready_for_import_planning`, `blocked`, `rejected`) in `data-model.md:11-44` and `contracts/external-tool-profile.md:7-16`, but the existing `internal/producerfamily` package already defines a `Decision` enum (`accepted`, `narrowed`, `rejected`, `blocked`, `not_assessed`) plus `SupportState` (`candidate_only`, `accepted`, `narrowed`, `rejected`, `blocked`) consumed from `producer-evaluations.jsonl` via `detectProducerEvaluations` (`contextprep.go:546-596`). Spec 053 ratified the JSONL evaluation record as the canonical machine-readable adoption-decision surface (`internal/testfixtures/language-agnostic-producers/producer-records.jsonl` already uses `producer-evaluation` with `Decision=Narrowed`, etc.). Spec 084's role vocabulary is a parallel scheme with no explicit mapping or decision to remain markdown-only. | `internal/producerfamily/producerfamily.go:118-170`, `internal/contextprep/contextprep.go:546-596`, `internal/testfixtures/language-agnostic-producers/producer-records.jsonl:1-3`, `docs/specs/084-external-tool-evaluation-profiles/data-model.md:10-44` | Either (a) state in `plan.md` and `research.md` that the profile document is intentionally a human-readable companion to, and not a replacement for, the producer-evaluation JSONL record, and add a follow-up note that future importer specs (P6-085, P6-086) will emit `producer-evaluation` records per candidate; or (b) drop the parallel `role` enum and reuse the existing one. Either path needs a sentence in the spec, plan, and contract. | Open | +| F3 | **Major** | Backlog / surface inconsistency | `product-backlog.md:168` (P6-084) still reads `"Draft spec; backlog-only. No plan/tasks yet."` while `spec.md:7-8` claims `"Ready for implementation. `plan.md` and `tasks.md` exist…"`. Spec/backlog/task ledgers must agree before T020 closeout. | `docs/product-backlog.md:168` vs `docs/specs/084-external-tool-evaluation-profiles/spec.md:7-8` | Update the backlog row to "Plan + tasks exist; planning review in progress; not yet ready for merge" in the same commit that updates `spec.md` status. Cover during T020. | Open (covered by T020) | +| F4 | Minor | Requirements drift | FR-007-009 are name-bound to three specific repos; if any of those move, the FRs require edits. Plan and tasks T008-T010 mirror that binding. | `spec.md:122-133`, `tasks.md:T008-T010` | Accept as a maintenance cost, but record in the contract that FR-007-009 are bound to current repo identities and a rename/transfer triggers a profile refresh. | Accept | +| F5 | Minor | Constitution drift | None. The plan's constitution check passes all five principles; the slice adds no installs, network, daemon, mutation, credentials, schema, or graph evidence promotion. | `plan.md:60-74`, `.specify/memory/constitution.md:6-42` | Accept. | Accept | +| F6 | Minor | Product drift | Scope is bounded to documentation + bounded context guidance, matching `docs/product-boundary.md:25-39` and `docs/product-claims.md` "Portolan composes existing OSS tool outputs" row. | `plan.md:13-20`, `product-boundary.md:25-40`, `product-claims.md:55-65` | Accept. | Accept | +| F7 | Minor | Evidence-state honesty | FR-004 and SC-002 explicitly prohibit promoting candidate profiles to observed evidence. T006 + T014 add focused test coverage. Existing `contextprep` already keeps candidate tools at `not_assessed` / `unknown` in `oss-plan.json` and `evidence-index.jsonl`. | `spec.md:114-116,150-152`, `tasks.md:T006,T014`, `contextprep.go:1437-1462` | Accept. | Accept | +| F8 | Minor | Privacy / safety boundaries | FR-003, FR-005, and edge cases cover target mutation, daemon/watch, credential, and LLM-authored summary risks with explicit approval gates. | `spec.md:111-119,90-99`, `contracts/external-tool-profile.md:26-32` | Accept. | Accept | +| F9 | Minor | Task testability | Each phase names concrete files, independent tests, and verification commands. Test-first discipline holds for behavior changes (T006 → T013 → T014). | `tasks.md:T006,T012,T015,T019-T023`, `plan.md:123-136` | Accept. | Accept | +| F10 | Minor | Spec assumption vs plan behavior | Spec assumption says "documentation and guidance first; it does not add an importer, execute external tools, install dependencies, or change schemas." But T013 changes `internal/contextprep/contextprep.go` (a Go code change) and T014 updates tests. The plan correctly notes the bounded exception in "Project Structure" and the constitution check, but the spec's framing is slightly misleading. | `spec.md:165-166` vs `tasks.md:T013-T014`, `plan.md:108-114` | Add one clarifying sentence to `spec.md` "Assumptions" that "Bounded context-pack guidance may add a small pointer in `internal/contextprep`; this is not an importer, schema change, or graph evidence fact." | Fix in same commit as status update | +| F11 | Minor | T022 review lens mandate vs docs-heavy diff | T022 mandates CRAP / Maintainability Index / CleanArch-hex / CleanCode / SOLID / DRY / YAGNI for the PR. The slice is dominated by new `docs/adapter-contracts/external-tool-evaluation-profiles.md` (docs), bounded T013 in `internal/contextprep` (Go), and T006/T014 in `internal/contextprep/contextprep_test.go` (test). Per `AGENTS.md` "For docs/config-only PRs, record code metrics and code-architecture lenses as `not_applicable` with diff evidence." Since the diff does touch Go, lens `not_applicable` is only partial. | `tasks.md:T022`, `AGENTS.md` PR review section, `internal/contextprep/contextprep.go` (T013 touches ~10-30 lines) | Reframe T022 to: "Run quality lenses; for the new `docs/adapter-contracts/external-tool-evaluation-profiles.md` and the bounded `internal/contextprep` change, classify each lens against the touched file set; record CRAP/MI/CleanArch-hex as `not_applicable` for the docs file with diff evidence, and apply the lenses to the small `internal/contextprep` change." | Fix wording in T022 | +| F12 | Minor | Branch metadata agreement | `spec.md:3`, `plan.md:3`, the actual git branch (`git status --short --branch` → `codex/084-external-tool-evaluation-profiles`), and the `AGENTS.md` plan pointer (now updated) all agree on `codex/084-external-tool-evaluation-profiles`. The original spec.md branch field `codex/084-tool-adoption-specs` is now corrected. | `spec.md:3`, `plan.md:3`, `AGENTS.md` SPECKIT block, `git status` | Accept; spec.md branch rename is already in the working tree. | Accept (in working tree) | +| F13 | Minor | Cross-spec coordination | Spec 084's classifications (ast-index strongest, CodeGraph lower-fit, Understand-Anything UX-only) directly feed P6-085 (ast-index producer import) and P6-086 (evidence navigation UX patterns), which are still backlog-only drafts. No cross-reference is recorded in spec 084. | `docs/product-backlog.md:168-170` | Add a one-line "Related future slices" note at the bottom of `spec.md` pointing to P6-085 and P6-086 with the planning note that they remain backlog-only. | Fix in same commit | +| F14 | Minor | Snapshot freshness callout | `research.md:32-38` is dated 2026-06-05 and the spec assumption says upstream may change after 2026-06-04. The `last_refreshed` field in `data-model.md:7-9` handles this at profile-record level, but the spec/plan do not name when the next mandatory refresh happens (e.g., before PR ready or before any future importer spec). | `research.md:32-38`, `spec.md:165-168`, `data-model.md:7-9` | Add a line in `plan.md` "Verification" or `quickstart.md` "Verify Profile Guidance" naming the refresh step as required before PR ready. | Fix in same commit | +| F15 | Minor | Quickstart command sequence | `quickstart.md:20-23` says to run `go run ./cmd/portolan context prepare --root . --out /tmp/portolan-084-context --force` and `jq empty /tmp/portolan-084-context/oss-plan.json`. The existing `contextprep` test pattern uses `out := filepath.Join(root, ".portolan", "context")` and T019 is a baseline run; the quickstart's `--out /tmp/...` is fine for humans but the test (T006) will likely use a `t.TempDir()`-style path. The quickstart is correct as a human smoke, just confirm it does not need to also assert the contract profile file exists. | `quickstart.md:18-24`, `tasks.md:T006` | Accept; quickstart is human-oriented. T006 owns the assertion contract. | Accept | +| F16 | Minor | T005 "planning review" naming | T005 is the disposition for "three assessed non-GPT planning review lanes." The current `raw-planning-*.md` files are raw model output, not the disposition file. T005 needs to produce `reviews/planning-review-disposition-2026-06-05.md` summarizing the three lanes. F1 must be fixed before that disposition can be written. | `tasks.md:T005` | Bundle F1 fix + T005 disposition into the same Phase 1 step. | Bundle with F1 | + +## Cross-cutting assessment + +- **Requirements drift**: Mostly clean. FR-001-FR-010 are concrete, bounded to documentation/guidance, and traceable to T006-T018. Only F4 (name-binding) and F10 (assumption vs Go change) flagged. +- **Constitution drift**: None. Plan's gate passes all five principles; slice adds no install/network/daemon/mutation/credentials/schema/evidence promotion. +- **Product drift**: Bounded and aligned with `product-boundary.md` and `product-claims.md`. F2 is the one real scheme-reuse concern. +- **Evidence-state honesty**: Held. Existing `contextprep` already routes candidate tools through `not_assessed` and `unknown` paths; T006 + T014 add test coverage. +- **Privacy / safety boundaries**: Held. Approval-gated execution/install/MCP/watcher/daemon/mutation requirements are explicit and the plan refuses to add a native producer wrapper. +- **Task testability**: Held. Each phase names files, independent tests, and verification commands; baseline commands in `AGENTS.md` are reused. + +## Verdict + +**CHANGES_REQUESTED** (3 major + 11 minor / acceptable) + +Blocking items before T020 closeout: + +1. **F1**: Run a third assessed non-GPT planning review lane (M3 produced an empty 0-byte file and does not count) and write `reviews/planning-review-disposition-2026-06-05.md` summarizing three assessed lanes. Until then, T005 is not complete and AGENTS.md "empty… do not count toward the three" applies. +2. **F2**: Decide between (a) explicit companion-document framing with a future-spec note that P6-085 / P6-086 will emit `producer-evaluation` records for the three candidates, or (b) reuse the existing `Decision` / `SupportState` enums and drop the parallel `role` vocabulary. Document the decision in `plan.md` and `contracts/external-tool-profile.md`. +3. **F3**: Reconcile backlog row P6-084 with `spec.md` status (do this in T020, but lock the wording now). + +Minor items F4-F16 can be folded into the same Phase 1 commit and the T020 closeout. + +## not_assessed + +- M3 lane review output (empty file → `not_assessed` per AGENTS.md and `docs/review-harness-benchmark.md:130-138`). +- T006 / T013 / T014 implementation behavior (no code changed in this review; tasks are unchecked). +- GitHub checks / PR state / merge readiness (pending Phase 6). +- Post-2026-06-05 external tool upstream metadata drift (handled by `last_refreshed` field; covered by F14). +- Real `ast-index` / `codegraph` / `understand-anything` output acquisition — explicitly out of scope per `spec.md:165-166` and P6-085 / P6-086 backlog-only status. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-planning-zai-glm51-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-planning-zai-glm51-2026-06-05.md new file mode 100644 index 0000000..20d6b5e --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-planning-zai-glm51-2026-06-05.md @@ -0,0 +1,35 @@ +Let me gather additional context before assessing readiness. +Now I have enough context to deliver the assessment. Here is the review: + +--- + +## Spec 084 Plan/Tasks Readiness Review + +**Reviewer**: zai-coding-plan/glm-5.1 +**Date**: 2026-06-05 +**Scope**: spec.md, plan.md, tasks.md, research.md, data-model.md, quickstart.md, contracts/external-tool-profile.md, existing codebase + +**Verdict**: **PASS** (minor findings; implementation may proceed after T005 planning review lanes are completed) + +### Findings + +| # | Severity | Area | Finding | Evidence | Recommendation | +|---|---|---|---|---|---| +| F1 | Minor | Requirements drift | No drift detected. FR-001 through FR-010 are concrete, bounded to documentation/guidance, and traceable to tasks. | spec.md:105-136, tasks.md:T006-T018 | Accept. | +| F2 | Minor | Constitution drift | No drift detected. Plan constitution check explicitly passes all five principles. No installs, network, daemon, mutation, or schema changes. | plan.md:60-74, constitution.md:6-18 | Accept. | +| F3 | Minor | Product drift | No drift detected. Scope is explicitly documentation and context-guidance only. No importer, execution, schema change, or network behavior is introduced. Matches product-boundary.md and product-claims.md. | plan.md:13-20, product-boundary.md:25-40 | Accept. | +| F4 | Minor | Evidence-state honesty | PASS. FR-004/SC-002 prohibit treating candidate profiles as observed evidence. T006 and T014 add test coverage. Existing `contextprep.go` already uses `not_assessed` for producer recommendations and candidates. | spec.md:114-116, tasks.md:T006,T014, contextprep.go:1724-1738 | Accept. | +| F5 | Minor | Privacy/security boundaries | PASS. FR-003, FR-005, and edge cases cover target mutation, daemon/watch, credential, and sensitive-payload risks. Approval gates required before execution. | spec.md:118-119,90-99, contracts/external-tool-profile.md:26-32 | Accept. | +| F6 | Minor | Task testability | PASS. Each phase names concrete files, independent tests, and verification commands. | tasks.md:T006,T012,T015,T019, plan.md:123-136 | Accept. | +| F7 | Minor | Spec status lag | spec.md:7-8 says "Ready for implementation" but the backlog (product-backlog.md:168) still says "Draft spec; backlog-only. No plan/tasks yet." which contradicts existing plan.md and tasks.md. | spec.md:7, product-backlog.md:168 | Update backlog status during T020; T020 covers final sync. | +| F8 | Minor | Raw planning review lane empty | `raw-planning-zai-glm51-2026-06-05.md` contains only a context-gathering message, not an assessed review. One assessed lane exists (kimi-k2p6). T005 requires three assessed non-GPT planning review lanes. | reviews/raw-planning-zai-glm51-2026-06-05.md:1, tasks.md:T005 | Complete T005 with three assessed lanes before Phase 2. The current review can count as one lane. | +| F9 | Minor | T022 code-quality lenses for docs-only slice | T022 mandates CRAP, MI, CleanArch, CleanCode, SOLID, DRY, YAGNI for a documentation-only diff. Per AGENTS.md delivery rules, these should be `not_applicable` with diff evidence. | tasks.md:T022, AGENTS.md delivery rules | Add explicit note in T022 or record as `not_applicable` during execution. Already covered by AGENTS.md guidance but worth noting for the implementer. | +| F10 | Minor | Research metadata snapshot date alignment | research.md snapshot is dated 2026-06-05, matching plan.md date. Spec assumption says upstream may change after 2026-06-04. Acceptable but implementation must note if upstream changed between snapshot and profile creation. | research.md:32-38, spec.md:167-168 | Accept; profile `last_refreshed` field handles this. | + +### not_assessed + +- T005 planning-review-lane execution evidence (pending: 1 assessed lane exists, need 2 more) +- GitHub checks / PR state (pending Phase 6) +- External tool upstream health/metadata freshness post-2026-06-05 snapshot (acknowledged in spec; `last_refreshed` field mitigates) +- `internal/contextprep` code changes needed for T013/T014 (no implementation exists yet; tests drive the design) +- Real ast-index, CodeGraph, or Understand-Anything output acquisition (out of scope for this slice per spec.md:165-166) diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-pr62-review-kimi-k2p6-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-pr62-review-kimi-k2p6-2026-06-05.md new file mode 100644 index 0000000..5906c91 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-pr62-review-kimi-k2p6-2026-06-05.md @@ -0,0 +1,25 @@ +## PR #62 Review Verdict: **PASS** (minor findings, no blockers) + +All baseline checks pass (`go test ./...`, `go vet ./...`, `jq empty schema/*.json`, `git diff --check`). Smoke test confirms generated `answer-contract.md` contains stale-profile guidance and `evidence-index.jsonl` excludes candidate names. + +### Findings Table + +| # | Lens | Severity | Evidence | Recommendation | +|---|------|----------|----------|----------------| +| 1 | Spec drift | None | `spec.md`, `plan.md`, `tasks.md`, `product-backlog.md:168` agree on branch name (`codex/084-external-tool-evaluation-profiles`), status ("Implemented locally; PR review pending"), and scope (docs + bounded context guidance). Planning disposition exists with 3 assessed lanes. | No action required | +| 2 | Constitution drift | None | No network/daemon/mutation/install/schema/evidence promotion. FR-004/SC-002 explicitly prohibit promoting profiles to observed evidence. | No action required | +| 3 | Product drift | None | Scope matches `product-boundary.md:25-40`; no importer, execution, or schema change introduced. | No action required | +| 4 | CRAP | **not_applicable** (docs) | `docs/adapter-contracts/external-tool-evaluation-profiles.md`, review files, `spec.md`, `tasks.md` are markdown documentation per AGENTS.md delivery rules. | Record as `not_applicable` with diff evidence | +| 5 | MI | **not_applicable** (docs) | Same docs-only file set as #4. | Record as `not_applicable` with diff evidence | +| 6 | CleanArch hex | **not_applicable** (docs) | Same docs-only file set as #4. | Record as `not_applicable` with diff evidence | +| 7 | CRAP (code) | Info | `contextprep.go` change: +1 `fmt.Fprintf` line. Cyclomatic complexity = 1, test coverage = 100%. Estimated CRAP ≈ 1.0. | No action required | +| 8 | MI (code) | Info | Bounded string append in renderer, well-tested, no branching. Estimated MI > 90. | No action required | +| 9 | CleanArch hex (code) | None | Change confined to `renderAnswerContract` presentation layer; no domain/boundary violations. | No action required | +| 10 | CleanCode | Info | `contextprep_test.go:220` checks lowercase `"codegraph"`; renderer uses TitleCase `"CodeGraph"`. Test would miss TitleCase leakage in evidence index. | Use `strings.ToLower()` or add capitalized variant to forbidden list | +| 11 | SOLID | None | `renderAnswerContract` single responsibility held; extended via text append, not logic modification. No interface/dependency changes. | No action required | +| 12 | DRY | None | Profile paragraph appears once in renderer, asserted once in test. No cross-file duplication. | No action required | +| 13 | YAGNI | None | Implementation is minimal: one paragraph, one test, no extra abstractions or features. | No action required | + +### not_assessed +- GitHub checks / CI status (no PR created yet from this review) +- Real ast-index / CodeGraph / Understand-Anything execution or output acquisition (explicitly out of scope per `spec.md:165-166`) diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us1-kimi-k2p6-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us1-kimi-k2p6-2026-06-05.md new file mode 100644 index 0000000..41644eb --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us1-kimi-k2p6-2026-06-05.md @@ -0,0 +1,25 @@ +## Review Findings for User Story 1 + +**Verdict: PASS** + +### Assessed Findings + +| # | Severity | Evidence | Recommendation | +|---|----------|----------|----------------| +| 1 | `info` | `docs/adapter-contracts/external-tool-evaluation-profiles.md` contains all three required projects in the Decision Summary table (lines 8–14) and detailed profiles (lines 35–117). | No change required. | +| 2 | `info` | ast-index is explicitly labeled "Strongest current symbol/reference producer candidate" (line 12) and recorded as the "strongest current candidate for future symbol/reference import planning" (lines 59–60). | No change required. | +| 3 | `info` | CodeGraph is labeled "lower-fit optional candidate" (line 13) with explicit rationale about `.codegraph/` target writes and watch/MCP behavior (lines 74–88). | No change required. | +| 4 | `info` | Understand-Anything is classified as `ux_pattern_source` (line 14, line 99) with guidance to "borrow navigation UX ideas only" (lines 114–115) and reject LLM-authored graph as evidence (lines 116–117). | No change required. | +| 5 | `info` | All risk surfaces (target mutation, watcher/daemon, hooks, MCP install, network install, LLM-authored claims) are framed as approval-gated blockers in Common Rules (lines 27–31) and per-profile "Approval boundary" fields. | No change required. | +| 6 | `info` | Each profile includes: project identity, license, maintenance snapshot with date, output surfaces, local execution posture, target mutation risk, network/install risk, daemon/watch risk, privacy risk, integration cost, recommended action, and evidence limitations. | No change required. | + +### not_assessed + +| Item | Reason | +|------|--------| +| Stale-profile refresh enforcement | FR-010 guidance is present; automated enforcement is outside User Story 1 scope. | +| Machine-readable producer evidence states | Spec explicitly states these remain product-facing guidance only (lines 16–18); no schema change was requested. | + +### Summary + +The profile document satisfies all User Story 1 acceptance scenarios and functional requirements (FR-001 through FR-010). No overclaiming is present; evidence limitations and approval boundaries are consistently and clearly stated. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us2-zai-glm51-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us2-zai-glm51-2026-06-05.md new file mode 100644 index 0000000..50e9ee5 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us2-zai-glm51-2026-06-05.md @@ -0,0 +1,64 @@ +Let me first read the spec and understand User Story 2, then check review tooling availability. +Now let me read the rest of contextprep.go (the rendering functions) and check the external tool profiles doc: +Now let me check tooling preflight and build the review context: +Good - all three models and the explore agent are available. Now let me build the review context: +Now let me wait for all three lanes and collect results: +All three lanes completed. Let me collect their results: +All three lanes completed. Let me verify the key findings locally: +All evidence collected. Here is the synthesized review: + +--- + +## Mode: REVIEW + +**review status**: partial +**scope**: local uncommitted diff + smoke-tested generated context pack (`/tmp/portolan-084-context/`) +**lanes**: +- requirements / zai-coding-plan/glm-5.1: **findings** (3 findings, 0 blocking) +- code / kimi-for-coding/k2p6: **pass** +- security / minimax/MiniMax-M2.7: **pass** + +--- + +### Findings + +**F-001 — medium: FR-010 stale-profile risk not surfaced in generated `answer-contract.md`** +- **Requirement**: FR-010: "Generated guidance MUST make stale-profile risk visible and require a fresh upstream/local verification step before implementation depends on a candidate's current behavior." +- **Evidence**: The profile doc (`docs/adapter-contracts/external-tool-evaluation-profiles.md:32-33`) contains a one-day refresh rule, but the generated `answer-contract.md` paragraph at `contextprep.go:2434` delegates to the profile doc without surfacing staleness risk. A generated context pack (`/tmp/portolan-084-context/answer-contract.md`) confirmed: the word "stale" appears only for artifact boundaries, never for tool profile metadata staleness. The test (`contextprep_test.go:205-215`) does not assert stale-risk language in `answer-contract.md`. +- **Recommendation**: Add a one-line stale-risk reminder to the answer-contract paragraph (e.g., "Profile metadata may be stale; refresh upstream verification before depending on current tool behavior.") and add a test assertion for it. + +**F-002 — low: Case-sensitive evidence-index forbidden check** +- **Evidence**: `contextprep_test.go:219` forbids `"codegraph"` (lowercase) in `evidence-index.jsonl`. The answer-contract text uses `"CodeGraph"` (capitalized). If any future code path wrote the capitalized form, the test would miss it. Verified locally: `"CodeGraph"` does not appear in the evidence index. Practical risk is currently zero because `contextprep.go:2434` writes tool names only to `answer-contract.md`, not to the evidence index. +- **Recommendation**: Change `"codegraph"` to `"CodeGraph"` in the forbidden list or use `strings.ToLower` before checking. + +**F-003 — low: T014 test scope narrower than task description** +- **Evidence**: T014 says "assert ... no external tool execution command is emitted." The test (`contextprep_test.go:218-223`) only checks `evidence-index.jsonl` exclusion. It does not inspect `oss-plan.json` or other generated files for execution commands. The implementation is correct (no execution commands are added), but the test does not fully cover the T014 acceptance claim. +- **Recommendation**: Add a negative assertion on `oss-plan.json` or narrow T014 description to match actual test scope. + +--- + +### Verified Requirements + +| Requirement | Status | Evidence | +|---|---|---| +| US2 AS1 (no graph facts from candidates) | **pass** | `contextprep.go:2434` explicitly forbids promotion; test verifies `evidence-index.jsonl` excludes tool names | +| US2 AS2 (stale-profile risk) | **partial** | Profile doc has stale rules; generated `answer-contract.md` does not surface them (F-001) | +| FR-004 (no evidence promotion) | **pass** | Guidance says "Do not promote candidate profiles to source-visible, metadata-visible, or runtime-visible evidence" | +| FR-005 (approval boundaries) | **pass** | Profile doc records approval gates for all three candidates | +| FR-010 (stale risk visible) | **gap** | Delegated to profile doc only; not in generated guidance (F-001) | +| SC-002 (no graph fact from candidate) | **pass** | evidence-index.jsonl exclusion confirmed by test and local smoke | +| Evidence-state honesty | **pass** | Text positions profiles as guidance, not evidence | +| Privacy/security | **pass** | No secrets, network access, credentials, or command execution in the diff | + +### Evidence gaps +- CI/CD checks: `not_assessed` +- PR review approval: `not_assessed` +- `oss-plan.json` negative assertion for external tool commands: `not_assessed` + +### Conflicts +- Lane 1 (requirements) found 3 findings; Lane 2 (code) and Lane 3 (security) found no defects. No disagreement — the requirements lane applied a stricter FR-010 lens that the code and security lanes did not specifically test against. + +--- + +**Verdict**: **CHANGES_REQUESTED** — F-001 is a medium-severity gap against FR-010. The generated `answer-contract.md` should surface stale-profile risk rather than delegating it entirely to the profile document. F-002 and F-003 are low and can be addressed alongside F-001. +- merge_approval: `not_assessed` diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-minimax-m3-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-minimax-m3-2026-06-05.md new file mode 100644 index 0000000..1f46dd3 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-minimax-m3-2026-06-05.md @@ -0,0 +1,27 @@ +# US3 Review: External Tool Evaluation Profiles — spec 084 + +**Scope**: T016-T018 (refresh procedure, role/producer-family mapping boundary, schema non-change, maintainability). +**Verdict**: **CHANGES_REQUESTED**. + +## Findings + +| # | Severity | Evidence | Recommendation | +|---|----------|----------|----------------| +| F-1 | blocker | `docs/specs/084-external-tool-evaluation-profiles/reviews/us3-review-disposition-2026-06-05.md` does not exist; the only US3 raw lane file `raw-us3-minimax-m3-2026-06-05.md` is 0 bytes. T018 task is unchecked. | Create the disposition file with three assessed non-GPT lanes (or record explicit `not_assessed` per lane with replacement rationale) before claiming US3 is reviewed. Per `AGENTS.md` and the planning-review-disposition precedent, an empty/hung/minimax self-observed lane does not count as assessed evidence. | +| F-2 | high | `tasks.md:40-42` shows T016, T017, T018 as `[ ]`, but T016's deliverable (Refresh Procedure at `docs/adapter-contracts/external-tool-evaluation-profiles.md:119-127`) is present in the working tree and T017 is verifiable (`jq empty` passes for all 7 schema files; no schema file appears in the diff set). | Reconcile the ledger: mark T016 done only after the F-1 disposition is in place, record the T017 command output verbatim, then mark complete. AGENTS.md requires backlog/spec/tasks/disposition/implementation consistency. | +| F-3 | high | The US3 independent test in `spec.md:78-80` says a profile can transition `producer_candidate` -> `blocked` / `rejected` / `ready_for_import_planning` "without changing graph schemas." The Refresh Procedure step 4 (`docs/adapter-contracts/external-tool-evaluation-profiles.md:122-126`) only describes the `guidance -> import planning` direction and only says "map the decision to producer-family `Decision` and `SupportState` records." The other three transitions (to `blocked`, `rejected`, and from `producer_candidate` to `ux_pattern_source`) have no documented producer-family destination. | Make the role -> `Decision`/`SupportState` mapping explicit for all five role states (at minimum: source state, target state, required producer-family record change, whether the importer/evaluation slice is opened or closed). Either expand the Refresh Procedure or add a small mapping table to `contracts/external-tool-profile.md`. | +| F-4 | medium | `docs/product-backlog.md:168` says P6-084 is "Ready for implementation," but `spec.md:7-8` still says "Ready for implementation; plan.md and tasks.md exist; planning review is in progress before implementation," and T016-T018 are unchecked. Three SpecKit surfaces disagree on the same spec. | Pick one state (either "ready-for-implementation, US3 review in progress" or "implemented, PR review pending") and align backlog, spec status, and task ledger to it. | +| F-5 | medium | Maintainability: the Refresh Procedure is 5 lines, which is good, but it has no cross-reference to the producer-family schema or to `contracts/external-tool-profile.md`. A maintainer refreshing a profile is told to "map to producer-family Decision and SupportState records" without pointing at the field set, schema path, or an importer-slice template. | Add a single sentence pointing at `schema/producer-family.schema.json` and the slice template under `docs/specs/053-language-agnostic-producers/` (or whichever spec owns the importer opening). | +| F-6 | low | `docs/adapter-contracts/external-tool-evaluation-profiles.md:33` requires a manual one-day refresh check on `Last refreshed`. The contract has no machine-checkable freshness signal beyond the date string. | Optionally add a focused test that asserts a `last_refreshed` is present in every profile and a CLI/lint command that warns when any profile is older than 1 day. Not blocking for US3. | +| F-7 | info | F-001 from the US2 disposition (`raw-us2-zai-glm51-2026-06-05.md:25-28`) — stale-profile risk not surfaced in the generated `answer-contract.md` — also bears on US3's "incremental refresh" story. The bounded `contextprep.go` paragraph added for US2 (`contextprep.go:2434`) does not mention profile staleness. | Consider closing the US2 finding inside the US3 cycle, since refresh is the US3 theme. Low priority relative to F-1/F-2/F-3. | + +## Not Assessed + +- **End-to-end role transition test**: a profile has not actually been moved between role states in this slice; the independent test in `spec.md:78-80` is asserted by documentation, not by exercised behavior. +- **Producer-family schema stability across future importer slices (085/086)**: depends on specs that are explicitly backlog-only. +- **GitHub checks, PR state, CI, and merge readiness**: no PR exists yet. +- **CRAP, MI, CleanArch-hex lenses for US3**: US3 is docs-only (Refresh Procedure text). Per `AGENTS.md` these lenses are `not_applicable` for docs-only files; the bounded `contextprep.go` change is a single non-executable string literal and was already covered by the US2 code lens. YAGNI / DRY / CleanCode / SOLID: the new paragraph does not introduce a parallel decision scheme and does not duplicate the producer-family record contract, so no defect under those lenses. + +## Bottom Line + +US3's deliverable text (`docs/adapter-contracts/external-tool-evaluation-profiles.md:119-127`) is present and the schema non-change claim is verifiable, but the review disposition required by T018 and the project governance is missing, the task ledger is stale, and the role/producer-family mapping is not made explicit for all four non-import transitions. Resolve F-1, F-2, and F-3 before marking US3 ready for PR. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-rerun-kimi-k2p6-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-rerun-kimi-k2p6-2026-06-05.md new file mode 100644 index 0000000..faa7957 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-rerun-kimi-k2p6-2026-06-05.md @@ -0,0 +1,12 @@ +# Failed US3 Rerun Lane: Kimi K2.6 + +Date: 2026-06-05 + +Harness: `opencode run --pure --model kimi-for-coding/k2p6` + +Status: `not_assessed` + +Observed failure: `unknown certificate verification error` + +Disposition: This lane is not counted as assessed review evidence. It was +replaced by `raw-us3-rerun-opencode-mimo-2026-06-05.md`. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-rerun-mimo-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-rerun-mimo-2026-06-05.md new file mode 100644 index 0000000..ea537e0 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-rerun-mimo-2026-06-05.md @@ -0,0 +1,13 @@ +## Findings Table + +| Area | Severity | Evidence | Recommendation | +|------|----------|----------|----------------| +| Role mapping boundary covers all five labels | ✅ PASS | Role Mapping Boundary table includes `producer_candidate`, `ux_pattern_source`, `ready_for_import_planning`, `blocked`, `rejected` with explicit Decision/SupportState mappings | No action | +| Producer-family schema reference exists | ✅ PASS | Refresh Procedure step 5: "Use `schema/producer-family.schema.json` and the producer-family precedent in `docs/specs/053-language-agnostic-producers/`"; Contract Evidence Rules also reference `schema/producer-family.schema.json` | No action | +| Stale profile guidance in generated answer contract | ✅ PASS | Contextprep snippet contains: `"treat the profile as stale if its \`Last refreshed\` date is older than the current review window"` | No action | +| Stale profile guidance asserted in test | ✅ PASS | `TestRunWritesExternalToolProfileGuidanceWithoutPromotingEvidence` asserts `"treat the profile as stale if its \`Last refreshed\` date is older than the current review window"` is present in `answer-contract.md` | No action | +| No schema changes required | ✅ PASS | Role mapping table and contract reference existing `Decision`/`SupportState` records from producer-family schema; no new schema fields introduced | No action | + +## Verdict + +**PASS** — All five checks satisfied. Role mapping boundary is complete, producer-family schema reference is present in both the profile document and contract, stale-profile guidance appears in both the contextprep snippet and the test assertion, and no schema modifications were needed. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-rerun-opencode-mimo-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-rerun-opencode-mimo-2026-06-05.md new file mode 100644 index 0000000..ab56f9d --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/raw-us3-rerun-opencode-mimo-2026-06-05.md @@ -0,0 +1,30 @@ +Now I have the full picture. Here's my review: + +--- + +## US3 Re-Review: External Tool Evaluation Profiles — spec 084 + +**Scope**: Role mapping boundary, producer-family schema reference, stale profile guidance, schema non-change. + +### Findings + +| # | Area | Severity | Evidence | Recommendation | +|---|------|----------|----------|----------------| +| 1 | Role mapping boundary covers all five labels | ✅ PASS | `docs/adapter-contracts/external-tool-evaluation-profiles.md:22-28` — Role Mapping Boundary table maps `producer_candidate`, `ux_pattern_source`, `ready_for_import_planning`, `blocked`, `rejected` to explicit `Decision`/`SupportState` pairs | No action | +| 2 | Producer-family schema reference exists | ✅ PASS | Refresh Procedure step 5 (`:137-139`) references `schema/producer-family.schema.json` and `docs/specs/053-language-agnostic-producers/`; Contract Evidence Rules (`contracts/external-tool-profile.md:28-30`) also reference it | No action | +| 3 | Stale profile guidance in generated answer contract | ✅ PASS | `internal/contextprep/contextprep.go:2434` renders: *"treat the profile as stale if its `Last refreshed` date is older than the current review window"* | No action | +| 4 | Stale profile guidance asserted in test | ✅ PASS | `internal/contextprep/contextprep_test.go:208` asserts the stale-profile string is present in `answer-contract.md` | No action | +| 5 | No schema changes | ✅ PASS | `schema/producer-family.schema.json` last modified in commit `de251ce` (spec 053); no spec-084 diff touches any schema file | No action | + +### MiniMax F-3 Status + +MiniMax F-3 requested explicit role→Decision/SupportState mapping for all five states. The Role Mapping Boundary table at `docs/adapter-contracts/external-tool-evaluation-profiles.md:22-28` now provides that mapping. **F-3 is resolved.** + +### Not Assessed + +- **Kimi certificate rerun**: failed; not counted as assessed evidence per AGENTS.md rules. +- **GitHub checks / PR state**: no PR exists yet. + +### Verdict + +**PASS** — All five checks satisfied. The MiMo rerun assessment is accurate and the MiniMax F-3 finding is resolved by the Role Mapping Boundary table. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/requirements-product-vision-drift-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/requirements-product-vision-drift-2026-06-05.md new file mode 100644 index 0000000..1966585 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/requirements-product-vision-drift-2026-06-05.md @@ -0,0 +1,28 @@ +# Requirements And Product Vision Drift Review: Spec 084 + +Date: 2026-06-05 + +## Scope + +- Backlog row: `docs/product-backlog.md` P6-084. +- SpecKit artifacts: `spec.md`, `plan.md`, `research.md`, `data-model.md`, + `quickstart.md`, `contracts/external-tool-profile.md`, and `tasks.md`. +- Product boundary: `docs/product-boundary.md`, `docs/product-claims.md`, and + `.specify/memory/constitution.md`. + +## Findings + +| Area | Result | Evidence | Disposition | +| --- | --- | --- | --- | +| Requirements drift | verified: no blocking drift | FR-001 through FR-010 map to T006 through T018, with final verification in T019 through T023. | Accept | +| Product drift | verified: no blocking drift | The slice is documentation and bounded context guidance only; no external tool execution, installer, daemon, schema change, or graph evidence promotion is allowed. | Accept | +| Constitution drift | verified: no blocking drift | Local-first/read-only, evidence honesty, OSS composition, SpecKit-before-implementation, and test-first-for-behavior gates are explicitly covered in `plan.md`. | Accept | +| Status drift | verified: fixed | `spec.md`, `docs/product-backlog.md`, and `tasks.md` now agree that plan/tasks exist and implementation is not yet verified. | Fixed | +| Profile vocabulary drift | verified: fixed | `research.md`, `data-model.md`, `contracts/external-tool-profile.md`, and `plan.md` now state that profile roles are product-facing guidance and do not replace producer-family `Decision`/`SupportState` records. | Fixed | + +## Decision + +Spec 084 is ready for implementation. Implementation must stay limited to the +profile artifact and bounded context-pack guidance. Real CodeGraph, +Understand-Anything, and ast-index execution/output remains `not_assessed` and +out of scope. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/us1-review-disposition-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/us1-review-disposition-2026-06-05.md new file mode 100644 index 0000000..7ec32c9 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/us1-review-disposition-2026-06-05.md @@ -0,0 +1,22 @@ +# US1 Review Disposition: External Tool Profiles + +Date: 2026-06-05 + +## Verification + +- `go test -count=1 ./internal/contextprep`: passed. +- Profile text check for all three projects, approval boundaries, refresh + procedure, and `not_assessed`: passed. +- Opencode review lane: `kimi-for-coding/k2p6`, raw output + `raw-us1-kimi-k2p6-2026-06-05.md`, verdict PASS. + +## Decision + +US1 accepted. The profile document lists CodeGraph, Understand-Anything, and +ast-index with distinct roles, fit, metadata snapshot, output surfaces, +approval boundaries, recommended actions, and evidence limitations. + +## Not Assessed + +- Real external tool execution or output acquisition. +- GitHub PR checks. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/us2-review-disposition-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/us2-review-disposition-2026-06-05.md new file mode 100644 index 0000000..b70fd81 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/us2-review-disposition-2026-06-05.md @@ -0,0 +1,30 @@ +# US2 Review Disposition: Candidate State Separation + +Date: 2026-06-05 + +## Verification + +- `go test -count=1 ./internal/contextprep`: passed. +- Local context smoke: + `go run ./cmd/portolan context prepare --root . --out /tmp/portolan-084-context --force`: passed. +- `jq empty /tmp/portolan-084-context/oss-plan.json`: passed. +- `answer-contract.md` contains external profile guidance and stale-profile + language: verified. +- `evidence-index.jsonl` does not contain CodeGraph, Understand-Anything, or + Claude-ast-index-search profile candidates as evidence records: verified. + +## Review + +- Opencode GLM lane with repo-grounded sub-review, raw output + `raw-us2-zai-glm51-2026-06-05.md`: PASS, no defects found. + +## Decision + +US2 accepted. Context guidance can mention profile candidates, but the +candidates do not become evidence-index facts and do not promote evidence +states. + +## Not Assessed + +- Real external tool output import. +- GitHub PR checks. diff --git a/docs/specs/084-external-tool-evaluation-profiles/reviews/us3-review-disposition-2026-06-05.md b/docs/specs/084-external-tool-evaluation-profiles/reviews/us3-review-disposition-2026-06-05.md new file mode 100644 index 0000000..9a7b873 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/reviews/us3-review-disposition-2026-06-05.md @@ -0,0 +1,42 @@ +# US3 Review Disposition: Incremental Profile Updates + +Date: 2026-06-05 + +## Verification + +- `jq empty schema/*.json`: passed. +- `go test -count=1 ./internal/contextprep`: passed. +- Local context smoke verified stale-profile guidance in generated + `answer-contract.md`: passed. + +## Review Lanes + +| Lane | Raw Output | Status | Verdict | +| --- | --- | --- | --- | +| `opencode-go/minimax-m3` via `opencode run` | `raw-us3-minimax-m3-2026-06-05.md` | assessed | CHANGES_REQUESTED | +| `kimi-for-coding/k2p6` via `opencode run` | `raw-us3-rerun-kimi-k2p6-2026-06-05.md` | not_assessed | failed with certificate verification error | +| `openrouter/xiaomi/mimo-v2.5-pro` via `opencode run` | `raw-us3-rerun-opencode-mimo-2026-06-05.md` | assessed replacement | PASS | +| `openrouter/xiaomi/mimo-v2.5-pro` via `pi` | `raw-us3-rerun-mimo-2026-06-05.md` | not_primary | sanity check only; not counted for the requested opencode review surface | + +## Accepted Findings And Fixes + +- MiniMax F-3 required explicit mapping for all profile role states. Fixed with + the Role Mapping Boundary table covering `producer_candidate`, + `ux_pattern_source`, `ready_for_import_planning`, `blocked`, and `rejected`. +- MiniMax F-5 required a producer-family schema reference. Fixed with references + to `schema/producer-family.schema.json` and + `docs/specs/053-language-agnostic-producers/`. +- MiniMax F-7 suggested stale-profile guidance in generated context. Fixed in + `internal/contextprep` and asserted in `contextprep_test.go`. + +## Decision + +US3 accepted after fixes and opencode replacement review. Maintainers can +refresh or narrow one profile while preserving schema stability and existing +producer-family decision/support-state boundaries. + +## Not Assessed + +- Future specs 085 and 086. +- Real external tool execution or output acquisition. +- GitHub PR checks. diff --git a/docs/specs/084-external-tool-evaluation-profiles/spec.md b/docs/specs/084-external-tool-evaluation-profiles/spec.md index 563ce50..0e8e575 100644 --- a/docs/specs/084-external-tool-evaluation-profiles/spec.md +++ b/docs/specs/084-external-tool-evaluation-profiles/spec.md @@ -1,11 +1,10 @@ # Feature Specification: External Tool Evaluation Profiles -**Feature Branch**: `codex/084-tool-adoption-specs` +**Feature Branch**: `codex/084-external-tool-evaluation-profiles` **Created**: 2026-06-04 -**Status**: Draft; backlog-only. Requires `plan.md` and `tasks.md` before -implementation. +**Status**: Implemented locally; PR review pending. **Input**: User asked to turn the external review of `colbymchenry/codegraph`, `Lum1104/Understand-Anything`, and `defendend/Claude-ast-index-search` into @@ -164,6 +163,17 @@ without changing graph schemas. - This slice is documentation and guidance first; it does not add an importer, execute external tools, install dependencies, or change schemas. +- Bounded context-pack guidance may add a small pointer in + `internal/contextprep`; this is not an importer, schema change, or graph + evidence fact. - External project metadata must be refreshed during implementation planning because upstream repositories can change after the 2026-06-04 review snapshot. - `not_assessed` remains a verification/status label, not a graph evidence state. + +## Related Future Slices + +- `docs/specs/085-ast-index-producer-import/` remains backlog-only and owns any + future import of explicitly supplied ast-index output. +- `docs/specs/086-evidence-navigation-ux-patterns/` remains backlog-only and + owns any future adoption of navigation UX patterns from CodeGraph, + Understand-Anything, or ast-index. diff --git a/docs/specs/084-external-tool-evaluation-profiles/tasks.md b/docs/specs/084-external-tool-evaluation-profiles/tasks.md new file mode 100644 index 0000000..a744678 --- /dev/null +++ b/docs/specs/084-external-tool-evaluation-profiles/tasks.md @@ -0,0 +1,70 @@ +# Tasks: External Tool Evaluation Profiles + +**Input**: `docs/specs/084-external-tool-evaluation-profiles/` + +## Phase 1: Setup And Planning + +- [x] T001 Verify dedicated branch/worktree and branch metadata in `docs/specs/084-external-tool-evaluation-profiles/spec.md`. +- [x] T002 Create concrete planning artifacts in `docs/specs/084-external-tool-evaluation-profiles/plan.md`, `research.md`, `data-model.md`, `quickstart.md`, and `contracts/external-tool-profile.md`. +- [x] T003 Update the SpecKit plan pointer in `AGENTS.md`. +- [x] T004 Record requirements, constitution, and product drift review in `docs/specs/084-external-tool-evaluation-profiles/reviews/requirements-product-vision-drift-2026-06-05.md`. +- [x] T005 Run three assessed non-GPT planning review lanes for `spec.md`, `plan.md`, and `tasks.md`, then record disposition in `docs/specs/084-external-tool-evaluation-profiles/reviews/planning-review-disposition-2026-06-05.md`. + +## Phase 2: Foundational Profile Contract + +- [x] T006 Add focused failing context/profile test in `internal/contextprep/contextprep_test.go` proving external tool profiles are guidance only and do not promote evidence. +- [x] T007 Create `docs/adapter-contracts/external-tool-evaluation-profiles.md` with required profile structure, candidate roles, approval boundaries, and stale-profile rules. + +## Phase 3: User Story 1 - Compare External Tools Without Overclaiming + +**Independent Test**: `docs/adapter-contracts/external-tool-evaluation-profiles.md` lists CodeGraph, Understand-Anything, and ast-index with role, fit, license, maintenance snapshot date, output surfaces, approval boundaries, and evidence limitations. + +- [x] T008 [US1] Add CodeGraph profile section to `docs/adapter-contracts/external-tool-evaluation-profiles.md`. +- [x] T009 [US1] Add Understand-Anything profile section to `docs/adapter-contracts/external-tool-evaluation-profiles.md`. +- [x] T010 [US1] Add ast-index profile section to `docs/adapter-contracts/external-tool-evaluation-profiles.md`. +- [x] T011 [US1] Update profile summary guidance in `docs/adapter-contracts/external-tool-evaluation-profiles.md` so ast-index is strongest candidate, CodeGraph is lower-fit optional, and Understand-Anything is UX-only. +- [x] T012 [US1] Run focused profile text checks and `go test -count=1 ./internal/contextprep`, then record slice review disposition in `docs/specs/084-external-tool-evaluation-profiles/reviews/us1-review-disposition-2026-06-05.md`. + +## Phase 4: User Story 2 - Preserve Tool Candidate State Separately From Evidence + +**Independent Test**: Generated context guidance can mention profile candidates while graph facts and evidence families remain unobserved until local output is supplied. + +- [x] T013 [US2] Add external profile pointer to context-pack guidance in `internal/contextprep/contextprep.go`. +- [x] T014 [US2] Ensure `internal/contextprep/contextprep_test.go` asserts candidate profiles remain `not_assessed` guidance and no external tool execution command is emitted. +- [x] T015 [US2] Run `go test -count=1 ./internal/contextprep` and a local context prepare smoke, then record slice review disposition in `docs/specs/084-external-tool-evaluation-profiles/reviews/us2-review-disposition-2026-06-05.md`. + +## Phase 5: User Story 3 - Update Adoption Decisions Incrementally + +**Independent Test**: A maintainer can refresh or change one profile classification without changing graph schemas or unrelated candidate profiles. + +- [x] T016 [US3] Add refresh/change procedure to `docs/adapter-contracts/external-tool-evaluation-profiles.md`. +- [x] T017 [US3] Verify no schema changes are needed with `jq empty schema/*.json`. +- [x] T018 [US3] Record incremental-update review disposition in `docs/specs/084-external-tool-evaluation-profiles/reviews/us3-review-disposition-2026-06-05.md`. + +## Phase 6: Final Verification And PR + +- [x] T019 Run full baseline: `go test ./...`, `go vet ./...`, `jq empty schema/*.json`, and `git diff --check`. +- [x] T020 Update `docs/product-backlog.md`, `docs/specs/084-external-tool-evaluation-profiles/spec.md`, and this task ledger to implementation-ready or implemented state as appropriate. +- [x] T021 Commit, push, create PR, and run PR review cycle under `docs/specs/084-external-tool-evaluation-profiles/reviews/`. +- [x] T022 Run PR-quality review lenses for spec drift, constitution drift, product drift, CRAP, MI, CleanArch hex, CleanCode, SOLID, DRY, and YAGNI, then record disposition; classify each lens against the touched file set, record CRAP/MI/CleanArch as `not_applicable` for docs-only files with diff evidence, and apply code-quality lenses to the bounded `internal/contextprep` change. +- [x] T023 Prepare PR readiness closeout with local verification, review evidence, GitHub checks, merge readiness, and `merge_approval: not_assessed`. + +## Dependencies + +- Phase 1 must complete before implementation. +- Phase 2 blocks all user stories. +- User Story 1 is the MVP and should complete before User Story 2. +- User Story 2 must complete before User Story 3. +- Phase 6 runs after all user stories. + +## Parallel Opportunities + +- T008, T009, and T010 can be drafted in parallel after T007. +- Review artifact drafting can proceed in parallel with local verification once command evidence is available. + +## Implementation Strategy + +Deliver User Story 1 first as the minimum useful artifact. Then add context-pack +guidance for User Story 2 and the refresh procedure for User Story 3. Keep every +slice bounded to existing docs and `internal/contextprep`; do not add external +tool execution, dependencies, schema changes, or graph evidence facts. diff --git a/internal/contextprep/contextprep.go b/internal/contextprep/contextprep.go index 5999dff..9d807c2 100644 --- a/internal/contextprep/contextprep.go +++ b/internal/contextprep/contextprep.go @@ -2431,6 +2431,7 @@ func renderAnswerContract(root, out string) string { fmt.Fprintf(&b, "Relationship answers must name both relationship kind and evidence type. For relationship claims, including \"what talks to what?\", look first at `evidence-index.jsonl`, `tool-registry.json`, `gaps.jsonl`, then map-bundle `summary.json`, `graph-index.json`, and `findings.jsonl`. `evidence-index.jsonl` may include build/deploy relationship candidates such as build manifests, distribution manifests, RPM specs, and deployment manifests; those are source-visible places to inspect, not parsed service topology. Native map relationship extraction is limited to Go imports and go.mod manifests; JVM, PHP, Scala, and other non-Go coupling stays `not_assessed` unless supplied through local producer output. `source-visible` and `metadata-visible` records do not prove runtime communication; runtime topology is `not_assessed` unless runtime-visible local observations were supplied and inspected. Dependency and symbol records from local producer outputs do not mean Portolan has native PHP, JVM, Scala, or other language semantics; they are producer evidence. Missing relationship surfaces remain `unknown`, `cannot_verify`, or `not_assessed`; `claim-only` remains a claim, not observed evidence.\n\n") fmt.Fprintf(&b, "## Producer Family Recommendations\n\n") fmt.Fprintf(&b, "Producer recommendations are options, not observed evidence. Treat `producer-recommendation` records in `evidence-index.jsonl` as a safe next-action surface for missing local producer families; they do not prove the candidate tool is installed, supported, or appropriate for this landscape. Candidate tools marked `candidate_only` remain `not_assessed` until local output or a local evaluation record exists. Portolan does not synthesize producer evaluations from recommendation records; if no `producer-evaluation` record is present, candidate evaluation remains `not_assessed`. Check both `verification_state` and `support_state`: `verification_state` describes local evidence for the candidate, while `support_state` describes whether Portolan can present it as supported. Do not propose a Portolan-owned PHP/JVM/Scala adapter as the default answer to language coverage gaps; ask for or evaluate local dependency, symbol-index, API/catalog, deployment/model, static finding, duplication, config, or runtime-observation producer evidence instead.\n\n") + fmt.Fprintf(&b, "External tool evaluation profiles are adoption guidance, not evidence. Read `docs/adapter-contracts/external-tool-evaluation-profiles.md` before recommending CodeGraph, Understand-Anything, or ast-index, and treat the profile as stale if its `Last refreshed` date is older than the current review window. Current guidance: ast-index is the strongest current symbol/reference producer candidate, CodeGraph is lower-fit optional, and Understand-Anything is UX inspiration. Do not promote candidate profiles to source-visible, metadata-visible, or runtime-visible evidence; local output and a Portolan import/evaluation contract are required first.\n\n") fmt.Fprintf(&b, "Tool acquisition guidance is stack-agnostic. Candidate tools are local producer options, not Portolan-owned language adapters. Use `oss-plan.json` acquisition fields to decide whether a tool is installed, missing, or requires local evaluation, and to name approval, network, cache, mutation, and output-path risks before asking the operator to pull it in. Do not propose a Portolan-owned PHP/JVM/Scala/Gradle adapter as the default answer to language coverage gaps; ask for local producer output from an appropriate existing tool family and keep evidence `not_assessed` until that output is present and re-ingested.\n\n") fmt.Fprintf(&b, "Build-system and SBOM producer output can be the right first evidence path when dependency/build manifests are visible, but the choice is landscape-specific and approval-gated. Java, Scala, Maven, Gradle, PHP, Composer, and other ecosystem relationships remain `not_assessed` until local producer output exists and is inspected. `oss-plan.json` may list concrete options such as Maven, Gradle, Composer, Syft, or CycloneDX-compatible producers when matching local surfaces are visible, but those options do not imply Portolan executed the tool or owns that stack. Keep visible manifests as native producer-evidence acquisition questions, not requests for Portolan-owned language or build-system adapters.\n\n") fmt.Fprintf(&b, "## Producer Run Records\n\n") diff --git a/internal/contextprep/contextprep_test.go b/internal/contextprep/contextprep_test.go index 91497cf..c03d255 100644 --- a/internal/contextprep/contextprep_test.go +++ b/internal/contextprep/contextprep_test.go @@ -190,6 +190,40 @@ func TestRunWritesStackAgnosticToolAcquisitionGuidance(t *testing.T) { } } +func TestRunWritesExternalToolProfileGuidanceWithoutPromotingEvidence(t *testing.T) { + root := t.TempDir() + repo := filepath.Join(root, "repos", "service") + mustMkdirContextprep(t, filepath.Join(repo, ".git")) + mustWriteContextprep(t, filepath.Join(repo, "main.go"), "package main\n") + + out := filepath.Join(root, ".portolan", "context") + if _, err := Run(Options{RootPath: root, OutputPath: out, Profile: "agent"}); err != nil { + t.Fatalf("Run returned error: %v", err) + } + + contract := mustReadContextprep(t, filepath.Join(out, "answer-contract.md")) + for _, want := range []string{ + "External tool evaluation profiles are adoption guidance, not evidence", + "docs/adapter-contracts/external-tool-evaluation-profiles.md", + "treat the profile as stale if its `Last refreshed` date is older than the current review window", + "ast-index is the strongest current symbol/reference producer candidate", + "CodeGraph is lower-fit optional", + "Understand-Anything is UX inspiration", + "Do not promote candidate profiles to source-visible, metadata-visible, or runtime-visible evidence", + } { + if !strings.Contains(contract, want) { + t.Fatalf("answer-contract.md missing %q:\n%s", want, contract) + } + } + + evidenceIndex := strings.ToLower(mustReadContextprep(t, filepath.Join(out, "evidence-index.jsonl"))) + for _, forbidden := range []string{"codegraph", "Understand-Anything", "Claude-ast-index-search"} { + if strings.Contains(evidenceIndex, strings.ToLower(forbidden)) { + t.Fatalf("evidence-index.jsonl contains profile candidate %q as evidence:\n%s", forbidden, evidenceIndex) + } + } +} + func TestResolveBuildToolExecutablePrefersExecutableWrapper(t *testing.T) { repo := t.TempDir() bin := filepath.Join(t.TempDir(), "bin")