From 290a0eaffb953f209952e5e40b294c95f8c66bb5 Mon Sep 17 00:00:00 2001 From: Codex Date: Wed, 20 May 2026 18:10:03 -0400 Subject: [PATCH] =?UTF-8?q?feat(scripts):=20audit-bd-memories=20=E2=80=94?= =?UTF-8?q?=20near-duplicate=20+=20retired-surface=20scanner=20(soc-lgq4?= =?UTF-8?q?=20#memory-audit)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 154+ bd memories accumulating. This script computes pairwise Jaccard on content tokens (default threshold 0.65) and scans for retired-surface keywords. Emits markdown report at .agents/audits/bd-memories-.md; does not auto-delete (operator runs bd forget selectively). Flags: --threshold --out --stdout --retired --no-retired --no-dups --json. Pure shell + awk; bd stubbed in tests for determinism. Smoke against real memories surfaced 3 near-dup pairs + 23 retired-surface mentions. Closes-scenario: soc-lgq4#memory-audit Bounded-context: BC1-Corpus Evidence: scripts/audit-bd-memories.sh Evidence: tests/scripts/audit-bd-memories.bats --- scripts/audit-bd-memories.sh | 214 +++++++++++++++++++++++++++ tests/scripts/audit-bd-memories.bats | 181 ++++++++++++++++++++++ 2 files changed, 395 insertions(+) create mode 100755 scripts/audit-bd-memories.sh create mode 100644 tests/scripts/audit-bd-memories.bats diff --git a/scripts/audit-bd-memories.sh b/scripts/audit-bd-memories.sh new file mode 100755 index 000000000..59f88a2c5 --- /dev/null +++ b/scripts/audit-bd-memories.sh @@ -0,0 +1,214 @@ +#!/usr/bin/env bash +# audit-bd-memories.sh — surface duplicate / stale-surface bd memories. +# +# 156+ bd memories as of 2026-05-20. Without curation, recall quality +# degrades (same lesson stored 3 ways; old lessons referencing retired +# surfaces like Ollama / shepherd-cron). This script does NOT delete +# anything; it produces a markdown report at +# `.agents/audits/bd-memories-.md` with three sections: +# +# NEAR-DUPLICATES memory pairs with content jaccard >= threshold +# RETIRED-SURFACE memories whose body mentions terms in +# the retired-surfaces list +# SUMMARY total / candidates-for-review counts +# +# Operator reviews and selectively runs `bd forget `. +# +# Flags: +# --threshold <0..1> Jaccard similarity floor for near-duplicates +# (default: 0.65) +# --out Output markdown path +# (default: .agents/audits/bd-memories-.md) +# --stdout Emit markdown to stdout (skip file write) +# --retired Override retired-surface keywords list +# --no-retired Skip retired-surface section +# --no-dups Skip near-duplicate section +# --json Machine-readable summary (skips markdown) +# +# Exit codes: +# 0 — audit completed (whether candidates were found or not) +# 2 — usage error +# 3 — bd unavailable or returned no memories + +set -euo pipefail + +THRESHOLD="0.65" +OUT_PATH="" +TO_STDOUT=0 +JSON=0 +INCLUDE_DUPS=1 +INCLUDE_RETIRED=1 +RETIRED_DEFAULT="ollama,shepherd-cron,openclaw,gemma,morai-codex,d:\\\\dream,dreamworker" +RETIRED_LIST="$RETIRED_DEFAULT" + +usage() { + sed -n '2,/^$/p' "$0" | sed 's/^# \{0,1\}//' + exit "${1:-0}" +} + +while [ $# -gt 0 ]; do + case "$1" in + --threshold) shift; THRESHOLD="${1:-0.65}" ;; + --out) shift; OUT_PATH="${1:-}" ;; + --stdout) TO_STDOUT=1 ;; + --retired) shift; RETIRED_LIST="${1:-}" ;; + --no-retired) INCLUDE_RETIRED=0 ;; + --no-dups) INCLUDE_DUPS=0 ;; + --json) JSON=1 ;; + -h|--help) usage 0 ;; + *) echo "audit-bd-memories: unknown arg: $1" >&2; usage 2 ;; + esac + shift || true +done + +if ! command -v bd >/dev/null 2>&1; then + echo "audit-bd-memories: bd CLI not available" >&2 + exit 3 +fi + +DATE_STR="$(date -u +%Y-%m-%d)" +if [ -z "$OUT_PATH" ]; then + OUT_PATH=".agents/audits/bd-memories-$DATE_STR.md" +fi + +# Step 1: parse `bd memories` into a TSV of "key\tcontent". The format is: +# Memories (N): ← header line +# ← blank +# ← 2-space indent +# ... ← 4-space indent (may be truncated) +# ← blank between memories +TMP_TSV="$(mktemp)" +# The tokens dir is `$TMP_TSV.tokens`, so use -rf to clean the whole sibling set. +trap 'rm -rf "$TMP_TSV" "$TMP_TSV".*' EXIT + +bd memories 2>/dev/null | awk ' + /^Memories \(/ { next } + /^ [^ ]/ { + if (key) { print key "\t" content } + sub(/^ /, ""); key=$0; content=""; next + } + /^ / { + sub(/^ /, "") + content = (content == "" ? $0 : content " " $0) + next + } + /^$/ { next } + END { if (key) { print key "\t" content } } +' > "$TMP_TSV" + +count=$(wc -l < "$TMP_TSV" | tr -d ' ') +if [ "$count" -eq 0 ]; then + echo "audit-bd-memories: no memories found" >&2 + exit 3 +fi + +# Step 2: near-duplicate detection via Jaccard on word-token sets. +# We compute one token-set file per memory under $TMP_TSV.tokens/, +# then walk pairs. +mkdir -p "$TMP_TSV.tokens" +i=0 +keys_file="$TMP_TSV.keys" +: > "$keys_file" +while IFS=$'\t' read -r key content; do + i=$((i + 1)) + printf '%s\n' "$key" >> "$keys_file" + printf '%s\n' "$content" | tr 'A-Z' 'a-z' | tr -c 'a-z0-9' '\n' \ + | awk 'length($0) >= 3' | sort -u > "$TMP_TSV.tokens/$i" +done < "$TMP_TSV" + +# Helper: jaccard A B → prints decimal 0..1 (0 when both empty) +jaccard() { + local a="$1" b="$2" union inter + inter="$(comm -12 "$a" "$b" 2>/dev/null | wc -l | tr -d ' ')" + union="$(cat "$a" "$b" | sort -u | wc -l | tr -d ' ')" + if [ "$union" -eq 0 ]; then + echo "0" + else + awk -v i="$inter" -v u="$union" 'BEGIN { printf "%.3f", i/u }' + fi +} + +# Collect (key_a, key_b, score) for pairs above threshold. +DUPS_FILE="$TMP_TSV.dups" +: > "$DUPS_FILE" +if [ "$INCLUDE_DUPS" -eq 1 ] && [ "$count" -gt 1 ]; then + for ((a=1; a= t+0) }'; then + key_b="$(sed -n "${b}p" "$keys_file")" + printf '%s\t%s\t%s\n' "$score" "$key_a" "$key_b" >> "$DUPS_FILE" + fi + done + done + # Sort highest-score first. + sort -r -o "$DUPS_FILE" "$DUPS_FILE" +fi + +dup_count="$(wc -l < "$DUPS_FILE" | tr -d ' ')" + +# Step 3: retired-surface scan. +RETIRED_FILE="$TMP_TSV.retired" +: > "$RETIRED_FILE" +if [ "$INCLUDE_RETIRED" -eq 1 ] && [ -n "$RETIRED_LIST" ]; then + # Convert csv to alternation regex. + pattern="$(printf '%s' "$RETIRED_LIST" | tr ',' '|')" + while IFS=$'\t' read -r key content; do + if printf '%s' "$content" | grep -iqE "$pattern"; then + hit="$(printf '%s' "$content" | grep -ioE "$pattern" | head -1)" + printf '%s\t%s\n' "$key" "$hit" >> "$RETIRED_FILE" + fi + done < "$TMP_TSV" +fi +retired_count="$(wc -l < "$RETIRED_FILE" | tr -d ' ')" + +# Step 4: emit output. +if [ "$JSON" -eq 1 ]; then + printf '{"total":%d,"near_duplicates":%d,"retired_candidates":%d,"threshold":%s}\n' \ + "$count" "$dup_count" "$retired_count" "$THRESHOLD" + exit 0 +fi + +emit_markdown() { + printf '# bd memories audit — %s\n\n' "$DATE_STR" + printf '*Inspected %d memories. Jaccard threshold: %s.*\n\n' "$count" "$THRESHOLD" + printf '## Summary\n\n' + printf -- '- Total memories: **%d**\n' "$count" + printf -- '- Near-duplicate pairs (>= %s jaccard): **%d**\n' "$THRESHOLD" "$dup_count" + printf -- '- Retired-surface candidates: **%d**\n' "$retired_count" + + if [ "$INCLUDE_DUPS" -eq 1 ]; then + printf '\n## Near-duplicates\n\n' + if [ "$dup_count" -eq 0 ]; then + printf '*(none)*\n' + else + printf '| Score | Key A | Key B |\n' + printf '|---|---|---|\n' + awk -F'\t' '{ printf "| %s | `%s` | `%s` |\n", $1, $2, $3 }' "$DUPS_FILE" + fi + fi + + if [ "$INCLUDE_RETIRED" -eq 1 ]; then + printf '\n## Retired-surface candidates\n\n' + if [ "$retired_count" -eq 0 ]; then + printf '*(none)*\n' + else + printf '*Pattern: %s*\n\n' "$RETIRED_LIST" + printf '| Key | Matched term |\n' + printf '|---|---|\n' + awk -F'\t' '{ printf "| `%s` | %s |\n", $1, $2 }' "$RETIRED_FILE" + fi + fi + printf '\n---\n*Generated by `scripts/audit-bd-memories.sh`. Operator reviews and selectively runs `bd forget `.*\n' +} + +if [ "$TO_STDOUT" -eq 1 ]; then + emit_markdown +else + mkdir -p "$(dirname "$OUT_PATH")" + emit_markdown > "$OUT_PATH" + echo "audit-bd-memories: wrote $OUT_PATH" + echo "audit-bd-memories: $count memories scanned, $dup_count near-dup pair(s), $retired_count retired-surface match(es)" +fi diff --git a/tests/scripts/audit-bd-memories.bats b/tests/scripts/audit-bd-memories.bats new file mode 100644 index 000000000..e2d6e6b43 --- /dev/null +++ b/tests/scripts/audit-bd-memories.bats @@ -0,0 +1,181 @@ +#!/usr/bin/env bats +# Regression tests for scripts/audit-bd-memories.sh (soc-lgq4). +# +# The script shells out to `bd memories`. We stub that binary via PATH +# so tests get deterministic input without hitting the real dolt store. + +setup() { + REPO_ROOT="$(git rev-parse --show-toplevel)" + SCRIPT="$REPO_ROOT/scripts/audit-bd-memories.sh" + TMP="$(mktemp -d)" + ORIG_DIR="$PWD" + ORIG_PATH="$PATH" + mkdir -p "$TMP/bin" +} + +teardown() { + cd "$ORIG_DIR" 2>/dev/null || true + export PATH="$ORIG_PATH" + rm -rf "$TMP" +} + +# stub_bd — write a bd shim that emits the file. +stub_bd() { + local out_file="$1" + cat >"$TMP/bin/bd" <"$TMP/mems.txt" <<'EOF' +Memories (4): + + alpha-one + The quick brown fox jumps over the lazy dog repeatedly. + + alpha-two + The quick brown fox jumps over the lazy dog repeatedly. + + beta-distinct + Completely unrelated content about systemd timers and journald. + + retired-mention + Old lesson about ollama gemma morai-codex pipelines that no longer apply. +EOF +} + +run_audit() { + cd "$TMP" + run "$SCRIPT" "$@" +} + +@test "exits 3 when no bd memories present" { + cat >"$TMP/mems.txt" <<'EOF' +Memories (0): + +EOF + stub_bd "$TMP/mems.txt" + run_audit --json + [ "$status" -eq 3 ] +} + +@test "--json reports counts on a 4-memory corpus" { + write_corpus_basic + stub_bd "$TMP/mems.txt" + run_audit --json + [ "$status" -eq 0 ] + echo "$output" | jq -e '.total == 4' >/dev/null + # alpha-one and alpha-two are byte-identical → 1.0 jaccard, definitely above threshold. + echo "$output" | jq -e '.near_duplicates >= 1' >/dev/null + # retired-mention contains "ollama" → matches default pattern. + echo "$output" | jq -e '.retired_candidates >= 1' >/dev/null +} + +@test "default markdown output lands under .agents/audits/" { + write_corpus_basic + stub_bd "$TMP/mems.txt" + # We need .agents/ to be writable; the script creates the audit dir. + run_audit + [ "$status" -eq 0 ] + files=$(ls "$TMP/.agents/audits/bd-memories-"*.md 2>/dev/null | wc -l | tr -d ' ') + [ "$files" -eq 1 ] +} + +@test "--stdout emits markdown instead of writing a file" { + write_corpus_basic + stub_bd "$TMP/mems.txt" + run_audit --stdout + [ "$status" -eq 0 ] + [[ "$output" == *"# bd memories audit"* ]] + [[ "$output" == *"## Near-duplicates"* ]] + [[ "$output" == *"## Retired-surface candidates"* ]] + # No file should have been written. + ! ls "$TMP/.agents/audits/bd-memories-"*.md 2>/dev/null +} + +@test "near-duplicates table includes the duplicate keys" { + write_corpus_basic + stub_bd "$TMP/mems.txt" + run_audit --stdout + [ "$status" -eq 0 ] + [[ "$output" == *"alpha-one"* ]] + [[ "$output" == *"alpha-two"* ]] +} + +@test "--threshold 0.99 raises the bar; identical pairs still pass, near misses don't" { + cat >"$TMP/mems.txt" <<'EOF' +Memories (2): + + a-mostly-same + apple banana cherry date elderberry fig grape + + b-mostly-same + apple banana cherry date elderberry fig pear +EOF + stub_bd "$TMP/mems.txt" + run_audit --threshold 0.99 --json + [ "$status" -eq 0 ] + # 6 of 8 unique words shared = 0.75 jaccard → below 0.99. + echo "$output" | jq -e '.near_duplicates == 0' >/dev/null +} + +@test "--no-dups suppresses near-duplicate scanning entirely" { + write_corpus_basic + stub_bd "$TMP/mems.txt" + run_audit --stdout --no-dups + [ "$status" -eq 0 ] + [[ "$output" != *"## Near-duplicates"* ]] +} + +@test "--no-retired suppresses retired-surface section" { + write_corpus_basic + stub_bd "$TMP/mems.txt" + run_audit --stdout --no-retired + [ "$status" -eq 0 ] + [[ "$output" != *"## Retired-surface candidates"* ]] +} + +@test "--retired overrides default retired-keyword list" { + cat >"$TMP/mems.txt" <<'EOF' +Memories (2): + + a-clean + nothing notable about this one + + b-special + this memory mentions cobalt-strike very loudly +EOF + stub_bd "$TMP/mems.txt" + run_audit --stdout --retired "cobalt-strike" + [ "$status" -eq 0 ] + [[ "$output" == *"b-special"* ]] + [[ "$output" == *"cobalt-strike"* ]] +} + +@test "unknown flag exits 2 with usage error" { + stub_bd "$TMP/mems.txt" + run_audit --weasel + [ "$status" -eq 2 ] + [[ "$output" == *"unknown"* ]] +} + +@test "missing bd binary exits 3" { + # Don't stub bd; ensure it's not on the test PATH while keeping coreutils. + mkdir -p "$TMP/coreutils-only" + for cmd in bash sh sed awk grep tr sort comm cat mkdir mv rm cp ls wc dirname basename head tail printf jq mktemp; do + full="$(command -v "$cmd" 2>/dev/null || true)" + [ -n "$full" ] && ln -sf "$full" "$TMP/coreutils-only/$cmd" + done + export PATH="$TMP/coreutils-only" + run_audit --json + [ "$status" -eq 3 ] +}