boshu2 · boshu2 · May 20, 2026 · May 20, 2026
@@ -0,0 +1,214 @@
+#!/usr/bin/env bash
+# audit-bd-memories.sh — surface duplicate / stale-surface bd memories.
+#
+# 156+ bd memories as of 2026-05-20. Without curation, recall quality
+# degrades (same lesson stored 3 ways; old lessons referencing retired
+# surfaces like Ollama / shepherd-cron). This script does NOT delete
+# anything; it produces a markdown report at
+# `.agents/audits/bd-memories-<YYYY-MM-DD>.md` with three sections:
+#
+#   NEAR-DUPLICATES         memory pairs with content jaccard >= threshold
+#   RETIRED-SURFACE         memories whose body mentions terms in
+#                            the retired-surfaces list
+#   SUMMARY                 total / candidates-for-review counts
+#
+# Operator reviews and selectively runs `bd forget <key>`.
+#
+# Flags:
+#   --threshold <0..1>   Jaccard similarity floor for near-duplicates
+#                         (default: 0.65)
+#   --out <path>         Output markdown path
+#                         (default: .agents/audits/bd-memories-<date>.md)
+#   --stdout             Emit markdown to stdout (skip file write)
+#   --retired <csv>      Override retired-surface keywords list
+#   --no-retired         Skip retired-surface section
+#   --no-dups            Skip near-duplicate section
+#   --json               Machine-readable summary (skips markdown)
+#
+# Exit codes:
+#   0 — audit completed (whether candidates were found or not)
+#   2 — usage error
+#   3 — bd unavailable or returned no memories
+
+set -euo pipefail
+
+THRESHOLD="0.65"
+OUT_PATH=""
+TO_STDOUT=0
+JSON=0
+INCLUDE_DUPS=1
+INCLUDE_RETIRED=1
+RETIRED_DEFAULT="ollama,shepherd-cron,openclaw,gemma,morai-codex,d:\\\\dream,dreamworker"
+RETIRED_LIST="$RETIRED_DEFAULT"
+
+usage() {
+  sed -n '2,/^$/p' "$0" | sed 's/^# \{0,1\}//'
+  exit "${1:-0}"
+}
+
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --threshold) shift; THRESHOLD="${1:-0.65}" ;;
+    --out) shift; OUT_PATH="${1:-}" ;;
+    --stdout) TO_STDOUT=1 ;;
+    --retired) shift; RETIRED_LIST="${1:-}" ;;
+    --no-retired) INCLUDE_RETIRED=0 ;;
+    --no-dups) INCLUDE_DUPS=0 ;;
+    --json) JSON=1 ;;
+    -h|--help) usage 0 ;;
+    *) echo "audit-bd-memories: unknown arg: $1" >&2; usage 2 ;;
+  esac
+  shift || true
+done
+
+if ! command -v bd >/dev/null 2>&1; then
+  echo "audit-bd-memories: bd CLI not available" >&2
+  exit 3
+fi
+
+DATE_STR="$(date -u +%Y-%m-%d)"
+if [ -z "$OUT_PATH" ]; then
+  OUT_PATH=".agents/audits/bd-memories-$DATE_STR.md"
+fi
+
+# Step 1: parse `bd memories` into a TSV of "key\tcontent". The format is:
+#   Memories (N):                  ← header line
+#                                  ← blank
+#     <key>                        ← 2-space indent
+#       <content snippet>...       ← 4-space indent (may be truncated)
+#                                  ← blank between memories
+TMP_TSV="$(mktemp)"
+# The tokens dir is `$TMP_TSV.tokens`, so use -rf to clean the whole sibling set.
+trap 'rm -rf "$TMP_TSV" "$TMP_TSV".*' EXIT
+
+bd memories 2>/dev/null | awk '
+  /^Memories \(/ { next }
+  /^  [^ ]/ {
+    if (key) { print key "\t" content }
+    sub(/^  /, ""); key=$0; content=""; next
+  }
+  /^    / {
+    sub(/^    /, "")
+    content = (content == "" ? $0 : content " " $0)
+    next
+  }
+  /^$/ { next }
+  END { if (key) { print key "\t" content } }
+' > "$TMP_TSV"
+
+count=$(wc -l < "$TMP_TSV" | tr -d ' ')
+if [ "$count" -eq 0 ]; then
+  echo "audit-bd-memories: no memories found" >&2
+  exit 3
+fi
+
+# Step 2: near-duplicate detection via Jaccard on word-token sets.
+# We compute one token-set file per memory under $TMP_TSV.tokens/<n>,
+# then walk pairs.
+mkdir -p "$TMP_TSV.tokens"
+i=0
+keys_file="$TMP_TSV.keys"
+: > "$keys_file"
+while IFS=$'\t' read -r key content; do
+  i=$((i + 1))
+  printf '%s\n' "$key" >> "$keys_file"
+  printf '%s\n' "$content" | tr 'A-Z' 'a-z' | tr -c 'a-z0-9' '\n' \
+    | awk 'length($0) >= 3' | sort -u > "$TMP_TSV.tokens/$i"
+done < "$TMP_TSV"
+
+# Helper: jaccard A B → prints decimal 0..1 (0 when both empty)
+jaccard() {
+  local a="$1" b="$2" union inter
+  inter="$(comm -12 "$a" "$b" 2>/dev/null | wc -l | tr -d ' ')"
+  union="$(cat "$a" "$b" | sort -u | wc -l | tr -d ' ')"
+  if [ "$union" -eq 0 ]; then
+    echo "0"
+  else
+    awk -v i="$inter" -v u="$union" 'BEGIN { printf "%.3f", i/u }'
+  fi
+}
+
+# Collect (key_a, key_b, score) for pairs above threshold.
+DUPS_FILE="$TMP_TSV.dups"
+: > "$DUPS_FILE"
+if [ "$INCLUDE_DUPS" -eq 1 ] && [ "$count" -gt 1 ]; then
+  for ((a=1; a<count; a++)); do
+    key_a="$(sed -n "${a}p" "$keys_file")"
+    for ((b=a+1; b<=count; b++)); do
+      score="$(jaccard "$TMP_TSV.tokens/$a" "$TMP_TSV.tokens/$b")"
+      # awk for compare so we can compare decimals robustly
+      if awk -v s="$score" -v t="$THRESHOLD" 'BEGIN { exit !(s+0 >= t+0) }'; then
+        key_b="$(sed -n "${b}p" "$keys_file")"
+        printf '%s\t%s\t%s\n' "$score" "$key_a" "$key_b" >> "$DUPS_FILE"
+      fi
+    done
+  done
+  # Sort highest-score first.
+  sort -r -o "$DUPS_FILE" "$DUPS_FILE"
+fi
+
+dup_count="$(wc -l < "$DUPS_FILE" | tr -d ' ')"
+
+# Step 3: retired-surface scan.
+RETIRED_FILE="$TMP_TSV.retired"
+: > "$RETIRED_FILE"
+if [ "$INCLUDE_RETIRED" -eq 1 ] && [ -n "$RETIRED_LIST" ]; then
+  # Convert csv to alternation regex.
+  pattern="$(printf '%s' "$RETIRED_LIST" | tr ',' '|')"
+  while IFS=$'\t' read -r key content; do
+    if printf '%s' "$content" | grep -iqE "$pattern"; then
+      hit="$(printf '%s' "$content" | grep -ioE "$pattern" | head -1)"
+      printf '%s\t%s\n' "$key" "$hit" >> "$RETIRED_FILE"
+    fi
+  done < "$TMP_TSV"
+fi
+retired_count="$(wc -l < "$RETIRED_FILE" | tr -d ' ')"
+
+# Step 4: emit output.
+if [ "$JSON" -eq 1 ]; then
+  printf '{"total":%d,"near_duplicates":%d,"retired_candidates":%d,"threshold":%s}\n' \
+    "$count" "$dup_count" "$retired_count" "$THRESHOLD"
+  exit 0
+fi
+
+emit_markdown() {
+  printf '# bd memories audit — %s\n\n' "$DATE_STR"
+  printf '*Inspected %d memories. Jaccard threshold: %s.*\n\n' "$count" "$THRESHOLD"
+  printf '## Summary\n\n'
+  printf -- '- Total memories: **%d**\n' "$count"
+  printf -- '- Near-duplicate pairs (>= %s jaccard): **%d**\n' "$THRESHOLD" "$dup_count"
+  printf -- '- Retired-surface candidates: **%d**\n' "$retired_count"
+
+  if [ "$INCLUDE_DUPS" -eq 1 ]; then
+    printf '\n## Near-duplicates\n\n'
+    if [ "$dup_count" -eq 0 ]; then
+      printf '*(none)*\n'
+    else
+      printf '| Score | Key A | Key B |\n'
+      printf '|---|---|---|\n'
+      awk -F'\t' '{ printf "| %s | `%s` | `%s` |\n", $1, $2, $3 }' "$DUPS_FILE"
+    fi
+  fi
+
+  if [ "$INCLUDE_RETIRED" -eq 1 ]; then
+    printf '\n## Retired-surface candidates\n\n'
+    if [ "$retired_count" -eq 0 ]; then
+      printf '*(none)*\n'
+    else
+      printf '*Pattern: %s*\n\n' "$RETIRED_LIST"
+      printf '| Key | Matched term |\n'
+      printf '|---|---|\n'
+      awk -F'\t' '{ printf "| `%s` | %s |\n", $1, $2 }' "$RETIRED_FILE"
+    fi
+  fi
+  printf '\n---\n*Generated by `scripts/audit-bd-memories.sh`. Operator reviews and selectively runs `bd forget <key>`.*\n'
+}
+
+if [ "$TO_STDOUT" -eq 1 ]; then
+  emit_markdown
+else
+  mkdir -p "$(dirname "$OUT_PATH")"
+  emit_markdown > "$OUT_PATH"
+  echo "audit-bd-memories: wrote $OUT_PATH"
+  echo "audit-bd-memories: $count memories scanned, $dup_count near-dup pair(s), $retired_count retired-surface match(es)"
+fi
@@ -0,0 +1,181 @@
+#!/usr/bin/env bats
+# Regression tests for scripts/audit-bd-memories.sh (soc-lgq4).
+#
+# The script shells out to `bd memories`. We stub that binary via PATH
+# so tests get deterministic input without hitting the real dolt store.
+
+setup() {
+  REPO_ROOT="$(git rev-parse --show-toplevel)"
+  SCRIPT="$REPO_ROOT/scripts/audit-bd-memories.sh"
+  TMP="$(mktemp -d)"
+  ORIG_DIR="$PWD"
+  ORIG_PATH="$PATH"
+  mkdir -p "$TMP/bin"
+}
+
+teardown() {
+  cd "$ORIG_DIR" 2>/dev/null || true
+  export PATH="$ORIG_PATH"
+  rm -rf "$TMP"
+}
+
+# stub_bd <memories-output-file> — write a bd shim that emits the file.
+stub_bd() {
+  local out_file="$1"
+  cat >"$TMP/bin/bd" <<EOF
+#!/usr/bin/env bash
+if [ "\$1" = "memories" ]; then
+  cat "$out_file"
+  exit 0
+fi
+exit 0
+EOF
+  chmod +x "$TMP/bin/bd"
+  export PATH="$TMP/bin:$ORIG_PATH"
+}
+
+# Common synthetic memory corpus.
+write_corpus_basic() {
+  cat >"$TMP/mems.txt" <<'EOF'
+Memories (4):
+
+  alpha-one
+    The quick brown fox jumps over the lazy dog repeatedly.
+
+  alpha-two
+    The quick brown fox jumps over the lazy dog repeatedly.
+
+  beta-distinct
+    Completely unrelated content about systemd timers and journald.
+
+  retired-mention
+    Old lesson about ollama gemma morai-codex pipelines that no longer apply.
+EOF
+}
+
+run_audit() {
+  cd "$TMP"
+  run "$SCRIPT" "$@"
+}
+
+@test "exits 3 when no bd memories present" {
+  cat >"$TMP/mems.txt" <<'EOF'
+Memories (0):
+
+EOF
+  stub_bd "$TMP/mems.txt"
+  run_audit --json
+  [ "$status" -eq 3 ]
+}
+
+@test "--json reports counts on a 4-memory corpus" {
+  write_corpus_basic
+  stub_bd "$TMP/mems.txt"
+  run_audit --json
+  [ "$status" -eq 0 ]
+  echo "$output" | jq -e '.total == 4' >/dev/null
+  # alpha-one and alpha-two are byte-identical → 1.0 jaccard, definitely above threshold.
+  echo "$output" | jq -e '.near_duplicates >= 1' >/dev/null
+  # retired-mention contains "ollama" → matches default pattern.
+  echo "$output" | jq -e '.retired_candidates >= 1' >/dev/null
+}
+
+@test "default markdown output lands under .agents/audits/" {
+  write_corpus_basic
+  stub_bd "$TMP/mems.txt"
+  # We need .agents/ to be writable; the script creates the audit dir.
+  run_audit
+  [ "$status" -eq 0 ]
+  files=$(ls "$TMP/.agents/audits/bd-memories-"*.md 2>/dev/null | wc -l | tr -d ' ')
+  [ "$files" -eq 1 ]
+}
+
+@test "--stdout emits markdown instead of writing a file" {
+  write_corpus_basic
+  stub_bd "$TMP/mems.txt"
+  run_audit --stdout
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"# bd memories audit"* ]]
+  [[ "$output" == *"## Near-duplicates"* ]]
+  [[ "$output" == *"## Retired-surface candidates"* ]]
+  # No file should have been written.
+  ! ls "$TMP/.agents/audits/bd-memories-"*.md 2>/dev/null
+}
+
+@test "near-duplicates table includes the duplicate keys" {
+  write_corpus_basic
+  stub_bd "$TMP/mems.txt"
+  run_audit --stdout
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"alpha-one"* ]]
+  [[ "$output" == *"alpha-two"* ]]
+}
+
+@test "--threshold 0.99 raises the bar; identical pairs still pass, near misses don't" {
+  cat >"$TMP/mems.txt" <<'EOF'
+Memories (2):
+
+  a-mostly-same
+    apple banana cherry date elderberry fig grape
+
+  b-mostly-same
+    apple banana cherry date elderberry fig pear
+EOF
+  stub_bd "$TMP/mems.txt"
+  run_audit --threshold 0.99 --json
+  [ "$status" -eq 0 ]
+  # 6 of 8 unique words shared = 0.75 jaccard → below 0.99.
+  echo "$output" | jq -e '.near_duplicates == 0' >/dev/null
+}
+
+@test "--no-dups suppresses near-duplicate scanning entirely" {
+  write_corpus_basic
+  stub_bd "$TMP/mems.txt"
+  run_audit --stdout --no-dups
+  [ "$status" -eq 0 ]
+  [[ "$output" != *"## Near-duplicates"* ]]
+}
+
+@test "--no-retired suppresses retired-surface section" {
+  write_corpus_basic
+  stub_bd "$TMP/mems.txt"
+  run_audit --stdout --no-retired
+  [ "$status" -eq 0 ]
+  [[ "$output" != *"## Retired-surface candidates"* ]]
+}
+
+@test "--retired <csv> overrides default retired-keyword list" {
+  cat >"$TMP/mems.txt" <<'EOF'
+Memories (2):
+
+  a-clean
+    nothing notable about this one
+
+  b-special
+    this memory mentions cobalt-strike very loudly
+EOF
+  stub_bd "$TMP/mems.txt"
+  run_audit --stdout --retired "cobalt-strike"
+  [ "$status" -eq 0 ]
+  [[ "$output" == *"b-special"* ]]
+  [[ "$output" == *"cobalt-strike"* ]]
+}
+
+@test "unknown flag exits 2 with usage error" {
+  stub_bd "$TMP/mems.txt"
+  run_audit --weasel
+  [ "$status" -eq 2 ]
+  [[ "$output" == *"unknown"* ]]
+}
+
+@test "missing bd binary exits 3" {
+  # Don't stub bd; ensure it's not on the test PATH while keeping coreutils.
+  mkdir -p "$TMP/coreutils-only"
+  for cmd in bash sh sed awk grep tr sort comm cat mkdir mv rm cp ls wc dirname basename head tail printf jq mktemp; do
+    full="$(command -v "$cmd" 2>/dev/null || true)"
+    [ -n "$full" ] && ln -sf "$full" "$TMP/coreutils-only/$cmd"
+  done
+  export PATH="$TMP/coreutils-only"
+  run_audit --json
+  [ "$status" -eq 3 ]
+}