From 058dc02e23732633da55455d923a6097d24707b6 Mon Sep 17 00:00:00 2001 From: engineer Date: Thu, 4 Jun 2026 00:25:42 +0200 Subject: [PATCH] Harden ClawHub stats sync (SOL-5) Make the weekly lib/clawhub.ts stats refresh resilient to a flaky source: - Retries: bounded exponential backoff + jitter for network errors, timeouts, 429, and 5xx (was 429-only). 4xx stays non-retryable. - Per-request 15s timeout via AbortController so a hung connection cannot stall the job. - Structured JSON-line logging (ts/level/event/fields) for CI observability. - Graceful last-good fallback: if the source looks unavailable (nothing updated, or >50% of slugs fail), lib/clawhub.ts is left untouched and the run exits non-zero so the weekly job flags the outage instead of committing half-stale numbers. Document the weekly sync task in docs/clawhub-sync.md. Co-Authored-By: Claude Opus 4.8 (1M context) Co-authored-by: multica-agent --- docs/clawhub-sync.md | 69 ++++++++++++++++ scripts/refresh-clawhub-stats.mjs | 126 ++++++++++++++++++++++++------ 2 files changed, 172 insertions(+), 23 deletions(-) create mode 100644 docs/clawhub-sync.md diff --git a/docs/clawhub-sync.md b/docs/clawhub-sync.md new file mode 100644 index 0000000..00c1fbb --- /dev/null +++ b/docs/clawhub-sync.md @@ -0,0 +1,69 @@ +# ClawHub stats sync + +Keeps the install/star numbers on the live ClawHub listings in +[`lib/clawhub.ts`](../lib/clawhub.ts) current, so the catalog stops drifting +from reality. + +- **Script:** [`scripts/refresh-clawhub-stats.mjs`](../scripts/refresh-clawhub-stats.mjs) +- **Schedule:** [`.github/workflows/refresh-clawhub-stats.yml`](../.github/workflows/refresh-clawhub-stats.yml) + — weekly, **Mondays 06:00 UTC** (≈08:00 Paris), plus manual **Run workflow**. + +## What it does + +For each `openclaw skills install ` already present in `lib/clawhub.ts`, +it fetches the public ClawHub read API (`/api/v1/skills/`) and rewrites +that entry's `stats: { installs, stars, fetchedAt }` line in place. + +**Stats only.** It never adds, removes, or re-ranks skills — membership stays +human-gated (see `drafts/clawhub-candidates`). Slugs are the unit of work; the +file's structure is untouched. + +API reuse follows ClawHub's "Public catalog reuse" policy: weekly cadence, +`Retry-After` honored, canonical listings linked back via `docsUrl`. + +## Resilience + +The sync runs unattended, so a flaky source must not corrupt the catalog. + +- **Timeout** — every request is aborted after 15s so a hung connection can't + stall the job. +- **Retries** — transient failures (network errors, timeouts, `429`, `5xx`) + retry up to 3 times with exponential backoff + jitter (~0.5s → 1s → 2s). + `429` responses honor `Retry-After`. `4xx` (other than 429) are treated as + permanent and not retried. +- **Structured logging** — one JSON object per line (`{ ts, level, event, … }`). + `info` → stdout, `warn`/`error` → stderr. Grep CI logs by `event` + (`fetch_retry`, `fetch_failed`, `skip_slug`, `source_unavailable`, `sync_done`). +- **Graceful last-good fallback** — a single failing slug keeps its previous + values and the run continues. But if the source looks **unavailable** + (nothing updated, or >50% of slugs fail), the script leaves `lib/clawhub.ts` + **completely untouched** and exits non-zero. The weekly job then goes red to + surface the outage, and the committed catalog stays at its last-good state + instead of being overwritten with half-stale numbers. + +Because the script exits non-zero on a source outage, the workflow's +"Commit if changed" step is skipped — no commit, last-good preserved. + +## Running it manually + +```bash +# Dry run — fetch + report, write nothing: +node scripts/refresh-clawhub-stats.mjs --dry + +# Real run — rewrite lib/clawhub.ts in place: +node scripts/refresh-clawhub-stats.mjs +``` + +Requires Node 20+ (global `fetch`). After a real run, review the diff on +`lib/clawhub.ts` before committing. + +## Tuning + +Constants at the top of the script: + +| Constant | Default | Meaning | +| ----------------- | ------- | ------------------------------------------------ | +| `TIMEOUT_MS` | 15000 | Per-request abort timeout (ms). | +| `MAX_ATTEMPTS` | 4 | Initial try + 3 retries. | +| `RETRY_BASE_MS` | 500 | Backoff base; doubles each attempt. | +| `FAIL_THRESHOLD` | 0.5 | Failure fraction that trips the last-good guard. | diff --git a/scripts/refresh-clawhub-stats.mjs b/scripts/refresh-clawhub-stats.mjs index 4687eba..b67f46b 100644 --- a/scripts/refresh-clawhub-stats.mjs +++ b/scripts/refresh-clawhub-stats.mjs @@ -9,6 +9,14 @@ // ClawHub public read API is permitted per their "Public catalog reuse" policy: // we cache (weekly), honor 429/Retry-After, and link back to canonical listings. // +// Resilience (see docs/clawhub-sync.md): +// - Per-request timeout + bounded retries with exponential backoff + jitter +// for transient failures (network errors, timeouts, 429, 5xx). +// - Structured JSON-line logs (one event per line) for CI observability. +// - Graceful last-good fallback: if the source looks unavailable (most/all +// fetches fail), lib/clawhub.ts is left UNTOUCHED and the run exits non-zero +// so the weekly job surfaces the outage instead of writing a degraded file. +// // Local dry run: node scripts/refresh-clawhub-stats.mjs --dry import { readFileSync, writeFileSync } from "node:fs" @@ -18,34 +26,82 @@ const DRY = process.argv.includes("--dry") const API = "https://clawhub.ai/api/v1/skills" const today = new Date().toISOString().slice(0, 10) // YYYY-MM-DD (UTC) +const TIMEOUT_MS = 15000 // per-request abort +const MAX_ATTEMPTS = 4 // initial try + 3 retries +const RETRY_BASE_MS = 500 // backoff base: 0.5s, 1s, 2s (+ jitter) +const FAIL_THRESHOLD = 0.5 // >50% slugs failing => treat source as unavailable + const sleep = (ms) => new Promise((r) => setTimeout(r, ms)) const escapeRe = (s) => s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") -async function fetchStats(slug, attempt = 0) { - const res = await fetch(`${API}/${encodeURIComponent(slug)}`, { - headers: { accept: "application/json" }, - }) - if (res.status === 429 && attempt < 3) { - const wait = Number(res.headers.get("retry-after")) || 5 - await sleep((wait + Math.random()) * 1000) - return fetchStats(slug, attempt + 1) +// Structured logging: one JSON object per line. info -> stdout, warn/error -> stderr. +function log(level, event, fields = {}) { + const line = JSON.stringify({ ts: new Date().toISOString(), level, event, ...fields }) + if (level === "error" || level === "warn") console.error(line) + else console.log(line) +} + +async function fetchOnce(slug) { + const ctrl = new AbortController() + const timer = setTimeout(() => ctrl.abort(), TIMEOUT_MS) + try { + return await fetch(`${API}/${encodeURIComponent(slug)}`, { + headers: { accept: "application/json" }, + signal: ctrl.signal, + }) + } finally { + clearTimeout(timer) + } +} + +async function fetchStats(slug) { + for (let attempt = 1; ; attempt++) { + let res, netErr + try { + res = await fetchOnce(slug) + } catch (e) { + netErr = e // network failure or timeout abort + } + + if (!netErr && res.ok) { + const json = await res.json() + const s = json?.skill?.stats ?? json?.stats ?? {} + const installs = Number.isFinite(s.installsAllTime) ? s.installsAllTime : null + const stars = Number.isFinite(s.stars) ? s.stars : null + return { installs, stars } + } + + const status = res?.status + const retryable = !!netErr || status === 429 || (status >= 500 && status <= 599) + if (!retryable || attempt >= MAX_ATTEMPTS) { + throw new Error(netErr ? netErr.message : `HTTP ${status}`) + } + + // Honor Retry-After when present (429s), else exponential backoff + jitter. + const retryAfter = Number(res?.headers.get("retry-after")) + const wait = + Number.isFinite(retryAfter) && retryAfter > 0 + ? retryAfter * 1000 + : RETRY_BASE_MS * 2 ** (attempt - 1) + log("warn", "fetch_retry", { + slug, + attempt, + status: status ?? null, + error: netErr?.message ?? null, + waitMs: Math.round(wait), + }) + await sleep(wait + Math.random() * 250) } - if (!res.ok) throw new Error(`${slug}: HTTP ${res.status}`) - const json = await res.json() - const s = json?.skill?.stats ?? json?.stats ?? {} - const installs = Number.isFinite(s.installsAllTime) ? s.installsAllTime : null - const stars = Number.isFinite(s.stars) ? s.stars : null - return { installs, stars } } async function main() { let text = readFileSync(FILE, "utf8") const slugs = [...text.matchAll(/openclaw skills install ([^"]+)"/g)].map((m) => m[1]) if (slugs.length === 0) { - console.error("No ClawHub slugs found in lib/clawhub.ts — aborting.") + log("error", "no_slugs", { file: "lib/clawhub.ts" }) process.exit(1) } - console.log(`Refreshing ${slugs.length} ClawHub skills…`) + log("info", "sync_start", { slugs: slugs.length, dry: DRY }) let updated = 0 const failed = [] @@ -53,7 +109,8 @@ async function main() { try { const { installs, stars } = await fetchStats(slug) if (installs == null || stars == null) { - failed.push(`${slug} (missing stats)`) + failed.push(slug) + log("warn", "skip_slug", { slug, reason: "missing_stats" }) continue } // Replace this entry's stats line. Non-greedy hop from the unique install @@ -62,28 +119,51 @@ async function main() { `(openclaw skills install ${escapeRe(slug)}"[\\s\\S]*?stats: \\{ installs: )\\d+(, stars: )\\d+(, fetchedAt: ")[^"]*(" \\},)` ) if (!re.test(text)) { - failed.push(`${slug} (stats line not matched)`) + failed.push(slug) + log("warn", "skip_slug", { slug, reason: "stats_line_unmatched" }) continue } text = text.replace(re, `$1${installs}$2${stars}$3${today}$4`) updated++ } catch (e) { - failed.push(`${slug} (${e.message})`) + failed.push(slug) + log("warn", "fetch_failed", { slug, error: e.message }) } await sleep(150) // be polite } - if (failed.length) console.warn(`Skipped ${failed.length}: ${failed.join(", ")}`) - console.log(`Updated ${updated}/${slugs.length} entries, fetchedAt=${today}.`) + // Graceful fallback: when the source looks unavailable (nothing updated, or + // failures cross the threshold), keep last-good lib/clawhub.ts untouched + // rather than committing a half-stale file. Exit non-zero so CI flags it. + const failRate = failed.length / slugs.length + if (updated === 0 || failRate > FAIL_THRESHOLD) { + log("error", "source_unavailable", { + updated, + failed: failed.length, + total: slugs.length, + failRate: Number(failRate.toFixed(2)), + action: "kept_last_good", + }) + process.exit(1) + } + + if (failed.length) log("warn", "partial_skips", { count: failed.length, slugs: failed }) + log("info", "sync_done", { + updated, + failed: failed.length, + total: slugs.length, + fetchedAt: today, + }) if (DRY) { - console.log("(dry run — not writing)") + log("info", "dry_run", { wrote: false }) return } writeFileSync(FILE, text) + log("info", "wrote_file", { file: "lib/clawhub.ts" }) } main().catch((e) => { - console.error(e) + log("error", "fatal", { error: e?.message ?? String(e) }) process.exit(1) })