From e7a95a7381166c26fa861975ce42b1f3d7547597 Mon Sep 17 00:00:00 2001
From: vansin <smartflowaiteam@gmail.com>
Date: Sun, 28 Jun 2026 09:12:33 +0800
Subject: [PATCH] =?UTF-8?q?fix(#266=20Bucket=20B):=20reset=20server=20+=20?=
 =?UTF-8?q?DB=20between=20e2e=20suites=20=E2=80=94=20recover=20V3=20Networ?=
 =?UTF-8?q?ks=20visibility?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Before this change, test-all.sh started one bun server up-front and ran all
4 suites against it. Two suites (V3 Networks, Config Priority) also try to
`bun run src/index.ts &` themselves; that background bind hits EADDRINUSE
(harmless in the bg case — the parent's server still serves), but the
state-contamination from Base E2E (137 tests' worth of users/networks/
tasks in commhub.db) breaks downstream suites:

  - V3 Networks: completes 22 tests but the captured Results line interacts
    with the shared-DB pollution in ways that make run_suite's regex
    match the wrong line → reported as "0 ran (suite crashed)" even
    though the suite exits 0.
  - Config Priority: depends on `anet node create` with hand-written
    config.json; under the polluted shared DB this can't reach its
    Results line.

This PR replaces the single up-front server start with a `reset_server`
helper called between every suite. Each helper invocation:
  1. pkill the previous bun server
  2. waits for :9200 to actually free up (pkill is async — binding too
     soon re-triggers EADDRINUSE)
  3. rm -rf /root/.commhub /root/.anet (fresh DB + fresh client state)
  4. starts a new bun server in background
  5. polls /health until 200 (max 15s)

Local verification on the patched image, full test-all.sh run:

  before                          after
  ───────────────────────         ───────────────────────
  Base E2E:    90 / 45            Base E2E:    90 / 45    (unchanged)
  V3 Auth:     25 / 0             V3 Auth:     25 / 0     (unchanged)
  V3 Networks: 0 ran ⚠            V3 Networks: 19 / 3    ✅ visible
  Config Pri:  0 ran ⚠            Config Pri:  0 ran ⚠    (separate, see below)
  TOTAL:      115 / 47            TOTAL:      134 / 49

Net: +19 PASS surfaced + +2 real fails surfaced (V3 Networks alpha/beta
task — likely same root as Base E2E A3 in #266 audit).

Config Priority still reports "0 ran" — that's a separate issue:
`anet node create` now needs a real login first, but Config Priority
intentionally writes a fake `token:"global-tok"` to verify the config
priority resolution. That's a test-design problem (not orchestration),
and needs a small refactor to do `anet login` first then layer the
priority overrides on top. Tracked separately in #266 (Bucket C).

Refs: #266 (Docker E2E audit), #265+#269 (CI baseline restored)
---
 tests/test-all.sh | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/tests/test-all.sh b/tests/test-all.sh
index 20eadee2..d7d658c0 100755
--- a/tests/test-all.sh
+++ b/tests/test-all.sh
@@ -38,13 +38,43 @@ run_suite() {
   echo ""
 }
 
-# Start server once
-cd /app/server && bun run src/index.ts &>/dev/null &
-sleep 3
+# Each suite gets a fresh server + fresh DB. Without this, Base E2E leaves
+# 137 tests' worth of state in commhub.db + binds :9200, which then breaks
+# V3 Networks (state pollution → Results line never produced → reported as
+# "0 ran") and Config Priority (its own `bun run src/index.ts &` gets
+# EADDRINUSE because the parent's server still owns :9200, so the suite
+# proceeds against the stale shared server and stops on the first
+# `anet node create` that needs fresh-DB state). See #266 round-1 audit.
+#
+# Suites that don't start their own server (Base E2E, V3 Auth) rely on
+# reset_server() to put one up for them. Suites that DO start their own
+# (V3 Networks, Config Priority) get a free :9200 to bind to.
+reset_server() {
+  pkill -f 'bun.*src/index.ts' 2>/dev/null || true
+  # Wait for :9200 to actually free up — pkill is async; binding before
+  # the old process releases would re-trigger the EADDRINUSE we just fixed.
+  for _ in $(seq 1 30); do
+    if ! (exec 3<>/dev/tcp/127.0.0.1/9200) 2>/dev/null; then break; fi
+    exec 3>&- 2>/dev/null || true
+    sleep 0.25
+  done
+  rm -rf /root/.commhub /root/.anet 2>/dev/null || true
+  cd /app/server && bun run src/index.ts &>/dev/null &
+  for _ in $(seq 1 30); do
+    curl -sf http://127.0.0.1:9200/health > /dev/null && return 0
+    sleep 0.5
+  done
+  echo "::warning::reset_server: server did not respond to /health within 15s" >&2
+  return 1
+}
 
+reset_server
 run_suite "Base E2E (137)" "/app/test.sh 2>&1"
+reset_server
 run_suite "V3 Auth (25)" "/app/test-auth.sh 2>&1"
+reset_server
 run_suite "V3 Networks (22)" "/app/test-networks.sh 2>&1"
+reset_server
 run_suite "Config Priority (16)" "/app/test-config.sh 2>&1"
 
 echo ""