Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 29 additions & 8 deletions keeperhub-scheduler/block-dispatcher/chain-monitor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,19 @@ export class ChainMonitor {
// not falsify liveness. This was previously `lastBlockReceivedAt` and was
// updated before dedup — that was the root cause of the silent 7-hour
// outage where block-triggered workflows stopped without recovering.
//
// KEEP-570: also no longer reset by subscribeToBlocks on reconnect. A
// monitor that keeps reconnecting but never gets a real block is exactly
// the stuck state the reconciler needs to detect. The monitorBootAt field
// covers the cold-start warmup window so a freshly-booted monitor isn't
// reaped before its first block.
private lastBlockAdvanceAt: number | null = null;
// Wall-clock ms of when this monitor instance started running. Used by
// isAlive() as the staleness baseline before the first real block arrives.
// Replaces the previous behaviour of seeding lastBlockAdvanceAt on every
// subscribe, which broke the reconciler's ability to detect monitors
// stuck across many reconnect cycles.
private monitorBootAt: number | null = null;
private blocksReceived = 0;
private blocksMatched = 0;
private lastHeartbeat = Date.now();
Expand Down Expand Up @@ -207,6 +219,7 @@ export class ChainMonitor {
}
this.isRunning = true;
this.silentReconnects = 0;
this.monitorBootAt = Date.now();
metrics.setSilentReconnectsCurrent(this.chainName, 0);
metrics.setWorkflowsTracked(this.chainName, this.workflows.length);

Expand Down Expand Up @@ -280,10 +293,18 @@ export class ChainMonitor {
// reconciler tears it down and starts a fresh monitor. The signal is
// height-advance — not callback-fire — because a stuck upstream can
// replay the same block(N) forever without us advancing.
//
// KEEP-570: staleness is measured from the last *real* height advance.
// Before the first block ever arrives the baseline is monitorBootAt, so
// a freshly-booted monitor isn't reaped during cold-start warmup. Once
// the first block lands, lastBlockAdvanceAt takes over. This replaces
// the previous logic where subscribeToBlocks reset lastBlockAdvanceAt
// on every reconnect — that reset masked monitors stuck across many
// silent-reconnect cycles, exactly the prod failure mode.
const stalenessBaseline = this.lastBlockAdvanceAt ?? this.monitorBootAt;
if (
this.lastBlockAdvanceAt !== null &&
Date.now() - this.lastBlockAdvanceAt >
liveness("MONITOR_RECREATE_TIMEOUT_MS")
stalenessBaseline !== null &&
Date.now() - stalenessBaseline > liveness("MONITOR_RECREATE_TIMEOUT_MS")
) {
return false;
}
Expand Down Expand Up @@ -488,13 +509,13 @@ export class ChainMonitor {
});

this.hasActiveSubscription = true;
// Seed the staleness clock so a freshly-subscribed monitor isn't reaped
// by isAlive() before the first block arrives. Real height advances
// refresh this in processBlockRange().
// KEEP-570: do NOT seed lastBlockAdvanceAt here. Resetting it on every
// subscribe falsified isAlive() across reconnects, so monitors that
// re-subscribed every BLOCK_ADVANCE_TIMEOUT_MS without ever receiving
// a real block looked permanently alive to the reconciler. The cold-
// start warmup case is now handled by monitorBootAt in isAlive().
const subscribedAt = Date.now();
this.lastBlockAdvanceAt = subscribedAt;
metrics.setHasActiveSubscription(this.chainName, true);
metrics.setLastBlockAdvanceAt(this.chainName, subscribedAt);
metrics.setSubscribedAt(this.chainName, subscribedAt);
// resetNoBlockTimer is called from start/reconnect, but seed it here too
// for the same reason — the watchdog needs a baseline from the moment we
Expand Down
48 changes: 46 additions & 2 deletions keeperhub-scheduler/tests/unit/chain-monitor.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,16 @@ describe("ChainMonitor", () => {
});

it("stays alive when blocks are arriving regularly", async () => {
// Pin the windows so the 5-minute gap between blocks does not trigger
// the in-monitor reconnect cycle. The previous version of this test
// relied on subscribeToBlocks resetting lastBlockAdvanceAt to mask the
// reconnect activity, which masked the very bug (KEEP-570) that broke
// the reconciler's view of stuck monitors. The contract this test
// intends to assert is: each real height advance refreshes the
// staleness clock; verify it directly.
vi.stubEnv("BLOCK_ADVANCE_TIMEOUT_MS", String(60 * 60_000));
vi.stubEnv("MONITOR_RECREATE_TIMEOUT_MS", String(10 * 60_000));

const monitor = new ChainMonitor({
chain: makeChain(),
workflows: [makeWorkflow({ blockInterval: 1 })],
Expand Down Expand Up @@ -539,10 +549,44 @@ describe("ChainMonitor", () => {
});

await monitor.start();
// Without seeding lastBlockAdvanceAt at subscribe time, isAlive()
// would compute a stale gap immediately. Sanity check the seed.
// monitorBootAt covers the cold-start warmup window: isAlive() returns
// true even before the first real block arrives, until staleness
// measured from boot exceeds MONITOR_RECREATE_TIMEOUT_MS.
expect(monitor.isAlive()).toBe(true);
});

it("reaps a monitor stuck across silent reconnects with no real blocks", async () => {
// KEEP-570 regression: previously, subscribeToBlocks reset
// lastBlockAdvanceAt on every reconnect, so a monitor that kept
// re-subscribing but never received a real block looked alive forever
// to the reconciler. After this fix the staleness clock is measured
// from the last real height advance (or monitorBootAt as the cold-
// start fallback), so persistent silence across reconnects becomes
// visible to BlockMonitorService.isAlive().
//
// Pin a short BLOCK_ADVANCE_TIMEOUT_MS to drive multiple silent
// reconnects within the test window, and a short
// MONITOR_RECREATE_TIMEOUT_MS so the reaper threshold is reachable.
vi.stubEnv("BLOCK_ADVANCE_TIMEOUT_MS", String(60_000));
vi.stubEnv("MONITOR_RECREATE_TIMEOUT_MS", String(120_000));

const monitor = new ChainMonitor({
chain: makeChain(),
workflows: [makeWorkflow()],
});

await monitor.start();
expect(monitor.isAlive()).toBe(true);

// Three silent windows of 60s each plus the reconnects in between.
// No real blocks emitted; the monitor's in-process reconnect cycle
// keeps re-subscribing but the upstream stays silent.
await vi.advanceTimersByTimeAsync(3 * 60_000);

// Past the 120s staleness threshold, the reaper sees the monitor as
// not alive even though it is happily resubscribing.
expect(monitor.isAlive()).toBe(false);
});
});

// -------------------------------------------------------------------------
Expand Down
Loading