From b4d21188c718c0efcd198bf611825ce1a81a1263 Mon Sep 17 00:00:00 2001 From: David McFadzean Date: Sat, 21 Mar 2026 13:30:08 -0400 Subject: [PATCH] Add Docker healthchecks for core services --- docker-compose.drawbridge.yml | 21 +++- docker-compose.yml | 108 +++++++++++++++--- .../hyperswarm/src/hyperswarm-mediator.ts | 14 +++ 3 files changed, 121 insertions(+), 22 deletions(-) diff --git a/docker-compose.drawbridge.yml b/docker-compose.drawbridge.yml index 4d89f8bc..a4ceb7c9 100644 --- a/docker-compose.drawbridge.yml +++ b/docker-compose.drawbridge.yml @@ -32,11 +32,17 @@ services: - ./data/drawbridge:/data/drawbridge - ./data/cln-mainnet:/data/lightning:ro - tor-hostname:/data/tor:ro + healthcheck: + test: ["CMD", "node", "-e", "const http=require('http');const req=http.get('http://127.0.0.1:4222/api/v1/ready',res=>{let body='';res.on('data',chunk=>body+=chunk);res.on('end',()=>process.exit(res.statusCode===200&&(body.trim()==='true'||body.includes('\"ready\":true'))?0:1));});req.on('error',()=>process.exit(1));"] + interval: 15s + timeout: 5s + retries: 6 + start_period: 25s depends_on: gatekeeper: - condition: service_started + condition: service_healthy redis: - condition: service_started + condition: service_healthy drawbridge-client: build: @@ -51,8 +57,15 @@ services: user: "${ARCHON_UID}:${ARCHON_GID}" ports: - "${ARCHON_DRAWBRIDGE_CLIENT_PORT:-4223}:4223" + healthcheck: + test: ["CMD", "node", "-e", "const http=require('http');const req=http.get('http://127.0.0.1:4223/',res=>process.exit(res.statusCode===200?0:1));req.on('error',()=>process.exit(1));"] + interval: 15s + timeout: 5s + retries: 6 + start_period: 20s depends_on: - - drawbridge + drawbridge: + condition: service_healthy tor: image: goldy/tor-hidden-service:latest @@ -77,7 +90,7 @@ services: wait depends_on: drawbridge: - condition: service_started + condition: service_healthy volumes: tor-hostname: diff --git a/docker-compose.yml b/docker-compose.yml index 2d10dce2..9be80a38 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,12 @@ services: - ./data/mongodb:/data/db ports: - 127.0.0.1:27017:27017 + healthcheck: + test: ["CMD", "mongosh", "--quiet", "--eval", "db.adminCommand({ ping: 1 })"] + interval: 10s + timeout: 5s + retries: 6 + start_period: 15s redis: image: redis:8.0.4-alpine @@ -21,6 +27,12 @@ services: - ./data/redis:/data ports: - 127.0.0.1:6379:6379 + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 6 + start_period: 10s ipfs: image: ipfs/kubo:v0.40.1 @@ -59,10 +71,19 @@ services: user: "${ARCHON_UID}:${ARCHON_GID}" ports: - ${ARCHON_GATEKEEPER_PORT}:4224 + healthcheck: + test: ["CMD", "node", "-e", "const http=require('http');const req=http.get('http://127.0.0.1:4224/api/v1/ready',res=>{let body='';res.on('data',chunk=>body+=chunk);res.on('end',()=>process.exit(res.statusCode===200&&(body.trim()==='true'||body.includes('\"ready\":true'))?0:1));});req.on('error',()=>process.exit(1));"] + interval: 10s + timeout: 5s + retries: 6 + start_period: 20s depends_on: - - mongodb - - redis - - ipfs + mongodb: + condition: service_healthy + redis: + condition: service_healthy + ipfs: + condition: service_healthy keymaster: build: @@ -90,10 +111,19 @@ services: user: "${ARCHON_UID}:${ARCHON_GID}" ports: - ${ARCHON_KEYMASTER_PORT}:4226 + healthcheck: + test: ["CMD", "node", "-e", "const http=require('http');const req=http.get('http://127.0.0.1:4226/api/v1/ready',res=>{let body='';res.on('data',chunk=>body+=chunk);res.on('end',()=>process.exit(res.statusCode===200&&body.includes('\"ready\":true')?0:1));});req.on('error',()=>process.exit(1));"] + interval: 10s + timeout: 5s + retries: 6 + start_period: 20s depends_on: - - gatekeeper - - redis - - mongodb + gatekeeper: + condition: service_healthy + redis: + condition: service_healthy + mongodb: + condition: service_healthy hyperswarm-mediator: build: @@ -115,10 +145,19 @@ services: user: "${ARCHON_UID}:${ARCHON_GID}" ports: - "127.0.0.1:4232:4232" + healthcheck: + test: ["CMD", "node", "-e", "const http=require('http');const req=http.get('http://127.0.0.1:4232/ready',res=>{let body='';res.on('data',chunk=>body+=chunk);res.on('end',()=>process.exit(res.statusCode===200&&body.includes('\"ready\":true')?0:1));});req.on('error',()=>process.exit(1));"] + interval: 15s + timeout: 5s + retries: 6 + start_period: 25s depends_on: - - gatekeeper - - keymaster - - ipfs + gatekeeper: + condition: service_healthy + keymaster: + condition: service_healthy + ipfs: + condition: service_healthy cli: build: @@ -136,9 +175,12 @@ services: - ./share:/app/share user: "${ARCHON_UID}:${ARCHON_GID}" depends_on: - - gatekeeper - - keymaster - - ipfs + gatekeeper: + condition: service_healthy + keymaster: + condition: service_healthy + ipfs: + condition: service_healthy explorer: build: @@ -154,8 +196,15 @@ services: - VITE_OPERATION_NETWORKS=hyperswarm,local,BTC:signet,BTC:testnet4 ports: - "4000:4000" + healthcheck: + test: ["CMD", "node", "-e", "const http=require('http');const req=http.get('http://127.0.0.1:4000/version',res=>process.exit(res.statusCode===200?0:1));req.on('error',()=>process.exit(1));"] + interval: 15s + timeout: 5s + retries: 6 + start_period: 20s depends_on: - - gatekeeper + gatekeeper: + condition: service_healthy gatekeeper-client: build: @@ -170,8 +219,15 @@ services: user: "${ARCHON_UID}:${ARCHON_GID}" ports: - "${ARCHON_GATEKEEPER_CLIENT_PORT:-4225}:4225" + healthcheck: + test: ["CMD", "node", "-e", "const http=require('http');const req=http.get('http://127.0.0.1:4225/',res=>process.exit(res.statusCode===200?0:1));req.on('error',()=>process.exit(1));"] + interval: 15s + timeout: 5s + retries: 6 + start_period: 20s depends_on: - - gatekeeper + gatekeeper: + condition: service_healthy keymaster-client: build: @@ -186,8 +242,15 @@ services: user: "${ARCHON_UID}:${ARCHON_GID}" ports: - "${ARCHON_KEYMASTER_CLIENT_PORT:-4227}:4227" + healthcheck: + test: ["CMD", "node", "-e", "const http=require('http');const req=http.get('http://127.0.0.1:4227/',res=>process.exit(res.statusCode===200?0:1));req.on('error',()=>process.exit(1));"] + interval: 15s + timeout: 5s + retries: 6 + start_period: 20s depends_on: - - keymaster + keymaster: + condition: service_healthy react-wallet: build: @@ -204,8 +267,15 @@ services: user: "${ARCHON_UID}:${ARCHON_GID}" ports: - "${ARCHON_REACT_WALLET_PORT:-4228}:${ARCHON_REACT_WALLET_PORT:-4228}" + healthcheck: + test: ["CMD", "node", "-e", "const http=require('http');const req=http.get('http://127.0.0.1:4228/',res=>process.exit(res.statusCode===200?0:1));req.on('error',()=>process.exit(1));"] + interval: 15s + timeout: 5s + retries: 6 + start_period: 20s depends_on: - - gatekeeper + gatekeeper: + condition: service_healthy # Observability Stack prometheus: @@ -221,8 +291,10 @@ services: - "--storage.tsdb.path=/prometheus" - "--storage.tsdb.retention.time=15d" depends_on: - - gatekeeper - - keymaster + gatekeeper: + condition: service_healthy + keymaster: + condition: service_healthy grafana: image: grafana/grafana:10.4.0 diff --git a/services/mediators/hyperswarm/src/hyperswarm-mediator.ts b/services/mediators/hyperswarm/src/hyperswarm-mediator.ts index bbab4fdd..54bdf047 100644 --- a/services/mediators/hyperswarm/src/hyperswarm-mediator.ts +++ b/services/mediators/hyperswarm/src/hyperswarm-mediator.ts @@ -187,6 +187,20 @@ function updateGauges(): void { function startMetricsServer(): void { const app = express(); + app.get('/ready', async (_req, res) => { + try { + const [gatekeeperReady, keymasterReady, ipfsReady] = await Promise.all([ + gatekeeper.isReady(), + keymaster.isReady(), + ipfs.isReady(), + ]); + + res.json({ ready: gatekeeperReady && keymasterReady && ipfsReady && Boolean(nodeInfo) }); + } catch { + res.json({ ready: false }); + } + }); + app.get('/version', (_req, res) => { res.json({ version: serviceVersion, commit: serviceCommit }); });