diff --git a/.gitignore b/.gitignore index 6b7e5b5..0af74a6 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ dist .DS_Store .env .env.* +!.env.example *.log package-lock.json diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..c3f3afb --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,560 @@ +# Architecture Diagram + +## System Overview + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Client Application │ +└────────────────────────────┬────────────────────────────────────┘ + │ + │ HTTP Request + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Express HTTP Server │ +│ (src/index.ts) │ +└────────────────────────────┬────────────────────────────────────┘ + │ + │ Route to Controller + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Deposit Controller │ +│ (src/controllers/depositController.ts) │ +│ │ +│ • Request validation │ +│ • Error mapping (CircuitBreakerOpenError → 502) │ +│ • Response formatting │ +└────────────────────────────┬────────────────────────────────────┘ + │ + │ Call Service + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Transaction Builder Service │ +│ (src/services/transactionBuilder.ts) │ +│ │ +│ • buildVaultDepositTransaction() │ +│ • loadAccount() │ +│ • fetchBaseFee() │ +└────────────────────────────┬────────────────────────────────────┘ + │ + │ Wrapped with Resilience + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Circuit Breaker │ +│ (src/lib/circuitBreaker.ts) │ +│ │ +│ ┌──────────┐ ┌──────────┐ ┌──────────────┐ │ +│ │ CLOSED │─────►│ OPEN │─────►│ HALF_OPEN │ │ +│ │ (Normal) │ │(Fast-Fail)│ │ (Testing) │ │ +│ └────┬─────┘ └──────────┘ └──────┬───────┘ │ +│ │ │ │ +│ └───────────────────────────────────────┘ │ +│ │ +│ • State management │ +│ • Failure counting │ +│ • Cooldown timing │ +└────────────────────────────┬────────────────────────────────────┘ + │ + │ If CLOSED or HALF_OPEN + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Retry Mechanism │ +│ (src/lib/retry.ts) │ +│ │ +│ Attempt 1: Immediate │ +│ Attempt 2: ~1000ms (exponential backoff) │ +│ Attempt 3: ~2000ms (with jitter) │ +│ │ +│ • Exponential backoff │ +│ • Jitter to prevent thundering herd │ +│ • Configurable max attempts │ +└────────────────────────────┬────────────────────────────────────┘ + │ + │ Network Call + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Stellar Horizon API │ +│ (horizon-testnet.stellar.org) │ +│ │ +│ • loadAccount(publicKey) │ +│ • feeStats() │ +│ • Transaction submission │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Request Flow + +### Successful Request + +``` +Client + │ + │ POST /api/deposits/build + ▼ +Controller (validate request) + │ + │ Valid + ▼ +Transaction Builder + │ + │ buildVaultDepositTransaction() + ▼ +Circuit Breaker (CLOSED) + │ + │ Allow + ▼ +Retry Mechanism + │ + │ Attempt 1 + ▼ +Horizon API + │ + │ 200 OK + ▼ +Return Account Data + │ + ▼ +Build Transaction + │ + ▼ +Return XDR + │ + ▼ +Controller (format response) + │ + │ 200 OK + ▼ +Client +``` + +### Transient Failure with Retry + +``` +Client + │ + │ POST /api/deposits/build + ▼ +Controller + │ + ▼ +Transaction Builder + │ + ▼ +Circuit Breaker (CLOSED) + │ + ▼ +Retry Mechanism + │ + │ Attempt 1 + ▼ +Horizon API + │ + │ Network Timeout ❌ + ▼ +Retry Mechanism + │ + │ Wait ~1000ms (backoff) + │ Attempt 2 + ▼ +Horizon API + │ + │ 200 OK ✅ + ▼ +Return Account Data + │ + ▼ +Build Transaction + │ + ▼ +Return XDR + │ + ▼ +Controller (200 OK) + │ + ▼ +Client +``` + +### Circuit Breaker Trip + +``` +Client + │ + │ POST /api/deposits/build (Request 1) + ▼ +Circuit Breaker (CLOSED) + │ + │ consecutiveFailures: 0 + ▼ +Retry → Horizon API ❌ (All attempts fail) + │ + │ consecutiveFailures: 1 + ▼ +Controller (502 Bad Gateway) + │ + ▼ +Client + +───────────────────────────── + +Client + │ + │ POST /api/deposits/build (Request 2-5) + ▼ +Circuit Breaker (CLOSED) + │ + │ consecutiveFailures: 1-4 + ▼ +Retry → Horizon API ❌ (All attempts fail) + │ + │ consecutiveFailures: 2-5 + ▼ +Controller (502 Bad Gateway) + │ + ▼ +Client + +───────────────────────────── + +Client + │ + │ POST /api/deposits/build (Request 6) + ▼ +Circuit Breaker (CLOSED) + │ + │ consecutiveFailures: 5 + ▼ +Retry → Horizon API ❌ (All attempts fail) + │ + │ consecutiveFailures: 6 ≥ threshold (5) + │ STATE TRANSITION: CLOSED → OPEN 🔴 + ▼ +Controller (502 Bad Gateway) + │ + ▼ +Client + +───────────────────────────── + +Client + │ + │ POST /api/deposits/build (Request 7+) + ▼ +Circuit Breaker (OPEN) + │ + │ Fast-fail immediately ⚡ + │ No network call made + ▼ +CircuitBreakerOpenError + │ + ▼ +Controller (502 Bad Gateway) + │ + ▼ +Client +``` + +### Circuit Breaker Recovery + +``` +Circuit Breaker (OPEN) + │ + │ Wait cooldown period (30s) + │ + │ STATE TRANSITION: OPEN → HALF_OPEN 🟡 + ▼ +Client + │ + │ POST /api/deposits/build (Probe request) + ▼ +Circuit Breaker (HALF_OPEN) + │ + │ Allow single probe + ▼ +Retry → Horizon API + │ + │ 200 OK ✅ + │ + │ STATE TRANSITION: HALF_OPEN → CLOSED 🟢 + ▼ +Return Success + │ + ▼ +Controller (200 OK) + │ + ▼ +Client + +───────────────────────────── + +Circuit Breaker (CLOSED) + │ + │ Normal operation resumed + │ consecutiveFailures: 0 + ▼ +All subsequent requests succeed +``` + +## Component Responsibilities + +### Controller Layer (src/controllers/) + +**Responsibilities:** +- HTTP request/response handling +- Request validation +- Error mapping to HTTP status codes +- Response formatting + +**Does NOT:** +- Business logic +- Direct Horizon calls +- Retry logic +- State management + +### Service Layer (src/services/) + +**Responsibilities:** +- Business logic +- Transaction building +- Account loading +- Fee fetching + +**Does NOT:** +- HTTP concerns +- Error status code mapping +- Request validation + +### Resilience Layer (src/lib/) + +**Responsibilities:** +- Retry with exponential backoff +- Circuit breaker state management +- Failure counting +- Cooldown timing + +**Does NOT:** +- Business logic +- HTTP concerns +- Stellar-specific logic + +## Error Flow + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Error Types │ +└─────────────────────────────────────────────────────────────────┘ + +Network Error (Horizon) + │ + ▼ +Retry Mechanism + │ + ├─► Success after retry → Return result + │ + └─► All retries fail + │ + ▼ + RetryExhaustedError + │ + ▼ + Circuit Breaker (increment failures) + │ + ├─► Below threshold → Propagate error + │ + └─► At threshold → Transition to OPEN + │ + ▼ + CircuitBreakerOpenError (future requests) + │ + ▼ + Controller (map to BadGatewayError) + │ + ▼ + HTTP 502 Response + │ + ▼ + Client +``` + +## State Diagram + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Circuit Breaker State Machine │ +└─────────────────────────────────────────────────────────────────┘ + + ┌──────────────────┐ + │ CLOSED │ + │ (Normal Op) │ + │ │ + │ • Allow requests │ + │ • Count failures │ + │ • Reset on success│ + └────────┬─────────┘ + │ + │ consecutiveFailures ≥ threshold + │ + ▼ + ┌──────────────────┐ + │ OPEN │ + │ (Fast-Fail) │ + │ │ + │ • Reject requests│ + │ • No network calls│ + │ • Start cooldown │ + └────────┬─────────┘ + │ + │ cooldown elapsed + │ + ▼ + ┌──────────────────┐ + │ HALF_OPEN │ + │ (Testing) │ + │ │ + │ • Allow 1 probe │ + │ • Test recovery │ + └────────┬─────────┘ + │ + ┌────────┴────────┐ + │ │ + Success Failure + │ │ + ▼ ▼ + ┌─────────┐ ┌─────────┐ + │ CLOSED │ │ OPEN │ + └─────────┘ └─────────┘ +``` + +## Data Flow + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Configuration Flow │ +└─────────────────────────────────────────────────────────────────┘ + +Environment Variables (.env) + │ + ├─► HORIZON_URL + ├─► STELLAR_BASE_FEE + ├─► CIRCUIT_BREAKER_THRESHOLD + ├─► CIRCUIT_BREAKER_COOLDOWN_MS + ├─► RETRY_MAX_ATTEMPTS + └─► RETRY_BASE_DELAY_MS + │ + ▼ +Transaction Builder Config + │ + ├─► Circuit Breaker Instance + │ │ + │ └─► failureThreshold + │ cooldownMs + │ + └─► Retry Config + │ + └─► maxAttempts + baseDelayMs +``` + +## Monitoring Flow + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Metrics Collection │ +└─────────────────────────────────────────────────────────────────┘ + +Circuit Breaker + │ + ├─► state (CLOSED/OPEN/HALF_OPEN) + ├─► consecutiveFailures + ├─► consecutiveSuccesses + ├─► totalFailures + ├─► totalSuccesses + ├─► lastFailureTime + └─► lastStateChange + │ + ▼ +GET /api/deposits/health + │ + ▼ +JSON Response + │ + ▼ +Monitoring System + │ + ├─► Alert on state=OPEN + ├─► Track failure rate + └─► Dashboard visualization +``` + +## Deployment Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Production Deployment │ +└─────────────────────────────────────────────────────────────────┘ + +Load Balancer + │ + ├─► Instance 1 (Circuit Breaker A) + │ │ + │ └─► Horizon Testnet + │ + ├─► Instance 2 (Circuit Breaker B) + │ │ + │ └─► Horizon Testnet + │ + └─► Instance 3 (Circuit Breaker C) + │ + └─► Horizon Testnet + +Note: Each instance has its own circuit breaker state. +For shared state, consider Redis or distributed circuit breaker. +``` + +## Testing Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Test Layers │ +└─────────────────────────────────────────────────────────────────┘ + +Unit Tests (lib/) + │ + ├─► retry.test.ts + │ │ + │ ├─► Mock operations + │ ├─► Fake timers + │ └─► Test backoff timing + │ + └─► circuitBreaker.test.ts + │ + ├─► Mock operations + ├─► Test state transitions + └─► Test thresholds + +Integration Tests (services/) + │ + └─► transactionBuilder.test.ts + │ + ├─► Mock Stellar SDK + ├─► Test retry integration + └─► Test circuit breaker integration + +HTTP Tests (controllers/) + │ + └─► depositController.test.ts + │ + ├─► Mock transaction builder + ├─► Test error mapping + └─► Test HTTP responses +``` + +## Summary + +The architecture implements a layered approach with clear separation of concerns: + +1. **HTTP Layer** - Request/response handling +2. **Business Layer** - Transaction building logic +3. **Resilience Layer** - Retry and circuit breaker +4. **Network Layer** - Stellar Horizon API + +Each layer has a single responsibility and communicates through well-defined interfaces, making the system maintainable, testable, and resilient to failures. diff --git a/DEPLOYMENT_CHECKLIST.md b/DEPLOYMENT_CHECKLIST.md new file mode 100644 index 0000000..89d0622 --- /dev/null +++ b/DEPLOYMENT_CHECKLIST.md @@ -0,0 +1,393 @@ +# Deployment Checklist + +Use this checklist to ensure the circuit breaker and retry implementation is properly deployed and configured. + +## Pre-Deployment + +### Code Review + +- [ ] All tests pass: `npm test` +- [ ] Test coverage ≥ 90%: `npm test -- --coverage` +- [ ] TypeScript compiles without errors: `npm run typecheck` +- [ ] Linting passes: `npm run lint` +- [ ] No console.log statements in production code +- [ ] All TODOs resolved or documented +- [ ] Code reviewed by at least one other developer + +### Dependencies + +- [ ] `stellar-sdk` added to package.json +- [ ] All dependencies installed: `npm install` +- [ ] No security vulnerabilities: `npm audit` +- [ ] Lock file committed: `package-lock.json` + +### Configuration + +- [ ] `.env.example` file created with all variables +- [ ] `.env` file NOT committed to git +- [ ] `.gitignore` includes `.env` and `coverage/` +- [ ] Environment variables documented in README + +### Documentation + +- [ ] README.md updated with new features +- [ ] RESILIENCE.md created and reviewed +- [ ] ARCHITECTURE.md created +- [ ] QUICKSTART.md created +- [ ] API endpoints documented +- [ ] Configuration parameters documented + +## Deployment + +### Environment Setup + +- [ ] Node.js 18+ installed on target environment +- [ ] Environment variables configured +- [ ] Horizon URL verified and accessible +- [ ] Network connectivity to Stellar Horizon tested + +### Configuration Values + +#### Development Environment + +- [ ] `HORIZON_URL=https://horizon-testnet.stellar.org` +- [ ] `CIRCUIT_BREAKER_THRESHOLD=3` (fast feedback) +- [ ] `CIRCUIT_BREAKER_COOLDOWN_MS=10000` (10s) +- [ ] `RETRY_MAX_ATTEMPTS=2` +- [ ] `RETRY_BASE_DELAY_MS=500` + +#### Staging Environment + +- [ ] `HORIZON_URL=https://horizon-testnet.stellar.org` +- [ ] `CIRCUIT_BREAKER_THRESHOLD=5` +- [ ] `CIRCUIT_BREAKER_COOLDOWN_MS=30000` (30s) +- [ ] `RETRY_MAX_ATTEMPTS=3` +- [ ] `RETRY_BASE_DELAY_MS=1000` + +#### Production Environment + +- [ ] `HORIZON_URL=https://horizon.stellar.org` (or custom) +- [ ] `STELLAR_NETWORK=Public Global Stellar Network ; September 2015` +- [ ] `CIRCUIT_BREAKER_THRESHOLD=10` (conservative) +- [ ] `CIRCUIT_BREAKER_COOLDOWN_MS=60000` (60s) +- [ ] `RETRY_MAX_ATTEMPTS=5` +- [ ] `RETRY_BASE_DELAY_MS=2000` + +### Build and Deploy + +- [ ] Build succeeds: `npm run build` +- [ ] Build artifacts in `dist/` directory +- [ ] Start script works: `npm start` +- [ ] Server starts without errors +- [ ] Health endpoint responds: `GET /api/health` + +## Post-Deployment Verification + +### Functional Testing + +#### Health Check + +```bash +curl http://your-server:3000/api/health +``` + +- [ ] Returns 200 OK +- [ ] Response: `{"status":"ok","service":"callora-backend"}` + +#### Circuit Breaker Health + +```bash +curl http://your-server:3000/api/deposits/health +``` + +- [ ] Returns 200 OK +- [ ] Response includes circuit breaker state +- [ ] Initial state is `CLOSED` +- [ ] Metrics are initialized + +#### Deposit Transaction (Success Case) + +```bash +curl -X POST http://your-server:3000/api/deposits/build \ + -H "Content-Type: application/json" \ + -d '{ + "sourcePublicKey": "VALID_SOURCE_KEY", + "vaultPublicKey": "VALID_VAULT_KEY", + "amount": "100" + }' +``` + +- [ ] Returns 200 OK with valid keys +- [ ] Response includes `transactionXdr` +- [ ] XDR is valid base64 string + +#### Validation (Error Cases) + +```bash +# Missing fields +curl -X POST http://your-server:3000/api/deposits/build \ + -H "Content-Type: application/json" \ + -d '{}' +``` + +- [ ] Returns 400 Bad Request +- [ ] Error message describes missing fields + +```bash +# Invalid amount +curl -X POST http://your-server:3000/api/deposits/build \ + -H "Content-Type: application/json" \ + -d '{ + "sourcePublicKey": "VALID_KEY", + "vaultPublicKey": "VALID_KEY", + "amount": "-50" + }' +``` + +- [ ] Returns 400 Bad Request +- [ ] Error message describes invalid amount + +### Resilience Testing + +#### Test Retry Mechanism + +1. Configure short retry delays for testing +2. Temporarily use invalid Horizon URL +3. Make request and observe logs + +- [ ] Retry attempts logged +- [ ] Exponential backoff delays observed +- [ ] Eventually returns 502 after exhausting retries + +#### Test Circuit Breaker Trip + +1. Configure low threshold (e.g., 2) +2. Use invalid Horizon URL +3. Make multiple requests + +- [ ] First request fails with retry exhaustion +- [ ] Second request fails with retry exhaustion +- [ ] Third request fast-fails with circuit breaker open +- [ ] Health endpoint shows state=OPEN +- [ ] No network calls made after circuit opens + +#### Test Circuit Breaker Recovery + +1. After circuit opens, restore valid Horizon URL +2. Wait for cooldown period +3. Make new request + +- [ ] Circuit transitions to HALF_OPEN +- [ ] Probe request succeeds +- [ ] Circuit transitions to CLOSED +- [ ] Subsequent requests succeed normally + +### Performance Testing + +#### Latency + +- [ ] Successful requests complete in < 2s +- [ ] Failed requests with retry complete in < 10s +- [ ] Fast-fail requests (circuit open) complete in < 100ms + +#### Throughput + +- [ ] Server handles expected request rate +- [ ] No memory leaks under sustained load +- [ ] Circuit breaker doesn't trip under normal load + +### Monitoring Setup + +#### Metrics Collection + +- [ ] Circuit breaker state monitored +- [ ] Failure rate tracked +- [ ] Consecutive failures tracked +- [ ] Response times logged + +#### Alerting + +- [ ] Alert configured for circuit state=OPEN +- [ ] Alert configured for high failure rate (>10%) +- [ ] Alert configured for high consecutive failures (>50% threshold) +- [ ] Alert configured for sustained high latency + +#### Dashboards + +- [ ] Circuit breaker state visualization +- [ ] Request success/failure rate graph +- [ ] Response time histogram +- [ ] Retry attempt distribution + +### Logging + +- [ ] Application logs to appropriate destination +- [ ] Log level configured (INFO for production) +- [ ] Circuit breaker state transitions logged +- [ ] Retry attempts logged +- [ ] Errors logged with stack traces +- [ ] No sensitive data in logs + +## Rollback Plan + +### Preparation + +- [ ] Previous version tagged in git +- [ ] Rollback procedure documented +- [ ] Database migrations (if any) are reversible +- [ ] Configuration backup available + +### Rollback Triggers + +Rollback if: + +- [ ] Circuit breaker stuck in OPEN state +- [ ] Excessive false positives +- [ ] Performance degradation +- [ ] Increased error rates +- [ ] Memory leaks detected + +### Rollback Steps + +1. [ ] Stop current deployment +2. [ ] Deploy previous version +3. [ ] Restore previous configuration +4. [ ] Verify health endpoints +5. [ ] Monitor for stability +6. [ ] Document rollback reason + +## Post-Deployment Monitoring + +### First 24 Hours + +- [ ] Monitor circuit breaker state every hour +- [ ] Check failure rates +- [ ] Review error logs +- [ ] Verify no memory leaks +- [ ] Confirm expected throughput + +### First Week + +- [ ] Daily review of metrics +- [ ] Analyze retry patterns +- [ ] Tune thresholds if needed +- [ ] Document any issues +- [ ] Collect feedback from users + +### Ongoing + +- [ ] Weekly metrics review +- [ ] Monthly configuration review +- [ ] Quarterly load testing +- [ ] Update documentation as needed + +## Troubleshooting + +### Circuit Breaker Stuck Open + +**Symptoms:** +- Health endpoint shows state=OPEN +- All requests return 502 +- Cooldown period has elapsed + +**Actions:** +- [ ] Check Horizon URL is correct +- [ ] Verify network connectivity to Horizon +- [ ] Review Horizon service status +- [ ] Check for DNS issues +- [ ] Restart service if necessary + +### Excessive Retries + +**Symptoms:** +- High latency on requests +- Many retry attempts in logs +- Circuit breaker not tripping + +**Actions:** +- [ ] Reduce `RETRY_MAX_ATTEMPTS` +- [ ] Lower `CIRCUIT_BREAKER_THRESHOLD` +- [ ] Investigate root cause of failures +- [ ] Check Horizon service health + +### False Positives + +**Symptoms:** +- Circuit opens during normal operation +- Transient failures trip circuit +- Frequent state transitions + +**Actions:** +- [ ] Increase `CIRCUIT_BREAKER_THRESHOLD` +- [ ] Increase `RETRY_MAX_ATTEMPTS` +- [ ] Review failure patterns +- [ ] Adjust retry delays + +## Sign-Off + +### Development Team + +- [ ] Lead Developer: _________________ Date: _______ +- [ ] Backend Engineer: ________________ Date: _______ +- [ ] QA Engineer: ____________________ Date: _______ + +### Operations Team + +- [ ] DevOps Engineer: _________________ Date: _______ +- [ ] SRE: ____________________________ Date: _______ + +### Product Team + +- [ ] Product Manager: _________________ Date: _______ +- [ ] Technical Lead: __________________ Date: _______ + +## Notes + +Use this section to document any deployment-specific notes, issues encountered, or deviations from the standard process: + +``` +Date: ___________ +Notes: + + + + +``` + +--- + +## Quick Reference + +### Useful Commands + +```bash +# Check health +curl http://localhost:3000/api/health + +# Check circuit breaker +curl http://localhost:3000/api/deposits/health + +# View logs +tail -f logs/app.log + +# Check process +ps aux | grep node + +# Restart service +npm run build && npm start +``` + +### Configuration Quick Reference + +| Environment | Threshold | Cooldown | Retries | +|-------------|-----------|----------|---------| +| Development | 3 | 10s | 2 | +| Staging | 5 | 30s | 3 | +| Production | 10 | 60s | 5 | + +### Support Contacts + +- Development Team: dev-team@example.com +- Operations Team: ops-team@example.com +- On-Call: oncall@example.com +- Escalation: escalation@example.com diff --git a/QUICKSTART.md b/QUICKSTART.md new file mode 100644 index 0000000..f2d01ad --- /dev/null +++ b/QUICKSTART.md @@ -0,0 +1,265 @@ +# Quick Start Guide + +Get the Callora backend running in 5 minutes. + +## Prerequisites + +- Node.js 18+ installed +- npm or yarn package manager +- (Optional) Stellar account for testing + +## Installation + +```bash +# Clone the repository +git clone +cd callora-backend + +# Install dependencies +npm install + +# Copy environment template +cp .env.example .env +``` + +## Running the Server + +### Development Mode + +```bash +npm run dev +``` + +Server starts at http://localhost:3000 + +### Production Mode + +```bash +npm run build +npm start +``` + +## Testing the API + +### Health Check + +```bash +curl http://localhost:3000/api/health +``` + +Expected response: +```json +{ + "status": "ok", + "service": "callora-backend" +} +``` + +### Circuit Breaker Health + +```bash +curl http://localhost:3000/api/deposits/health +``` + +Expected response: +```json +{ + "success": true, + "circuitBreaker": { + "state": "CLOSED", + "consecutiveFailures": 0, + "totalSuccesses": 0 + } +} +``` + +### Build Deposit Transaction + +```bash +curl -X POST http://localhost:3000/api/deposits/build \ + -H "Content-Type: application/json" \ + -d '{ + "sourcePublicKey": "GABC123...", + "vaultPublicKey": "GDEF456...", + "amount": "100" + }' +``` + +**Note:** Use valid Stellar public keys. You can generate test keys at: +https://laboratory.stellar.org/#account-creator?network=test + +## Running Tests + +```bash +# Run all tests +npm test + +# Run with coverage +npm test -- --coverage + +# Run specific test file +npm test -- retry.test.ts +``` + +## Common Configuration + +### Use Stellar Testnet (Default) + +```bash +# .env +HORIZON_URL=https://horizon-testnet.stellar.org +STELLAR_NETWORK=Test SDF Network ; September 2015 +``` + +### Use Stellar Mainnet + +```bash +# .env +HORIZON_URL=https://horizon.stellar.org +STELLAR_NETWORK=Public Global Stellar Network ; September 2015 +``` + +### Fast Development Settings + +For faster feedback during development: + +```bash +# .env +CIRCUIT_BREAKER_THRESHOLD=2 +CIRCUIT_BREAKER_COOLDOWN_MS=5000 +RETRY_MAX_ATTEMPTS=2 +RETRY_BASE_DELAY_MS=500 +``` + +### Conservative Production Settings + +For production deployment: + +```bash +# .env +CIRCUIT_BREAKER_THRESHOLD=10 +CIRCUIT_BREAKER_COOLDOWN_MS=60000 +RETRY_MAX_ATTEMPTS=5 +RETRY_BASE_DELAY_MS=2000 +``` + +## Testing Circuit Breaker + +### Trigger Circuit Breaker Open + +1. Configure low threshold: + ```bash + export CIRCUIT_BREAKER_THRESHOLD=2 + export RETRY_MAX_ATTEMPTS=1 + ``` + +2. Use invalid Horizon URL: + ```bash + export HORIZON_URL=http://invalid-horizon.example.com + ``` + +3. Restart server: + ```bash + npm run dev + ``` + +4. Make multiple requests: + ```bash + # First request (fails) + curl -X POST http://localhost:3000/api/deposits/build \ + -H "Content-Type: application/json" \ + -d '{"sourcePublicKey":"GABC","vaultPublicKey":"GDEF","amount":"100"}' + + # Second request (fails, trips circuit) + curl -X POST http://localhost:3000/api/deposits/build \ + -H "Content-Type: application/json" \ + -d '{"sourcePublicKey":"GABC","vaultPublicKey":"GDEF","amount":"100"}' + + # Third request (fast-fails with 502) + curl -X POST http://localhost:3000/api/deposits/build \ + -H "Content-Type: application/json" \ + -d '{"sourcePublicKey":"GABC","vaultPublicKey":"GDEF","amount":"100"}' + ``` + +5. Check circuit breaker state: + ```bash + curl http://localhost:3000/api/deposits/health + ``` + + Expected response: + ```json + { + "circuitBreaker": { + "state": "OPEN", + "consecutiveFailures": 2 + } + } + ``` + +## Next Steps + +- Read [RESILIENCE.md](./RESILIENCE.md) for detailed resilience patterns documentation +- Review [README.md](./README.md) for complete API documentation +- Explore test files for usage examples +- Configure environment variables for your deployment + +## Troubleshooting + +### Port Already in Use + +```bash +# Change port in .env +PORT=3001 +``` + +Or kill the process using port 3000: + +```bash +# Windows +netstat -ano | findstr :3000 +taskkill /PID /F + +# Linux/Mac +lsof -ti:3000 | xargs kill -9 +``` + +### Module Not Found Errors + +```bash +# Clean install +rm -rf node_modules package-lock.json +npm install +``` + +### TypeScript Errors + +```bash +# Check types without building +npm run typecheck + +# Clean build +rm -rf dist +npm run build +``` + +### Test Failures + +```bash +# Clear Jest cache +npm test -- --clearCache + +# Run tests in verbose mode +npm test -- --verbose +``` + +## Support + +For issues or questions: +1. Check existing documentation +2. Review test files for examples +3. Open an issue on GitHub +4. Contact the development team + +## License + +[Your License Here] diff --git a/README.md b/README.md index e06e284..3347d66 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,8 @@ API gateway, usage metering, and billing services for the Callora API marketplac - **Node.js** + **TypeScript** - **Express** for HTTP API +- **Stellar SDK** for Horizon integration +- **Circuit Breaker & Retry Patterns** for resilience - Planned: Horizon listener, PostgreSQL, billing engine ## What's included @@ -83,6 +85,17 @@ The request requires developer auth via `Authorization: Bearer ...` or `x-user-i - The in-memory store factories are still available for unit tests and isolated local scenarios. - Apply `migrations/001_create_usage_events.sql`, `migrations/002_create_settlements.sql`, `migrations/003_create_revenue_ledger.sql`, and `migrations/005_add_persistent_store_columns.sql` before starting the API against PostgreSQL. +## Resilience Features + +The backend implements production-grade resilience patterns for Stellar Horizon network calls: + +- ✅ **Bounded Retry with Exponential Backoff** - Automatically retries transient failures +- ✅ **Circuit Breaker Pattern** - Fast-fails during outages to prevent resource exhaustion +- ✅ **Graceful Degradation** - Maps upstream failures to appropriate HTTP status codes (502) +- ✅ **Health Monitoring** - Exposes circuit breaker metrics for observability + +See [RESILIENCE.md](./RESILIENCE.md) for detailed documentation. + ## Local setup 1. **Prerequisites:** Node.js 18+ @@ -91,6 +104,18 @@ The request requires developer auth via `Authorization: Bearer ...` or `x-user-i ```bash cd callora-backend npm install + ``` + +3. **Configure environment (optional):** + + ```bash + cp .env.example .env + # Edit .env with your configuration + ``` + +4. **Run in development mode:** + + ```bash npm run dev ``` @@ -151,7 +176,65 @@ callora-backend/ |-- tsconfig.json ``` -## Environment +## Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `PORT` | HTTP port | `3000` | +| `HORIZON_URL` | Stellar Horizon endpoint | `https://horizon-testnet.stellar.org` | +| `STELLAR_BASE_FEE` | Transaction base fee (stroops) | `100` | +| `STELLAR_TRANSACTION_TIMEOUT` | Transaction timeout (seconds) | `30` | +| `CIRCUIT_BREAKER_THRESHOLD` | Failures before opening circuit | `5` | +| `CIRCUIT_BREAKER_COOLDOWN_MS` | Cooldown period (ms) | `30000` | +| `RETRY_MAX_ATTEMPTS` | Maximum retry attempts | `3` | +| `RETRY_BASE_DELAY_MS` | Initial retry delay (ms) | `1000` | + +See `.env.example` for complete configuration options. + +## Testing + +Run the test suite: + +```bash +npm test +``` + +Run with coverage: + +```bash +npm test -- --coverage +``` + +The test suite includes: +- Unit tests for retry mechanism +- Unit tests for circuit breaker +- Integration tests for transaction builder +- HTTP integration tests for controllers +- Mock Horizon responses for various scenarios + +**Target Coverage:** 90%+ line coverage + +## Troubleshooting + +### Circuit Breaker Stuck Open + +If the circuit breaker remains open: + +1. Check `/api/deposits/health` to see current state +2. Verify `HORIZON_URL` is correct and accessible +3. Wait for cooldown period to elapse +4. Restart service to reset circuit breaker + +### High Latency + +If experiencing high latency: + +1. Reduce `RETRY_MAX_ATTEMPTS` +2. Lower `CIRCUIT_BREAKER_THRESHOLD` to fail faster +3. Check Horizon service status +4. Review logs for retry patterns + +See [RESILIENCE.md](./RESILIENCE.md) for detailed troubleshooting guide. Copy `.env.example` to `.env` and fill in your values before running locally: @@ -282,4 +365,6 @@ Notes: - When omitted, the route defaults `network` to `testnet`. - Invalid values are rejected consistently with a `400` validation response. -This repo is part of [Callora](https://github.com/your-org/callora). Frontend: `callora-frontend`. Contracts: `callora-contracts`. +This repo is part of [Callora](https://github.com/your-org/callora): +- Frontend: `callora-frontend` +- Contracts: `callora-contracts` diff --git a/RESILIENCE.md b/RESILIENCE.md new file mode 100644 index 0000000..a094aa1 --- /dev/null +++ b/RESILIENCE.md @@ -0,0 +1,454 @@ +# Resilience Patterns Documentation + +This document describes the circuit breaker and retry mechanisms implemented for Stellar Horizon network calls. + +## Overview + +The Callora backend implements two key resilience patterns to handle transient failures and prevent cascading failures when interacting with the Stellar Horizon network: + +1. **Bounded Retry with Exponential Backoff** - Automatically retries failed operations with increasing delays +2. **Circuit Breaker Pattern** - Prevents resource exhaustion by fast-failing when services are unavailable + +## Architecture + +### Circuit Breaker State Machine + +The circuit breaker operates in three states: + +``` +┌─────────┐ +│ CLOSED │ ◄─────────────────────────┐ +│ (Normal)│ │ +└────┬────┘ │ + │ │ + │ Failures ≥ Threshold │ Success in HALF_OPEN + │ │ + ▼ │ +┌─────────┐ ┌────┴────────┐ +│ OPEN │──────────────────────►│ HALF_OPEN │ +│(Failing)│ After Cooldown │ (Testing) │ +└─────────┘ └─────────────┘ + │ │ + │ │ + └─────────────────────────────────┘ + Failure in HALF_OPEN +``` + +#### State Descriptions + +**CLOSED (Normal Operation)** +- All requests pass through to Horizon +- Failures increment a counter; successes reset it +- Transitions to OPEN when consecutive failures exceed threshold + +**OPEN (Fast-Fail Mode)** +- All requests immediately fail with `CircuitBreakerOpenError` +- No requests are sent to Horizon (protects downstream services) +- After cooldown period, transitions to HALF_OPEN + +**HALF_OPEN (Recovery Testing)** +- Allows a single probe request through +- Success → transition back to CLOSED +- Failure → return to OPEN and reset cooldown timer + +### Retry Mechanism + +The retry mechanism implements exponential backoff with jitter: + +**Formula:** `delay = min(baseDelay × 2^attempt, maxDelay) × (1 ± jitter)` + +**Example with defaults:** +- Attempt 1: Immediate +- Attempt 2: ~1000ms (1s ± 30%) +- Attempt 3: ~2000ms (2s ± 30%) + +**Benefits:** +- Exponential backoff reduces load on failing services +- Jitter prevents thundering herd problem +- Bounded delays prevent indefinite waiting + +## Configuration + +### Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `HORIZON_URL` | Stellar Horizon endpoint | `https://horizon-testnet.stellar.org` | +| `STELLAR_BASE_FEE` | Transaction base fee (stroops) | `100` | +| `STELLAR_TRANSACTION_TIMEOUT` | Transaction timeout (seconds) | `30` | +| `CIRCUIT_BREAKER_THRESHOLD` | Failures before opening circuit | `5` | +| `CIRCUIT_BREAKER_COOLDOWN_MS` | Cooldown period (milliseconds) | `30000` (30s) | +| `RETRY_MAX_ATTEMPTS` | Maximum retry attempts | `3` | +| `RETRY_BASE_DELAY_MS` | Initial retry delay (milliseconds) | `1000` (1s) | + +### Example Configuration + +**Development (Fast Recovery):** +```bash +CIRCUIT_BREAKER_THRESHOLD=3 +CIRCUIT_BREAKER_COOLDOWN_MS=10000 +RETRY_MAX_ATTEMPTS=2 +RETRY_BASE_DELAY_MS=500 +``` + +**Production (Conservative):** +```bash +CIRCUIT_BREAKER_THRESHOLD=10 +CIRCUIT_BREAKER_COOLDOWN_MS=60000 +RETRY_MAX_ATTEMPTS=5 +RETRY_BASE_DELAY_MS=2000 +``` + +## API Endpoints + +### POST /api/deposits/build + +Build a vault deposit transaction with resilience patterns. + +**Request:** +```json +{ + "sourcePublicKey": "GSOURCE123...", + "vaultPublicKey": "GVAULT456...", + "amount": "100.5" +} +``` + +**Success Response (200):** +```json +{ + "success": true, + "transactionXdr": "AAAAA...ZZZZZ" +} +``` + +**Error Responses:** + +**400 Bad Request** - Invalid input +```json +{ + "success": false, + "error": "Invalid request body. Required fields: sourcePublicKey, vaultPublicKey, amount" +} +``` + +**502 Bad Gateway** - Circuit breaker open or retries exhausted +```json +{ + "success": false, + "error": "Stellar Horizon service is currently unavailable. Circuit breaker is open. Please try again later." +} +``` + +**500 Internal Server Error** - Unexpected error +```json +{ + "success": false, + "error": "Internal server error" +} +``` + +### GET /api/deposits/health + +Get circuit breaker health metrics. + +**Response (200):** +```json +{ + "success": true, + "circuitBreaker": { + "state": "CLOSED", + "consecutiveFailures": 0, + "consecutiveSuccesses": 5, + "totalFailures": 2, + "totalSuccesses": 10, + "lastFailureTime": null, + "lastStateChange": 1234567890 + } +} +``` + +## Error Handling + +### Error Types + +**CircuitBreakerOpenError** +- Thrown when circuit breaker is in OPEN state +- Mapped to HTTP 502 Bad Gateway +- Indicates upstream service is unavailable + +**RetryExhaustedError** +- Thrown when all retry attempts fail +- Mapped to HTTP 502 Bad Gateway +- Contains attempt count and last error + +**BadRequestError** +- Thrown for invalid client input +- Mapped to HTTP 400 Bad Request +- Validation errors + +### Error Flow + +``` +Horizon Call + │ + ├─► Success ──────────────────────► Return Result + │ + └─► Failure + │ + ├─► Retry (with backoff) + │ │ + │ ├─► Success ─────────────► Return Result + │ │ + │ └─► Max Retries ─────────► RetryExhaustedError → 502 + │ + └─► Circuit Breaker Check + │ + ├─► CLOSED ──────────────► Continue + │ + ├─► HALF_OPEN ───────────► Allow Probe + │ + └─► OPEN ────────────────► CircuitBreakerOpenError → 502 +``` + +## Monitoring + +### Key Metrics to Monitor + +1. **Circuit Breaker State** + - Alert when state transitions to OPEN + - Track time spent in each state + +2. **Failure Rates** + - `totalFailures / (totalFailures + totalSuccesses)` + - Alert on sustained high failure rates + +3. **Consecutive Failures** + - Early warning before circuit opens + - Alert at 50% of threshold + +4. **Retry Attempts** + - Track average retries per request + - High retry counts indicate instability + +### Health Check Integration + +Poll `/api/deposits/health` to monitor circuit breaker state: + +```bash +curl http://localhost:3000/api/deposits/health +``` + +**Healthy Response:** +```json +{ + "circuitBreaker": { + "state": "CLOSED", + "consecutiveFailures": 0 + } +} +``` + +**Degraded Response:** +```json +{ + "circuitBreaker": { + "state": "OPEN", + "consecutiveFailures": 5, + "lastFailureTime": 1234567890 + } +} +``` + +## Testing + +### Running Tests + +```bash +# Run all tests +npm test + +# Run with coverage +npm test -- --coverage + +# Run specific test suite +npm test -- retry.test.ts +npm test -- circuitBreaker.test.ts +npm test -- transactionBuilder.test.ts +npm test -- depositController.test.ts +``` + +### Test Coverage + +The implementation includes comprehensive tests covering: + +- ✅ Successful operations on first attempt +- ✅ Transient failures with successful retry +- ✅ Persistent failures exhausting retries +- ✅ Circuit breaker state transitions +- ✅ Fast-fail behavior when circuit is open +- ✅ Recovery after cooldown period +- ✅ HTTP error mapping (400, 502, 500) +- ✅ Request validation +- ✅ Concurrent operations + +**Target Coverage:** 90%+ line coverage + +### Manual Testing + +**Test Circuit Breaker Trip:** + +1. Configure low threshold: + ```bash + export CIRCUIT_BREAKER_THRESHOLD=2 + export RETRY_MAX_ATTEMPTS=1 + ``` + +2. Make requests with invalid Horizon URL: + ```bash + export HORIZON_URL=http://invalid-horizon.example.com + ``` + +3. Send multiple requests: + ```bash + curl -X POST http://localhost:3000/api/deposits/build \ + -H "Content-Type: application/json" \ + -d '{ + "sourcePublicKey": "GSOURCE...", + "vaultPublicKey": "GVAULT...", + "amount": "100" + }' + ``` + +4. Observe circuit breaker open after threshold failures + +5. Check health endpoint: + ```bash + curl http://localhost:3000/api/deposits/health + ``` + +## Best Practices + +### When to Adjust Configuration + +**Increase Threshold** when: +- Experiencing frequent false positives +- Network is inherently unstable but recovers quickly +- Cost of circuit opening is high + +**Decrease Threshold** when: +- Failures cascade to other services +- Recovery time is long +- Want faster failure detection + +**Increase Cooldown** when: +- Service takes long to recover +- Want to reduce probe frequency +- Avoiding premature recovery attempts + +**Decrease Cooldown** when: +- Service recovers quickly +- Want faster recovery +- Acceptable to probe more frequently + +### Production Recommendations + +1. **Start Conservative** + - Higher thresholds (8-10 failures) + - Longer cooldowns (60s) + - More retry attempts (4-5) + +2. **Monitor and Tune** + - Collect metrics for 1-2 weeks + - Analyze failure patterns + - Adjust based on actual behavior + +3. **Alert Configuration** + - Alert on circuit OPEN state + - Alert on sustained high failure rates + - Alert on retry exhaustion + +4. **Graceful Degradation** + - Cache recent successful responses + - Provide fallback values when possible + - Clear user communication during outages + +## Troubleshooting + +### Circuit Breaker Stuck Open + +**Symptoms:** Circuit remains OPEN despite service recovery + +**Solutions:** +1. Check cooldown period hasn't elapsed +2. Verify Horizon URL is correct +3. Test Horizon connectivity directly +4. Review logs for underlying errors +5. Manually reset if necessary (restart service) + +### Excessive Retries + +**Symptoms:** High latency, many retry attempts + +**Solutions:** +1. Reduce `RETRY_MAX_ATTEMPTS` +2. Increase `RETRY_BASE_DELAY_MS` +3. Lower `CIRCUIT_BREAKER_THRESHOLD` to fail faster +4. Investigate root cause of failures + +### False Positives + +**Symptoms:** Circuit opens during normal operation + +**Solutions:** +1. Increase `CIRCUIT_BREAKER_THRESHOLD` +2. Review failure patterns (are they truly transient?) +3. Improve retry logic for specific error types +4. Consider separate circuits for different operations + +## Implementation Details + +### File Structure + +``` +src/ +├── lib/ +│ ├── errors.ts # Custom error classes +│ ├── retry.ts # Retry mechanism +│ ├── retry.test.ts # Retry tests +│ ├── circuitBreaker.ts # Circuit breaker implementation +│ └── circuitBreaker.test.ts # Circuit breaker tests +├── services/ +│ ├── transactionBuilder.ts # Stellar transaction builder +│ └── transactionBuilder.test.ts # Transaction builder tests +├── controllers/ +│ ├── depositController.ts # Deposit API controller +│ └── depositController.test.ts # Controller tests +└── index.ts # Express app with routes +``` + +### Key Functions + +**`withRetry(operation, config)`** +- Wraps async operations with retry logic +- Returns result or throws `RetryExhaustedError` + +**`CircuitBreaker.execute(operation)`** +- Wraps operations with circuit breaker +- Manages state transitions +- Throws `CircuitBreakerOpenError` when open + +**`StellarTransactionBuilder.loadAccount(publicKey)`** +- Loads account from Horizon with resilience +- Combines retry + circuit breaker + +**`buildDepositTransaction(req, res, next)`** +- Express controller for deposit endpoint +- Maps errors to appropriate HTTP status codes + +## References + +- [Circuit Breaker Pattern - Martin Fowler](https://martinfowler.com/bliki/CircuitBreaker.html) +- [Exponential Backoff - AWS Architecture Blog](https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/) +- [Stellar Horizon API](https://developers.stellar.org/api/horizon) +- [Resilience Patterns - Microsoft Azure](https://docs.microsoft.com/en-us/azure/architecture/patterns/category/resiliency) diff --git a/src/lib/circuitBreaker.test.ts b/src/lib/circuitBreaker.test.ts new file mode 100644 index 0000000..fbf435e --- /dev/null +++ b/src/lib/circuitBreaker.test.ts @@ -0,0 +1,304 @@ +/** + * Unit tests for circuit breaker pattern implementation. + */ + +import { CircuitBreaker, CircuitBreakerState } from './circuitBreaker.js'; +import { CircuitBreakerOpenError } from './errors.js'; + +describe('Circuit Breaker', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + describe('State transitions', () => { + it('should start in CLOSED state', () => { + const breaker = new CircuitBreaker(); + expect(breaker.getState()).toBe(CircuitBreakerState.CLOSED); + }); + + it('should transition to OPEN after threshold failures', async () => { + const breaker = new CircuitBreaker({ failureThreshold: 3 }); + const operation = jest.fn().mockRejectedValue(new Error('Failure')); + + // Execute failures up to threshold + for (let i = 0; i < 3; i++) { + await expect(breaker.execute(operation)).rejects.toThrow('Failure'); + } + + expect(breaker.getState()).toBe(CircuitBreakerState.OPEN); + expect(operation).toHaveBeenCalledTimes(3); + }); + + it('should fast-fail when OPEN', async () => { + const breaker = new CircuitBreaker({ failureThreshold: 2, cooldownMs: 5000 }); + const operation = jest.fn().mockRejectedValue(new Error('Failure')); + + // Trip the breaker + await breaker.execute(operation).catch(() => {}); + await breaker.execute(operation).catch(() => {}); + + expect(breaker.getState()).toBe(CircuitBreakerState.OPEN); + + // Should fast-fail without calling operation + await expect(breaker.execute(operation)).rejects.toThrow(CircuitBreakerOpenError); + expect(operation).toHaveBeenCalledTimes(2); // Not called again + }); + + it('should transition to HALF_OPEN after cooldown', async () => { + jest.useFakeTimers(); + + const breaker = new CircuitBreaker({ failureThreshold: 2, cooldownMs: 5000 }); + const operation = jest.fn().mockRejectedValue(new Error('Failure')); + + // Trip the breaker + await breaker.execute(operation).catch(() => {}); + await breaker.execute(operation).catch(() => {}); + + expect(breaker.getState()).toBe(CircuitBreakerState.OPEN); + + // Advance time past cooldown + jest.advanceTimersByTime(5000); + + // Next execution should transition to HALF_OPEN + const successOp = jest.fn().mockResolvedValue('success'); + await breaker.execute(successOp); + + expect(breaker.getState()).toBe(CircuitBreakerState.CLOSED); + + jest.useRealTimers(); + }); + + it('should transition back to CLOSED on success in HALF_OPEN', async () => { + jest.useFakeTimers(); + + const breaker = new CircuitBreaker({ failureThreshold: 2, cooldownMs: 1000 }); + const failOp = jest.fn().mockRejectedValue(new Error('Failure')); + + // Trip the breaker + await breaker.execute(failOp).catch(() => {}); + await breaker.execute(failOp).catch(() => {}); + + expect(breaker.getState()).toBe(CircuitBreakerState.OPEN); + + // Wait for cooldown + jest.advanceTimersByTime(1000); + + // Successful probe should close the circuit + const successOp = jest.fn().mockResolvedValue('success'); + await breaker.execute(successOp); + + expect(breaker.getState()).toBe(CircuitBreakerState.CLOSED); + + jest.useRealTimers(); + }); + + it('should return to OPEN on failure in HALF_OPEN', async () => { + jest.useFakeTimers(); + + const breaker = new CircuitBreaker({ failureThreshold: 2, cooldownMs: 1000 }); + const failOp = jest.fn().mockRejectedValue(new Error('Failure')); + + // Trip the breaker + await breaker.execute(failOp).catch(() => {}); + await breaker.execute(failOp).catch(() => {}); + + expect(breaker.getState()).toBe(CircuitBreakerState.OPEN); + + // Wait for cooldown + jest.advanceTimersByTime(1000); + + // Failed probe should return to OPEN + await breaker.execute(failOp).catch(() => {}); + + expect(breaker.getState()).toBe(CircuitBreakerState.OPEN); + + jest.useRealTimers(); + }); + + it('should reset consecutive failures on success in CLOSED', async () => { + const breaker = new CircuitBreaker({ failureThreshold: 3 }); + const failOp = jest.fn().mockRejectedValue(new Error('Failure')); + const successOp = jest.fn().mockResolvedValue('success'); + + // Two failures + await breaker.execute(failOp).catch(() => {}); + await breaker.execute(failOp).catch(() => {}); + + expect(breaker.getState()).toBe(CircuitBreakerState.CLOSED); + + // Success resets counter + await breaker.execute(successOp); + + // Two more failures shouldn't trip (counter was reset) + await breaker.execute(failOp).catch(() => {}); + await breaker.execute(failOp).catch(() => {}); + + expect(breaker.getState()).toBe(CircuitBreakerState.CLOSED); + }); + }); + + describe('Metrics', () => { + it('should track success and failure counts', async () => { + const breaker = new CircuitBreaker(); + const successOp = jest.fn().mockResolvedValue('success'); + const failOp = jest.fn().mockRejectedValue(new Error('Failure')); + + await breaker.execute(successOp); + await breaker.execute(successOp); + await breaker.execute(failOp).catch(() => {}); + + const metrics = breaker.getMetrics(); + + expect(metrics.totalSuccesses).toBe(2); + expect(metrics.totalFailures).toBe(1); + expect(metrics.consecutiveSuccesses).toBe(0); + expect(metrics.consecutiveFailures).toBe(1); + }); + + it('should track last failure time', async () => { + const breaker = new CircuitBreaker(); + const failOp = jest.fn().mockRejectedValue(new Error('Failure')); + + const beforeTime = Date.now(); + await breaker.execute(failOp).catch(() => {}); + const afterTime = Date.now(); + + const metrics = breaker.getMetrics(); + + expect(metrics.lastFailureTime).toBeGreaterThanOrEqual(beforeTime); + expect(metrics.lastFailureTime).toBeLessThanOrEqual(afterTime); + }); + + it('should track state changes', async () => { + const breaker = new CircuitBreaker({ failureThreshold: 2 }); + const failOp = jest.fn().mockRejectedValue(new Error('Failure')); + + const initialMetrics = breaker.getMetrics(); + const initialStateChange = initialMetrics.lastStateChange; + + // Trip the breaker + await breaker.execute(failOp).catch(() => {}); + await breaker.execute(failOp).catch(() => {}); + + const finalMetrics = breaker.getMetrics(); + + expect(finalMetrics.state).toBe(CircuitBreakerState.OPEN); + expect(finalMetrics.lastStateChange).toBeGreaterThan(initialStateChange); + }); + }); + + describe('Configuration', () => { + it('should use custom failure threshold', async () => { + const breaker = new CircuitBreaker({ failureThreshold: 5 }); + const failOp = jest.fn().mockRejectedValue(new Error('Failure')); + + // 4 failures shouldn't trip + for (let i = 0; i < 4; i++) { + await breaker.execute(failOp).catch(() => {}); + } + + expect(breaker.getState()).toBe(CircuitBreakerState.CLOSED); + + // 5th failure should trip + await breaker.execute(failOp).catch(() => {}); + expect(breaker.getState()).toBe(CircuitBreakerState.OPEN); + }); + + it('should use custom cooldown period', async () => { + jest.useFakeTimers(); + + const breaker = new CircuitBreaker({ failureThreshold: 1, cooldownMs: 10000 }); + const failOp = jest.fn().mockRejectedValue(new Error('Failure')); + + // Trip the breaker + await breaker.execute(failOp).catch(() => {}); + expect(breaker.getState()).toBe(CircuitBreakerState.OPEN); + + // Advance time less than cooldown + jest.advanceTimersByTime(5000); + + // Should still be open + await expect(breaker.execute(failOp)).rejects.toThrow(CircuitBreakerOpenError); + + // Advance past cooldown + jest.advanceTimersByTime(5000); + + // Should allow probe + const successOp = jest.fn().mockResolvedValue('success'); + await breaker.execute(successOp); + + expect(breaker.getState()).toBe(CircuitBreakerState.CLOSED); + + jest.useRealTimers(); + }); + + it('should use custom success threshold in HALF_OPEN', async () => { + jest.useFakeTimers(); + + const breaker = new CircuitBreaker({ + failureThreshold: 2, + cooldownMs: 1000, + successThreshold: 2, + }); + const failOp = jest.fn().mockRejectedValue(new Error('Failure')); + const successOp = jest.fn().mockResolvedValue('success'); + + // Trip the breaker + await breaker.execute(failOp).catch(() => {}); + await breaker.execute(failOp).catch(() => {}); + + // Wait for cooldown + jest.advanceTimersByTime(1000); + + // First success shouldn't close + await breaker.execute(successOp); + expect(breaker.getState()).toBe(CircuitBreakerState.HALF_OPEN); + + // Second success should close + await breaker.execute(successOp); + expect(breaker.getState()).toBe(CircuitBreakerState.CLOSED); + + jest.useRealTimers(); + }); + }); + + describe('Reset functionality', () => { + it('should reset to CLOSED state', async () => { + const breaker = new CircuitBreaker({ failureThreshold: 2 }); + const failOp = jest.fn().mockRejectedValue(new Error('Failure')); + + // Trip the breaker + await breaker.execute(failOp).catch(() => {}); + await breaker.execute(failOp).catch(() => {}); + + expect(breaker.getState()).toBe(CircuitBreakerState.OPEN); + + // Reset + breaker.reset(); + + expect(breaker.getState()).toBe(CircuitBreakerState.CLOSED); + + // Should accept operations again + const successOp = jest.fn().mockResolvedValue('success'); + await expect(breaker.execute(successOp)).resolves.toBe('success'); + }); + }); + + describe('Concurrent operations', () => { + it('should handle concurrent operations correctly', async () => { + const breaker = new CircuitBreaker({ failureThreshold: 3 }); + const successOp = jest.fn().mockResolvedValue('success'); + + // Execute multiple operations concurrently + const promises = Array(10) + .fill(null) + .map(() => breaker.execute(successOp)); + + const results = await Promise.all(promises); + + expect(results).toHaveLength(10); + expect(results.every((r) => r === 'success')).toBe(true); + expect(breaker.getState()).toBe(CircuitBreakerState.CLOSED); + }); + }); +}); diff --git a/src/lib/circuitBreaker.ts b/src/lib/circuitBreaker.ts new file mode 100644 index 0000000..5c29e22 --- /dev/null +++ b/src/lib/circuitBreaker.ts @@ -0,0 +1,188 @@ +/** + * Circuit Breaker pattern implementation for protecting against cascading failures. + * + * States: + * - CLOSED: Normal operation, requests pass through + * - OPEN: Fast-fail mode, requests immediately rejected + * - HALF_OPEN: Testing recovery, single probe request allowed + * + * Configuration: + * - failureThreshold: Consecutive failures before opening (default: 5) + * - cooldownMs: Time in OPEN state before attempting recovery (default: 30000) + * - successThreshold: Consecutive successes in HALF_OPEN to close (default: 1) + */ + +import { CircuitBreakerOpenError } from './errors.js'; + +export enum CircuitBreakerState { + CLOSED = 'CLOSED', + OPEN = 'OPEN', + HALF_OPEN = 'HALF_OPEN', +} + +export interface CircuitBreakerConfig { + failureThreshold?: number; + cooldownMs?: number; + successThreshold?: number; +} + +export interface CircuitBreakerMetrics { + state: CircuitBreakerState; + consecutiveFailures: number; + consecutiveSuccesses: number; + totalFailures: number; + totalSuccesses: number; + lastFailureTime: number | null; + lastStateChange: number; +} + +const DEFAULT_CONFIG: Required = { + failureThreshold: 5, + cooldownMs: 30000, + successThreshold: 1, +}; + +/** + * Circuit Breaker implementation with automatic state management. + */ +export class CircuitBreaker { + private state: CircuitBreakerState = CircuitBreakerState.CLOSED; + private consecutiveFailures: number = 0; + private consecutiveSuccesses: number = 0; + private totalFailures: number = 0; + private totalSuccesses: number = 0; + private lastFailureTime: number | null = null; + private lastStateChange: number = Date.now(); + private readonly config: Required; + + constructor(config: CircuitBreakerConfig = {}) { + this.config = { ...DEFAULT_CONFIG, ...config }; + } + + /** + * Execute an operation through the circuit breaker. + * + * @param operation - Async function to execute + * @returns Result of the operation + * @throws CircuitBreakerOpenError if circuit is open + */ + async execute(operation: () => Promise): Promise { + // Check if we should transition from OPEN to HALF_OPEN + if (this.state === CircuitBreakerState.OPEN) { + const timeSinceFailure = Date.now() - (this.lastFailureTime ?? 0); + if (timeSinceFailure >= this.config.cooldownMs) { + this.transitionTo(CircuitBreakerState.HALF_OPEN); + } else { + throw new CircuitBreakerOpenError( + `Circuit breaker is open. Cooldown remaining: ${ + this.config.cooldownMs - timeSinceFailure + }ms` + ); + } + } + + try { + const result = await operation(); + this.onSuccess(); + return result; + } catch (error) { + this.onFailure(); + throw error; + } + } + + /** + * Handle successful operation execution. + */ + private onSuccess(): void { + this.totalSuccesses++; + this.consecutiveFailures = 0; + this.consecutiveSuccesses++; + + if (this.state === CircuitBreakerState.HALF_OPEN) { + if (this.consecutiveSuccesses >= this.config.successThreshold) { + this.transitionTo(CircuitBreakerState.CLOSED); + this.consecutiveSuccesses = 0; + } + } + } + + /** + * Handle failed operation execution. + */ + private onFailure(): void { + this.totalFailures++; + this.consecutiveSuccesses = 0; + this.consecutiveFailures++; + this.lastFailureTime = Date.now(); + + if (this.state === CircuitBreakerState.HALF_OPEN) { + // Immediate transition back to OPEN on any failure in HALF_OPEN + this.transitionTo(CircuitBreakerState.OPEN); + } else if (this.state === CircuitBreakerState.CLOSED) { + if (this.consecutiveFailures >= this.config.failureThreshold) { + this.transitionTo(CircuitBreakerState.OPEN); + } + } + } + + /** + * Transition to a new circuit breaker state. + */ + private transitionTo(newState: CircuitBreakerState): void { + const oldState = this.state; + this.state = newState; + this.lastStateChange = Date.now(); + + console.log( + `Circuit breaker state transition: ${oldState} → ${newState} ` + + `(failures: ${this.consecutiveFailures}, successes: ${this.consecutiveSuccesses})` + ); + + // Reset consecutive counters on state change + if (newState === CircuitBreakerState.CLOSED) { + this.consecutiveFailures = 0; + } + } + + /** + * Get current circuit breaker metrics. + */ + getMetrics(): CircuitBreakerMetrics { + return { + state: this.state, + consecutiveFailures: this.consecutiveFailures, + consecutiveSuccesses: this.consecutiveSuccesses, + totalFailures: this.totalFailures, + totalSuccesses: this.totalSuccesses, + lastFailureTime: this.lastFailureTime, + lastStateChange: this.lastStateChange, + }; + } + + /** + * Get current state. + */ + getState(): CircuitBreakerState { + return this.state; + } + + /** + * Force reset the circuit breaker to CLOSED state. + * Use with caution - primarily for testing or manual intervention. + */ + reset(): void { + this.state = CircuitBreakerState.CLOSED; + this.consecutiveFailures = 0; + this.consecutiveSuccesses = 0; + this.lastStateChange = Date.now(); + console.log('Circuit breaker manually reset to CLOSED state'); + } +} + +/** + * Create a circuit breaker wrapper with pre-configured settings. + */ +export function createCircuitBreaker(config: CircuitBreakerConfig = {}): CircuitBreaker { + return new CircuitBreaker(config); +} diff --git a/src/lib/errors.ts b/src/lib/errors.ts new file mode 100644 index 0000000..ad5e227 --- /dev/null +++ b/src/lib/errors.ts @@ -0,0 +1,56 @@ +/** + * Custom error classes for resilience patterns and HTTP error mapping. + */ + +/** + * Thrown when the circuit breaker is in OPEN state and requests are being rejected. + */ +export class CircuitBreakerOpenError extends Error { + constructor(message: string = 'Circuit breaker is open') { + super(message); + this.name = 'CircuitBreakerOpenError'; + Object.setPrototypeOf(this, CircuitBreakerOpenError.prototype); + } +} + +/** + * Thrown when all retry attempts have been exhausted. + */ +export class RetryExhaustedError extends Error { + public readonly attempts: number; + public readonly lastError: Error; + + constructor(attempts: number, lastError: Error) { + super(`Retry exhausted after ${attempts} attempts: ${lastError.message}`); + this.name = 'RetryExhaustedError'; + this.attempts = attempts; + this.lastError = lastError; + Object.setPrototypeOf(this, RetryExhaustedError.prototype); + } +} + +/** + * HTTP 502 Bad Gateway error for upstream service failures. + */ +export class BadGatewayError extends Error { + public readonly statusCode: number = 502; + + constructor(message: string = 'Bad Gateway') { + super(message); + this.name = 'BadGatewayError'; + Object.setPrototypeOf(this, BadGatewayError.prototype); + } +} + +/** + * HTTP 400 Bad Request error for invalid client input. + */ +export class BadRequestError extends Error { + public readonly statusCode: number = 400; + + constructor(message: string = 'Bad Request') { + super(message); + this.name = 'BadRequestError'; + Object.setPrototypeOf(this, BadRequestError.prototype); + } +} diff --git a/src/lib/retry.test.ts b/src/lib/retry.test.ts new file mode 100644 index 0000000..dfe32d7 --- /dev/null +++ b/src/lib/retry.test.ts @@ -0,0 +1,191 @@ +/** + * Unit tests for retry mechanism with exponential backoff. + */ + +import { withRetry, createRetryWrapper } from './retry.js'; +import { RetryExhaustedError } from './errors.js'; + +describe('Retry Mechanism', () => { + beforeEach(() => { + jest.clearAllMocks(); + jest.useFakeTimers(); + }); + + afterEach(() => { + jest.useRealTimers(); + }); + + describe('withRetry', () => { + it('should return result on first successful attempt', async () => { + const operation = jest.fn().mockResolvedValue('success'); + + const promise = withRetry(operation); + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toBe('success'); + expect(operation).toHaveBeenCalledTimes(1); + }); + + it('should retry and succeed after transient failure', async () => { + const operation = jest + .fn() + .mockRejectedValueOnce(new Error('Transient failure')) + .mockResolvedValueOnce('success'); + + const promise = withRetry(operation, { maxAttempts: 3 }); + + // Fast-forward through retry delays + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toBe('success'); + expect(operation).toHaveBeenCalledTimes(2); + }); + + it('should throw RetryExhaustedError after max attempts', async () => { + const error = new Error('Persistent failure'); + const operation = jest.fn().mockRejectedValue(error); + + const promise = withRetry(operation, { maxAttempts: 3 }); + await jest.runAllTimersAsync(); + + await expect(promise).rejects.toThrow(RetryExhaustedError); + await expect(promise).rejects.toMatchObject({ + attempts: 3, + lastError: error, + }); + expect(operation).toHaveBeenCalledTimes(3); + }); + + it('should apply exponential backoff delays', async () => { + const operation = jest.fn().mockRejectedValue(new Error('Failure')); + const baseDelayMs = 1000; + + const promise = withRetry(operation, { + maxAttempts: 3, + baseDelayMs, + jitterFactor: 0, // No jitter for predictable testing + }); + + // First attempt fails immediately + await jest.advanceTimersByTimeAsync(0); + expect(operation).toHaveBeenCalledTimes(1); + + // Second attempt after ~1000ms (2^0 * 1000) + await jest.advanceTimersByTimeAsync(1000); + expect(operation).toHaveBeenCalledTimes(2); + + // Third attempt after ~2000ms (2^1 * 1000) + await jest.advanceTimersByTimeAsync(2000); + expect(operation).toHaveBeenCalledTimes(3); + + await expect(promise).rejects.toThrow(RetryExhaustedError); + }); + + it('should cap delay at maxDelayMs', async () => { + const operation = jest.fn().mockRejectedValue(new Error('Failure')); + + const promise = withRetry(operation, { + maxAttempts: 4, + baseDelayMs: 1000, + maxDelayMs: 2000, + jitterFactor: 0, + }); + + await jest.advanceTimersByTimeAsync(0); + expect(operation).toHaveBeenCalledTimes(1); + + // Second attempt: 1000ms + await jest.advanceTimersByTimeAsync(1000); + expect(operation).toHaveBeenCalledTimes(2); + + // Third attempt: capped at 2000ms (not 2000ms) + await jest.advanceTimersByTimeAsync(2000); + expect(operation).toHaveBeenCalledTimes(3); + + // Fourth attempt: still capped at 2000ms (not 4000ms) + await jest.advanceTimersByTimeAsync(2000); + expect(operation).toHaveBeenCalledTimes(4); + + await expect(promise).rejects.toThrow(RetryExhaustedError); + }); + + it('should handle non-Error rejections', async () => { + const operation = jest.fn().mockRejectedValue('string error'); + + const promise = withRetry(operation, { maxAttempts: 2 }); + await jest.runAllTimersAsync(); + + await expect(promise).rejects.toThrow(RetryExhaustedError); + const error = await promise.catch((e) => e); + expect(error.lastError.message).toBe('string error'); + }); + + it('should use default config when not provided', async () => { + const operation = jest.fn().mockRejectedValue(new Error('Failure')); + + const promise = withRetry(operation); + await jest.runAllTimersAsync(); + + await expect(promise).rejects.toThrow(RetryExhaustedError); + expect(operation).toHaveBeenCalledTimes(3); // Default maxAttempts + }); + }); + + describe('createRetryWrapper', () => { + it('should create a wrapper with pre-configured settings', async () => { + const retryWrapper = createRetryWrapper({ maxAttempts: 2, baseDelayMs: 500 }); + const operation = jest.fn().mockRejectedValue(new Error('Failure')); + + const promise = retryWrapper(operation); + await jest.runAllTimersAsync(); + + await expect(promise).rejects.toThrow(RetryExhaustedError); + expect(operation).toHaveBeenCalledTimes(2); + }); + + it('should allow multiple operations with same config', async () => { + const retryWrapper = createRetryWrapper({ maxAttempts: 2 }); + + const op1 = jest.fn().mockResolvedValue('result1'); + const op2 = jest.fn().mockResolvedValue('result2'); + + const promise1 = retryWrapper(op1); + const promise2 = retryWrapper(op2); + + await jest.runAllTimersAsync(); + + await expect(promise1).resolves.toBe('result1'); + await expect(promise2).resolves.toBe('result2'); + }); + }); + + describe('Jitter behavior', () => { + it('should apply jitter to delay calculations', async () => { + const operation = jest.fn().mockRejectedValue(new Error('Failure')); + const delays: number[] = []; + + // Mock setTimeout to capture actual delays + const originalSetTimeout = global.setTimeout; + jest.spyOn(global, 'setTimeout').mockImplementation(((callback: any, ms: number) => { + delays.push(ms); + return originalSetTimeout(callback, 0); + }) as any); + + const promise = withRetry(operation, { + maxAttempts: 3, + baseDelayMs: 1000, + jitterFactor: 0.3, + }); + + await jest.runAllTimersAsync(); + await promise.catch(() => {}); // Ignore error + + // Verify delays are within jitter range + expect(delays.length).toBe(2); // Two retries + expect(delays[0]).toBeGreaterThanOrEqual(700); // 1000 * (1 - 0.3) + expect(delays[0]).toBeLessThanOrEqual(1300); // 1000 * (1 + 0.3) + }); + }); +});