From c8a86bab11f84ff6fcc657cde036adea3897cb07 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 7 May 2026 14:43:56 +0000 Subject: [PATCH 1/5] Add enterprise workflows and deployment manifests Agent-Logs-Url: https://github.com/Algodons/agentos/sessions/42083e98-582c-4e4c-a645-3e712f0bed81 Co-authored-by: SMSDAO <144380926+SMSDAO@users.noreply.github.com> --- .dockerignore | 8 +++ .github/workflows/ci.yml | 55 ++++++++++----- .github/workflows/deploy.yml | 44 ++++++++++++ .github/workflows/release.yml | 114 ++++++++++++++++++++++++++++++++ .github/workflows/security.yml | 82 +++++++++++++++++++++++ Dockerfile | 25 +++++++ deploy/k8s/base/deployment.yaml | 55 +++++++++++++++ deploy/k8s/base/hpa.yaml | 19 ++++++ deploy/k8s/base/namespace.yaml | 4 ++ deploy/k8s/base/service.yaml | 12 ++++ 10 files changed, 400 insertions(+), 18 deletions(-) create mode 100644 .dockerignore create mode 100644 .github/workflows/deploy.yml create mode 100644 .github/workflows/release.yml create mode 100644 .github/workflows/security.yml create mode 100644 Dockerfile create mode 100644 deploy/k8s/base/deployment.yaml create mode 100644 deploy/k8s/base/hpa.yaml create mode 100644 deploy/k8s/base/namespace.yaml create mode 100644 deploy/k8s/base/service.yaml diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..47fb8cd --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +node_modules +.next +coverage +.git +.github +npm-debug.log +.env +.env.* diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index df07943..bad0dae 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,24 +9,33 @@ on: permissions: contents: read +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: - test: - name: Type-check, Lint & Test + validate: + name: Validate (Node ${{ matrix.node-version }}) runs-on: ubuntu-latest - permissions: - contents: read + strategy: + fail-fast: false + matrix: + node-version: [20, 22] steps: - name: Checkout uses: actions/checkout@v4 + with: + persist-credentials: false + fetch-depth: 1 - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '20' - cache: 'npm' + node-version: ${{ matrix.node-version }} + cache: npm - - name: Install dependencies + - name: Install dependencies (deterministic) run: npm ci - name: Type-check @@ -35,28 +44,38 @@ jobs: - name: Lint run: npm run lint - - name: Test + - name: Unit and integration tests run: npm test - build: - name: Build - runs-on: ubuntu-latest - needs: test - permissions: - contents: read + - name: Build + run: npm run build + coverage: + name: Coverage + runs-on: ubuntu-latest + needs: validate steps: - name: Checkout uses: actions/checkout@v4 + with: + persist-credentials: false + fetch-depth: 1 - name: Setup Node.js uses: actions/setup-node@v4 with: - node-version: '20' - cache: 'npm' + node-version: 20 + cache: npm - name: Install dependencies run: npm ci - - name: Build - run: npm run build + - name: Coverage tests + run: npm run test:coverage + + - name: Upload coverage artifact + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: coverage/ + if-no-files-found: error diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..5f36338 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,44 @@ +name: Deploy + +on: + workflow_dispatch: + inputs: + image-tag: + description: 'Container image tag to deploy' + required: true + type: string + +permissions: + contents: read + +jobs: + deploy-staging: + name: Deploy to staging + runs-on: ubuntu-latest + environment: staging + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + persist-credentials: false + fetch-depth: 1 + + - name: Deploy placeholder command + run: echo "Deploying image tag ${{ inputs.image-tag }} to staging" + + deploy-production: + name: Deploy to production (approval required) + runs-on: ubuntu-latest + needs: deploy-staging + environment: production + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + persist-credentials: false + fetch-depth: 1 + + - name: Deploy placeholder command + run: echo "Deploying image tag ${{ inputs.image-tag }} to production" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..64c5b52 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,114 @@ +name: Release + +on: + push: + tags: + - 'v*.*.*' + +permissions: + contents: read + +jobs: + verify: + name: Verify release + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 20 + cache: npm + + - name: Install dependencies (deterministic) + run: npm ci + + - name: Type-check + run: npm run type-check + + - name: Lint + run: npm run lint + + - name: Test + run: npm test + + - name: Build + run: npm run build + + package: + name: Package artifacts + runs-on: ubuntu-latest + needs: verify + permissions: + contents: read + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 20 + cache: npm + + - name: Install dependencies + run: npm ci + + - name: Build package + run: | + TAG="${GITHUB_REF_NAME}" + export SOURCE_DATE_EPOCH="$(git log -1 --format=%ct)" + npm pack + sha256sum *.tgz > checksums.txt + { + echo "# Release ${TAG}" + echo + echo "## Commit" + echo "- $(git rev-parse HEAD)" + echo + echo "## Checksums" + cat checksums.txt + } > release-notes.md + + - name: Upload release artifacts + uses: actions/upload-artifact@v4 + with: + name: release-artifacts + path: | + *.tgz + checksums.txt + release-notes.md + if-no-files-found: error + + publish: + name: Publish GitHub release + runs-on: ubuntu-latest + needs: package + permissions: + contents: write + + steps: + - name: Download artifacts + uses: actions/download-artifact@v4 + with: + name: release-artifacts + + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + body_path: release-notes.md + files: | + *.tgz + checksums.txt diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml new file mode 100644 index 0000000..b016941 --- /dev/null +++ b/.github/workflows/security.yml @@ -0,0 +1,82 @@ +name: Security + +on: + pull_request: + branches: [main] + push: + branches: [main] + schedule: + - cron: '0 3 * * *' + +permissions: + contents: read + +jobs: + codeql: + name: CodeQL + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + persist-credentials: false + fetch-depth: 1 + + - name: Initialize CodeQL + uses: github/codeql-action/init@v3 + with: + languages: javascript-typescript + + - name: Autobuild + uses: github/codeql-action/autobuild@v3 + + - name: Analyze + uses: github/codeql-action/analyze@v3 + + dependency-review: + if: github.event_name == 'pull_request' + name: Dependency Review + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: read + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + persist-credentials: false + fetch-depth: 1 + + - name: Dependency review + uses: actions/dependency-review-action@v4 + + npm-audit: + name: npm audit + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + persist-credentials: false + fetch-depth: 1 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: 20 + cache: npm + + - name: Install dependencies + run: npm ci + + - name: High severity dependency audit + run: npm audit --audit-level=high diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c84d7a2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,25 @@ +FROM node:20-bookworm-slim AS deps +WORKDIR /app +COPY package*.json ./ +RUN npm ci + +FROM node:20-bookworm-slim AS builder +WORKDIR /app +COPY --from=deps /app/node_modules ./node_modules +COPY . . +RUN npm run build + +FROM node:20-bookworm-slim AS runner +WORKDIR /app +ENV NODE_ENV=production +ENV NEXT_TELEMETRY_DISABLED=1 + +COPY package*.json ./ +RUN npm ci --omit=dev + +COPY --from=builder /app/.next ./.next +COPY --from=builder /app/public ./public +COPY --from=builder /app/next.config.mjs ./next.config.mjs + +EXPOSE 3000 +CMD ["npm", "run", "start"] diff --git a/deploy/k8s/base/deployment.yaml b/deploy/k8s/base/deployment.yaml new file mode 100644 index 0000000..e469d39 --- /dev/null +++ b/deploy/k8s/base/deployment.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: agentos + namespace: agentos + labels: + app: agentos +spec: + replicas: 3 + revisionHistoryLimit: 5 + selector: + matchLabels: + app: agentos + template: + metadata: + labels: + app: agentos + spec: + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + containers: + - name: agentos + image: ghcr.io/algodons/agentos:latest + imagePullPolicy: IfNotPresent + ports: + - containerPort: 3000 + env: + - name: NODE_ENV + value: production + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 1 + memory: 1Gi + livenessProbe: + httpGet: + path: /api/status + port: 3000 + initialDelaySeconds: 20 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /api/status + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 5 + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: ["ALL"] diff --git a/deploy/k8s/base/hpa.yaml b/deploy/k8s/base/hpa.yaml new file mode 100644 index 0000000..8b3e7c2 --- /dev/null +++ b/deploy/k8s/base/hpa.yaml @@ -0,0 +1,19 @@ +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: agentos + namespace: agentos +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: agentos + minReplicas: 3 + maxReplicas: 15 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 65 diff --git a/deploy/k8s/base/namespace.yaml b/deploy/k8s/base/namespace.yaml new file mode 100644 index 0000000..f444fa3 --- /dev/null +++ b/deploy/k8s/base/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: agentos diff --git a/deploy/k8s/base/service.yaml b/deploy/k8s/base/service.yaml new file mode 100644 index 0000000..7fac14c --- /dev/null +++ b/deploy/k8s/base/service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: agentos + namespace: agentos +spec: + selector: + app: agentos + ports: + - protocol: TCP + port: 80 + targetPort: 3000 From 9818068f99f33dcc8d0e1dd9b9311aeea468c96a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 7 May 2026 14:46:26 +0000 Subject: [PATCH 2/5] Harden runtime determinism and add operations playbooks Agent-Logs-Url: https://github.com/Algodons/agentos/sessions/42083e98-582c-4e4c-a645-3e712f0bed81 Co-authored-by: SMSDAO <144380926+SMSDAO@users.noreply.github.com> --- README.md | 130 ++++++++++++++----------- app/api/status/route.ts | 8 +- core/wasm/wasmRunner.ts | 41 +++++--- core/wasm/workerPool.ts | 35 ++++++- docs/deployment/docker.md | 28 ++++++ docs/deployment/kubernetes.md | 25 +++++ docs/deployment/vercel.md | 19 ++++ docs/operations/disaster-recovery.md | 31 ++++++ docs/operations/monitoring-alerting.md | 34 +++++++ docs/operations/scaling-playbook.md | 19 ++++ tests/wasm.determinism.test.ts | 63 ++++++++++++ tests/workerPool.load.test.ts | 65 +++++++++++++ 12 files changed, 422 insertions(+), 76 deletions(-) create mode 100644 docs/deployment/docker.md create mode 100644 docs/deployment/kubernetes.md create mode 100644 docs/deployment/vercel.md create mode 100644 docs/operations/disaster-recovery.md create mode 100644 docs/operations/monitoring-alerting.md create mode 100644 docs/operations/scaling-playbook.md create mode 100644 tests/wasm.determinism.test.ts create mode 100644 tests/workerPool.load.test.ts diff --git a/README.md b/README.md index 38339b9..a43e816 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ > **A production-grade WASM-powered multi-agent system that optimizes prompts autonomously, executes across multiple AI models, scores best outputs, and evolves via feedback loops.** [![CI](https://github.com/Algodons/agentos/actions/workflows/ci.yml/badge.svg)](https://github.com/Algodons/agentos/actions/workflows/ci.yml) +[![Security](https://github.com/Algodons/agentos/actions/workflows/security.yml/badge.svg)](https://github.com/Algodons/agentos/actions/workflows/security.yml) --- @@ -21,7 +22,7 @@ Open [http://localhost:3000/dashboard](http://localhost:3000/dashboard) to see t AgentOS is an autonomous prompt optimization operating system โ€” a self-evolving AI execution layer that: -- ๐Ÿค– **Optimizes prompts** through a 7-agent swarm pipeline +- ๏ฟฝ๏ฟฝ **Optimizes prompts** through a 7-agent swarm pipeline - โšก **Executes across multiple AI models** (OpenAI, Claude, Gemini, Llama) - ๐Ÿ“Š **Scores and selects** the best outputs via weighted evaluation - ๐Ÿ” **Evolves prompts** via feedback loops (up to 10 iterations) @@ -29,7 +30,7 @@ AgentOS is an autonomous prompt optimization operating system โ€” a self-evolvin - ๐Ÿง  **Stores and versions** high-performing prompts - ๐ŸŒ **WASM execution layer** for parallel, deterministic agent tasks -> **Backward compatibility:** `import { PromptOS } from '@/core'` still works โ€” `PromptOS` is aliased to `AgentOS`. +> **Backward compatibility:** `import { PromptOS } from '@/core'` still works โ€” `PromptOS` is aliased to `SwarmOrchestrator`. --- @@ -38,13 +39,6 @@ AgentOS is an autonomous prompt optimization operating system โ€” a self-evolvin ``` /core โ”œโ”€โ”€ agents/ # 7 stateless typed agents -โ”‚ โ”œโ”€โ”€ InputAgent -โ”‚ โ”œโ”€โ”€ PromptArchitect -โ”‚ โ”œโ”€โ”€ ExecutionAgent -โ”‚ โ”œโ”€โ”€ EvaluationAgent -โ”‚ โ”œโ”€โ”€ OptimizationAgent -โ”‚ โ”œโ”€โ”€ SecurityAgent -โ”‚ โ””โ”€โ”€ DeploymentAgent โ”œโ”€โ”€ swarm/ # SwarmOrchestrator โ€” pipeline coordinator โ”œโ”€โ”€ wasm/ # WasmRunner + WorkerPool โ”œโ”€โ”€ models/ # ModelRouter (OpenAI / Claude / Gemini / Llama) @@ -69,39 +63,80 @@ score = (accuracy ร— 0.40) + (completeness ร— 0.25) + (determinism ร— 0.15) --- -## ๐ŸŒ API - -### `POST /api/optimize` +## ๐Ÿงช Testing -**Request:** -```json -{ "input": "Summarize the history of the internet." } +```bash +npm test # run all tests +npm run test:coverage # with coverage report ``` -**Response:** -```json -{ - "optimizedPrompt": "...", - "score": 0.91, - "model": "mock", - "versionId": "uuid", - "iterations": 3, - "scoreBreakdown": { "accuracy": 0.85, "completeness": 1.0 }, - "threatDetected": false -} -``` +Coverage now includes: + +- Unit tests for all swarm agents +- End-to-end swarm orchestration tests +- Determinism tests for WASM execution +- Worker-pool load and queue saturation tests + +--- + +## ๐Ÿ” Enterprise CI/CD and Security + +### CI/CD Workflows + +- **CI (`.github/workflows/ci.yml`)** + - Deterministic `npm ci` + - Node.js matrix validation (20, 22) + - Type-check, lint, tests, build + - Coverage artifact upload +- **Release (`.github/workflows/release.yml`)** + - Triggered by semantic tags (`v*.*.*`) + - Full pre-release validation + - Reproducible package artifact + checksums + - Automated GitHub release creation +- **Deploy (`.github/workflows/deploy.yml`)** + - Human-triggered deployment path + - Staging then production gating via environments + +### Security Governance + +- **Security workflow (`.github/workflows/security.yml`)** + - CodeQL analysis + - Dependency review for pull requests + - Scheduled and branch-based `npm audit` +- Prompt injection and XSS sanitization built into core runtime +- No embedded secrets in codebase --- -## ๐ŸŽจ Dashboard +## ๐Ÿšข Deployment Playbooks + +- Docker: [`docs/deployment/docker.md`](docs/deployment/docker.md) +- Kubernetes: [`docs/deployment/kubernetes.md`](docs/deployment/kubernetes.md) +- Vercel: [`docs/deployment/vercel.md`](docs/deployment/vercel.md) -Route: `/dashboard` +Kubernetes base manifests are provided in `deploy/k8s/base/` with secure defaults: -- **Score Index** โ€” real-time score with sparkline -- **Agent Swarm Panel** โ€” live agent status (idle / running / failed) -- **Prompt Timeline** โ€” version history with scores -- **Model Comparison** โ€” GPT vs Claude vs Gemini vs Llama -- **Live Logs** โ€” terminal-style streaming log view +- Non-root execution +- Read-only root filesystem +- Capability drop (`ALL`) +- Liveness/readiness probes +- HorizontalPodAutoscaler + +--- + +## ๐Ÿ“ˆ Observability, Scaling, and Recovery + +- Monitoring and alerting: [`docs/operations/monitoring-alerting.md`](docs/operations/monitoring-alerting.md) +- Scaling strategy: [`docs/operations/scaling-playbook.md`](docs/operations/scaling-playbook.md) +- Disaster recovery: [`docs/operations/disaster-recovery.md`](docs/operations/disaster-recovery.md) + +`GET /api/status` now returns health and operational metadata: + +- status +- timestamp +- uptime +- model and score snapshot +- version count --- @@ -117,29 +152,7 @@ LLAMA_API_KEY= MARKETPLACE_ENABLED=true ``` -The system works **without any API keys** โ€” all models fall back to a deterministic mock provider. - ---- - -## ๐Ÿงช Testing - -```bash -npm test # run all tests -npm run test:coverage # with coverage report -``` - -- `tests/swarm.test.ts` โ€” end-to-end swarm pipeline -- `tests/agents.test.ts` โ€” unit tests for all 7 agents -- `tests/scoring.test.ts` โ€” scoring engine formula tests - ---- - -## ๐Ÿ›ก๏ธ Security - -- **PromptSanitizer** โ€” strips jailbreak and injection patterns before execution -- **InjectionDetector** โ€” detects `instruction_override`, `jailbreak`, `xss_injection`, `code_injection`, `system_prompt_injection` -- No secrets hardcoded anywhere -- Server-side validation on all API inputs +The system works **without any API keys** โ€” models can run through deterministic mock behavior. --- @@ -151,7 +164,8 @@ npm run test:coverage # with coverage report | `npm run build` | Production build | | `npm run type-check` | TypeScript strict check | | `npm run lint` | ESLint | -| `npm test` | Jest tests | +| `npm test` | Jest suite | +| `npm run test:coverage` | Jest coverage | --- diff --git a/app/api/status/route.ts b/app/api/status/route.ts index 06f1265..8f80a7e 100644 --- a/app/api/status/route.ts +++ b/app/api/status/route.ts @@ -1,17 +1,21 @@ import { NextResponse } from 'next/server'; import { MemorySystem } from '@/core/memory/MemorySystem'; +const processStartTime = Date.now(); + /** * GET /api/status * - * Returns the current AgentOS system status for the polling fallback in lib/socket.ts. - * Reports the score and model from the most recently deployed prompt version. + * Returns the current AgentOS status and operational metrics. */ export function GET(): NextResponse { const memory = MemorySystem.getInstance(); const best = memory.getBestVersion(); return NextResponse.json({ + status: 'ok', + timestamp: new Date().toISOString(), + uptimeMs: Date.now() - processStartTime, score: best?.score ?? 0, model: best?.model ?? 'โ€”', versionCount: memory.getLongTermHistory().length, diff --git a/core/wasm/wasmRunner.ts b/core/wasm/wasmRunner.ts index 2b16ecd..1302fe3 100644 --- a/core/wasm/wasmRunner.ts +++ b/core/wasm/wasmRunner.ts @@ -12,43 +12,62 @@ export interface WasmTaskResult { durationMs: number; } +export interface WasmRunnerOptions { + now?: () => number; + defaultTimeoutMs?: number; + maxTimeoutMs?: number; +} + /** - * WasmRunner โ€” provides deterministic, parallel task execution via a simulated WASM sandbox. - * - * In production this would compile and run Wasm modules via the WebAssembly API. - * The interface is kept identical so swapping in a real WASM engine requires no - * changes to the surrounding code. + * WasmRunner โ€” deterministic task execution wrapper for WASM-compatible workloads. */ export class WasmRunner { + private readonly now: () => number; + private readonly defaultTimeoutMs: number; + private readonly maxTimeoutMs: number; + + constructor(options: WasmRunnerOptions = {}) { + this.now = options.now ?? Date.now; + this.defaultTimeoutMs = options.defaultTimeoutMs ?? 10_000; + this.maxTimeoutMs = options.maxTimeoutMs ?? 60_000; + } + /** - * Executes a single task with optional timeout. + * Executes a single task with validated timeout bounds. */ async run(task: WasmTask): Promise> { - const start = Date.now(); - const timeoutMs = task.timeoutMs ?? 10_000; + const start = this.now(); + const timeoutMs = this.resolveTimeout(task.timeoutMs); try { const result = await Promise.race([ task.execute(task.payload), this.timeout(timeoutMs, task.id), ]); - return { id: task.id, result, durationMs: Date.now() - start }; + return { id: task.id, result, durationMs: this.now() - start }; } catch (err) { return { id: task.id, error: err instanceof Error ? err.message : String(err), - durationMs: Date.now() - start, + durationMs: this.now() - start, }; } } /** - * Executes multiple tasks in parallel. + * Executes multiple tasks in deterministic input order. */ async runAll(tasks: WasmTask[]): Promise[]> { return Promise.all(tasks.map((task) => this.run(task))); } + private resolveTimeout(taskTimeoutMs?: number): number { + if (typeof taskTimeoutMs !== 'number' || !Number.isFinite(taskTimeoutMs) || taskTimeoutMs <= 0) { + return this.defaultTimeoutMs; + } + return Math.min(taskTimeoutMs, this.maxTimeoutMs); + } + private timeout(ms: number, taskId: string): Promise { return new Promise((_, reject) => setTimeout(() => reject(new Error(`WasmRunner: task ${taskId} timed out after ${ms}ms`)), ms), diff --git a/core/wasm/workerPool.ts b/core/wasm/workerPool.ts index a8554db..714b365 100644 --- a/core/wasm/workerPool.ts +++ b/core/wasm/workerPool.ts @@ -1,16 +1,31 @@ import { WasmRunner, WasmTask, WasmTaskResult } from './wasmRunner'; +export interface WorkerPoolOptions { + concurrency?: number; + maxQueueSize?: number; +} + /** * WorkerPool โ€” manages a pool of WasmRunner instances for parallel task execution. - * Implements a task queue with configurable concurrency and round-robin distribution. */ export class WorkerPool { private readonly runners: WasmRunner[]; private readonly queue: Array<() => void> = []; + private readonly concurrency: number; + private readonly maxQueueSize: number; private activeCount = 0; private roundRobinIndex = 0; - constructor(private readonly concurrency: number = 4) { + constructor(options: WorkerPoolOptions | number = 4) { + const normalized = typeof options === 'number' ? { concurrency: options } : options; + const concurrency = normalized.concurrency ?? 4; + + if (!Number.isInteger(concurrency) || concurrency < 1) { + throw new Error('WorkerPool: concurrency must be a positive integer'); + } + + this.concurrency = concurrency; + this.maxQueueSize = normalized.maxQueueSize ?? concurrency * 100; this.runners = Array.from({ length: concurrency }, () => new WasmRunner()); } @@ -18,7 +33,7 @@ export class WorkerPool { * Submits a task to the pool. Queues it if all runners are busy. */ submit(task: WasmTask): Promise> { - return new Promise((resolve) => { + return new Promise((resolve, reject) => { const run = () => { this.activeCount++; const runnerIndex = this.roundRobinIndex % this.runners.length; @@ -33,9 +48,15 @@ export class WorkerPool { if (this.activeCount < this.concurrency) { run(); - } else { - this.queue.push(run); + return; } + + if (this.queue.length >= this.maxQueueSize) { + reject(new Error(`WorkerPool: queue capacity ${this.maxQueueSize} exceeded`)); + return; + } + + this.queue.push(run); }); } @@ -56,4 +77,8 @@ export class WorkerPool { get active(): number { return this.activeCount; } + + get capacity(): number { + return this.concurrency; + } } diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md new file mode 100644 index 0000000..eaeee2a --- /dev/null +++ b/docs/deployment/docker.md @@ -0,0 +1,28 @@ +# Docker Deployment Playbook + +## Build + +```bash +docker build -t ghcr.io/algodons/agentos: . +``` + +## Run + +```bash +docker run --rm -p 3000:3000 --env-file .env.local ghcr.io/algodons/agentos: +``` + +## Security Hardening Controls + +- Use immutable tags (`vX.Y.Z` + commit SHA) and never deploy `latest` to production. +- Run containers with read-only filesystems and non-root users in orchestration layers. +- Inject secrets through runtime secret stores; never bake into images. +- Validate image digests against release checksum artifacts before rollout. + +## Human Approval Checkpoint + +Before production rollout, release manager must verify: + +1. CI workflow green on release tag. +2. Security workflow green (CodeQL + dependency audit). +3. Signed-off deployment ticket with rollback owner assigned. diff --git a/docs/deployment/kubernetes.md b/docs/deployment/kubernetes.md new file mode 100644 index 0000000..385382e --- /dev/null +++ b/docs/deployment/kubernetes.md @@ -0,0 +1,25 @@ +# Kubernetes Deployment Playbook + +## Apply Base Manifests + +```bash +kubectl apply -f deploy/k8s/base/namespace.yaml +kubectl apply -f deploy/k8s/base/deployment.yaml +kubectl apply -f deploy/k8s/base/service.yaml +kubectl apply -f deploy/k8s/base/hpa.yaml +``` + +## Production Controls + +- Keep `replicas >= 3` for zone-level resilience. +- Use HPA CPU target at 65% and max replicas of 15 by default. +- Require readiness and liveness probes on `/api/status`. +- Enforce Pod Security (`runAsNonRoot`, dropped capabilities, `RuntimeDefault` seccomp). + +## Human Approval Checkpoint + +Deployment to production namespace requires: + +1. Approved change request. +2. On-call SRE acknowledgement. +3. Verified rollback image digest staged and tested. diff --git a/docs/deployment/vercel.md b/docs/deployment/vercel.md new file mode 100644 index 0000000..a63fbf7 --- /dev/null +++ b/docs/deployment/vercel.md @@ -0,0 +1,19 @@ +# Vercel Deployment Playbook + +## Project Setup + +1. Import repository into Vercel. +2. Set framework preset to Next.js. +3. Configure production environment variables using Vercel encrypted secrets. + +## Release Strategy + +- Deploy preview for every PR. +- Promote to production only from protected `main` branch. +- Pin deployment to release tags for auditable rollouts. + +## Human Approval Checkpoint + +- Require reviewer approval on PR. +- Require workflow `CI` and `Security` to pass before merge. +- Require explicit production promotion by release manager. diff --git a/docs/operations/disaster-recovery.md b/docs/operations/disaster-recovery.md new file mode 100644 index 0000000..6323fab --- /dev/null +++ b/docs/operations/disaster-recovery.md @@ -0,0 +1,31 @@ +# Disaster Recovery Plan + +## Objectives + +- **RTO**: 30 minutes for core API restoration. +- **RPO**: 15 minutes for persistent operational metadata. + +## Recovery Phases + +1. **Detect and declare incident** + - Trigger incident commander role. + - Freeze releases and non-essential deployments. +2. **Stabilize** + - Shift traffic to healthy region/environment. + - Roll back to last known-good release tag. +3. **Restore** + - Rehydrate state/configuration from backups and IaC definitions. + - Re-run smoke checks and determinism tests. +4. **Validate and communicate** + - Confirm SLO recovery. + - Communicate status to stakeholders. + +## Backup and Restore Requirements + +- Daily encrypted backups of deployment configuration and secrets metadata references. +- Immutable release artifacts retained for at least 90 days. +- Quarterly restoration drills with documented outcomes. + +## Human Approval Checkpoint + +Incident commander and security lead must jointly approve return to normal release operations. diff --git a/docs/operations/monitoring-alerting.md b/docs/operations/monitoring-alerting.md new file mode 100644 index 0000000..96c312c --- /dev/null +++ b/docs/operations/monitoring-alerting.md @@ -0,0 +1,34 @@ +# Monitoring Dashboards and Alerting Rules + +## Golden Signals Dashboard + +Track these panels at 1m, 5m, and 1h windows: + +- Request rate (`/api/optimize`, `/api/status`) +- Error rate (5xx responses) +- P95/P99 latency per endpoint +- Container CPU and memory +- Worker queue depth and saturation +- Deployment and release version distribution + +## Recommended Alerts + +### Critical +- API availability below 99.5% over 5 minutes. +- P99 latency above 3 seconds over 10 minutes. +- Error rate above 2% over 5 minutes. + +### High +- Pod restart count > 3 within 10 minutes. +- HPA at max replicas for > 15 minutes. +- Security workflow failure on default branch. + +### Medium +- CI failure rate > 20% on default branch over 24 hours. +- Dependency audit reports high/critical vulnerabilities. + +## On-call Response Expectations + +- Acknowledge critical alerts within 5 minutes. +- Begin mitigation within 10 minutes. +- Publish incident timeline and root-cause analysis within 24 hours. diff --git a/docs/operations/scaling-playbook.md b/docs/operations/scaling-playbook.md new file mode 100644 index 0000000..3985a2c --- /dev/null +++ b/docs/operations/scaling-playbook.md @@ -0,0 +1,19 @@ +# Scaling Playbook + +## Horizontal Scaling Strategy + +- Scale web tier via Kubernetes HPA between 3 and 15 replicas. +- Keep worker pool concurrency tied to available vCPU: `concurrency = vCPU * 2`. +- Limit queue growth with explicit `maxQueueSize` guardrails. + +## Capacity Planning + +- Baseline with load tests at 1x, 2x, and 4x expected peak RPS. +- Reserve 30% headroom above 95th percentile peak. +- Validate release candidate under peak load before production promotion. + +## Failure Containment + +- Use bounded retries with exponential backoff. +- Fail fast on queue saturation and shed excess load. +- Keep deployments canary-first before global rollout. diff --git a/tests/wasm.determinism.test.ts b/tests/wasm.determinism.test.ts new file mode 100644 index 0000000..f243c86 --- /dev/null +++ b/tests/wasm.determinism.test.ts @@ -0,0 +1,63 @@ +import { WasmRunner } from '../core/wasm/wasmRunner'; + +describe('WasmRunner determinism', () => { + it('preserves deterministic output for identical task input', async () => { + const fixedNow = [1_000, 1_010, 2_000, 2_010]; + let index = 0; + const runner = new WasmRunner({ now: () => fixedNow[index++] ?? 2_010 }); + + const task = { + id: 'deterministic-task', + payload: { value: 7 }, + execute: async ({ value }: { value: number }) => value * 3, + }; + + const first = await runner.run(task); + const second = await runner.run(task); + + expect(first.result).toBe(21); + expect(second.result).toBe(21); + expect(first.durationMs).toBe(10); + expect(second.durationMs).toBe(10); + }); + + it('returns runAll results in input order even with mixed latencies', async () => { + const runner = new WasmRunner(); + const tasks = [ + { + id: 'slow', + payload: 1, + execute: async (value: number) => { + await new Promise((resolve) => setTimeout(resolve, 30)); + return value; + }, + }, + { + id: 'fast', + payload: 2, + execute: async (value: number) => value, + }, + ]; + + const results = await runner.runAll(tasks); + + expect(results.map((r) => r.id)).toEqual(['slow', 'fast']); + expect(results.map((r) => r.result)).toEqual([1, 2]); + }); + + it('applies timeout validation bounds', async () => { + const runner = new WasmRunner({ defaultTimeoutMs: 20, maxTimeoutMs: 40 }); + + const result = await runner.run({ + id: 'timeout-bounded', + payload: null, + timeoutMs: 1_000, + execute: async () => { + await new Promise((resolve) => setTimeout(resolve, 50)); + return 'done'; + }, + }); + + expect(result.error).toContain('timed out after 40ms'); + }); +}); diff --git a/tests/workerPool.load.test.ts b/tests/workerPool.load.test.ts new file mode 100644 index 0000000..5672f58 --- /dev/null +++ b/tests/workerPool.load.test.ts @@ -0,0 +1,65 @@ +import { WorkerPool } from '../core/wasm/workerPool'; + +describe('WorkerPool load and scaling', () => { + it('enforces configured concurrency while processing load', async () => { + const pool = new WorkerPool({ concurrency: 3, maxQueueSize: 200 }); + let inFlight = 0; + let maxObserved = 0; + + const tasks = Array.from({ length: 50 }, (_, i) => + pool.submit({ + id: `task-${i}`, + payload: i, + execute: async (value: number) => { + inFlight++; + maxObserved = Math.max(maxObserved, inFlight); + await new Promise((resolve) => setTimeout(resolve, 5)); + inFlight--; + return value * 2; + }, + }), + ); + + const results = await Promise.all(tasks); + + expect(results).toHaveLength(50); + expect(results.every((r) => typeof r.result === 'number')).toBe(true); + expect(maxObserved).toBeLessThanOrEqual(3); + expect(pool.capacity).toBe(3); + }); + + it('rejects submissions beyond queue capacity', async () => { + const pool = new WorkerPool({ concurrency: 1, maxQueueSize: 1 }); + + const first = pool.submit({ + id: 'first', + payload: null, + execute: async () => { + await new Promise((resolve) => setTimeout(resolve, 30)); + return 'first'; + }, + }); + + const second = pool.submit({ + id: 'second', + payload: null, + execute: async () => 'second', + }); + + const third = pool.submit({ + id: 'third', + payload: null, + execute: async () => 'third', + }); + + await expect(first).resolves.toHaveProperty('id', 'first'); + await expect(second).resolves.toHaveProperty('id', 'second'); + await expect(third).rejects.toThrow('queue capacity 1 exceeded'); + }); + + it('rejects invalid concurrency configuration', () => { + expect(() => new WorkerPool({ concurrency: 0 })).toThrow( + 'WorkerPool: concurrency must be a positive integer', + ); + }); +}); From ce63d6991a326ec671c01ba3ad91114a21f004c6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 7 May 2026 14:48:29 +0000 Subject: [PATCH 3/5] Fix timeout cleanup and queue-capacity test stability Agent-Logs-Url: https://github.com/Algodons/agentos/sessions/42083e98-582c-4e4c-a645-3e712f0bed81 Co-authored-by: SMSDAO <144380926+SMSDAO@users.noreply.github.com> --- core/wasm/wasmRunner.ts | 24 ++++++++++++++---------- tests/workerPool.load.test.ts | 13 +++++++------ 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/core/wasm/wasmRunner.ts b/core/wasm/wasmRunner.ts index 1302fe3..f105fc2 100644 --- a/core/wasm/wasmRunner.ts +++ b/core/wasm/wasmRunner.ts @@ -39,11 +39,17 @@ export class WasmRunner { const start = this.now(); const timeoutMs = this.resolveTimeout(task.timeoutMs); + let timeoutId: NodeJS.Timeout | null = null; + try { - const result = await Promise.race([ - task.execute(task.payload), - this.timeout(timeoutMs, task.id), - ]); + const timeoutPromise = new Promise((_, reject) => { + timeoutId = setTimeout( + () => reject(new Error(`WasmRunner: task ${task.id} timed out after ${timeoutMs}ms`)), + timeoutMs, + ); + }); + + const result = await Promise.race([task.execute(task.payload), timeoutPromise]); return { id: task.id, result, durationMs: this.now() - start }; } catch (err) { return { @@ -51,6 +57,10 @@ export class WasmRunner { error: err instanceof Error ? err.message : String(err), durationMs: this.now() - start, }; + } finally { + if (timeoutId) { + clearTimeout(timeoutId); + } } } @@ -67,10 +77,4 @@ export class WasmRunner { } return Math.min(taskTimeoutMs, this.maxTimeoutMs); } - - private timeout(ms: number, taskId: string): Promise { - return new Promise((_, reject) => - setTimeout(() => reject(new Error(`WasmRunner: task ${taskId} timed out after ${ms}ms`)), ms), - ); - } } diff --git a/tests/workerPool.load.test.ts b/tests/workerPool.load.test.ts index 5672f58..d5f6314 100644 --- a/tests/workerPool.load.test.ts +++ b/tests/workerPool.load.test.ts @@ -46,15 +46,16 @@ describe('WorkerPool load and scaling', () => { execute: async () => 'second', }); - const third = pool.submit({ - id: 'third', - payload: null, - execute: async () => 'third', - }); + await expect( + pool.submit({ + id: 'third', + payload: null, + execute: async () => 'third', + }), + ).rejects.toThrow('queue capacity 1 exceeded'); await expect(first).resolves.toHaveProperty('id', 'first'); await expect(second).resolves.toHaveProperty('id', 'second'); - await expect(third).rejects.toThrow('queue capacity 1 exceeded'); }); it('rejects invalid concurrency configuration', () => { From cc8b233cab69616aadcc6bc81b8481eba13fe2f2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 7 May 2026 14:50:06 +0000 Subject: [PATCH 4/5] Use immutable image tag in Kubernetes deployment Agent-Logs-Url: https://github.com/Algodons/agentos/sessions/42083e98-582c-4e4c-a645-3e712f0bed81 Co-authored-by: SMSDAO <144380926+SMSDAO@users.noreply.github.com> --- deploy/k8s/base/deployment.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/k8s/base/deployment.yaml b/deploy/k8s/base/deployment.yaml index e469d39..6176dcf 100644 --- a/deploy/k8s/base/deployment.yaml +++ b/deploy/k8s/base/deployment.yaml @@ -22,7 +22,7 @@ spec: type: RuntimeDefault containers: - name: agentos - image: ghcr.io/algodons/agentos:latest + image: ghcr.io/algodons/agentos:v1.0.0-sha.ce63d69 imagePullPolicy: IfNotPresent ports: - containerPort: 3000 From 054d23822d4ee68507c5565a64bb79de161130f9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 7 May 2026 14:55:20 +0000 Subject: [PATCH 5/5] Patch download-artifact action to v4.1.3 Agent-Logs-Url: https://github.com/Algodons/agentos/sessions/42083e98-582c-4e4c-a645-3e712f0bed81 Co-authored-by: SMSDAO <144380926+SMSDAO@users.noreply.github.com> --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 64c5b52..4d51001 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -101,7 +101,7 @@ jobs: steps: - name: Download artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@v4.1.3 with: name: release-artifacts