From c8a86bab11f84ff6fcc657cde036adea3897cb07 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 7 May 2026 14:43:56 +0000
Subject: [PATCH 1/5] Add enterprise workflows and deployment manifests

Agent-Logs-Url: https://github.com/Algodons/agentos/sessions/42083e98-582c-4e4c-a645-3e712f0bed81

Co-authored-by: SMSDAO <144380926+SMSDAO@users.noreply.github.com>
---
 .dockerignore                   |   8 +++
 .github/workflows/ci.yml        |  55 ++++++++++-----
 .github/workflows/deploy.yml    |  44 ++++++++++++
 .github/workflows/release.yml   | 114 ++++++++++++++++++++++++++++++++
 .github/workflows/security.yml  |  82 +++++++++++++++++++++++
 Dockerfile                      |  25 +++++++
 deploy/k8s/base/deployment.yaml |  55 +++++++++++++++
 deploy/k8s/base/hpa.yaml        |  19 ++++++
 deploy/k8s/base/namespace.yaml  |   4 ++
 deploy/k8s/base/service.yaml    |  12 ++++
 10 files changed, 400 insertions(+), 18 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 .github/workflows/deploy.yml
 create mode 100644 .github/workflows/release.yml
 create mode 100644 .github/workflows/security.yml
 create mode 100644 Dockerfile
 create mode 100644 deploy/k8s/base/deployment.yaml
 create mode 100644 deploy/k8s/base/hpa.yaml
 create mode 100644 deploy/k8s/base/namespace.yaml
 create mode 100644 deploy/k8s/base/service.yaml

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..47fb8cd
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,8 @@
+node_modules
+.next
+coverage
+.git
+.github
+npm-debug.log
+.env
+.env.*
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index df07943..bad0dae 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,24 +9,33 @@ on:
 permissions:
   contents: read
 
+concurrency:
+  group: ci-${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
-  test:
-    name: Type-check, Lint & Test
+  validate:
+    name: Validate (Node ${{ matrix.node-version }})
     runs-on: ubuntu-latest
-    permissions:
-      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        node-version: [20, 22]
 
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+          fetch-depth: 1
 
       - name: Setup Node.js
         uses: actions/setup-node@v4
         with:
-          node-version: '20'
-          cache: 'npm'
+          node-version: ${{ matrix.node-version }}
+          cache: npm
 
-      - name: Install dependencies
+      - name: Install dependencies (deterministic)
         run: npm ci
 
       - name: Type-check
@@ -35,28 +44,38 @@ jobs:
       - name: Lint
         run: npm run lint
 
-      - name: Test
+      - name: Unit and integration tests
         run: npm test
 
-  build:
-    name: Build
-    runs-on: ubuntu-latest
-    needs: test
-    permissions:
-      contents: read
+      - name: Build
+        run: npm run build
 
+  coverage:
+    name: Coverage
+    runs-on: ubuntu-latest
+    needs: validate
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+          fetch-depth: 1
 
       - name: Setup Node.js
         uses: actions/setup-node@v4
         with:
-          node-version: '20'
-          cache: 'npm'
+          node-version: 20
+          cache: npm
 
       - name: Install dependencies
         run: npm ci
 
-      - name: Build
-        run: npm run build
+      - name: Coverage tests
+        run: npm run test:coverage
+
+      - name: Upload coverage artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: coverage-report
+          path: coverage/
+          if-no-files-found: error
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
new file mode 100644
index 0000000..5f36338
--- /dev/null
+++ b/.github/workflows/deploy.yml
@@ -0,0 +1,44 @@
+name: Deploy
+
+on:
+  workflow_dispatch:
+    inputs:
+      image-tag:
+        description: 'Container image tag to deploy'
+        required: true
+        type: string
+
+permissions:
+  contents: read
+
+jobs:
+  deploy-staging:
+    name: Deploy to staging
+    runs-on: ubuntu-latest
+    environment: staging
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+          fetch-depth: 1
+
+      - name: Deploy placeholder command
+        run: echo "Deploying image tag ${{ inputs.image-tag }} to staging"
+
+  deploy-production:
+    name: Deploy to production (approval required)
+    runs-on: ubuntu-latest
+    needs: deploy-staging
+    environment: production
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+          fetch-depth: 1
+
+      - name: Deploy placeholder command
+        run: echo "Deploying image tag ${{ inputs.image-tag }} to production"
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000..64c5b52
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,114 @@
+name: Release
+
+on:
+  push:
+    tags:
+      - 'v*.*.*'
+
+permissions:
+  contents: read
+
+jobs:
+  verify:
+    name: Verify release
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+          cache: npm
+
+      - name: Install dependencies (deterministic)
+        run: npm ci
+
+      - name: Type-check
+        run: npm run type-check
+
+      - name: Lint
+        run: npm run lint
+
+      - name: Test
+        run: npm test
+
+      - name: Build
+        run: npm run build
+
+  package:
+    name: Package artifacts
+    runs-on: ubuntu-latest
+    needs: verify
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          persist-credentials: false
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+          cache: npm
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: Build package
+        run: |
+          TAG="${GITHUB_REF_NAME}"
+          export SOURCE_DATE_EPOCH="$(git log -1 --format=%ct)"
+          npm pack
+          sha256sum *.tgz > checksums.txt
+          {
+            echo "# Release ${TAG}"
+            echo
+            echo "## Commit"
+            echo "- $(git rev-parse HEAD)"
+            echo
+            echo "## Checksums"
+            cat checksums.txt
+          } > release-notes.md
+
+      - name: Upload release artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: release-artifacts
+          path: |
+            *.tgz
+            checksums.txt
+            release-notes.md
+          if-no-files-found: error
+
+  publish:
+    name: Publish GitHub release
+    runs-on: ubuntu-latest
+    needs: package
+    permissions:
+      contents: write
+
+    steps:
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          name: release-artifacts
+
+      - name: Create GitHub Release
+        uses: softprops/action-gh-release@v2
+        with:
+          body_path: release-notes.md
+          files: |
+            *.tgz
+            checksums.txt
diff --git a/.github/workflows/security.yml b/.github/workflows/security.yml
new file mode 100644
index 0000000..b016941
--- /dev/null
+++ b/.github/workflows/security.yml
@@ -0,0 +1,82 @@
+name: Security
+
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+  schedule:
+    - cron: '0 3 * * *'
+
+permissions:
+  contents: read
+
+jobs:
+  codeql:
+    name: CodeQL
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+          fetch-depth: 1
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v3
+        with:
+          languages: javascript-typescript
+
+      - name: Autobuild
+        uses: github/codeql-action/autobuild@v3
+
+      - name: Analyze
+        uses: github/codeql-action/analyze@v3
+
+  dependency-review:
+    if: github.event_name == 'pull_request'
+    name: Dependency Review
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: read
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+          fetch-depth: 1
+
+      - name: Dependency review
+        uses: actions/dependency-review-action@v4
+
+  npm-audit:
+    name: npm audit
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          persist-credentials: false
+          fetch-depth: 1
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+          cache: npm
+
+      - name: Install dependencies
+        run: npm ci
+
+      - name: High severity dependency audit
+        run: npm audit --audit-level=high
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..c84d7a2
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,25 @@
+FROM node:20-bookworm-slim AS deps
+WORKDIR /app
+COPY package*.json ./
+RUN npm ci
+
+FROM node:20-bookworm-slim AS builder
+WORKDIR /app
+COPY --from=deps /app/node_modules ./node_modules
+COPY . .
+RUN npm run build
+
+FROM node:20-bookworm-slim AS runner
+WORKDIR /app
+ENV NODE_ENV=production
+ENV NEXT_TELEMETRY_DISABLED=1
+
+COPY package*.json ./
+RUN npm ci --omit=dev
+
+COPY --from=builder /app/.next ./.next
+COPY --from=builder /app/public ./public
+COPY --from=builder /app/next.config.mjs ./next.config.mjs
+
+EXPOSE 3000
+CMD ["npm", "run", "start"]
diff --git a/deploy/k8s/base/deployment.yaml b/deploy/k8s/base/deployment.yaml
new file mode 100644
index 0000000..e469d39
--- /dev/null
+++ b/deploy/k8s/base/deployment.yaml
@@ -0,0 +1,55 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: agentos
+  namespace: agentos
+  labels:
+    app: agentos
+spec:
+  replicas: 3
+  revisionHistoryLimit: 5
+  selector:
+    matchLabels:
+      app: agentos
+  template:
+    metadata:
+      labels:
+        app: agentos
+    spec:
+      securityContext:
+        runAsNonRoot: true
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: agentos
+          image: ghcr.io/algodons/agentos:latest
+          imagePullPolicy: IfNotPresent
+          ports:
+            - containerPort: 3000
+          env:
+            - name: NODE_ENV
+              value: production
+          resources:
+            requests:
+              cpu: 250m
+              memory: 256Mi
+            limits:
+              cpu: 1
+              memory: 1Gi
+          livenessProbe:
+            httpGet:
+              path: /api/status
+              port: 3000
+            initialDelaySeconds: 20
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /api/status
+              port: 3000
+            initialDelaySeconds: 10
+            periodSeconds: 5
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop: ["ALL"]
diff --git a/deploy/k8s/base/hpa.yaml b/deploy/k8s/base/hpa.yaml
new file mode 100644
index 0000000..8b3e7c2
--- /dev/null
+++ b/deploy/k8s/base/hpa.yaml
@@ -0,0 +1,19 @@
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: agentos
+  namespace: agentos
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: agentos
+  minReplicas: 3
+  maxReplicas: 15
+  metrics:
+    - type: Resource
+      resource:
+        name: cpu
+        target:
+          type: Utilization
+          averageUtilization: 65
diff --git a/deploy/k8s/base/namespace.yaml b/deploy/k8s/base/namespace.yaml
new file mode 100644
index 0000000..f444fa3
--- /dev/null
+++ b/deploy/k8s/base/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: agentos
diff --git a/deploy/k8s/base/service.yaml b/deploy/k8s/base/service.yaml
new file mode 100644
index 0000000..7fac14c
--- /dev/null
+++ b/deploy/k8s/base/service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: agentos
+  namespace: agentos
+spec:
+  selector:
+    app: agentos
+  ports:
+    - protocol: TCP
+      port: 80
+      targetPort: 3000

From 9818068f99f33dcc8d0e1dd9b9311aeea468c96a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 7 May 2026 14:46:26 +0000
Subject: [PATCH 2/5] Harden runtime determinism and add operations playbooks

Agent-Logs-Url: https://github.com/Algodons/agentos/sessions/42083e98-582c-4e4c-a645-3e712f0bed81

Co-authored-by: SMSDAO <144380926+SMSDAO@users.noreply.github.com>
---
 README.md                              | 130 ++++++++++++++-----------
 app/api/status/route.ts                |   8 +-
 core/wasm/wasmRunner.ts                |  41 +++++---
 core/wasm/workerPool.ts                |  35 ++++++-
 docs/deployment/docker.md              |  28 ++++++
 docs/deployment/kubernetes.md          |  25 +++++
 docs/deployment/vercel.md              |  19 ++++
 docs/operations/disaster-recovery.md   |  31 ++++++
 docs/operations/monitoring-alerting.md |  34 +++++++
 docs/operations/scaling-playbook.md    |  19 ++++
 tests/wasm.determinism.test.ts         |  63 ++++++++++++
 tests/workerPool.load.test.ts          |  65 +++++++++++++
 12 files changed, 422 insertions(+), 76 deletions(-)
 create mode 100644 docs/deployment/docker.md
 create mode 100644 docs/deployment/kubernetes.md
 create mode 100644 docs/deployment/vercel.md
 create mode 100644 docs/operations/disaster-recovery.md
 create mode 100644 docs/operations/monitoring-alerting.md
 create mode 100644 docs/operations/scaling-playbook.md
 create mode 100644 tests/wasm.determinism.test.ts
 create mode 100644 tests/workerPool.load.test.ts

diff --git a/README.md b/README.md
index 38339b9..a43e816 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 > **A production-grade WASM-powered multi-agent system that optimizes prompts autonomously, executes across multiple AI models, scores best outputs, and evolves via feedback loops.**
 
 [![CI](https://github.com/Algodons/agentos/actions/workflows/ci.yml/badge.svg)](https://github.com/Algodons/agentos/actions/workflows/ci.yml)
+[![Security](https://github.com/Algodons/agentos/actions/workflows/security.yml/badge.svg)](https://github.com/Algodons/agentos/actions/workflows/security.yml)
 
 ---
 
@@ -21,7 +22,7 @@ Open [http://localhost:3000/dashboard](http://localhost:3000/dashboard) to see t
 
 AgentOS is an autonomous prompt optimization operating system — a self-evolving AI execution layer that:
 
-- 🤖 **Optimizes prompts** through a 7-agent swarm pipeline
+- �� **Optimizes prompts** through a 7-agent swarm pipeline
 - ⚡ **Executes across multiple AI models** (OpenAI, Claude, Gemini, Llama)
 - 📊 **Scores and selects** the best outputs via weighted evaluation
 - 🔁 **Evolves prompts** via feedback loops (up to 10 iterations)
@@ -29,7 +30,7 @@ AgentOS is an autonomous prompt optimization operating system — a self-evolvin
 - 🧠 **Stores and versions** high-performing prompts
 - 🌐 **WASM execution layer** for parallel, deterministic agent tasks
 
-> **Backward compatibility:** `import { PromptOS } from '@/core'` still works — `PromptOS` is aliased to `AgentOS`.
+> **Backward compatibility:** `import { PromptOS } from '@/core'` still works — `PromptOS` is aliased to `SwarmOrchestrator`.
 
 ---
 
@@ -38,13 +39,6 @@ AgentOS is an autonomous prompt optimization operating system — a self-evolvin
 ```
 /core
 ├── agents/          # 7 stateless typed agents
-│   ├── InputAgent
-│   ├── PromptArchitect
-│   ├── ExecutionAgent
-│   ├── EvaluationAgent
-│   ├── OptimizationAgent
-│   ├── SecurityAgent
-│   └── DeploymentAgent
 ├── swarm/           # SwarmOrchestrator — pipeline coordinator
 ├── wasm/            # WasmRunner + WorkerPool
 ├── models/          # ModelRouter (OpenAI / Claude / Gemini / Llama)
@@ -69,39 +63,80 @@ score = (accuracy × 0.40) + (completeness × 0.25) + (determinism × 0.15)
 
 ---
 
-## 🌐 API
-
-### `POST /api/optimize`
+## 🧪 Testing
 
-**Request:**
-```json
-{ "input": "Summarize the history of the internet." }
+```bash
+npm test               # run all tests
+npm run test:coverage  # with coverage report
 ```
 
-**Response:**
-```json
-{
-  "optimizedPrompt": "...",
-  "score": 0.91,
-  "model": "mock",
-  "versionId": "uuid",
-  "iterations": 3,
-  "scoreBreakdown": { "accuracy": 0.85, "completeness": 1.0 },
-  "threatDetected": false
-}
-```
+Coverage now includes:
+
+- Unit tests for all swarm agents
+- End-to-end swarm orchestration tests
+- Determinism tests for WASM execution
+- Worker-pool load and queue saturation tests
+
+---
+
+## 🔐 Enterprise CI/CD and Security
+
+### CI/CD Workflows
+
+- **CI (`.github/workflows/ci.yml`)**
+  - Deterministic `npm ci`
+  - Node.js matrix validation (20, 22)
+  - Type-check, lint, tests, build
+  - Coverage artifact upload
+- **Release (`.github/workflows/release.yml`)**
+  - Triggered by semantic tags (`v*.*.*`)
+  - Full pre-release validation
+  - Reproducible package artifact + checksums
+  - Automated GitHub release creation
+- **Deploy (`.github/workflows/deploy.yml`)**
+  - Human-triggered deployment path
+  - Staging then production gating via environments
+
+### Security Governance
+
+- **Security workflow (`.github/workflows/security.yml`)**
+  - CodeQL analysis
+  - Dependency review for pull requests
+  - Scheduled and branch-based `npm audit`
+- Prompt injection and XSS sanitization built into core runtime
+- No embedded secrets in codebase
 
 ---
 
-## 🎨 Dashboard
+## 🚢 Deployment Playbooks
+
+- Docker: [`docs/deployment/docker.md`](docs/deployment/docker.md)
+- Kubernetes: [`docs/deployment/kubernetes.md`](docs/deployment/kubernetes.md)
+- Vercel: [`docs/deployment/vercel.md`](docs/deployment/vercel.md)
 
-Route: `/dashboard`
+Kubernetes base manifests are provided in `deploy/k8s/base/` with secure defaults:
 
-- **Score Index** — real-time score with sparkline
-- **Agent Swarm Panel** — live agent status (idle / running / failed)
-- **Prompt Timeline** — version history with scores
-- **Model Comparison** — GPT vs Claude vs Gemini vs Llama
-- **Live Logs** — terminal-style streaming log view
+- Non-root execution
+- Read-only root filesystem
+- Capability drop (`ALL`)
+- Liveness/readiness probes
+- HorizontalPodAutoscaler
+
+---
+
+## 📈 Observability, Scaling, and Recovery
+
+- Monitoring and alerting: [`docs/operations/monitoring-alerting.md`](docs/operations/monitoring-alerting.md)
+- Scaling strategy: [`docs/operations/scaling-playbook.md`](docs/operations/scaling-playbook.md)
+- Disaster recovery: [`docs/operations/disaster-recovery.md`](docs/operations/disaster-recovery.md)
+
+`GET /api/status` now returns health and operational metadata:
+
+- status
+- timestamp
+- uptime
+- model and score snapshot
+- version count
 
 ---
 
@@ -117,29 +152,7 @@ LLAMA_API_KEY=
 MARKETPLACE_ENABLED=true
 ```
 
-The system works **without any API keys** — all models fall back to a deterministic mock provider.
-
----
-
-## 🧪 Testing
-
-```bash
-npm test              # run all tests
-npm run test:coverage # with coverage report
-```
-
-- `tests/swarm.test.ts` — end-to-end swarm pipeline
-- `tests/agents.test.ts` — unit tests for all 7 agents
-- `tests/scoring.test.ts` — scoring engine formula tests
-
----
-
-## 🛡️ Security
-
-- **PromptSanitizer** — strips jailbreak and injection patterns before execution
-- **InjectionDetector** — detects `instruction_override`, `jailbreak`, `xss_injection`, `code_injection`, `system_prompt_injection`
-- No secrets hardcoded anywhere
-- Server-side validation on all API inputs
+The system works **without any API keys** — models can run through deterministic mock behavior.
 
 ---
 
@@ -151,7 +164,8 @@ npm run test:coverage # with coverage report
 | `npm run build` | Production build |
 | `npm run type-check` | TypeScript strict check |
 | `npm run lint` | ESLint |
-| `npm test` | Jest tests |
+| `npm test` | Jest suite |
+| `npm run test:coverage` | Jest coverage |
 
 ---
 
diff --git a/app/api/status/route.ts b/app/api/status/route.ts
index 06f1265..8f80a7e 100644
--- a/app/api/status/route.ts
+++ b/app/api/status/route.ts
@@ -1,17 +1,21 @@
 import { NextResponse } from 'next/server';
 import { MemorySystem } from '@/core/memory/MemorySystem';
 
+const processStartTime = Date.now();
+
 /**
  * GET /api/status
  *
- * Returns the current AgentOS system status for the polling fallback in lib/socket.ts.
- * Reports the score and model from the most recently deployed prompt version.
+ * Returns the current AgentOS status and operational metrics.
  */
 export function GET(): NextResponse {
   const memory = MemorySystem.getInstance();
   const best = memory.getBestVersion();
 
   return NextResponse.json({
+    status: 'ok',
+    timestamp: new Date().toISOString(),
+    uptimeMs: Date.now() - processStartTime,
     score: best?.score ?? 0,
     model: best?.model ?? '—',
     versionCount: memory.getLongTermHistory().length,
diff --git a/core/wasm/wasmRunner.ts b/core/wasm/wasmRunner.ts
index 2b16ecd..1302fe3 100644
--- a/core/wasm/wasmRunner.ts
+++ b/core/wasm/wasmRunner.ts
@@ -12,43 +12,62 @@ export interface WasmTaskResult<R> {
   durationMs: number;
 }
 
+export interface WasmRunnerOptions {
+  now?: () => number;
+  defaultTimeoutMs?: number;
+  maxTimeoutMs?: number;
+}
+
 /**
- * WasmRunner — provides deterministic, parallel task execution via a simulated WASM sandbox.
- *
- * In production this would compile and run Wasm modules via the WebAssembly API.
- * The interface is kept identical so swapping in a real WASM engine requires no
- * changes to the surrounding code.
+ * WasmRunner — deterministic task execution wrapper for WASM-compatible workloads.
  */
 export class WasmRunner {
+  private readonly now: () => number;
+  private readonly defaultTimeoutMs: number;
+  private readonly maxTimeoutMs: number;
+
+  constructor(options: WasmRunnerOptions = {}) {
+    this.now = options.now ?? Date.now;
+    this.defaultTimeoutMs = options.defaultTimeoutMs ?? 10_000;
+    this.maxTimeoutMs = options.maxTimeoutMs ?? 60_000;
+  }
+
   /**
-   * Executes a single task with optional timeout.
+   * Executes a single task with validated timeout bounds.
    */
   async run<T, R>(task: WasmTask<T, R>): Promise<WasmTaskResult<R>> {
-    const start = Date.now();
-    const timeoutMs = task.timeoutMs ?? 10_000;
+    const start = this.now();
+    const timeoutMs = this.resolveTimeout(task.timeoutMs);
 
     try {
       const result = await Promise.race([
         task.execute(task.payload),
         this.timeout<R>(timeoutMs, task.id),
       ]);
-      return { id: task.id, result, durationMs: Date.now() - start };
+      return { id: task.id, result, durationMs: this.now() - start };
     } catch (err) {
       return {
         id: task.id,
         error: err instanceof Error ? err.message : String(err),
-        durationMs: Date.now() - start,
+        durationMs: this.now() - start,
       };
     }
   }
 
   /**
-   * Executes multiple tasks in parallel.
+   * Executes multiple tasks in deterministic input order.
    */
   async runAll<T, R>(tasks: WasmTask<T, R>[]): Promise<WasmTaskResult<R>[]> {
     return Promise.all(tasks.map((task) => this.run(task)));
   }
 
+  private resolveTimeout(taskTimeoutMs?: number): number {
+    if (typeof taskTimeoutMs !== 'number' || !Number.isFinite(taskTimeoutMs) || taskTimeoutMs <= 0) {
+      return this.defaultTimeoutMs;
+    }
+    return Math.min(taskTimeoutMs, this.maxTimeoutMs);
+  }
+
   private timeout<R>(ms: number, taskId: string): Promise<R> {
     return new Promise((_, reject) =>
       setTimeout(() => reject(new Error(`WasmRunner: task ${taskId} timed out after ${ms}ms`)), ms),
diff --git a/core/wasm/workerPool.ts b/core/wasm/workerPool.ts
index a8554db..714b365 100644
--- a/core/wasm/workerPool.ts
+++ b/core/wasm/workerPool.ts
@@ -1,16 +1,31 @@
 import { WasmRunner, WasmTask, WasmTaskResult } from './wasmRunner';
 
+export interface WorkerPoolOptions {
+  concurrency?: number;
+  maxQueueSize?: number;
+}
+
 /**
  * WorkerPool — manages a pool of WasmRunner instances for parallel task execution.
- * Implements a task queue with configurable concurrency and round-robin distribution.
  */
 export class WorkerPool {
   private readonly runners: WasmRunner[];
   private readonly queue: Array<() => void> = [];
+  private readonly concurrency: number;
+  private readonly maxQueueSize: number;
   private activeCount = 0;
   private roundRobinIndex = 0;
 
-  constructor(private readonly concurrency: number = 4) {
+  constructor(options: WorkerPoolOptions | number = 4) {
+    const normalized = typeof options === 'number' ? { concurrency: options } : options;
+    const concurrency = normalized.concurrency ?? 4;
+
+    if (!Number.isInteger(concurrency) || concurrency < 1) {
+      throw new Error('WorkerPool: concurrency must be a positive integer');
+    }
+
+    this.concurrency = concurrency;
+    this.maxQueueSize = normalized.maxQueueSize ?? concurrency * 100;
     this.runners = Array.from({ length: concurrency }, () => new WasmRunner());
   }
 
@@ -18,7 +33,7 @@ export class WorkerPool {
    * Submits a task to the pool. Queues it if all runners are busy.
    */
   submit<T, R>(task: WasmTask<T, R>): Promise<WasmTaskResult<R>> {
-    return new Promise((resolve) => {
+    return new Promise((resolve, reject) => {
       const run = () => {
         this.activeCount++;
         const runnerIndex = this.roundRobinIndex % this.runners.length;
@@ -33,9 +48,15 @@ export class WorkerPool {
 
       if (this.activeCount < this.concurrency) {
         run();
-      } else {
-        this.queue.push(run);
+        return;
       }
+
+      if (this.queue.length >= this.maxQueueSize) {
+        reject(new Error(`WorkerPool: queue capacity ${this.maxQueueSize} exceeded`));
+        return;
+      }
+
+      this.queue.push(run);
     });
   }
 
@@ -56,4 +77,8 @@ export class WorkerPool {
   get active(): number {
     return this.activeCount;
   }
+
+  get capacity(): number {
+    return this.concurrency;
+  }
 }
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
new file mode 100644
index 0000000..eaeee2a
--- /dev/null
+++ b/docs/deployment/docker.md
@@ -0,0 +1,28 @@
+# Docker Deployment Playbook
+
+## Build
+
+```bash
+docker build -t ghcr.io/algodons/agentos:<tag> .
+```
+
+## Run
+
+```bash
+docker run --rm -p 3000:3000 --env-file .env.local ghcr.io/algodons/agentos:<tag>
+```
+
+## Security Hardening Controls
+
+- Use immutable tags (`vX.Y.Z` + commit SHA) and never deploy `latest` to production.
+- Run containers with read-only filesystems and non-root users in orchestration layers.
+- Inject secrets through runtime secret stores; never bake into images.
+- Validate image digests against release checksum artifacts before rollout.
+
+## Human Approval Checkpoint
+
+Before production rollout, release manager must verify:
+
+1. CI workflow green on release tag.
+2. Security workflow green (CodeQL + dependency audit).
+3. Signed-off deployment ticket with rollback owner assigned.
diff --git a/docs/deployment/kubernetes.md b/docs/deployment/kubernetes.md
new file mode 100644
index 0000000..385382e
--- /dev/null
+++ b/docs/deployment/kubernetes.md
@@ -0,0 +1,25 @@
+# Kubernetes Deployment Playbook
+
+## Apply Base Manifests
+
+```bash
+kubectl apply -f deploy/k8s/base/namespace.yaml
+kubectl apply -f deploy/k8s/base/deployment.yaml
+kubectl apply -f deploy/k8s/base/service.yaml
+kubectl apply -f deploy/k8s/base/hpa.yaml
+```
+
+## Production Controls
+
+- Keep `replicas >= 3` for zone-level resilience.
+- Use HPA CPU target at 65% and max replicas of 15 by default.
+- Require readiness and liveness probes on `/api/status`.
+- Enforce Pod Security (`runAsNonRoot`, dropped capabilities, `RuntimeDefault` seccomp).
+
+## Human Approval Checkpoint
+
+Deployment to production namespace requires:
+
+1. Approved change request.
+2. On-call SRE acknowledgement.
+3. Verified rollback image digest staged and tested.
diff --git a/docs/deployment/vercel.md b/docs/deployment/vercel.md
new file mode 100644
index 0000000..a63fbf7
--- /dev/null
+++ b/docs/deployment/vercel.md
@@ -0,0 +1,19 @@
+# Vercel Deployment Playbook
+
+## Project Setup
+
+1. Import repository into Vercel.
+2. Set framework preset to Next.js.
+3. Configure production environment variables using Vercel encrypted secrets.
+
+## Release Strategy
+
+- Deploy preview for every PR.
+- Promote to production only from protected `main` branch.
+- Pin deployment to release tags for auditable rollouts.
+
+## Human Approval Checkpoint
+
+- Require reviewer approval on PR.
+- Require workflow `CI` and `Security` to pass before merge.
+- Require explicit production promotion by release manager.
diff --git a/docs/operations/disaster-recovery.md b/docs/operations/disaster-recovery.md
new file mode 100644
index 0000000..6323fab
--- /dev/null
+++ b/docs/operations/disaster-recovery.md
@@ -0,0 +1,31 @@
+# Disaster Recovery Plan
+
+## Objectives
+
+- **RTO**: 30 minutes for core API restoration.
+- **RPO**: 15 minutes for persistent operational metadata.
+
+## Recovery Phases
+
+1. **Detect and declare incident**
+   - Trigger incident commander role.
+   - Freeze releases and non-essential deployments.
+2. **Stabilize**
+   - Shift traffic to healthy region/environment.
+   - Roll back to last known-good release tag.
+3. **Restore**
+   - Rehydrate state/configuration from backups and IaC definitions.
+   - Re-run smoke checks and determinism tests.
+4. **Validate and communicate**
+   - Confirm SLO recovery.
+   - Communicate status to stakeholders.
+
+## Backup and Restore Requirements
+
+- Daily encrypted backups of deployment configuration and secrets metadata references.
+- Immutable release artifacts retained for at least 90 days.
+- Quarterly restoration drills with documented outcomes.
+
+## Human Approval Checkpoint
+
+Incident commander and security lead must jointly approve return to normal release operations.
diff --git a/docs/operations/monitoring-alerting.md b/docs/operations/monitoring-alerting.md
new file mode 100644
index 0000000..96c312c
--- /dev/null
+++ b/docs/operations/monitoring-alerting.md
@@ -0,0 +1,34 @@
+# Monitoring Dashboards and Alerting Rules
+
+## Golden Signals Dashboard
+
+Track these panels at 1m, 5m, and 1h windows:
+
+- Request rate (`/api/optimize`, `/api/status`)
+- Error rate (5xx responses)
+- P95/P99 latency per endpoint
+- Container CPU and memory
+- Worker queue depth and saturation
+- Deployment and release version distribution
+
+## Recommended Alerts
+
+### Critical
+- API availability below 99.5% over 5 minutes.
+- P99 latency above 3 seconds over 10 minutes.
+- Error rate above 2% over 5 minutes.
+
+### High
+- Pod restart count > 3 within 10 minutes.
+- HPA at max replicas for > 15 minutes.
+- Security workflow failure on default branch.
+
+### Medium
+- CI failure rate > 20% on default branch over 24 hours.
+- Dependency audit reports high/critical vulnerabilities.
+
+## On-call Response Expectations
+
+- Acknowledge critical alerts within 5 minutes.
+- Begin mitigation within 10 minutes.
+- Publish incident timeline and root-cause analysis within 24 hours.
diff --git a/docs/operations/scaling-playbook.md b/docs/operations/scaling-playbook.md
new file mode 100644
index 0000000..3985a2c
--- /dev/null
+++ b/docs/operations/scaling-playbook.md
@@ -0,0 +1,19 @@
+# Scaling Playbook
+
+## Horizontal Scaling Strategy
+
+- Scale web tier via Kubernetes HPA between 3 and 15 replicas.
+- Keep worker pool concurrency tied to available vCPU: `concurrency = vCPU * 2`.
+- Limit queue growth with explicit `maxQueueSize` guardrails.
+
+## Capacity Planning
+
+- Baseline with load tests at 1x, 2x, and 4x expected peak RPS.
+- Reserve 30% headroom above 95th percentile peak.
+- Validate release candidate under peak load before production promotion.
+
+## Failure Containment
+
+- Use bounded retries with exponential backoff.
+- Fail fast on queue saturation and shed excess load.
+- Keep deployments canary-first before global rollout.
diff --git a/tests/wasm.determinism.test.ts b/tests/wasm.determinism.test.ts
new file mode 100644
index 0000000..f243c86
--- /dev/null
+++ b/tests/wasm.determinism.test.ts
@@ -0,0 +1,63 @@
+import { WasmRunner } from '../core/wasm/wasmRunner';
+
+describe('WasmRunner determinism', () => {
+  it('preserves deterministic output for identical task input', async () => {
+    const fixedNow = [1_000, 1_010, 2_000, 2_010];
+    let index = 0;
+    const runner = new WasmRunner({ now: () => fixedNow[index++] ?? 2_010 });
+
+    const task = {
+      id: 'deterministic-task',
+      payload: { value: 7 },
+      execute: async ({ value }: { value: number }) => value * 3,
+    };
+
+    const first = await runner.run(task);
+    const second = await runner.run(task);
+
+    expect(first.result).toBe(21);
+    expect(second.result).toBe(21);
+    expect(first.durationMs).toBe(10);
+    expect(second.durationMs).toBe(10);
+  });
+
+  it('returns runAll results in input order even with mixed latencies', async () => {
+    const runner = new WasmRunner();
+    const tasks = [
+      {
+        id: 'slow',
+        payload: 1,
+        execute: async (value: number) => {
+          await new Promise((resolve) => setTimeout(resolve, 30));
+          return value;
+        },
+      },
+      {
+        id: 'fast',
+        payload: 2,
+        execute: async (value: number) => value,
+      },
+    ];
+
+    const results = await runner.runAll(tasks);
+
+    expect(results.map((r) => r.id)).toEqual(['slow', 'fast']);
+    expect(results.map((r) => r.result)).toEqual([1, 2]);
+  });
+
+  it('applies timeout validation bounds', async () => {
+    const runner = new WasmRunner({ defaultTimeoutMs: 20, maxTimeoutMs: 40 });
+
+    const result = await runner.run({
+      id: 'timeout-bounded',
+      payload: null,
+      timeoutMs: 1_000,
+      execute: async () => {
+        await new Promise((resolve) => setTimeout(resolve, 50));
+        return 'done';
+      },
+    });
+
+    expect(result.error).toContain('timed out after 40ms');
+  });
+});
diff --git a/tests/workerPool.load.test.ts b/tests/workerPool.load.test.ts
new file mode 100644
index 0000000..5672f58
--- /dev/null
+++ b/tests/workerPool.load.test.ts
@@ -0,0 +1,65 @@
+import { WorkerPool } from '../core/wasm/workerPool';
+
+describe('WorkerPool load and scaling', () => {
+  it('enforces configured concurrency while processing load', async () => {
+    const pool = new WorkerPool({ concurrency: 3, maxQueueSize: 200 });
+    let inFlight = 0;
+    let maxObserved = 0;
+
+    const tasks = Array.from({ length: 50 }, (_, i) =>
+      pool.submit({
+        id: `task-${i}`,
+        payload: i,
+        execute: async (value: number) => {
+          inFlight++;
+          maxObserved = Math.max(maxObserved, inFlight);
+          await new Promise((resolve) => setTimeout(resolve, 5));
+          inFlight--;
+          return value * 2;
+        },
+      }),
+    );
+
+    const results = await Promise.all(tasks);
+
+    expect(results).toHaveLength(50);
+    expect(results.every((r) => typeof r.result === 'number')).toBe(true);
+    expect(maxObserved).toBeLessThanOrEqual(3);
+    expect(pool.capacity).toBe(3);
+  });
+
+  it('rejects submissions beyond queue capacity', async () => {
+    const pool = new WorkerPool({ concurrency: 1, maxQueueSize: 1 });
+
+    const first = pool.submit({
+      id: 'first',
+      payload: null,
+      execute: async () => {
+        await new Promise((resolve) => setTimeout(resolve, 30));
+        return 'first';
+      },
+    });
+
+    const second = pool.submit({
+      id: 'second',
+      payload: null,
+      execute: async () => 'second',
+    });
+
+    const third = pool.submit({
+      id: 'third',
+      payload: null,
+      execute: async () => 'third',
+    });
+
+    await expect(first).resolves.toHaveProperty('id', 'first');
+    await expect(second).resolves.toHaveProperty('id', 'second');
+    await expect(third).rejects.toThrow('queue capacity 1 exceeded');
+  });
+
+  it('rejects invalid concurrency configuration', () => {
+    expect(() => new WorkerPool({ concurrency: 0 })).toThrow(
+      'WorkerPool: concurrency must be a positive integer',
+    );
+  });
+});

From ce63d6991a326ec671c01ba3ad91114a21f004c6 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 7 May 2026 14:48:29 +0000
Subject: [PATCH 3/5] Fix timeout cleanup and queue-capacity test stability

Agent-Logs-Url: https://github.com/Algodons/agentos/sessions/42083e98-582c-4e4c-a645-3e712f0bed81

Co-authored-by: SMSDAO <144380926+SMSDAO@users.noreply.github.com>
---
 core/wasm/wasmRunner.ts       | 24 ++++++++++++++----------
 tests/workerPool.load.test.ts | 13 +++++++------
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/core/wasm/wasmRunner.ts b/core/wasm/wasmRunner.ts
index 1302fe3..f105fc2 100644
--- a/core/wasm/wasmRunner.ts
+++ b/core/wasm/wasmRunner.ts
@@ -39,11 +39,17 @@ export class WasmRunner {
     const start = this.now();
     const timeoutMs = this.resolveTimeout(task.timeoutMs);
 
+    let timeoutId: NodeJS.Timeout | null = null;
+
     try {
-      const result = await Promise.race([
-        task.execute(task.payload),
-        this.timeout<R>(timeoutMs, task.id),
-      ]);
+      const timeoutPromise = new Promise<R>((_, reject) => {
+        timeoutId = setTimeout(
+          () => reject(new Error(`WasmRunner: task ${task.id} timed out after ${timeoutMs}ms`)),
+          timeoutMs,
+        );
+      });
+
+      const result = await Promise.race([task.execute(task.payload), timeoutPromise]);
       return { id: task.id, result, durationMs: this.now() - start };
     } catch (err) {
       return {
@@ -51,6 +57,10 @@ export class WasmRunner {
         error: err instanceof Error ? err.message : String(err),
         durationMs: this.now() - start,
       };
+    } finally {
+      if (timeoutId) {
+        clearTimeout(timeoutId);
+      }
     }
   }
 
@@ -67,10 +77,4 @@ export class WasmRunner {
     }
     return Math.min(taskTimeoutMs, this.maxTimeoutMs);
   }
-
-  private timeout<R>(ms: number, taskId: string): Promise<R> {
-    return new Promise((_, reject) =>
-      setTimeout(() => reject(new Error(`WasmRunner: task ${taskId} timed out after ${ms}ms`)), ms),
-    );
-  }
 }
diff --git a/tests/workerPool.load.test.ts b/tests/workerPool.load.test.ts
index 5672f58..d5f6314 100644
--- a/tests/workerPool.load.test.ts
+++ b/tests/workerPool.load.test.ts
@@ -46,15 +46,16 @@ describe('WorkerPool load and scaling', () => {
       execute: async () => 'second',
     });
 
-    const third = pool.submit({
-      id: 'third',
-      payload: null,
-      execute: async () => 'third',
-    });
+    await expect(
+      pool.submit({
+        id: 'third',
+        payload: null,
+        execute: async () => 'third',
+      }),
+    ).rejects.toThrow('queue capacity 1 exceeded');
 
     await expect(first).resolves.toHaveProperty('id', 'first');
     await expect(second).resolves.toHaveProperty('id', 'second');
-    await expect(third).rejects.toThrow('queue capacity 1 exceeded');
   });
 
   it('rejects invalid concurrency configuration', () => {

From cc8b233cab69616aadcc6bc81b8481eba13fe2f2 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 7 May 2026 14:50:06 +0000
Subject: [PATCH 4/5] Use immutable image tag in Kubernetes deployment

Agent-Logs-Url: https://github.com/Algodons/agentos/sessions/42083e98-582c-4e4c-a645-3e712f0bed81

Co-authored-by: SMSDAO <144380926+SMSDAO@users.noreply.github.com>
---
 deploy/k8s/base/deployment.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deploy/k8s/base/deployment.yaml b/deploy/k8s/base/deployment.yaml
index e469d39..6176dcf 100644
--- a/deploy/k8s/base/deployment.yaml
+++ b/deploy/k8s/base/deployment.yaml
@@ -22,7 +22,7 @@ spec:
           type: RuntimeDefault
       containers:
         - name: agentos
-          image: ghcr.io/algodons/agentos:latest
+          image: ghcr.io/algodons/agentos:v1.0.0-sha.ce63d69
           imagePullPolicy: IfNotPresent
           ports:
             - containerPort: 3000

From 054d23822d4ee68507c5565a64bb79de161130f9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 7 May 2026 14:55:20 +0000
Subject: [PATCH 5/5] Patch download-artifact action to v4.1.3

Agent-Logs-Url: https://github.com/Algodons/agentos/sessions/42083e98-582c-4e4c-a645-3e712f0bed81

Co-authored-by: SMSDAO <144380926+SMSDAO@users.noreply.github.com>
---
 .github/workflows/release.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 64c5b52..4d51001 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -101,7 +101,7 @@ jobs:
 
     steps:
       - name: Download artifacts
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v4.1.3
         with:
           name: release-artifacts