From a1701f3689c6f58f98c93766b8d84259609b2100 Mon Sep 17 00:00:00 2001 From: gziv Date: Mon, 11 May 2026 11:22:03 +0300 Subject: [PATCH 01/11] feat: add rh-automation-governance-assessor evaluation submission --- .../CLAUDE.md | 66 ++ .../docs/.ai-index/cross-reference-graph.json | 64 + .../docs/.ai-index/semantic-index.json | 101 ++ .../docs/.ai-index/task-to-docs-mapping.json | 73 ++ .../docs/INDEX.md | 46 + .../docs/SOURCES.md | 200 ++++ .../docs/aap/README.md | 19 + .../docs/aap/execution-governance.md | 569 +++++++++ .../docs/aap/governance-readiness.md | 959 +++++++++++++++ .../docs/aap/job-troubleshooting.md | 396 +++++++ .../docs/references/README.md | 17 + .../docs/references/error-classification.md | 338 ++++++ .../instruction.md | 11 + .../mcps.json | 76 ++ .../metadata.yaml | 13 + .../skills/governance-assessor/SKILL.md | 154 +++ .../references/sample-full-assessment.md | 163 +++ .../references/sample-scoped-assessment.md | 73 ++ .../supportive/.mcp.json | 8 + .../supportive/mcp-servers/mock-aap-mcp.py | 1048 +++++++++++++++++ .../tests/llm_judge.py | 83 ++ .../tests/test_outputs.py | 25 + 22 files changed, 4502 insertions(+) create mode 100644 submissions/rh-automation-governance-assessor/CLAUDE.md create mode 100644 submissions/rh-automation-governance-assessor/docs/.ai-index/cross-reference-graph.json create mode 100644 submissions/rh-automation-governance-assessor/docs/.ai-index/semantic-index.json create mode 100644 submissions/rh-automation-governance-assessor/docs/.ai-index/task-to-docs-mapping.json create mode 100644 submissions/rh-automation-governance-assessor/docs/INDEX.md create mode 100644 submissions/rh-automation-governance-assessor/docs/SOURCES.md create mode 100644 submissions/rh-automation-governance-assessor/docs/aap/README.md create mode 100644 submissions/rh-automation-governance-assessor/docs/aap/execution-governance.md create mode 100644 submissions/rh-automation-governance-assessor/docs/aap/governance-readiness.md create mode 100644 submissions/rh-automation-governance-assessor/docs/aap/job-troubleshooting.md create mode 100644 submissions/rh-automation-governance-assessor/docs/references/README.md create mode 100644 submissions/rh-automation-governance-assessor/docs/references/error-classification.md create mode 100644 submissions/rh-automation-governance-assessor/instruction.md create mode 100644 submissions/rh-automation-governance-assessor/mcps.json create mode 100644 submissions/rh-automation-governance-assessor/metadata.yaml create mode 100644 submissions/rh-automation-governance-assessor/skills/governance-assessor/SKILL.md create mode 100644 submissions/rh-automation-governance-assessor/skills/governance-assessor/references/sample-full-assessment.md create mode 100644 submissions/rh-automation-governance-assessor/skills/governance-assessor/references/sample-scoped-assessment.md create mode 100644 submissions/rh-automation-governance-assessor/supportive/.mcp.json create mode 100644 submissions/rh-automation-governance-assessor/supportive/mcp-servers/mock-aap-mcp.py create mode 100644 submissions/rh-automation-governance-assessor/tests/llm_judge.py create mode 100644 submissions/rh-automation-governance-assessor/tests/test_outputs.py diff --git a/submissions/rh-automation-governance-assessor/CLAUDE.md b/submissions/rh-automation-governance-assessor/CLAUDE.md new file mode 100644 index 0000000..2e93308 --- /dev/null +++ b/submissions/rh-automation-governance-assessor/CLAUDE.md @@ -0,0 +1,66 @@ +# rh-automation Plugin + +You are an Ansible Automation Platform (AAP) engineer assistant. You help users assess governance readiness, run governed job executions with risk controls, and perform forensic analysis of failed jobs using Red Hat documentation and AAP APIs. + +## Skill-First Rule + +ALWAYS use the appropriate skill for AAP governance, execution, and troubleshooting tasks. Do NOT call MCP tools (`aap-mcp-job-management`, `aap-mcp-inventory-management`, `aap-mcp-configuration`, `aap-mcp-security-compliance`, `aap-mcp-system-monitoring`, `aap-mcp-user-management`) directly — skills enforce validation, risk analysis, human approval, and correct sequencing. + +To invoke a skill, use the Skill tool with the skill name (e.g., `/governance-executor`, `/forensic-troubleshooter`). + +## Intent Routing + +Match the user's request to the correct skill: + +| When the user asks about... | Use skill | +|----------------------------|-----------| +| End-to-end AAP governance readiness audit, production readiness, full or scoped governance assessment (orchestrates readiness steps) | `/governance-assessor` | +| Governed job execution: launch job template, production deploy, risk gates, check mode, approval (orchestrates validation → risk → launch) | `/governance-executor` | +| Failed job, root cause, what went wrong, forensic analysis of job errors (orchestrates analysis → host facts → resolution advice) | `/forensic-troubleshooter` | +| Validate AAP MCP connectivity, test AAP connection, verify MCP servers before other work | `/aap-mcp-validator` | +| Governance readiness only (7 domains), audit credentials/RBAC/workflows/notifications without the full governance-assessor wrapper | `/governance-readiness-assessor` | +| Is this execution safe?, production target risk, scan extra_vars, execution scope before launch | `/execution-risk-analyzer` | +| Launch job after risk analysis, check mode / dry run first, phased rollout, rollback | `/governed-job-launcher` | +| Analyze failed job events, failure timeline, classify job error (not host facts or fixes yet) | `/job-failure-analyzer` | +| Host facts for failed hosts, disk/memory drift, correlate inventory with job failure | `/host-fact-inspector` | +| How to fix, Red Hat docs recommendation, remediation after failure analysis | `/resolution-advisor` | +| Session / workflow audit trail, execution summary report after governance or troubleshooting | `/execution-summary` | + +If the request doesn't clearly match one skill, ask the user to clarify. For **full platform governance assessment**, prefer `/governance-assessor`. For **governed execution**, prefer `/governance-executor` rather than running `/execution-risk-analyzer` and `/governed-job-launcher` manually unless the user scoped a single step. For **job failure deep-dive**, prefer `/forensic-troubleshooter` over piecing together analysis skills unless the user only wants one sub-step. + +## Skill Chaining + +Some workflows are orchestrated for you: + +- **Governance assessment**: `/governance-assessor` orchestrates validation and readiness assessment (including `/governance-readiness-assessor`) and typically ends with `/execution-summary`. +- **Governed execution**: `/governance-executor` orchestrates `/aap-mcp-validator`, `/execution-risk-analyzer`, `/governed-job-launcher`, and `/execution-summary`. +- **Forensic troubleshooting**: `/forensic-troubleshooter` orchestrates `/job-failure-analyzer`, `/host-fact-inspector`, `/resolution-advisor`, and `/execution-summary`. + +Typical standalone sequences: + +- **Pre-flight only**: `/aap-mcp-validator` before any AAP-dependent skill. +- **Manual execution path** (when not using orchestrator): `/aap-mcp-validator` → `/execution-risk-analyzer` → `/governed-job-launcher` → `/execution-summary`. +- **Manual troubleshooting path**: `/job-failure-analyzer` → `/host-fact-inspector` → `/resolution-advisor` → `/execution-summary`. + +After completing a skill, suggest relevant next-step skills (for example, after readiness assessment offer `/governance-executor` for controlled execution, or after a failed run offer `/forensic-troubleshooter`). + +## MCP Servers + +Six HTTP MCP servers are configured for this pack. Skills wrap these — do not call their tools directly. + +- **aap-mcp-job-management** (Required for jobs and execution) — Job templates, launches, events, statuses, workflows, approvals. +- **aap-mcp-inventory-management** (Required for inventory-scoped work) — Inventories, hosts, groups, host facts (`ansible_facts`). +- **aap-mcp-configuration** (Required for full governance readiness) — Notification templates, execution environments, platform settings. +- **aap-mcp-security-compliance** (Required for full governance readiness) — Credentials, credential types, credential testing. +- **aap-mcp-system-monitoring** (Required for full governance readiness) — Instance groups, activity stream, mesh topology, platform status. +- **aap-mcp-user-management** (Required for full governance readiness) — Users, teams, organizations, roles, RBAC. + +Environment variables `AAP_MCP_SERVER` and `AAP_API_TOKEN` are defined in `mcps.json` using `${...}` placeholders only; never expose secret values in chat output. + +## Global Rules + +1. **Never expose credentials** — do not display API tokens, Bearer values, or raw contents of `AAP_API_TOKEN`. Only report whether required environment variables appear set. +2. **Confirm before execution and destructive impact** — follow each skill's human-in-the-loop steps: show plans, risk level, and obtain explicit approval before job launches that affect production or sensitive inventories. +3. **Never skip validation when the skill requires it** — use `/aap-mcp-validator` when prerequisites call for it; do not assume connectivity. +4. **Prefer orchestration skills for multi-step outcomes** — use `/governance-assessor`, `/governance-executor`, or `/forensic-troubleshooter` when the user wants an end-to-end outcome unless they explicitly request a single sub-task. +5. **Suggest next steps** — after completing a skill, suggest related skills the user might run next. diff --git a/submissions/rh-automation-governance-assessor/docs/.ai-index/cross-reference-graph.json b/submissions/rh-automation-governance-assessor/docs/.ai-index/cross-reference-graph.json new file mode 100644 index 0000000..5160a81 --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/.ai-index/cross-reference-graph.json @@ -0,0 +1,64 @@ +{ + "version": "2.0", + "last_updated": "2026-02-26", + "nodes": [ + { + "id": "governance-readiness", + "path": "aap/governance-readiness.md", + "title": "AAP Governance Readiness Assessment" + }, + { + "id": "execution-governance", + "path": "aap/execution-governance.md", + "title": "Execution Governance" + }, + { + "id": "job-troubleshooting", + "path": "aap/job-troubleshooting.md", + "title": "Job Troubleshooting" + }, + { + "id": "error-classification", + "path": "references/error-classification.md", + "title": "Error Classification Taxonomy" + } + ], + "edges": [ + { + "from": "governance-readiness", + "to": "execution-governance", + "relationship": "precedes", + "description": "After assessing readiness, use execution governance for governed execution" + }, + { + "from": "execution-governance", + "to": "job-troubleshooting", + "relationship": "follows_on_failure", + "description": "If execution fails, use troubleshooting for forensic analysis" + }, + { + "from": "job-troubleshooting", + "to": "error-classification", + "relationship": "references", + "description": "Troubleshooting uses error classification for systematic error typing" + }, + { + "from": "execution-governance", + "to": "governance-readiness", + "relationship": "optional_precondition", + "description": "First production execution may trigger readiness assessment" + }, + { + "from": "error-classification", + "to": "governance-readiness", + "relationship": "identifies_gaps", + "description": "Platform errors may indicate governance gaps in readiness domains" + }, + { + "from": "governance-readiness", + "to": "job-troubleshooting", + "relationship": "cross_reference", + "description": "Platform configuration issues discovered in assessment may explain job failures" + } + ] +} diff --git a/submissions/rh-automation-governance-assessor/docs/.ai-index/semantic-index.json b/submissions/rh-automation-governance-assessor/docs/.ai-index/semantic-index.json new file mode 100644 index 0000000..aae9f4c --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/.ai-index/semantic-index.json @@ -0,0 +1,101 @@ +{ + "version": "2.0", + "last_updated": "2026-02-26", + "documents": [ + { + "path": "aap/governance-readiness.md", + "title": "AAP Governance Readiness Assessment", + "category": "aap", + "token_estimate": 3500, + "semantic_keywords": [ + "platform readiness assessment", + "governance audit", + "AAP best practices check", + "RBAC compliance", + "credential hygiene", + "workflow governance", + "notification coverage", + "execution environment review", + "workload isolation", + "audit trail verification", + "production readiness", + "cross-domain correlation", + "compound risk analysis", + "adaptive depth queries", + "prioritized remediation", + "scale calibration" + ], + "use_cases": ["governance_readiness_assessment", "platform_audit", "pre_execution_check", "compound_risk_analysis"], + "mcp_servers_used": ["job-management", "inventory-management", "configuration", "security-compliance", "system-monitoring", "user-management"], + "red_hat_sources": 8, + "skills_that_read": ["governance-readiness-assessor"] + }, + { + "path": "aap/execution-governance.md", + "title": "Execution Governance", + "category": "aap", + "token_estimate": 2500, + "semantic_keywords": [ + "execute on production", + "check mode dry run", + "inventory risk classification", + "secret scanning extra_vars", + "rollback failed job", + "phased rollout", + "job template launch", + "diff mode", + "execution safety", + "production governance", + "job history analysis", + "template launch configuration", + "notification bindings", + "workflow coverage check", + "module analysis", + "adaptive risk elevation" + ], + "use_cases": ["governed_execution", "risk_analysis", "check_mode_execution", "rollback", "pre_execution_context"], + "mcp_servers_used": ["job-management", "inventory-management"], + "red_hat_sources": 5, + "skills_that_read": ["execution-risk-analyzer", "governed-job-launcher"] + }, + { + "path": "aap/job-troubleshooting.md", + "title": "Job Troubleshooting", + "category": "aap", + "token_estimate": 2000, + "semantic_keywords": [ + "job failed", + "why did the job fail", + "analyze failure", + "job events", + "host unreachable", + "module failure", + "error analysis", + "root cause", + "failure correlation" + ], + "use_cases": ["job_failure_analysis", "forensic_troubleshooting", "host_correlation"], + "mcp_servers_used": ["job-management", "inventory-management"], + "red_hat_sources": 3, + "skills_that_read": ["job-failure-analyzer", "host-fact-inspector"] + }, + { + "path": "references/error-classification.md", + "title": "Error Classification Taxonomy", + "category": "references", + "token_estimate": 1500, + "semantic_keywords": [ + "error classification", + "platform vs code error", + "resolution path", + "error taxonomy", + "failure type determination", + "troubleshooting decision tree" + ], + "use_cases": ["error_classification", "resolution_path_determination"], + "mcp_servers_used": [], + "red_hat_sources": 3, + "skills_that_read": ["resolution-advisor"] + } + ] +} diff --git a/submissions/rh-automation-governance-assessor/docs/.ai-index/task-to-docs-mapping.json b/submissions/rh-automation-governance-assessor/docs/.ai-index/task-to-docs-mapping.json new file mode 100644 index 0000000..85ffede --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/.ai-index/task-to-docs-mapping.json @@ -0,0 +1,73 @@ +{ + "version": "2.0", + "last_updated": "2026-02-26", + "workflows": { + "governance_assessment": { + "description": "Assess AAP platform governance readiness across 7 domains", + "trigger_phrases": [ + "assess governance readiness", + "is my AAP ready for production", + "audit platform configuration", + "check governance", + "what should I fix before executing jobs" + ], + "documents": [ + { + "path": "aap/governance-readiness.md", + "role": "primary", + "reason": "7-domain assessment framework with Red Hat citations" + } + ], + "agent": "governance-assessor", + "skills": ["aap-mcp-validator", "governance-readiness-assessor", "execution-summary"] + }, + "governed_execution": { + "description": "Execute with risk analysis, check mode, and governance controls", + "trigger_phrases": [ + "execute on production", + "push to prod", + "launch job template", + "execute security patch", + "release to production" + ], + "documents": [ + { + "path": "aap/execution-governance.md", + "role": "primary", + "reason": "Risk classification, check mode, rollback, phased rollout" + }, + { + "path": "aap/governance-readiness.md", + "role": "optional", + "reason": "Optional pre-execution readiness check" + } + ], + "agent": "governance-executor", + "skills": ["aap-mcp-validator", "execution-risk-analyzer", "governed-job-launcher", "execution-summary"] + }, + "forensic_troubleshooting": { + "description": "Analyze failed jobs with event extraction, host correlation, and resolution advisory", + "trigger_phrases": [ + "job failed", + "why did the execution fail", + "analyze the failure", + "what went wrong", + "root cause analysis" + ], + "documents": [ + { + "path": "aap/job-troubleshooting.md", + "role": "primary", + "reason": "Event parsing, failure patterns, host correlation" + }, + { + "path": "references/error-classification.md", + "role": "secondary", + "reason": "Error taxonomy and resolution path mapping" + } + ], + "agent": "forensic-troubleshooter", + "skills": ["aap-mcp-validator", "job-failure-analyzer", "host-fact-inspector", "resolution-advisor", "execution-summary"] + } + } +} diff --git a/submissions/rh-automation-governance-assessor/docs/INDEX.md b/submissions/rh-automation-governance-assessor/docs/INDEX.md new file mode 100644 index 0000000..644d7bd --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/INDEX.md @@ -0,0 +1,46 @@ +# Documentation Index + +Navigation guide for the rh-automation knowledge base. These documents are read by skills at runtime to provide Red Hat documentation-backed intelligence. + +## How Documents Are Used + +``` +User Request → Agent → Skill reads document → Skill queries MCP tools → Skill interprets with document knowledge → Output with Red Hat citations +``` + +## Document Map + +### AAP Category (`docs/aap/`) + +Platform governance, execution, and troubleshooting references for Ansible Automation Platform. + +| Document | Purpose | Skills That Read It | Red Hat Sources | +|----------|---------|-------------------|----------------| +| [governance-readiness.md](aap/governance-readiness.md) | 7-domain platform governance assessment | `governance-readiness-assessor` | 8 sources (Security Best Practices, Workflows, Notifications, RBAC, Instance Groups, Activity Stream, EE Guide, Hardening Guide) | +| [execution-governance.md](aap/execution-governance.md) | Risk classification, check mode, rollback, phased rollout | `execution-risk-analyzer`, `governed-job-launcher` | 5 sources (Job Templates, Security Best Practices, Workflows, Check Mode, Controller Best Practices) | +| [job-troubleshooting.md](aap/job-troubleshooting.md) | Event parsing, host correlation, failure patterns | `job-failure-analyzer`, `host-fact-inspector` | 3 sources (Troubleshooting Guide, Job Events, Administration Guide) | + +### References Category (`docs/references/`) + +Cross-cutting reference material used across multiple use cases. + +| Document | Purpose | Skills That Read It | Red Hat Sources | +|----------|---------|-------------------|----------------| +| [error-classification.md](references/error-classification.md) | Error taxonomy, classification trees, resolution paths | `resolution-advisor` | 3 sources (Troubleshooting Guide, Ansible Module docs, Administration Guide) | + +## Task-to-Document Mapping + +| User Task | Primary Document | Secondary Document | +|-----------|-----------------|-------------------| +| "Assess governance readiness" | governance-readiness.md | -- | +| "Execute on production" | execution-governance.md | governance-readiness.md (optional pre-check) | +| "Analyze failed job" | job-troubleshooting.md | error-classification.md | +| "How to fix this error?" | error-classification.md | job-troubleshooting.md | + +## Semantic Indexing + +The `.ai-index/` directory contains pre-computed indexes for efficient document discovery: + +- `semantic-index.json` -- Document metadata with semantic keywords +- `task-to-docs-mapping.json` -- Pre-computed document sets for common workflows +- `cross-reference-graph.json` -- Document relationship graph diff --git a/submissions/rh-automation-governance-assessor/docs/SOURCES.md b/submissions/rh-automation-governance-assessor/docs/SOURCES.md new file mode 100644 index 0000000..0e064fc --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/SOURCES.md @@ -0,0 +1,200 @@ +# Official Red Hat Sources + +All documentation in this collection is derived from or references official Red Hat and Ansible documentation. Content is used in accordance with Red Hat's documentation license (CC BY-SA 4.0). + +## Primary Sources + +### 1. Red Hat AAP 2.5 - Configuring Automation Execution: Security Best Practices (Ch. 15) + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices + +**Sections Used**: +- Sec. 15.1.2: Minimize administrative accounts +- Sec. 15.1.4: Remove user access to credentials +- Sec. 15.1.5: Enforce separation of duties +- Sec. 15.2.1: Use teams for role-based access +- Sec. 15.2.2: External authentication (LDAP, SAML, OAuth) + +**Referenced By**: governance-readiness.md (Domains 3, 4, Bonus), execution-governance.md (secret scanning) + +**Date Accessed**: 2026-02-20 + +--- + +### 2. Red Hat AAP 2.5 - Automation Controller User Guide: Workflows (Ch. 9) + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-workflows + +**Sections Used**: +- Workflow job templates +- Sec. 9.4: Workflow RBAC +- Approval nodes + +**Referenced By**: governance-readiness.md (Domain 1), execution-governance.md + +**Date Accessed**: 2026-02-20 + +--- + +### 3. Red Hat AAP 2.5 - Automation Controller User Guide: Notifications (Ch. 25) + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-notifications + +**Sections Used**: +- Notification templates +- Sec. 25.1: Notification inheritance hierarchy +- Notification types (Email, Slack, Webhook, PagerDuty) + +**Referenced By**: governance-readiness.md (Domain 2) + +**Date Accessed**: 2026-02-20 + +--- + +### 4. Red Hat AAP 2.5 - Automation Controller User Guide: RBAC (Ch. 4) + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/access_management_and_authentication/gw-managing-access + +**Sections Used**: +- Role-based access controls +- Role definitions +- Team assignments + +**Referenced By**: governance-readiness.md (Domain 3) + +**Date Accessed**: 2026-02-20 + +--- + +### 5. Red Hat AAP 2.5 - Configuring Automation Execution: Instance Groups (Ch. 17) + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/using_automation_execution/controller-instance-groups + +**Sections Used**: +- Instance groups for workload isolation +- max_forks configuration +- Policy settings + +**Referenced By**: governance-readiness.md (Domain 6) + +**Date Accessed**: 2026-02-20 + +--- + +### 6. Red Hat AAP 2.5 - Automation Controller User Guide: Activity Stream + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/using_automation_execution/assembly-controller-activity-stream + +**Sections Used**: +- Activity stream audit logging +- Event filtering + +**Referenced By**: governance-readiness.md (Domain 7) + +**Date Accessed**: 2026-02-20 + +--- + +### 7. Red Hat AAP 2.6 - Creating and Using Execution Environments + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html-single/creating_and_using_execution_environments/index + +**Sections Used**: +- Custom EE creation +- Dependency pinning +- ansible-builder + +**Referenced By**: governance-readiness.md (Domain 5), error-classification.md (EE issues) + +**Date Accessed**: 2026-02-20 + +--- + +### 8. Red Hat AAP 2.6 - Hardening Guide + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/hardening_and_compliance/index + +**Sections Used**: +- Platform hardening +- Credential rotation +- Audit requirements + +**Referenced By**: governance-readiness.md + +**Date Accessed**: 2026-02-20 + +--- + +### 9. Red Hat AAP 2.6 - Troubleshooting Guide: Troubleshoot Jobs + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/troubleshooting_ansible_automation_platform/troubleshoot-jobs + +**Sections Used**: +- Job failure analysis +- Common job errors +- Event interpretation + +**Referenced By**: job-troubleshooting.md, error-classification.md + +**Date Accessed**: 2026-02-20 + +--- + +### 10. Red Hat AAP 2.5 - Automation Controller User Guide: Job Templates (Ch. 9) + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/automation_controller_user_guide/controller-job-templates + +**Sections Used**: +- Job template configuration +- job_type (run/check) +- diff_mode, limit, extra_vars +- Job slicing +- Relaunch + +**Referenced By**: execution-governance.md, job-troubleshooting.md + +**Date Accessed**: 2026-02-20 + +--- + +### 11. Red Hat AAP 2.5 - Configuring Automation Execution: Controller Best Practices + +**URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-tips-and-tricks + +**Sections Used**: +- Inventory management +- Environment separation + +**Referenced By**: execution-governance.md (risk classification) + +**Date Accessed**: 2026-02-20 + +--- + +### 12. Ansible Playbook Guide: Check Mode + +**URL**: https://docs.ansible.com/ansible/latest/playbook_guide/playbooks_checkmode.html + +**Sections Used**: +- Check mode behavior +- diff mode +- Limitations (shell/command modules) + +**Referenced By**: execution-governance.md (check mode section) + +**Date Accessed**: 2026-02-20 + +--- + +### 13. Ansible Built-in Module Documentation + +**URL**: https://docs.ansible.com/ansible/latest/collections/ansible/builtin/index.html + +**Sections Used**: +- Module return values +- Error conditions +- Check mode behavior per module + +**Referenced By**: error-classification.md + +**Date Accessed**: 2026-02-20 diff --git a/submissions/rh-automation-governance-assessor/docs/aap/README.md b/submissions/rh-automation-governance-assessor/docs/aap/README.md new file mode 100644 index 0000000..b53902a --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/aap/README.md @@ -0,0 +1,19 @@ +# AAP Documentation + +Platform governance, execution, and troubleshooting references for Ansible Automation Platform. + +## Documents + +| Document | Purpose | Use Case | +|----------|---------|----------| +| [governance-readiness.md](governance-readiness.md) | 7-domain platform governance assessment against Red Hat best practices | UC1: Governance Assessment | +| [execution-governance.md](execution-governance.md) | Risk classification, check mode, rollback, phased rollout | UC2: Governed Execution | +| [job-troubleshooting.md](job-troubleshooting.md) | Event parsing, host correlation, failure patterns | UC3: Forensic Troubleshooting | + +## How to Use + +These documents are read by skills at runtime. The skill reads the document FIRST, then queries MCP tools, then interprets results using the document's decision tables and Red Hat citations. + +``` +Skill reads document → Queries MCP → Interprets with Red Hat knowledge → Reports with citations +``` diff --git a/submissions/rh-automation-governance-assessor/docs/aap/execution-governance.md b/submissions/rh-automation-governance-assessor/docs/aap/execution-governance.md new file mode 100644 index 0000000..21db9cf --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/aap/execution-governance.md @@ -0,0 +1,569 @@ +--- +title: Execution Governance +category: aap +sources: + - title: "Red Hat AAP 2.5 - Job Templates" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/automation_controller_user_guide/controller-job-templates + sections: "Ch. 9: Job template configuration, job_type (run/check), diff_mode, limit, extra_vars, job slicing" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.5 - Security Best Practices" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices + sections: "Ch. 15: Sec. 15.1.4 Remove user access to credentials, Sec. 15.1.5 Enforce separation of duties" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.5 - Workflows" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-workflows + sections: "Ch. 9: Workflow RBAC, approval nodes, failure handling" + date_accessed: 2026-02-20 + - title: "Red Hat Ansible Best Practices - Check Mode" + url: https://docs.ansible.com/ansible/latest/playbook_guide/playbooks_checkmode.html + sections: "Check mode, diff mode, limitations with shell/command modules" + date_accessed: 2026-02-20 + - title: "Red Hat AAP Controller Best Practices" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-tips-and-tricks + sections: "Inventory management, environment separation" + date_accessed: 2026-02-20 +tags: [execution, governance, check-mode, risk-classification, rollback, phased-rollout, extra-vars, secret-scanning] +applies_to: [aap2.5, aap2.6] +semantic_keywords: + - "execute on production" + - "check mode dry run" + - "inventory risk classification" + - "secret scanning extra_vars" + - "rollback failed job" + - "phased rollout" + - "job template launch" + - "diff mode" + - "execution safety" + - "production governance" +use_cases: + - "governed_execution" + - "risk_analysis" + - "check_mode_execution" + - "rollback" +related_docs: + - "aap/governance-readiness.md" + - "aap/job-troubleshooting.md" + - "references/error-classification.md" +last_updated: 2026-02-26 +--- + +# Execution Governance + +This document teaches the agent how to execute governed jobs on Ansible Automation Platform. It covers inventory risk classification, pre-launch safety checks, check mode execution and interpretation, rollback patterns, and phased rollout strategies. Every governance control is rooted in Red Hat's official documentation. + +## Overview + +A governed execution follows a principle: **the higher the risk, the more governance controls apply**. Risk is determined by the target inventory, the scope of change, and the content of extra_vars. Governance controls range from simple confirmation (low risk) to mandatory check mode, approval gates, and phased rollout (critical risk). + +## When to Use This Document + +**Use when**: +- User asks to execute, launch, or run a job template +- User asks to push to production +- User asks about check mode or dry runs +- User asks about rollback after a failed execution + +**Do NOT use when**: +- User asks to assess platform governance readiness (use [governance-readiness.md](governance-readiness.md)) +- User asks to troubleshoot a failed job (use [job-troubleshooting.md](job-troubleshooting.md)) + +--- + +## Inventory Risk Classification + +### Red Hat Source + +> "It is best practice to use separate inventories for production and development environments." +> +> -- *Red Hat AAP 2.5, Configuring Automation Execution, Controller Best Practices* + +### Classification Approach + +Based on Red Hat's recommendation to separate production and development environments, this agent detects the environment from inventory metadata and applies proportional governance. This is the agent's contribution: translating Red Hat's principle into automated risk detection. + +**Risk signals** (checked in order): + +1. **Inventory name** (primary signal): Word-boundary matching against environment keywords +2. **Host count** (secondary signal): Large inventories carry higher blast radius +3. **Recent job history** (tertiary signal): Inventories with recent failures may need extra caution + +### MCP Pattern: Inventory Lookup + +```json +MCP Server: aap-mcp-inventory-management +Tool: inventories_list +Parameters: { "page_size": 100 } +``` + +To check host count for a specific inventory: + +```json +MCP Server: aap-mcp-inventory-management +Tool: hosts_list +Parameters: { "page_size": 1, "search": "" } +``` + +### Risk Decision Table + +| Inventory Name Pattern | Host Count | Risk Level | Governance Required | +|---|---|---|---| +| Contains `prod`, `production`, `live` | Any | **CRITICAL** | Check mode + approval + phased rollout recommended | +| Contains `stage`, `staging`, `uat`, `preprod` | Any | **HIGH** | Check mode + approval | +| Contains `test`, `qa` | Any | **MEDIUM** | Confirmation only | +| Contains `dev`, `development`, `sandbox`, `lab` | Any | **LOW** | Direct execution permitted | +| No matching pattern | > 50 hosts | **HIGH** | Check mode + approval (large blast radius) | +| No matching pattern | <= 50 hosts | **MEDIUM** | Confirmation only | + +**Transparency note**: The name-based risk patterns above are this agent's implementation of Red Hat's principle to "use separate inventories for production and development." The host count thresholds are the agent's contribution for unclassifiable inventories. + +--- + +## Extra Vars Safety Scanning + +### Red Hat Source + +> "Remove user access to credentials. Credentials can be configured at the organization, team, or user level." +> +> -- *Red Hat AAP 2.5, Configuring Automation Execution, Ch. 15, Sec. 15.1.4* + +### What This Means for Extra Vars + +Red Hat's credential management system stores secrets securely within AAP. When users pass secrets directly in `extra_vars` (as plain-text strings), they bypass this protection -- the secret appears in job logs, activity stream, and API responses. The agent detects this anti-pattern. + +### Detection Patterns + +The agent scans `extra_vars` key names and values for indicators of plain-text secrets: + +**Key name patterns** (case-insensitive): `password`, `secret`, `token`, `api_key`, `apikey`, `private_key`, `ssh_key`, `access_key`, `auth` + +**Value patterns**: Strings that look like tokens (long alphanumeric strings, base64-encoded content, strings starting with common prefixes like `sk-`, `ghp_`, `Bearer`) + +### MCP Pattern: Job Template Launch Parameters + +Before launching, inspect the launch configuration: + +```json +MCP Server: aap-mcp-job-management +Tool: job_templates_launch_retrieve +Parameters: { "id": "" } +``` + +This returns the template's expected extra_vars, defaults, and required fields. + +### Decision Table + +| Finding | Severity | Action | +|---|---|---| +| Secret-like key name with literal value in extra_vars | **BLOCK** | Refuse to launch. Recommend using AAP credentials instead. | +| Secret-like key name with variable reference (`{{ }}`) | **PASS** | Variable references are acceptable (resolved at runtime) | +| No secret indicators | **PASS** | Proceed with launch | + +**Transparency note**: The secret detection patterns above are this agent's implementation of Red Hat's recommendation to "Remove user access to credentials" (Ch. 15, Sec. 15.1.4). The regex patterns are the agent's contribution; the principle is Red Hat's. + +--- + +## Pre-Execution Context Analysis + +Beyond static risk classification (inventory name, extra_vars), the agent SHOULD examine the job template's operational context -- its history, configuration, and governance bindings. These signals adapt the risk assessment to the specific scenario rather than relying solely on inventory name patterns. + +### Red Hat Sources + +> "You can set notifications on job start and job end, including job failure, for the following resources: job templates, workflow templates, organizations, and projects." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 25: Notifications* + +> "Workflows enable you to configure a sequence of disparate job templates and link them together in order to execute them as a single unit." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 9: Workflows* + +### Signal 1: Job History + +Check whether this job template has recent failures: + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_list +Parameters: { "page_size": 5, "job_template": "", "order_by": "-finished" } +``` + +| Recent Jobs | Last Runs Status | Signal | Agent Action | +|---|---|---|---| +| > 0 | All successful | **CLEAR** | Proceed normally | +| > 0 | Most recent failed | **WARN** | "This template's last run failed. Investigate before re-executing." | +| > 0 | 2+ consecutive failures | **ELEVATED** | "This template has failed [N] consecutive times. Strongly recommend investigating root cause before retrying." | +| 0 | N/A (never run) | **INFO** | "This template has never been executed. First run -- extra caution recommended." | + +**Transparency note**: Job history analysis is the agent's proactive contribution. Red Hat does not prescribe checking history before launches, but the agent uses available MCP data to provide situational awareness that a vanilla agent would not. + +### Signal 2: Template Launch Configuration + +Check whether the template allows governance overrides at launch time: + +```json +MCP Server: aap-mcp-job-management +Tool: job_templates_launch_retrieve +Parameters: { "id": "" } +``` + +Examine the `ask_*_on_launch` fields: + +| Field | Value | Implication | +|---|---|---| +| `ask_job_type_on_launch` | `true` | Check mode override available -- governance can enforce dry-run | +| `ask_job_type_on_launch` | `false` | **Check mode not overridable at launch.** Agent cannot enforce dry-run without template modification. | +| `ask_limit_on_launch` | `true` | Phased rollout possible via `limit` parameter | +| `ask_limit_on_launch` | `false` | **Phased rollout not possible.** Agent cannot restrict hosts at launch. | +| `ask_variables_on_launch` | `true` | Extra vars can be overridden -- check for secret injection risk | +| `ask_diff_mode_on_launch` | `true` | Diff mode overridable -- governance can enable detailed change reporting | + +**Agent Action**: +- If `ask_job_type_on_launch` is `false` AND risk is CRITICAL/HIGH: Warn user that check mode cannot be enforced. Recommend modifying the template to enable "Prompt on launch" for job_type. +- If `ask_limit_on_launch` is `false` AND risk is CRITICAL: Warn that phased rollout is not possible with this template configuration. + +### Signal 3: Notification Bindings + +Check whether the template has failure notifications: + +```json +MCP Server: aap-mcp-job-management +Tool: job_templates_retrieve +Parameters: { "id": "" } +``` + +If the template shows no notification associations for error events: + +**Agent Action**: "Per Red Hat's Notifications documentation (Ch. 25), this job template has no failure notification configured. If this execution fails, no one will be automatically alerted. Consider adding failure notifications before production use." + +### Signal 4: Workflow Coverage + +Check whether this template is wrapped in a governed workflow: + +```json +MCP Server: aap-mcp-job-management +Tool: workflow_job_templates_list +Parameters: { "page_size": 100 } +``` + +If no workflows exist, or none reference this job template: + +**Agent Action**: "Per Red Hat's Workflow documentation (Ch. 9), this job template runs standalone -- not wrapped in a workflow. Workflows provide approval nodes, failure paths, and conditional logic. For production executions, consider wrapping this template in a workflow." + +### Signal 5: Previous Run Module Analysis + +If job history exists, examine events from the most recent run to identify playbook characteristics that affect check mode coverage: + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_job_events_list +Parameters: { "id": "", "page_size": 50 } +``` + +Scan event task names and module types. If events show `ansible.builtin.shell` or `ansible.builtin.command` usage: + +**Agent Action**: Elevate the check mode warning from generic to specific: "This playbook uses shell/command modules (found in previous run events). Check mode will SKIP these tasks -- they represent [X] of [Y] total tasks and will NOT be validated in the dry run." + +### Adaptive Risk Enhancement + +After collecting all signals, the agent adjusts the base risk assessment: + +| Base Risk | Signals Found | Adjusted Risk | Rationale | +|---|---|---|---| +| CRITICAL | Recent failures + no notifications | CRITICAL (confirmed) | Maximum governance -- investigate failures first | +| HIGH | Never run + check mode not overridable | **CRITICAL** (elevated) | First run on template that can't enforce dry-run | +| MEDIUM | All clear + good history | MEDIUM (confirmed) | Standard governance appropriate | +| LOW | Recent failures | **MEDIUM** (elevated) | Dev environment but template is failing -- extra caution | +| Any | No notifications + production target | Risk + **advisory** | Flag missing notifications as a governance gap | + +**Transparency note**: Risk elevation based on operational signals is the agent's proactive contribution. Red Hat's documentation establishes the governance principles; the agent applies them dynamically based on what it discovers about the specific execution scenario. + +--- + +## Check Mode Execution + +### Red Hat Source + +> "Check mode is a way to run Ansible without making any changes on remote systems. Check mode can be useful for testing playbooks." +> +> -- *Ansible Playbook Guide: Check Mode* + +> "The `job_type` field on a job template supports `run` and `check` modes." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 9: Job Templates* + +### MCP Pattern: Launch in Check Mode + +```json +MCP Server: aap-mcp-job-management +Tool: job_templates_launch_create +Parameters: { + "id": "", + "requestBody": { + "job_type": "check", + "diff_mode": true + } +} +``` + +**Key parameters**: +- `job_type`: `"check"` -- runs playbook in check mode (no changes applied) +- `diff_mode`: `true` -- shows what would change (file diffs, package lists) + +### Check Mode Limitations + +Per Ansible documentation, check mode has important limitations the agent must be aware of: + +| Module Category | Check Mode Behavior | Agent Guidance | +|---|---|---| +| `ansible.builtin.dnf` / `ansible.builtin.yum` | **Contacts repos, resolves dependencies, reports what would change** | Reliable for package operations | +| `ansible.builtin.service` / `ansible.builtin.systemd` | Reports what state changes would occur | Reliable | +| `ansible.builtin.copy` / `ansible.builtin.template` | Reports file changes with diff | Reliable | +| `ansible.builtin.shell` / `ansible.builtin.command` | **Skipped entirely** -- check mode cannot predict command output | Warn the user: "Tasks using shell/command modules were skipped in check mode and were NOT validated" | +| `ansible.builtin.raw` | Skipped in check mode | Same warning as shell/command | + +### Interpreting Check Mode Results + +After the check mode job completes, retrieve its events: + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_job_events_list +Parameters: { "id": "", "page_size": 100 } +``` + +And the host summary: + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_job_host_summaries_list +Parameters: { "id": "" } +``` + +**Interpretation guide**: + +| Host Summary Field | Meaning | Action | +|---|---|---| +| `failures` > 0 | Tasks would fail if executed | **STOP** -- do not proceed to full run. Report failures. | +| `dark` > 0 | Hosts unreachable | **STOP** -- connectivity issue. Investigate before proceeding. | +| `changed` > 0, `failures` = 0 | Changes would be applied successfully | Safe to proceed with approval | +| `ok` > 0, `changed` = 0 | No changes needed (idempotent) | Report: "Playbook is already in desired state" | +| `skipped` > 0 | Tasks were skipped (conditions not met or check mode limitation) | Warn about check mode limitations for skipped tasks | + +### Pitfalls + +- **Don't trust check mode blindly**: Shell/command tasks are skipped. If the playbook relies heavily on shell commands, check mode provides incomplete coverage. Warn the user. +- **Don't skip check mode for CRITICAL risk**: Even if the user says "urgent," CRITICAL-risk executions should always get a check mode pass per governance policy. +- **Don't forget diff_mode**: Always set `diff_mode: true` when running check mode. Without it, you see pass/fail but not *what* would change. + +--- + +## Full Execution + +### MCP Pattern: Launch with Full Run + +After check mode passes and user approves: + +```json +MCP Server: aap-mcp-job-management +Tool: job_templates_launch_create +Parameters: { + "id": "", + "requestBody": { + "job_type": "run", + "extra_vars": { ... }, + "limit": "" + } +} +``` + +### Monitoring Job Progress + +Poll job status until completion: + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_retrieve +Parameters: { "id": "" } +``` + +**Status values**: `pending`, `waiting`, `running`, `successful`, `failed`, `error`, `canceled` + +### Post-Execution Summary + +After completion, retrieve the changed-only summary: + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_job_host_summaries_list +Parameters: { "id": "" } +``` + +Report only hosts with `changed > 0` or `failures > 0` to keep the summary actionable. + +--- + +## Rollback Patterns + +### Red Hat Source + +> "You can relaunch a job with the same parameters, or relaunch on only failed hosts." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 9: Job Templates (Relaunch)* + +### MCP Pattern: Relaunch on Failed Hosts + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_relaunch_create +Parameters: { + "id": "", + "requestBody": { + "hosts": "failed", + "credential_passwords": {} + } +} +``` + +### Rollback Strategies + +| Strategy | When to Use | MCP Pattern | +|---|---|---| +| **Relaunch on failed hosts** | Partial failure; retry the same playbook on hosts that failed | `jobs_relaunch_create` with `hosts: "failed"` | +| **Rollback playbook** | Full rollback needed; separate playbook that undoes changes | Launch a different job template (the rollback template) | +| **Revert to previous job** | Re-run the last successful job with same parameters | `jobs_relaunch_create` on the previous successful job ID | + +### Pitfalls + +- **Don't relaunch blindly**: If check mode caught a failure, relaunching the same playbook on failed hosts will produce the same failure. Fix the root cause first. +- **Don't assume idempotent rollback**: Not all playbooks have rollback versions. If no rollback template exists, manual intervention may be required. + +--- + +## Phased Rollout + +### Red Hat Source + +> "The `limit` field can be used to restrict the hosts that the job template runs against." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 9: Job Templates* + +> "Job slicing enables you to distribute work across multiple Ansible controller nodes." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 9: Job Templates (Job Slicing)* + +### Phased Rollout Pattern + +For CRITICAL-risk executions targeting many hosts, roll out in phases: + +**Phase 1**: Canary -- single host or small group + +```json +MCP Server: aap-mcp-job-management +Tool: job_templates_launch_create +Parameters: { + "id": "", + "requestBody": { + "job_type": "run", + "limit": "" + } +} +``` + +**Phase 2**: Verify canary success, then expand to 25% + +```json +Parameters: { + "id": "", + "requestBody": { + "job_type": "run", + "limit": "[0:25%]" + } +} +``` + +**Phase 3**: Full rollout after canary + 25% pass + +```json +Parameters: { + "id": "", + "requestBody": { + "job_type": "run" + } +} +``` + +### Health Gate Between Phases + +Between each phase, check the job result: + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_job_host_summaries_list +Parameters: { "id": "" } +``` + +If `failures > 0` on any host, **STOP the rollout** and report. Do not proceed to the next phase. + +### Pitfalls + +- **Don't skip the canary**: Even if the user says "execute on all," CRITICAL-risk executions should validate on a canary first. +- **Don't use limit patterns without verifying**: The Ansible `limit` syntax supports patterns (`host1,host2`, `group[0:5]`, `~regex`). Verify the pattern resolves to expected hosts before launching. + +--- + +## Governance Workflow Summary + +The complete governed execution workflow: + +``` +1. IDENTIFY the job template and target inventory +2. CLASSIFY inventory risk (CRITICAL / HIGH / MEDIUM / LOW) +3. SCAN extra_vars for plain-text secrets +4. IF CRITICAL/HIGH risk: + a. LAUNCH in check mode with diff_mode=true + b. INTERPRET check mode results + c. WARN about shell/command limitations + d. PRESENT findings and ASK for approval +5. IF approved: + a. IF CRITICAL risk: Execute phased rollout (canary → 25% → full) + b. IF HIGH risk: Execute full run + c. Between phases: Verify host summaries (health gate) +6. REPORT changed-only summary +7. IF failure: Offer rollback options +``` + +--- + +## Cross-References + +- **[governance-readiness.md](governance-readiness.md)** -- Assess platform readiness before first production execution +- **[job-troubleshooting.md](job-troubleshooting.md)** -- If execution fails, use forensic troubleshooting to determine root cause +- **[error-classification.md](../references/error-classification.md)** -- Classify execution errors and determine resolution paths + +--- + +## Official Red Hat Sources + +1. Red Hat AAP 2.5, Automation Controller User Guide -- Job Templates (Ch. 9). https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/automation_controller_user_guide/controller-job-templates. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +2. Red Hat AAP 2.5, Configuring Automation Execution -- Security Best Practices (Ch. 15). https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +3. Red Hat AAP 2.5, Automation Controller User Guide -- Workflows (Ch. 9). https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-workflows. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +4. Red Hat AAP 2.5, Configuring Automation Execution -- Controller Best Practices. https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-tips-and-tricks. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +5. Ansible Playbook Guide -- Check Mode. https://docs.ansible.com/ansible/latest/playbook_guide/playbooks_checkmode.html. Accessed 2026-02-20. + +--- + +## Quick Reference + +| Governance Control | When Applied | MCP Tool | Key Parameter | +|---|---|---|---| +| Risk Classification | All executions | `inventories_list` | Inventory name + host count | +| Secret Scanning | All executions | `job_templates_launch_retrieve` | extra_vars inspection | +| Check Mode | CRITICAL + HIGH risk | `job_templates_launch_create` | `job_type: "check"`, `diff_mode: true` | +| Approval Gate | CRITICAL + HIGH risk | N/A (human-in-the-loop) | User confirmation | +| Phased Rollout | CRITICAL risk | `job_templates_launch_create` | `limit` parameter per phase | +| Health Gate | Between phases | `jobs_job_host_summaries_list` | `failures` = 0 to proceed | +| Rollback | On failure | `jobs_relaunch_create` | `hosts: "failed"` | +| Changed-Only Summary | Post-execution | `jobs_job_host_summaries_list` | Filter `changed > 0` | diff --git a/submissions/rh-automation-governance-assessor/docs/aap/governance-readiness.md b/submissions/rh-automation-governance-assessor/docs/aap/governance-readiness.md new file mode 100644 index 0000000..3abd44f --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/aap/governance-readiness.md @@ -0,0 +1,959 @@ +--- +title: AAP Governance Readiness Assessment +category: aap +sources: + - title: "Red Hat AAP 2.5 - Security Best Practices" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices + sections: "Ch. 15: Sec. 15.1.2 Minimize administrative accounts, Sec. 15.1.4 Remove user access to credentials, Sec. 15.1.5 Enforce separation of duties, Sec. 15.2.1 Use teams for role-based access, Sec. 15.2.2 External authentication" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.5 - Workflows" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-workflows + sections: "Ch. 9: Workflow job templates, Sec. 9.4 Workflow RBAC, approval nodes" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.5 - Notifications" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-notifications + sections: "Ch. 25: Notification templates, inheritance hierarchy, notification types" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.5 - Instance Groups" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/using_automation_execution/controller-instance-groups + sections: "Ch. 17: Instance groups, policies, max_forks, resource isolation" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.5 - Activity Stream" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/using_automation_execution/assembly-controller-activity-stream + sections: "Activity stream audit logging, event filtering" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.6 - Execution Environments" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/creating_and_consuming_execution_environments + sections: "Custom EE creation, dependency pinning, ansible-builder" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.6 - Hardening Guide" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/hardening_and_compliance/index + sections: "Platform hardening, credential rotation, audit requirements" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.5 - RBAC" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/access_management_and_authentication/gw-managing-access + sections: "Ch. 4: Role-based access controls, role definitions, team assignments" + date_accessed: 2026-02-20 +tags: [governance, readiness, assessment, rbac, credentials, workflows, notifications, execution-environments, instance-groups, audit, compliance] +applies_to: [aap2.5, aap2.6] +semantic_keywords: + - "platform readiness assessment" + - "governance audit" + - "AAP best practices check" + - "RBAC compliance" + - "credential hygiene" + - "workflow governance" + - "notification coverage" + - "execution environment review" + - "workload isolation" + - "audit trail verification" + - "production readiness" +use_cases: + - "governance_readiness_assessment" + - "platform_audit" + - "pre_execution_check" +related_docs: + - "aap/execution-governance.md" + - "aap/job-troubleshooting.md" + - "references/error-classification.md" +last_updated: 2026-02-26 +--- + +# AAP Governance Readiness Assessment + +This document teaches the agent how to audit an Ansible Automation Platform environment against Red Hat's official best practices across 7 governance domains, using MCP tools from all 6 AAP MCP servers. For each domain, the agent learns what "good" looks like according to Red Hat, how to check it programmatically, and how to report findings with full source attribution. + +## Overview + +A governance readiness assessment answers the question: **"Is this AAP instance configured according to Red Hat's published best practices for production use?"** + +The assessment covers 7 domains. Each domain maps to specific Red Hat documentation with direct quotes where available, or feature descriptions where Red Hat documents capability without explicit recommendation. The agent's role is to check the current AAP configuration against these standards and report findings transparently. + +## When to Use This Document + +**Use when**: +- User asks to assess AAP governance readiness +- User asks "Is my AAP ready for production?" +- User asks to audit platform configuration +- Before a first production execution (part of governance-executor workflow) +- User asks "What should I fix before executing jobs?" + +**Do NOT use when**: +- User wants to execute a specific job template (use [execution-governance.md](execution-governance.md)) +- User wants to troubleshoot a failed job (use [job-troubleshooting.md](job-troubleshooting.md)) +- User only wants to validate MCP connectivity (use aap-mcp-validator skill directly) + +--- + +## Domain 1: Workflow Governance + +### Red Hat Source + +> "Workflows enable you to configure a sequence of disparate job templates (or workflow templates) and link them together in order to execute them as a single unit." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 9: Workflows* + +> "You must have execute access to a job template to add it to a workflow job template." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 9, Sec. 9.4: Workflow RBAC* + +### What This Means in Practice + +Workflows are Red Hat's prescribed mechanism for multi-step automation with error handling. A workflow can include approval nodes (human gates), failure paths (rollback), and conditional logic. Running standalone job templates for production changes -- without wrapping them in workflows -- bypasses these governance controls. + +### MCP Assessment Pattern + +**Step 1**: Query workflow job templates. + +```json +MCP Server: aap-mcp-job-management +Tool: workflow_job_templates_list +Parameters: { "page_size": 100 } +``` + +**Step 2**: Query standalone job templates. + +```json +MCP Server: aap-mcp-job-management +Tool: job_templates_list +Parameters: { "page_size": 100 } +``` + +**Step 3**: Compare counts. Calculate the workflow coverage ratio: `workflow_count / (workflow_count + standalone_template_count)`. + +### Decision Table + +| Workflow Count | Standalone Count | Ratio | Status | Recommendation | +|---|---|---|---|---| +| > 0 | Any | > 50% | **PASS** | Workflows are in active use | +| > 0 | Many | < 50% | **WARN** | Most templates run outside workflow governance | +| 0 | Any | 0% | **GAP** | No workflows configured; production changes lack approval gates and failure paths | + +### Gap Remediation + +Workflows cannot be created via the current MCP tools (no `workflow_job_templates_create` endpoint). Report this as a manual action item: + +**Recommendation**: "Per Red Hat's Workflow documentation (Ch. 9), create workflow job templates that wrap your production job templates. Include approval nodes before critical steps and failure-path nodes for rollback." + +### Pitfalls + +- **Don't count workflow existence as governance**: A workflow with a single node and no approval gates provides no additional governance over a standalone template. +- **Don't ignore workflow RBAC**: Per Red Hat (Ch. 9, Sec. 9.4), users need execute access to add templates to workflows. Check that workflow execute permissions are restricted appropriately. + +--- + +## Domain 2: Notification Coverage + +### Red Hat Source + +> "You can set notifications on job start and job end, including job failure, for the following resources: job templates, workflow templates, organizations, and projects." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 25: Notifications* + +> "Notification templates can be inherited from an organization or project level, so you do not need to configure them on every job template individually." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 25, Sec. 25.1: Notification Hierarchy* + +### What This Means in Practice + +Notifications ensure that job outcomes -- especially failures -- are communicated to stakeholders. Without notification templates, a failed production job could go unnoticed until its impact is discovered manually. Red Hat supports Email, Slack, Webhook, PagerDuty, IRC, Grafana, Twilio, and custom webhook notification types. + +### MCP Assessment Pattern + +**Step 1**: Query notification templates. + +```json +MCP Server: aap-mcp-configuration +Tool: notification_templates_list +Parameters: { "page_size": 100 } +``` + +**Step 2**: Examine results. Check for: +- At least one notification template exists +- Templates cover failure events (look for `notification_type` and associated job template/workflow bindings in the response) + +### Decision Table + +| Templates Found | Types | Status | Recommendation | +|---|---|---|---| +| > 0 | Includes failure notifications | **PASS** | Notification coverage is configured | +| > 0 | Only success notifications | **WARN** | No failure notifications; failed jobs may go unnoticed | +| 0 | N/A | **GAP** | No notification templates configured | + +### Gap Remediation + +Notification templates CAN be created via MCP: + +```json +MCP Server: aap-mcp-configuration +Tool: notification_templates_create +Parameters: { + "name": "Production Job Failures", + "notification_type": "email", + "organization": 1, + "notification_configuration": { + "recipients": ["ops-team@example.com"], + "sender": "aap-notifications@example.com", + "host": "smtp.example.com", + "port": 587, + "use_tls": true + } +} +``` + +**Recommendation**: "Per Red Hat's Notification documentation (Ch. 25), configure notification templates at the organization level for inheritance. At minimum, set up failure notifications for all production job templates." + +### Pitfalls + +- **Don't assume inheritance is configured**: A notification template existing doesn't mean it's attached to job templates. The template must be bound to specific resources or inherited via organization/project. +- **Don't skip failure notifications**: Per Red Hat (Ch. 25), notifications can be set on both start and end events. Production governance requires at minimum failure notifications. + +--- + +## Domain 3: Access Control (RBAC) + +### Red Hat Source + +> "Use teams inside of organizations to assign permissions to groups of users rather than to users individually." +> +> -- *Red Hat AAP 2.5, Configuring Automation Execution, Ch. 15, Sec. 15.2.1* + +> "Delegate the minimum level of privileges required to run automation." +> +> -- *Red Hat AAP 2.5, Configuring Automation Execution, Ch. 15, Sec. 15.2.1* + +> "Minimize administrative accounts...restrict to the minimum set of users." +> +> -- *Red Hat AAP 2.5, Configuring Automation Execution, Ch. 15, Sec. 15.1.2* + +### What This Means in Practice + +Red Hat's RBAC guidance has three pillars: (1) team-based assignment over individual assignment, (2) least privilege, and (3) minimal admin accounts. Violations include: users with direct individual role assignments instead of team-based ones, excessive superuser accounts, and overly broad role definitions. + +### MCP Assessment Pattern + +**Step 1**: Query all users. + +```json +MCP Server: aap-mcp-user-management +Tool: users_list +Parameters: { "page_size": 100 } +``` + +**Step 2**: Count superuser accounts. From results, count users where `is_superuser` is `true`. + +**Step 3**: Query all teams. + +```json +MCP Server: aap-mcp-user-management +Tool: teams_list +Parameters: { "page_size": 100 } +``` + +**Step 4**: Query individual user role assignments. + +```json +MCP Server: aap-mcp-user-management +Tool: role_user_assignments_list +Parameters: { "page_size": 100 } +``` + +**Step 5**: Query team-based role assignments. + +```json +MCP Server: aap-mcp-user-management +Tool: role_team_assignments_list +Parameters: { "page_size": 100 } +``` + +**Step 6**: Compare individual vs team assignments. Calculate team assignment ratio: `team_assignments / (team_assignments + user_assignments)`. + +### Decision Table + +| Superusers | Teams | Assignment Ratio | Status | Finding | +|---|---|---|---|---| +| 1 | > 0 | Team-heavy (> 50%) | **PASS** | RBAC follows Red Hat best practices | +| 1 | > 0 | User-heavy (< 50%) | **WARN** | Teams exist but most permissions are assigned individually | +| 1 | 0 | 0% (all individual) | **GAP** | No teams configured; violates "Use teams...to assign permissions to groups" | +| > 1 | Any | Any | **WARN** | Multiple superuser accounts; violates "Minimize administrative accounts" | +| > 3 | Any | Any | **GAP** | Excessive superuser accounts; critical violation of least privilege | + +### Gap Remediation + +Teams CAN be created via MCP, and role assignments can be managed: + +```json +MCP Server: aap-mcp-user-management +Tool: teams_create +Parameters: { + "name": "automation-operators", + "organization": 1, + "description": "Operators who execute production job templates" +} +``` + +Role assignments can be created to grant team-level permissions: + +```json +MCP Server: aap-mcp-user-management +Tool: role_user_assignments_create +Parameters: { + "user": 3, + "role_definition": 14, + "object_id": "1" +} +``` + +**Recommendation**: "Per Red Hat's Security Best Practices (Ch. 15, Sec. 15.2.1), migrate individual user role assignments to team-based assignments. Create teams that map to operational roles (e.g., automation-operators, automation-admins, automation-auditors) and assign permissions to teams." + +### Pitfalls + +- **Don't count the admin account as a violation**: Every AAP instance has at least one superuser. The concern is *additional* superuser accounts beyond the minimum required. +- **Don't ignore service accounts**: Users created for CI/CD or API access may legitimately need individual assignments, but they should still follow least privilege. + +--- + +## Domain 4: Credential Security + +### Red Hat Source + +> "Remove user access to credentials. Credentials can be configured at the organization, team, or user level. Red Hat recommends that credentials be defined at the organization or team level." +> +> -- *Red Hat AAP 2.5, Configuring Automation Execution, Ch. 15, Sec. 15.1.4* + +> "Enforce separation of duties. Different credentials (SSH keys, cloud tokens) should be used for different pieces of automation. Do not share one credential across all job templates." +> +> -- *Red Hat AAP 2.5, Configuring Automation Execution, Ch. 15, Sec. 15.1.5* + +### What This Means in Practice + +Red Hat's credential guidance requires: (1) credentials managed at org/team level, not individual user level, (2) separate credentials per automation context (not one "master key" for everything), and (3) credential types that enforce structure. A single "Machine" credential used across all job templates violates separation of duties. + +### MCP Assessment Pattern + +**Step 1**: Query all credentials. + +```json +MCP Server: aap-mcp-security-compliance +Tool: credentials_list +Parameters: { "page_size": 100 } +``` + +**Step 2**: Query credential types. + +```json +MCP Server: aap-mcp-security-compliance +Tool: credential_types_list +Parameters: { "page_size": 100 } +``` + +**Step 3**: Analyze credential diversity. From the credentials list, check: +- How many unique credential types are in use +- Whether multiple credentials of the same type exist (separation of duties) +- Whether credentials have descriptive names suggesting scoped usage (e.g., "prod-ssh-key" vs "my-key") + +### Decision Table + +| Credential Count | Unique Types | Separation | Status | Finding | +|---|---|---|---|---| +| > 1 | > 1 | Scoped names | **PASS** | Credentials follow separation of duties | +| > 1 | 1 | All same type | **WARN** | Single credential type; limited separation of duties | +| 1 | 1 | Single cred | **GAP** | One credential for all automation; violates "Enforce separation of duties" | +| 0 | 0 | None | **GAP** | No credentials configured | + +### Gap Remediation + +Credentials CAN be created via MCP: + +```json +MCP Server: aap-mcp-security-compliance +Tool: credentials_create +Parameters: { + "name": "prod-machine-credential", + "credential_type": 1, + "organization": 1, + "inputs": { + "username": "ansible-svc", + "ssh_key_data": "{{ lookup('file', '/path/to/key') }}" + } +} +``` + +**Recommendation**: "Per Red Hat's Security Best Practices (Ch. 15, Sec. 15.1.5), create separate credentials for each environment (dev, staging, production) and each automation context (machine access, cloud provider, source control). Use credential types to enforce input structure." + +### Pitfalls + +- **Don't expose credential values**: The MCP `credentials_list` tool returns credential metadata, not secrets. Never attempt to retrieve or display actual credential values. +- **Don't confuse credential count with security**: Having 10 credentials doesn't mean they're properly scoped. Check names and types for evidence of separation. + +--- + +## Domain 5: Execution Environments + +### Red Hat Source + +Red Hat provides Execution Environments (EEs) as containerized runtime environments for automation jobs. EEs replace the legacy `virtualenv` approach. + +> "Execution environments are container images that serve as Ansible control nodes. They provide a defined, consistent, and portable environment for executing automation." +> +> -- *Red Hat AAP 2.6, Creating and Consuming Execution Environments* + +Red Hat ships a default `Minimal execution environment` and an `Ansible Automation Platform execution environment`. Custom EEs allow pinning specific Ansible collections, Python packages, and system packages. + +### What This Means in Practice + +Using only the default EE for all jobs means every playbook runs in the same environment, with no dependency isolation. Per Red Hat's EE documentation, custom EEs let teams pin specific collection versions and Python dependencies, preventing "works on my machine" failures and ensuring reproducible automation. + +**Framing**: Red Hat provides custom EE capability for dependency isolation and reproducibility. Assessment: Are you utilizing it? + +### MCP Assessment Pattern + +**Step 1**: Query execution environments. + +```json +MCP Server: aap-mcp-configuration +Tool: execution_environments_list +Parameters: { "page_size": 100 } +``` + +**Step 2**: Analyze results. Check: +- Total count of EEs +- Whether any custom EEs exist (beyond the default Red Hat-provided ones) +- Image names: `quay.io/ansible/` or `registry.redhat.io/` prefixes indicate vendor-provided; custom registries indicate custom EEs + +### Decision Table + +| Total EEs | Custom EEs | Status | Finding | +|---|---|---|---| +| > default count | > 0 | **PASS** | Custom execution environments in use | +| Default only | 0 | **WARN** | Only default EEs; all jobs share the same runtime environment | + +### Gap Remediation + +Custom EEs CAN be registered via MCP: + +```json +MCP Server: aap-mcp-configuration +Tool: execution_environments_create +Parameters: { + "name": "production-ee", + "image": "registry.example.com/aap/production-ee:1.0", + "organization": 1, + "description": "Production EE with pinned collection versions" +} +``` + +**Recommendation**: "Per Red Hat's EE documentation (AAP 2.6), create custom execution environments using `ansible-builder` to pin specific Ansible collections and Python dependencies. This ensures reproducible, isolated automation runs." + +### Pitfalls + +- **Don't flag default EEs as a failure**: Default EEs are legitimate for development and simple automation. Custom EEs are a maturity indicator, not a hard requirement. +- **Don't overlook image tags**: An EE referencing `latest` tag loses the reproducibility benefit of custom EEs. + +--- + +## Domain 6: Workload Isolation + +### Red Hat Source + +Red Hat provides Instance Groups for workload isolation and resource management. + +> "Instance groups can be used to assign jobs to run on specific sets of instances, providing workload isolation and resource management." +> +> -- *Red Hat AAP 2.5, Configuring Automation Execution, Ch. 17: Instance Groups* + +Instance groups support `max_forks` settings to limit concurrent automation load, and policy settings to control instance membership. + +### What This Means in Practice + +Without instance groups (beyond the default), all jobs compete for the same execution capacity. Production and development workloads share resources, meaning a runaway dev job can starve production automation. Instance groups enable isolation: "production jobs run on production instances, dev jobs run on dev instances." + +**Framing**: Red Hat provides instance groups for workload isolation. Assessment: Are you utilizing them to separate production and non-production workloads? + +### MCP Assessment Pattern + +**Step 1**: Query instance groups. + +```json +MCP Server: aap-mcp-system-monitoring +Tool: instance_groups_list +Parameters: { "page_size": 100 } +``` + +**Step 2**: Analyze results. Check: +- Total count of instance groups +- Whether groups beyond the default exist +- Group names suggesting environment separation (e.g., "production", "development") + +### Decision Table + +| Instance Groups | Beyond Default | Status | Finding | +|---|---|---|---| +| > 1 | Yes | **PASS** | Workload isolation configured | +| 1 (default only) | No | **WARN** | Single instance group; all workloads share resources | + +### Gap Remediation + +Instance groups CAN be created via MCP: + +```json +MCP Server: aap-mcp-system-monitoring +Tool: instance_groups_create +Parameters: { + "name": "production", + "max_forks": 50 +} +``` + +**Recommendation**: "Per Red Hat's Instance Groups documentation (Ch. 17), create separate instance groups for production and non-production workloads. Configure `max_forks` to prevent resource contention." + +### Pitfalls + +- **Don't over-isolate**: Creating too many instance groups with too few instances each can lead to resource underutilization. +- **Don't forget the controlplane group**: The `controlplane` instance group is system-managed and should not be modified. + +--- + +## Domain 7: Audit Trail + +### Red Hat Source + +Red Hat provides the Activity Stream as the platform's built-in audit trail. + +> "The Activity Stream shows all changes and events in the automation controller, including who made changes and when." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide: Activity Stream* + +The Activity Stream captures user logins, resource creation/modification/deletion, job launches, and configuration changes. + +**Framing**: Red Hat provides the Activity Stream for audit and compliance. Assessment: Is it accessible and generating entries? + +### MCP Assessment Pattern + +**Step 1**: Query the activity stream. + +```json +MCP Server: aap-mcp-system-monitoring +Tool: activity_stream_list +Parameters: { "page_size": 10 } +``` + +**Step 2**: Analyze results. Check: +- Whether entries are being generated (non-empty response) +- Recency of entries (is the platform actively logging?) +- Diversity of event types (logins, job launches, configuration changes) + +### Decision Table + +| Entries | Recency | Status | Finding | +|---|---|---|---| +| > 0 | Recent (within 24h) | **PASS** | Audit trail is active | +| > 0 | Stale (> 7 days) | **WARN** | Audit entries exist but platform appears inactive | +| 0 | N/A | **GAP** | No audit trail entries found | + +### Gap Remediation + +The Activity Stream is automatic and cannot be configured via MCP. If no entries are found, the platform may be newly installed or there may be a system issue. + +**Recommendation**: "Per Red Hat's Activity Stream documentation, verify that the platform is generating audit entries. For compliance, consider integrating the Activity Stream with external SIEM systems via webhooks or API polling." + +### Pitfalls + +- **Don't assume completeness**: The Activity Stream captures controller-level events, not playbook-level actions on managed hosts. +- **Don't ignore retention**: Activity Stream entries may have retention limits. For long-term audit, export to external systems. + +--- + +## Bonus Domain: External Authentication + +### Red Hat Source + +> "You can simplify login for your automation controller users by connecting to external account sources by LDAP, SAML 2.0, and certain OAuth providers." +> +> -- *Red Hat AAP 2.5, Configuring Automation Execution, Ch. 15, Sec. 15.2.2* + +### What This Means in Practice + +Local-only authentication means user lifecycle management is manual and disconnected from enterprise identity. External authentication (LDAP, SAML, OAuth) integrates AAP with the organization's identity provider, enabling centralized user management, MFA enforcement, and automatic deprovisioning. + +### MCP Assessment Pattern + +**Step 1**: Query authenticators. + +```json +MCP Server: aap-mcp-user-management +Tool: authenticators_list +Parameters: { "page_size": 100 } +``` + +**Step 2**: Analyze results. Check: +- Whether any external authenticators are configured (LDAP, SAML, OIDC) +- Whether configured authenticators are enabled + +### Decision Table + +| Authenticators | Enabled | Status | Finding | +|---|---|---|---| +| > 0 (external type) | Yes | **PASS** | External authentication configured | +| > 0 (external type) | No | **WARN** | External authenticator exists but is disabled | +| 0 (or local only) | N/A | **WARN** | Local authentication only; no enterprise identity integration | + +### Pitfalls + +- **Don't flag local auth as a hard failure**: Small environments or labs may legitimately use local authentication. This is a maturity indicator. +- **Don't expose authenticator configuration details**: The response may contain sensitive LDAP/SAML configuration. Report presence/absence only. + +--- + +## Output Template: Governance Readiness Report + +The agent MUST produce output in this format so the audience can see Red Hat documentation citations for every finding: + +``` +## AAP Governance Readiness Report + +**Assessment Date**: [date] +**AAP Instance**: [server URL] +**Domains Assessed**: 7 + 1 bonus + +--- + +### Domain 1: Workflow Governance — [PASS/GAP/WARN] + +Per Red Hat's *Automation Controller User Guide* (Ch. 9: Workflows): +> "Workflows enable you to configure a sequence of disparate job templates and link them together." + +**Finding**: Found [X] workflow job templates and [Y] standalone job templates. Workflow coverage ratio: [Z]%. +**Recommendation**: [action with source citation] + +--- + +### Domain 2: Notification Coverage — [PASS/GAP/WARN] + +Per Red Hat's *Automation Controller User Guide* (Ch. 25: Notifications): +> "You can set notifications on job start and job end, including job failure." + +**Finding**: Found [X] notification templates. [Types configured]. +**Recommendation**: [action with source citation] + +--- + +### Domain 3: Access Control (RBAC) — [PASS/GAP/WARN] + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.2.1): +> "Use teams inside of organizations to assign permissions to groups of users rather than to users individually." + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.2): +> "Minimize administrative accounts...restrict to the minimum set of users." + +**Finding**: [X] users ([Y] superusers), [Z] teams, [A] individual assignments, [B] team assignments. +**Recommendation**: [action with source citation] + +--- + +### Domain 4: Credential Security — [PASS/GAP/WARN] + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.4): +> "Remove user access to credentials...credentials should be defined at the organization or team level." + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.5): +> "Enforce separation of duties...different credentials for each piece of automation." + +**Finding**: [X] credentials, [Y] credential types, [separation assessment]. +**Recommendation**: [action with source citation] + +--- + +### Domain 5: Execution Environments — [PASS/GAP/WARN] + +Per Red Hat's *Creating and Consuming Execution Environments* (AAP 2.6): +> "Execution environments are container images that serve as Ansible control nodes." + +**Finding**: [X] execution environments ([Y] custom, [Z] default). +**Recommendation**: [action with source citation] + +--- + +### Domain 6: Workload Isolation — [PASS/GAP/WARN] + +Per Red Hat's *Configuring Automation Execution* (Ch. 17: Instance Groups): +> "Instance groups can be used to assign jobs to run on specific sets of instances." + +**Finding**: [X] instance groups. [Beyond default assessment]. +**Recommendation**: [action with source citation] + +--- + +### Domain 7: Audit Trail — [PASS/GAP/WARN] + +Per Red Hat's *Activity Stream* documentation: +> "The Activity Stream shows all changes and events in the automation controller." + +**Finding**: [X] activity stream entries. Most recent: [date]. +**Recommendation**: [action with source citation] + +--- + +### Bonus: External Authentication — [PASS/WARN] + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.2.2): +> "Connecting to external account sources by LDAP, SAML 2.0, and certain OAuth providers." + +**Finding**: [X] external authenticators configured. [Enabled status]. +**Recommendation**: [action with source citation] + +--- + +### Summary + +| Domain | Status | Key Finding | +|---|---|---| +| Workflow Governance | [status] | [one-line finding] | +| Notification Coverage | [status] | [one-line finding] | +| Access Control (RBAC) | [status] | [one-line finding] | +| Credential Security | [status] | [one-line finding] | +| Execution Environments | [status] | [one-line finding] | +| Workload Isolation | [status] | [one-line finding] | +| Audit Trail | [status] | [one-line finding] | +| External Authentication | [status] | [one-line finding] | + +**Overall**: [X] PASS, [Y] WARN, [Z] GAP out of 8 domains assessed. +``` + +--- + +## Cross-Domain Correlation + +Individual domain assessments reveal single-dimension findings. Cross-domain correlation reveals **compound risks** where gaps in one domain amplify weaknesses in another. The agent MUST perform this analysis after completing all domain assessments. + +### Why Correlation Matters + +Red Hat's Security Best Practices (Ch. 15) are interconnected: RBAC enables team-based credential management, workflows enable approval gates, notifications ensure visibility. When multiple domains have gaps, the compound effect is worse than the sum of individual gaps. + +### Correlation Patterns + +After assessing all domains, check for these compound findings: + +#### Pattern 1: RBAC Gap + Credential Risk + +**Trigger**: Domain 3 (RBAC) is GAP or WARN *and* Domain 4 (Credentials) has any credentials. + +**Compound Finding**: Per Red Hat's Ch. 15, Sec. 15.1.4: "Credentials can be configured at the organization, team, or user level." Without teams (Domain 3), credentials are necessarily user-scoped, directly violating this guidance. + +**Elevated Recommendation**: "Fix RBAC first -- creating teams unlocks team-scoped credential management, which addresses both domains simultaneously." + +#### Pattern 2: No Workflows + No Notifications + +**Trigger**: Domain 1 (Workflows) is GAP *and* Domain 2 (Notifications) is GAP. + +**Compound Finding**: Jobs run as standalone templates (no approval gates, no failure paths) with no failure alerting. Per Ch. 9 and Ch. 25, this means a failed production job has no governance controls AND no visibility. + +**Elevated Recommendation**: "This is the highest-risk combination. Production failures will go unnoticed until manually discovered. Address both domains urgently." + +#### Pattern 3: Single Instance Group + Production/Dev Inventories + +**Trigger**: Domain 6 (Workload Isolation) is WARN *and* inventory data shows both production-pattern and dev-pattern inventories exist. + +**Additional MCP Query** (adaptive -- only when triggered): + +```json +MCP Server: aap-mcp-inventory-management +Tool: inventories_list +Parameters: { "page_size": 100 } +``` + +If inventories matching both `prod`/`production` and `dev`/`development` patterns exist: + +**Compound Finding**: Per Ch. 17, instance groups provide workload isolation. With a single group, a runaway development job can starve production automation capacity. + +**Elevated Recommendation**: "Create separate instance groups for production and non-production workloads to prevent resource contention." + +#### Pattern 4: Multiple Superusers + No External Auth + +**Trigger**: Domain 3 (RBAC) shows > 1 superuser *and* Bonus Domain (External Auth) is WARN (local only). + +**Compound Finding**: Per Ch. 15, Sec. 15.1.2 and Sec. 15.2.2, superuser accounts with local-only authentication lack MFA and centralized lifecycle management. Password compromise has maximum blast radius. + +**Elevated Recommendation**: "Configure external authentication (LDAP/SAML/OIDC) to enforce MFA on superuser accounts. This addresses both RBAC and authentication gaps." + +### Correlation Output Template + +After the domain-by-domain report, include compound findings: + +``` +### Compound Risk Analysis + +[Only include this section if correlation patterns matched] + +⚠️ **[Pattern Name]**: +- Domains involved: [Domain X] ([status]) + [Domain Y] ([status]) +- Per Red Hat's [source]: "[relevant quote]" +- Combined impact: [what the compound risk means] +- Priority action: [what to fix first and why] +``` + +--- + +## Adaptive Depth Queries + +When a domain assessment reveals specific conditions, the agent SHOULD perform follow-up queries to deepen the finding rather than stopping at the surface-level check. This is how the assessment adapts to what it discovers. + +### Notification Depth: Check Actual Bindings + +**Trigger**: Domain 2 reports PASS (notification templates exist). + +**Rationale**: Templates existing doesn't mean they're attached to anything. + +**Follow-up Query**: + +```json +MCP Server: aap-mcp-job-management +Tool: job_templates_list +Parameters: { "page_size": 100 } +``` + +Examine each job template's response for notification association fields (`related.notification_templates_started`, `related.notification_templates_success`, `related.notification_templates_error`). If ALL job templates show empty notification bindings: + +**Revised Finding**: Downgrade Domain 2 to **WARN**: "Notification templates exist but are not bound to any job templates. Per Ch. 25: 'You can set notifications on job start and job end, including job failure' -- but only if templates are attached to resources." + +### Credential Depth: Check Separation of Duties + +**Trigger**: Domain 4 has multiple credentials (potential PASS). + +**Rationale**: Multiple credentials don't guarantee they're properly scoped. + +**Follow-up Query**: + +```json +MCP Server: aap-mcp-job-management +Tool: job_templates_list +Parameters: { "page_size": 100 } +``` + +Compare each job template's `credential` or `credentials` field. If one credential ID appears across templates targeting different inventories (e.g., both dev and prod): + +**Revised Finding**: Downgrade Domain 4 to **WARN**: "Credential '[name]' (ID: [id]) is shared across both development and production job templates. Per Ch. 15, Sec. 15.1.5: 'Enforce separation of duties...different credentials for different pieces of automation.'" + +### RBAC Depth: Check Role Breadth + +**Trigger**: Domain 3 reports PASS (teams exist, team-based assignments). + +**Rationale**: Team-based assignments don't guarantee least privilege. + +**Follow-up Query**: + +```json +MCP Server: aap-mcp-user-management +Tool: role_team_assignments_list +Parameters: { "page_size": 100 } +``` + +Cross-reference with role definitions. If any team has Admin-level access on organization-wide scope: + +**Revised Finding**: Downgrade Domain 3 to **WARN**: "Teams exist, but team '[name]' has Admin-level access across the organization. Per Ch. 15, Sec. 15.2.1: 'Delegate the minimum level of privileges required to run automation.'" + +### Scale Calibration + +**Trigger**: Always (after all domain assessments). + +**Rationale**: A 2-host lab and a 200-host enterprise have different severity thresholds. + +**Follow-up Queries**: + +```json +MCP Server: aap-mcp-inventory-management +Tool: inventories_list +Parameters: { "page_size": 100 } +``` + +```json +MCP Server: aap-mcp-inventory-management +Tool: hosts_list +Parameters: { "page_size": 1 } +``` + +Use total host count and inventory naming patterns to calibrate severity framing: + +| Scale Signal | Calibration | +|---|---| +| < 5 hosts, dev/lab inventory only | "Small lab/development environment. Governance gaps noted but severity calibrated to environment scale." | +| 5-50 hosts, mixed inventories | Standard severity | +| > 50 hosts or production-pattern inventories | "Enterprise environment with production workloads. Governance gaps carry elevated risk." | + +--- + +## Prioritized Remediation Ordering + +Instead of listing all gaps equally, order remediation by dependency chain. Some fixes unlock others: + +| Priority | Domain | Rationale | +|---|---|---| +| 1 | RBAC (Domain 3) | Prerequisite for team-scoped credentials and role-based access | +| 2 | Credential Security (Domain 4) | Depends on teams existing for proper scoping | +| 3 | Workflow Governance (Domain 1) | Enables approval gates and failure paths | +| 4 | Notification Coverage (Domain 2) | Most effective when attached to workflows/templates | +| 5 | Execution Environments (Domain 5) | Independent -- can fix in parallel | +| 6 | Workload Isolation (Domain 6) | Independent -- can fix in parallel | +| 7 | External Authentication (Bonus) | Independent but high-impact for security posture | +| 8 | Audit Trail (Domain 7) | Automatic -- no action unless missing | + +**Output Template**: + +After the domain-by-domain report, include: + +``` +### Recommended Fix Order + +Based on dependency analysis, address gaps in this order: + +1. **[First unfixed domain]** — [why this must be fixed first] + → Unlocks: [what fixing this enables] +2. **[Second unfixed domain]** — [why this comes next] + → Depends on: [prerequisite from step 1] +3. ... + +Domains that can be addressed in parallel: [list independent domains] +``` + +--- + +## Cross-References + +- **[execution-governance.md](execution-governance.md)** -- After assessing readiness, use this document for governed execution with risk classification and check mode +- **[job-troubleshooting.md](job-troubleshooting.md)** -- If jobs fail during execution, use this document for forensic troubleshooting with event parsing and host correlation +- **[error-classification.md](../references/error-classification.md)** -- Reference for systematic error classification and resolution path determination + +--- + +## Official Red Hat Sources + +1. Red Hat AAP 2.5, Configuring Automation Execution -- Security Best Practices (Ch. 15). https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +2. Red Hat AAP 2.5, Automation Controller User Guide -- Workflows (Ch. 9). https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-workflows. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +3. Red Hat AAP 2.5, Automation Controller User Guide -- Notifications (Ch. 25). https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-notifications. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +4. Red Hat AAP 2.5, Automation Controller User Guide -- RBAC (Ch. 4). https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/access_management_and_authentication/gw-managing-access. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +5. Red Hat AAP 2.5, Configuring Automation Execution -- Instance Groups (Ch. 17). https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/using_automation_execution/controller-instance-groups. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +6. Red Hat AAP 2.5, Automation Controller User Guide -- Activity Stream. https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/using_automation_execution/assembly-controller-activity-stream. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +7. Red Hat AAP 2.6, Creating and Consuming Execution Environments. https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/creating_and_consuming_execution_environments. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +8. Red Hat AAP 2.6, Hardening Guide. https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/hardening_and_compliance/index. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +--- + +## Quick Reference + +| Domain | Red Hat Source | MCP Server | Key Tool | Status Thresholds | +|---|---|---|---|---| +| Workflow Governance | Ch. 9 Workflows | job-management | `workflow_job_templates_list` | PASS: >50% coverage, GAP: 0 workflows | +| Notification Coverage | Ch. 25 Notifications | configuration | `notification_templates_list` | PASS: failure notifs exist, GAP: 0 templates | +| Access Control (RBAC) | Ch. 15 Sec. 15.2.1 | user-management | `users_list`, `teams_list`, `role_user_assignments_list`, `role_team_assignments_list` | PASS: teams + <2 superusers, GAP: 0 teams | +| Credential Security | Ch. 15 Sec. 15.1.4-5 | security-compliance | `credentials_list`, `credential_types_list` | PASS: multiple scoped creds, GAP: 1 cred | +| Execution Environments | AAP 2.6 EE Guide | configuration | `execution_environments_list` | PASS: custom EEs, WARN: default only | +| Workload Isolation | Ch. 17 Instance Groups | system-monitoring | `instance_groups_list` | PASS: >1 group, WARN: default only | +| Audit Trail | Activity Stream docs | system-monitoring | `activity_stream_list` | PASS: recent entries, GAP: 0 entries | +| External Auth (bonus) | Ch. 15 Sec. 15.2.2 | user-management | `authenticators_list` | PASS: external enabled, WARN: local only | diff --git a/submissions/rh-automation-governance-assessor/docs/aap/job-troubleshooting.md b/submissions/rh-automation-governance-assessor/docs/aap/job-troubleshooting.md new file mode 100644 index 0000000..3ce7636 --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/aap/job-troubleshooting.md @@ -0,0 +1,396 @@ +--- +title: Job Troubleshooting +category: aap +sources: + - title: "Red Hat AAP 2.6 - Troubleshooting Guide (Jobs)" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/troubleshooting_ansible_automation_platform/troubleshoot-jobs + sections: "Job failure analysis, common job errors, event interpretation" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.5 - Job Events" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/automation_controller_user_guide/controller-job-templates + sections: "Job events, host summaries, job stdout" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.5 - Configuring Automation Execution" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution + sections: "Instance capacity, job scheduling, execution environment troubleshooting" + date_accessed: 2026-02-20 +tags: [troubleshooting, job-failure, events, host-facts, correlation, forensic-analysis, error-patterns] +applies_to: [aap2.5, aap2.6] +semantic_keywords: + - "job failed" + - "why did the job fail" + - "analyze failure" + - "job events" + - "host unreachable" + - "module failure" + - "error analysis" + - "root cause" + - "failure correlation" +use_cases: + - "job_failure_analysis" + - "forensic_troubleshooting" + - "host_correlation" +related_docs: + - "aap/execution-governance.md" + - "aap/governance-readiness.md" + - "references/error-classification.md" +last_updated: 2026-02-26 +--- + +# Job Troubleshooting + +This document teaches the agent how to perform forensic analysis of failed AAP jobs. It covers event extraction and parsing, failure pattern recognition, host fact correlation, and root cause determination. Every analysis technique maps to MCP tools with exact parameters and is backed by Red Hat's official troubleshooting guidance. + +## Overview + +When an AAP job fails, the raw information exists in three places: **job events** (what happened, step by step), **host summaries** (which hosts failed), and **host facts** (system state of failed hosts). The agent's value is correlating these three data sources to determine root cause and classify the failure type. + +## When to Use This Document + +**Use when**: +- User reports a failed job: "Job #X failed" +- User asks why an execution failed +- User asks to analyze job errors +- As part of the forensic-troubleshooter agent workflow + +**Do NOT use when**: +- User wants to execute a job (use [execution-governance.md](execution-governance.md)) +- User wants to assess platform readiness (use [governance-readiness.md](governance-readiness.md)) +- User needs error classification taxonomy (use [error-classification.md](../references/error-classification.md) as companion reference) + +--- + +## Step 1: Job Status Retrieval + +### Red Hat Source + +> "The job detail page shows the status, timing, and results of a job execution." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide, Ch. 9* + +### MCP Pattern + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_retrieve +Parameters: { "id": "" } +``` + +**Key fields to extract**: +- `status`: `"failed"`, `"error"`, `"canceled"` -- determines analysis path +- `failed`: boolean -- confirms failure +- `job_type`: `"run"` or `"check"` -- check mode failures need different interpretation +- `elapsed`: execution time in seconds +- `launch_type`: `"manual"`, `"schedule"`, `"workflow"` -- context for how the job was triggered + +### Status Interpretation + +| Status | Meaning | Analysis Path | +|---|---|---| +| `failed` | Playbook execution completed but one or more tasks failed | Analyze job events for `runner_on_failed` events | +| `error` | Platform error prevented job execution | Check instance capacity, EE availability, credential validity | +| `canceled` | Job was canceled by user or timeout | Check if timeout was configured; may indicate hung task | + +--- + +## Step 2: Failure Event Extraction + +### Red Hat Source + +> "Troubleshooting a failed job in automation controller involves examining the job's event stream to identify which task failed and on which host." +> +> -- *Red Hat AAP 2.6, Troubleshooting Ansible Automation Platform, Troubleshoot Jobs* + +### MCP Pattern + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_job_events_list +Parameters: { + "id": "", + "page_size": 100 +} +``` + +### Event Filtering Strategy + +Job events contain the full execution trace. For failure analysis, focus on these event types: + +| Event Field (`event`) | Meaning | Priority | +|---|---|---| +| `runner_on_failed` | A task failed on a host | **PRIMARY** -- the actual failure | +| `runner_on_unreachable` | A host was unreachable | **PRIMARY** -- connectivity failure | +| `runner_on_skipped` | A task was skipped | SECONDARY -- may indicate conditional logic bypass | +| `playbook_on_stats` | Final play recap | SUMMARY -- aggregate success/failure counts | +| `runner_on_ok` | A task succeeded | CONTEXT -- useful for timeline reconstruction | + +### Extracting Failure Details + +From each `runner_on_failed` or `runner_on_unreachable` event, extract: + +- `host`: Which host failed +- `task`: Which task failed +- `event_data.res.msg`: The error message +- `event_data.res.module_name` or `event_data.task_action`: Which Ansible module was involved +- `event_data.res.rc`: Return code (for command/shell modules) +- `counter`: Event sequence number (for timeline) + +### Failure Timeline Reconstruction + +Sort events by `counter` to reconstruct the failure sequence: + +1. Identify the FIRST failure event (lowest counter among `runner_on_failed`/`runner_on_unreachable`) +2. Check if subsequent tasks were affected (cascade failures) +3. Note the task name and module of the first failure -- this is usually the root cause + +--- + +## Step 3: Host Summary Analysis + +### MCP Pattern + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_job_host_summaries_list +Parameters: { "id": "" } +``` + +### Interpreting Host Summaries + +| Field | Meaning | Forensic Value | +|---|---|---| +| `ok` | Tasks that succeeded | Baseline -- how far the playbook got before failure | +| `changed` | Tasks that made changes | Shows what was already modified before failure (important for rollback) | +| `failures` | Tasks that failed | Core failure count | +| `dark` | Host unreachable events | Indicates connectivity/platform issue, not code issue | +| `skipped` | Tasks skipped | May indicate handler or conditional logic issues | +| `processed` | Total tasks processed | Indicates whether failure was early or late in execution | + +### Correlation Pattern + +| dark > 0 | failures > 0 | Classification | +|---|---|---| +| Yes | No | **Platform issue**: Host connectivity problem | +| No | Yes | **Code/Config issue**: Playbook task failure | +| Yes | Yes | **Mixed**: Some hosts unreachable, others had task failures | +| No | No | Investigate: job may have `error` status (platform-level failure) | + +--- + +## Step 4: Host Fact Correlation + +### Red Hat Source + +> "Host facts gathered by Ansible (via setup/gather_facts) can provide system state information useful for troubleshooting failures." +> +> -- *Red Hat AAP 2.5, Automation Controller User Guide* + +### MCP Pattern + +First, retrieve host details to get the host ID: + +```json +MCP Server: aap-mcp-inventory-management +Tool: hosts_list +Parameters: { "search": "" } +``` + +Then retrieve host variables (which may include cached facts): + +```json +MCP Server: aap-mcp-inventory-management +Tool: hosts_variable_data_retrieve +Parameters: { "id": "", "format": "json" } +``` + +### Host Fact Correlation Table + +When a failure is identified, correlate the error with host facts to determine if the host's system state contributed: + +| Error Pattern | Host Fact to Check | Likely Cause | +|---|---|---| +| "No space left on device" | `ansible_mounts[].size_available` | Disk full on target host | +| "Unable to start service" | `ansible_service_mgr` | Service manager mismatch (systemd vs sysvinit) | +| Package not found / install failure | `ansible_distribution`, `ansible_distribution_version` | Wrong OS version; package not in repos | +| "Permission denied" | `ansible_user_id`, `ansible_become` | Privilege escalation not configured | +| "Connection timed out" | `ansible_default_ipv4` | Network configuration issue | +| "No matching host" / "Name resolution failure" | `ansible_fqdn`, `ansible_hostname` | DNS resolution problem | +| "Module not found" | Host's Python version (`ansible_python_version`) | Missing Python dependency on host | +| Out of memory / OOM killed | `ansible_memtotal_mb`, `ansible_memfree_mb` | Insufficient memory | + +### Pitfalls + +- **Don't assume host facts are current**: Cached facts may be stale. If facts were gathered during a previous job run, they reflect the state at that time, not necessarily now. +- **Don't skip host correlation for "obvious" errors**: Even a simple "package not found" may have a root cause in the host's OS version or repo configuration. + +--- + +## Step 5: Job Stdout (Supplementary) + +For detailed output when event data is insufficient: + +### MCP Pattern + +```json +MCP Server: aap-mcp-job-management +Tool: jobs_stdout_retrieve +Parameters: { "id": "", "format": "txt" } +``` + +**When to use**: When event data doesn't contain enough detail (e.g., `runner_on_failed` without a clear `msg`). The stdout contains the full Ansible output including verbose error messages. + +**Supported formats**: `ansi` (colored), `txt` (plain text), `json` (structured), `html` (rendered) + +--- + +## Failure Patterns Reference + +### Pattern 1: Host Unreachable + +**Red Hat Source**: *AAP 2.6 Troubleshooting Guide* -- "If the job shows hosts as 'dark' (unreachable), verify network connectivity and SSH configuration." + +**Event signature**: `event: "runner_on_unreachable"` + +**Common causes**: +- SSH port blocked by firewall +- Host is down or rebooting +- DNS resolution failure +- SSH key mismatch (credential issue) + +**Correlation**: Check `ansible_default_ipv4` from host facts to verify network configuration. + +### Pattern 2: Module Failure (Package Operations) + +**Event signature**: `event: "runner_on_failed"`, `task_action: "ansible.builtin.dnf"` or `ansible.builtin.yum"` + +**Common causes**: +- Package not available in configured repos +- Repository connectivity issue +- Dependency conflict +- Disk space insufficient for package + +**Correlation**: Check `ansible_distribution` and `ansible_distribution_version` to verify OS compatibility. + +### Pattern 3: Privilege Escalation Timeout + +**Red Hat Source**: *AAP 2.6 Troubleshooting Guide* -- "Privilege escalation timeouts can occur when sudo requires a password or when the become method is misconfigured." + +**Event signature**: `event: "runner_on_failed"`, `msg` contains "Timeout" and "privilege escalation" + +**Common causes**: +- `become: true` without passwordless sudo configured +- sudo requiring TTY (`requiretty` in sudoers) +- Become method mismatch (sudo vs su vs pbrun) + +**Correlation**: Check `ansible_become` and `ansible_user_id` from host facts. + +### Pattern 4: Service Start Failure + +**Event signature**: `event: "runner_on_failed"`, `task_action: "ansible.builtin.service"` or `"ansible.builtin.systemd"` + +**Common causes**: +- Service configuration error (bad config file) +- Port already in use +- Dependency service not running +- SELinux context preventing service start + +**Correlation**: Check `ansible_service_mgr` to verify service manager compatibility. + +### Pattern 5: Template Rendering Error + +**Event signature**: `event: "runner_on_failed"`, `msg` contains "AnsibleUndefinedVariable" or "template error" + +**Common causes**: +- Variable not defined in inventory or extra_vars +- Jinja2 syntax error in template +- Variable scope issue (host vs group vs play) + +**Correlation**: Check `hosts_variable_data_retrieve` for the host to see what variables are defined. + +### Pattern 6: Execution Environment Issue + +**Event signature**: Job `status: "error"` (not `"failed"` -- platform-level), error mentions EE or container + +**Common causes**: +- EE image not accessible (registry auth failure) +- EE missing required collection +- EE Python version incompatible with playbook + +**Correlation**: Check `execution_environments_list` from configuration MCP server. + +--- + +## Root Cause Classification Output Template + +The agent MUST produce a structured root cause analysis: + +``` +## Job Failure Analysis: Job #[job_id] + +**Job Status**: [status] +**Elapsed Time**: [elapsed]s +**Launch Type**: [launch_type] + +### Failure Timeline + +1. [timestamp/counter] - Task "[task_name]" on host "[hostname]": [event_type] + Error: "[error_message]" +2. [subsequent events if cascade] + +### Host Summary + +| Host | OK | Changed | Failed | Unreachable | +|---|---|---|---|---| +| [host1] | [ok] | [changed] | [failures] | [dark] | + +### Root Cause Classification + +**Classification**: [Platform / Code / Configuration] Issue +**Evidence**: Per Red Hat's Troubleshooting Guide: "[relevant guidance]" + +**Error Pattern**: [Pattern name from Failure Patterns Reference] +**Root Cause**: [Specific determination] + +### Host Fact Correlation + +Consulted host facts for [hostname]: +- [relevant fact]: [value] → [correlation finding] + +### Recommended Resolution + +Per [Red Hat source]: [resolution recommendation] + +See [error-classification.md](../references/error-classification.md) for detailed resolution paths. +``` + +--- + +## Cross-References + +- **[execution-governance.md](execution-governance.md)** -- For rollback options after determining root cause +- **[governance-readiness.md](governance-readiness.md)** -- Platform configuration issues may indicate governance gaps +- **[error-classification.md](../references/error-classification.md)** -- Detailed error taxonomy and resolution path mapping + +--- + +## Official Red Hat Sources + +1. Red Hat AAP 2.6, Troubleshooting Ansible Automation Platform -- Troubleshoot Jobs. https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/troubleshooting_ansible_automation_platform/troubleshoot-jobs. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +2. Red Hat AAP 2.5, Automation Controller User Guide -- Job Templates (Ch. 9). https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/automation_controller_user_guide/controller-job-templates. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +3. Red Hat AAP 2.5, Configuring Automation Execution. https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +--- + +## Quick Reference + +| Analysis Step | MCP Server | Tool | Key Parameters | +|---|---|---|---| +| Job status | job-management | `jobs_retrieve` | `id` | +| Failure events | job-management | `jobs_job_events_list` | `id`, `page_size: 100` | +| Host summaries | job-management | `jobs_job_host_summaries_list` | `id` | +| Full stdout | job-management | `jobs_stdout_retrieve` | `id`, `format: "txt"` | +| Host lookup | inventory-management | `hosts_list` | `search: ""` | +| Host facts | inventory-management | `hosts_variable_data_retrieve` | `id`, `format: "json"` | +| EE check | configuration | `execution_environments_list` | `page_size: 100` | diff --git a/submissions/rh-automation-governance-assessor/docs/references/README.md b/submissions/rh-automation-governance-assessor/docs/references/README.md new file mode 100644 index 0000000..93f2bc1 --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/references/README.md @@ -0,0 +1,17 @@ +# References Documentation + +Cross-cutting reference material used across multiple use cases. + +## Documents + +| Document | Purpose | Use Case | +|----------|---------|----------| +| [error-classification.md](error-classification.md) | Error taxonomy, classification decision trees, resolution path mapping | UC3: Forensic Troubleshooting | + +## How to Use + +Reference documents provide classification frameworks and taxonomies. They complement the AAP-specific documents by providing structured decision-making guidance. + +``` +AAP doc provides failure patterns → Reference doc classifies and maps to resolution paths +``` diff --git a/submissions/rh-automation-governance-assessor/docs/references/error-classification.md b/submissions/rh-automation-governance-assessor/docs/references/error-classification.md new file mode 100644 index 0000000..342fbf4 --- /dev/null +++ b/submissions/rh-automation-governance-assessor/docs/references/error-classification.md @@ -0,0 +1,338 @@ +--- +title: Error Classification Taxonomy +category: references +sources: + - title: "Red Hat AAP 2.6 - Troubleshooting Guide" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/troubleshooting_ansible_automation_platform/troubleshoot-jobs + sections: "Job failure types, common error patterns, resolution approaches" + date_accessed: 2026-02-20 + - title: "Ansible Module Documentation" + url: https://docs.ansible.com/ansible/latest/collections/ansible/builtin/index.html + sections: "Module return values, error conditions, check mode behavior" + date_accessed: 2026-02-20 + - title: "Red Hat AAP 2.5 - Configuring Automation Execution" + url: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution + sections: "Platform errors, capacity issues, execution environment troubleshooting" + date_accessed: 2026-02-20 +tags: [error-classification, taxonomy, platform-errors, code-errors, configuration-errors, resolution-paths] +applies_to: [aap2.5, aap2.6] +semantic_keywords: + - "error classification" + - "platform vs code error" + - "resolution path" + - "error taxonomy" + - "failure type determination" + - "troubleshooting decision tree" +use_cases: + - "error_classification" + - "resolution_path_determination" +related_docs: + - "aap/job-troubleshooting.md" + - "aap/execution-governance.md" +last_updated: 2026-02-26 +--- + +# Error Classification Taxonomy + +This document teaches the agent a systematic framework for classifying AAP job errors into three categories -- **Platform**, **Code**, and **Configuration** -- and mapping each to a resolution path. The classification determines who needs to act (platform admin, playbook developer, or ops engineer) and what Red Hat documentation to reference. + +## Overview + +Not all job failures are the same. A host connectivity issue requires platform investigation. A bad Jinja2 variable requires playbook code fixes. A privilege escalation timeout requires configuration changes. Classifying the error correctly is the first step to efficient resolution. + +## When to Use This Document + +**Use when**: +- After analyzing job events (companion to [job-troubleshooting.md](../aap/job-troubleshooting.md)) +- When the resolution-advisor skill needs to determine resolution paths +- When classifying errors for the execution summary report + +**Do NOT use when**: +- For initial event extraction (use [job-troubleshooting.md](../aap/job-troubleshooting.md) first) +- For execution decisions (use [execution-governance.md](../aap/execution-governance.md)) + +--- + +## Classification Decision Tree + +``` +Job Status? +├── "error" → PLATFORM ERROR (job never executed) +│ ├── EE not found → EE Configuration Issue +│ ├── Capacity exceeded → Instance Capacity Issue +│ └── Credential invalid → Credential Configuration Issue +│ +├── "failed" → Examine event types +│ ├── runner_on_unreachable → PLATFORM ERROR +│ │ ├── SSH timeout → Network/Firewall Issue +│ │ ├── DNS failure → DNS Configuration Issue +│ │ └── Auth rejected → SSH Key/Credential Issue +│ │ +│ ├── runner_on_failed → Examine module and message +│ │ ├── Module: dnf/yum/apt +│ │ │ ├── "No package matching" → CODE ERROR (wrong package name) +│ │ │ └── "Failed to download" → PLATFORM ERROR (repo access) +│ │ │ +│ │ ├── Module: service/systemd +│ │ │ ├── "Could not find" → CODE ERROR (wrong service name) +│ │ │ └── "Failed to start" → CONFIGURATION ERROR (service config) +│ │ │ +│ │ ├── Module: copy/template +│ │ │ ├── "AnsibleUndefinedVariable" → CODE ERROR (missing variable) +│ │ │ └── "Permission denied" → CONFIGURATION ERROR (file perms) +│ │ │ +│ │ ├── Module: shell/command +│ │ │ ├── rc != 0 → CODE ERROR (script failure) +│ │ │ └── "Timeout" → CONFIGURATION ERROR (command timeout) +│ │ │ +│ │ └── Message contains "privilege escalation" +│ │ └── CONFIGURATION ERROR (sudo/become) +│ │ +│ └── runner_on_skipped (all tasks) → CODE ERROR (conditional logic) +│ +└── "canceled" → Check timeout settings + ├── Timeout configured and hit → CONFIGURATION ERROR + └── Manual cancellation → Not an error +``` + +--- + +## Category 1: Platform Errors + +### Definition + +Errors caused by infrastructure, network, or AAP platform state -- not by the playbook code itself. Resolution requires platform admin action. + +### Red Hat Source + +> "If a job fails immediately or shows all hosts as unreachable, check the automation controller's capacity and the network connectivity to managed hosts." +> +> -- *Red Hat AAP 2.6, Troubleshooting Ansible Automation Platform* + +### Error Patterns + +#### 1a. Host Unreachable (SSH) + +| Field | Pattern | +|---|---| +| Event | `runner_on_unreachable` | +| Message | Contains "Connection timed out", "Connection refused", "No route to host" | +| Host Summary | `dark > 0` | + +**Resolution path**: Network/infrastructure team. Check firewall rules, SSH daemon status, host availability. + +**Red Hat reference**: AAP 2.6 Troubleshooting Guide -- "Verify network connectivity and SSH configuration." + +#### 1b. DNS Resolution Failure + +| Field | Pattern | +|---|---| +| Event | `runner_on_unreachable` | +| Message | Contains "Name or service not known", "Could not resolve hostname" | + +**Resolution path**: DNS/infrastructure team. Verify DNS records and resolution from controller nodes. + +#### 1c. Execution Environment Unavailable + +| Field | Pattern | +|---|---| +| Job Status | `error` (not `failed`) | +| Message | Contains "EE", "execution environment", "image pull", "container" | + +**Resolution path**: Platform admin. Verify EE image accessibility, registry authentication, and EE configuration in AAP. + +**Red Hat reference**: AAP 2.6, Creating and Consuming Execution Environments -- verify image registry access. + +#### 1d. Instance Capacity Exhaustion + +| Field | Pattern | +|---|---| +| Job Status | `error` or long `pending`/`waiting` | +| Message | Contains "capacity", "no available instances" | + +**Resolution path**: Platform admin. Scale instance groups or reduce concurrent job load. + +**Red Hat reference**: AAP 2.5, Instance Groups (Ch. 17) -- "Configure instance groups with appropriate capacity." + +--- + +## Category 2: Code Errors + +### Definition + +Errors caused by playbook logic, module usage, or variable definitions. Resolution requires playbook developer action. + +### Red Hat Source + +> "Module failures typically indicate an issue with the playbook task definition, such as an incorrect module parameter, missing variable, or logic error." +> +> -- *Red Hat AAP 2.6, Troubleshooting Ansible Automation Platform* + +### Error Patterns + +#### 2a. Undefined Variable + +| Field | Pattern | +|---|---| +| Event | `runner_on_failed` | +| Message | Contains "AnsibleUndefinedVariable", "'{{ variable }}' is undefined" | +| Module | `ansible.builtin.template`, `ansible.builtin.debug`, or any task using variables | + +**Resolution path**: Playbook developer. Define the variable in inventory vars, group vars, extra_vars, or role defaults. + +#### 2b. Wrong Package Name + +| Field | Pattern | +|---|---| +| Event | `runner_on_failed` | +| Module | `ansible.builtin.dnf`, `ansible.builtin.yum` | +| Message | Contains "No package matching", "No match for argument" | + +**Resolution path**: Playbook developer. Verify package name for the target OS distribution and version. + +**Host fact correlation**: Check `ansible_distribution` and `ansible_distribution_version` -- package names differ between RHEL 8 and RHEL 9. + +#### 2c. Syntax / Logic Error + +| Field | Pattern | +|---|---| +| Event | `runner_on_failed` | +| Message | Contains "Syntax Error", "template error", "unexpected token" | + +**Resolution path**: Playbook developer. Fix Jinja2 syntax, YAML formatting, or task logic. + +#### 2d. Script Failure (shell/command) + +| Field | Pattern | +|---|---| +| Event | `runner_on_failed` | +| Module | `ansible.builtin.shell`, `ansible.builtin.command` | +| `rc` | Non-zero return code | + +**Resolution path**: Playbook developer. Debug the shell script/command. Check `stdout` and `stderr` in the event data. + +#### 2e. Wrong Service Name + +| Field | Pattern | +|---|---| +| Event | `runner_on_failed` | +| Module | `ansible.builtin.service`, `ansible.builtin.systemd` | +| Message | Contains "Could not find the requested service" | + +**Resolution path**: Playbook developer. Verify service name is correct for the target OS. + +--- + +## Category 3: Configuration Errors + +### Definition + +Errors caused by mismatches between the playbook's expectations and the target system's configuration. The playbook logic may be correct, but the environment isn't set up to support it. Resolution requires ops/config team action. + +### Error Patterns + +#### 3a. Privilege Escalation Failure + +| Field | Pattern | +|---|---| +| Event | `runner_on_failed` | +| Message | Contains "Missing sudo password", "privilege escalation", "Timeout" with "become" | + +**Resolution path**: Ops team. Configure passwordless sudo for the Ansible service account, or provide become credentials in AAP. + +**Red Hat reference**: AAP 2.6 Troubleshooting Guide -- "Privilege escalation timeouts can occur when sudo requires a password or when the become method is misconfigured." + +#### 3b. Credential Mismatch + +| Field | Pattern | +|---|---| +| Event | `runner_on_unreachable` or `runner_on_failed` | +| Message | Contains "Authentication failed", "Permission denied (publickey)" | + +**Resolution path**: Ops team. Update the AAP credential with the correct SSH key or password. Verify the credential is attached to the correct job template. + +**Red Hat reference**: AAP 2.5 Security Best Practices (Ch. 15, Sec. 15.1.4) -- "Credentials should be defined at the organization or team level." + +#### 3c. File Permission Denied + +| Field | Pattern | +|---|---| +| Event | `runner_on_failed` | +| Module | `ansible.builtin.copy`, `ansible.builtin.file`, `ansible.builtin.template` | +| Message | Contains "Permission denied" (NOT SSH-related) | + +**Resolution path**: Ops team. Fix file/directory permissions on the target host. + +#### 3d. Service Configuration Error + +| Field | Pattern | +|---|---| +| Event | `runner_on_failed` | +| Module | `ansible.builtin.service`, `ansible.builtin.systemd` | +| Message | Contains "Failed to start", "failed with result 'exit-code'" | + +**Resolution path**: Ops team. Check the service's configuration files, port conflicts, and dependency services on the target host. + +#### 3e. Missing Collection in EE + +| Field | Pattern | +|---|---| +| Event | `runner_on_failed` | +| Message | Contains "couldn't resolve module/action", "No module named" | + +**Resolution path**: Platform admin. Update the Execution Environment to include the required Ansible collection. + +**Red Hat reference**: AAP 2.6 Creating and Consuming Execution Environments -- "Custom EEs allow pinning specific Ansible collections." + +--- + +## Resolution Path Summary + +| Classification | Who Acts | Typical Fix | Red Hat Doc Reference | +|---|---|---|---| +| **Platform** - Host Unreachable | Network/Infra | Firewall, SSH, DNS | AAP 2.6 Troubleshooting Guide | +| **Platform** - EE Unavailable | Platform Admin | Registry access, EE config | AAP 2.6 EE Guide | +| **Platform** - Capacity | Platform Admin | Scale instances | AAP 2.5 Instance Groups (Ch. 17) | +| **Code** - Undefined Variable | Playbook Dev | Define variable | Ansible Variable Precedence docs | +| **Code** - Wrong Package | Playbook Dev | Fix package name | RHEL Package Management docs | +| **Code** - Syntax Error | Playbook Dev | Fix Jinja2/YAML | Ansible Playbook Guide | +| **Code** - Script Failure | Playbook Dev | Debug script | N/A (custom script) | +| **Config** - Privilege Escalation | Ops Team | Sudoers config | AAP 2.6 Troubleshooting Guide | +| **Config** - Credential Mismatch | Ops Team | Update credential | AAP 2.5 Security Best Practices | +| **Config** - Permissions | Ops Team | File permissions | RHEL System Administration | +| **Config** - Service Failure | Ops Team | Service config | systemd documentation | +| **Config** - Missing Collection | Platform Admin | Update EE | AAP 2.6 EE Guide | + +--- + +## Cross-References + +- **[job-troubleshooting.md](../aap/job-troubleshooting.md)** -- Use first for event extraction and host correlation before classifying errors +- **[execution-governance.md](../aap/execution-governance.md)** -- For rollback options after classification determines the error requires immediate remediation +- **[governance-readiness.md](../aap/governance-readiness.md)** -- Platform errors may indicate governance gaps (e.g., single instance group causing capacity issues) + +--- + +## Official Red Hat Sources + +1. Red Hat AAP 2.6, Troubleshooting Ansible Automation Platform -- Troubleshoot Jobs. https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/troubleshooting_ansible_automation_platform/troubleshoot-jobs. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +2. Ansible Built-in Module Documentation. https://docs.ansible.com/ansible/latest/collections/ansible/builtin/index.html. Accessed 2026-02-20. + +3. Red Hat AAP 2.5, Configuring Automation Execution. https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution. Accessed 2026-02-20. Content used under CC BY-SA 4.0. + +--- + +## Quick Reference + +| Error Indicator | Classification | Resolution Owner | +|---|---|---| +| `runner_on_unreachable` | Platform | Network/Infra | +| Job status `error` | Platform | Platform Admin | +| `AnsibleUndefinedVariable` | Code | Playbook Dev | +| `No package matching` | Code | Playbook Dev | +| `rc != 0` (shell/command) | Code | Playbook Dev | +| `privilege escalation` / `Timeout` | Configuration | Ops Team | +| `Permission denied` (file) | Configuration | Ops Team | +| `Failed to start` (service) | Configuration | Ops Team | +| `couldn't resolve module` | Configuration | Platform Admin (EE) | diff --git a/submissions/rh-automation-governance-assessor/instruction.md b/submissions/rh-automation-governance-assessor/instruction.md new file mode 100644 index 0000000..4f5ee07 --- /dev/null +++ b/submissions/rh-automation-governance-assessor/instruction.md @@ -0,0 +1,11 @@ +# governance-assessor Task + +You are a Red Hat Ansible Automation Platform (AAP) engineer. Complete the task described below using the tools and documentation available in your environment. + +## Requirements +- Use MCP tools to interact with the AAP environment +- Document your methodology, findings, and results in `/solution/report.md` +- If reference documentation or skills are available in this environment, consult them before beginning work +- Complete the entire analysis autonomously + +Write your complete analysis in `/solution/report.md`. diff --git a/submissions/rh-automation-governance-assessor/mcps.json b/submissions/rh-automation-governance-assessor/mcps.json new file mode 100644 index 0000000..6a92802 --- /dev/null +++ b/submissions/rh-automation-governance-assessor/mcps.json @@ -0,0 +1,76 @@ +{ + "mcpServers": { + "aap-mcp-job-management": { + "type": "http", + "url": "https://${AAP_MCP_SERVER}/job_management/mcp", + "headers": { + "Authorization": "Bearer ${AAP_API_TOKEN}" + }, + "env": { + "AAP_MCP_SERVER": "${AAP_MCP_SERVER}", + "AAP_API_TOKEN": "${AAP_API_TOKEN}" + }, + "description": "AAP job management: job templates, launches, events, status, workflows, and workflow approvals" + }, + "aap-mcp-inventory-management": { + "type": "http", + "url": "https://${AAP_MCP_SERVER}/inventory_management/mcp", + "headers": { + "Authorization": "Bearer ${AAP_API_TOKEN}" + }, + "env": { + "AAP_MCP_SERVER": "${AAP_MCP_SERVER}", + "AAP_API_TOKEN": "${AAP_API_TOKEN}" + }, + "description": "AAP inventory management: inventories, hosts, groups, and host facts (ansible_facts)" + }, + "aap-mcp-configuration": { + "type": "http", + "url": "https://${AAP_MCP_SERVER}/configuration/mcp", + "headers": { + "Authorization": "Bearer ${AAP_API_TOKEN}" + }, + "env": { + "AAP_MCP_SERVER": "${AAP_MCP_SERVER}", + "AAP_API_TOKEN": "${AAP_API_TOKEN}" + }, + "description": "AAP configuration: notification templates, execution environments, and platform settings" + }, + "aap-mcp-security-compliance": { + "type": "http", + "url": "https://${AAP_MCP_SERVER}/security_compliance/mcp", + "headers": { + "Authorization": "Bearer ${AAP_API_TOKEN}" + }, + "env": { + "AAP_MCP_SERVER": "${AAP_MCP_SERVER}", + "AAP_API_TOKEN": "${AAP_API_TOKEN}" + }, + "description": "AAP security and compliance: credentials, credential types, and credential testing" + }, + "aap-mcp-system-monitoring": { + "type": "http", + "url": "https://${AAP_MCP_SERVER}/system_monitoring/mcp", + "headers": { + "Authorization": "Bearer ${AAP_API_TOKEN}" + }, + "env": { + "AAP_MCP_SERVER": "${AAP_MCP_SERVER}", + "AAP_API_TOKEN": "${AAP_API_TOKEN}" + }, + "description": "AAP system monitoring: instance groups, instances, activity stream, platform status, and mesh topology" + }, + "aap-mcp-user-management": { + "type": "http", + "url": "https://${AAP_MCP_SERVER}/user_management/mcp", + "headers": { + "Authorization": "Bearer ${AAP_API_TOKEN}" + }, + "env": { + "AAP_MCP_SERVER": "${AAP_MCP_SERVER}", + "AAP_API_TOKEN": "${AAP_API_TOKEN}" + }, + "description": "AAP user management: users, teams, organizations, role assignments, role definitions, and authenticators" + } + } +} diff --git a/submissions/rh-automation-governance-assessor/metadata.yaml b/submissions/rh-automation-governance-assessor/metadata.yaml new file mode 100644 index 0000000..e10be0c --- /dev/null +++ b/submissions/rh-automation-governance-assessor/metadata.yaml @@ -0,0 +1,13 @@ +name: rh-automation-governance-assessor +description: "rh-automation governance-assessor Skill Evaluation" +persona: rh-automation +version: "1.0.0" +generation_mode: manual +tags: + - rh-automation + - governance-assessor +cpus: 2 +memory_mb: 2048 +storage_mb: 10240 +experiment: + n_trials: 3 diff --git a/submissions/rh-automation-governance-assessor/skills/governance-assessor/SKILL.md b/submissions/rh-automation-governance-assessor/skills/governance-assessor/SKILL.md new file mode 100644 index 0000000..662a9df --- /dev/null +++ b/submissions/rh-automation-governance-assessor/skills/governance-assessor/SKILL.md @@ -0,0 +1,154 @@ +--- +name: governance-assessor +description: | + Orchestrates AAP governance readiness assessments -- full platform audit or scoped to specific domains. + + Assesses 7 governance domains + 1 bonus: + 1. Workflow Governance (approval gates, workflow coverage) + 2. Notification Coverage (failure alerting, notification bindings) + 3. Access Control / RBAC (teams, roles, least privilege) + 4. Credential Security (separation of duties, credential hygiene) + 5. Execution Environments (custom EEs, image provenance) + 6. Workload Isolation (instance groups, capacity separation) + 7. Audit Trail (activity stream, change tracking) + Bonus: External Authentication (LDAP, SAML, SSO) + + Use when: + - Full: "Is my AAP ready for production?", "Audit my platform governance" + - Scoped: "Assess my credentials setup", "Check my RBAC", "How are my notifications?" + - "What should I fix before executing jobs?" + - Any question about specific AAP governance domains above + + NOT for job execution (use governance-executor) or troubleshooting (use forensic-troubleshooter). +model: inherit +color: red +--- + +# Governance Assessor + +## Prerequisites + +**Required MCP Servers**: All 6 AAP MCP servers for full assessment; subset for scoped assessment (validated in Step 1) +**Required Skills**: `aap-mcp-validator`, `governance-readiness-assessor`, `execution-summary` + +## When to Use This Skill + +Use this skill when: +- User asks to assess or audit their AAP platform's governance readiness (full assessment) +- User asks about a specific governance area: credentials, RBAC, workflows, notifications, execution environments, instance groups, audit trail, or authentication (scoped assessment) +- User asks if their AAP is ready for production execution +- User asks what needs to be improved in their AAP setup +- Before a first production execution (optional pre-flight check) + +Do NOT use when: +- User wants to execute a job (use `governance-executor` skill) +- User wants to troubleshoot a failed job (use `forensic-troubleshooter` skill) +- User only wants MCP connectivity check (use `aap-mcp-validator` skill directly) + +## Workflow + +### 1. Validate MCP Connectivity + +**Invoke the aap-mcp-validator skill**: +- **Full assessment**: Validate all 6 AAP MCP servers +- **Scoped assessment**: Validate only the MCP servers needed for the requested domains +- If any required server fails: report and stop + +### 2. Run Governance Readiness Assessment + +**Invoke the governance-readiness-assessor skill** (the skill determines scope from the user's request): +- **Full assessment**: Queries all 6 MCP servers across 7 domains + 1 bonus domain +- **Scoped assessment**: Queries only the servers for requested domains, plus minimal queries for cross-domain correlation +- The skill reads governance-readiness.md +- **Adapts depth**: When initial assessment reveals PASS, performs follow-up queries to verify (e.g., notification templates exist but are they bound to anything? Credentials exist but are they shared across environments?) +- **Correlates across domains**: Identifies compound risks (e.g., no teams + credentials = user-scoped credentials; no workflows + no notifications = invisible failures) +- **Calibrates severity**: Checks inventory scale to frame findings appropriately (lab vs enterprise) +- Produces the structured Governance Readiness Report with Red Hat citations per domain, compound risk analysis, and prioritized fix order + +**Document Consultation** (performed by the skill): +The governance-readiness-assessor skill reads [governance-readiness.md](../../docs/aap/governance-readiness.md) and reports its consultation. + +### 3. Present Report and Offer Remediation + +Present the full report to the user including: +- Per-domain findings (with any depth-query adjustments) +- Compound Risk Analysis section (cross-domain correlations) +- Recommended Fix Order (prioritized by dependency chain) + +For any GAP or WARN findings, offer to remediate using MCP write tools where available, starting with the highest-priority gap per the dependency chain. + +**Human Confirmation** (REQUIRED): +Before creating or modifying any AAP resource: +- Display the planned change +- Ask: "Should I create/modify this resource to address the gap?" +- Wait for explicit user confirmation + +### 4. Generate Execution Summary + +**Invoke the execution-summary skill**: +- Generate audit trail showing documents consulted, MCP tools used, governance findings + +## Dependencies + +### Required Skills +- `aap-mcp-validator` - MCP server validation +- `governance-readiness-assessor` - 7-domain assessment +- `execution-summary` - Audit trail + +### Required MCP Servers +- All 6 AAP MCP servers + +### Required MCP Tools +- None; delegated to sub-skill (`governance-readiness-assessor`). + +### Related Skills +- `governance-executor` - Follow-up: governed execution after assessment passes +- `forensic-troubleshooter` - Follow-up: investigate failures found during assessment + +### Reference Documentation +- [governance-readiness.md](../../docs/aap/governance-readiness.md) - 7-domain assessment framework + +### Sample Reports +- [sample-full-assessment.md](references/sample-full-assessment.md) - Full 7+1 domain assessment with compound risk analysis +- [sample-scoped-assessment.md](references/sample-scoped-assessment.md) - Scoped assessment (Credentials + RBAC) with cross-domain correlation + +## Example Usage + +### Example 1: Full Platform Assessment + +**User**: "Is my AAP ready for production?" + +**Agent**: + +1. Validates all 6 MCP servers via aap-mcp-validator +2. Invokes governance-readiness-assessor (full scope: all 7+1 domains) + - Domain 1 (Workflows): GAP -- no workflow job templates found + - Domain 2 (Notifications): WARN -- templates exist but not bound to jobs + - Domain 3 (RBAC): GAP -- no teams, only user-level assignments + - Domain 4 (Credentials): PASS + - Domain 5 (EEs): PASS + - Domain 6 (Workload Isolation): PASS -- single instance group but lab scale + - Domain 7 (Audit Trail): PASS + - Bonus (Auth): WARN -- local-only authentication + - Compound: No workflows + no notifications = highest-risk combination +3. Presents full report with Red Hat citations per domain +4. Offers remediation: "Create a team for automation operators? (Fixes Domain 3)" +5. Invokes execution-summary with audit trail + +### Example 2: Scoped Assessment + +**User**: "Check my RBAC and credentials setup" + +**Agent**: + +1. Validates `aap-mcp-user-management` and `aap-mcp-security-compliance` +2. Invokes governance-readiness-assessor scoped to Domains 3 + 4 +3. Presents scoped report: RBAC findings + Credential findings + cross-correlation +4. Offers: "Would you like me to run the full 7-domain assessment?" +5. Invokes execution-summary + +## Critical: Human-in-the-Loop Requirements + +1. **Before any remediation actions**: Display planned change, wait for approval +2. **Never auto-create resources**: Always present the assessment first, let user decide what to fix +3. **Offer skip/abort options**: User may want to see the report without acting on it diff --git a/submissions/rh-automation-governance-assessor/skills/governance-assessor/references/sample-full-assessment.md b/submissions/rh-automation-governance-assessor/skills/governance-assessor/references/sample-full-assessment.md new file mode 100644 index 0000000..9f3396a --- /dev/null +++ b/submissions/rh-automation-governance-assessor/skills/governance-assessor/references/sample-full-assessment.md @@ -0,0 +1,163 @@ +# Sample Report: Full Governance Readiness Assessment + +This sample shows the expected output when a user requests a full platform governance audit. +The report follows the output template defined in [governance-readiness.md](../../../docs/aap/governance-readiness.md). + +--- + +## AAP Governance Readiness Report + +**Assessment Date**: 2026-03-15 +**AAP Instance**: aap.example.com +**Domains Assessed**: 7 + 1 bonus +**Scale Calibration**: Enterprise (3 inventories, 87 hosts across production, staging, and development) + +**Documents Consulted**: +- [governance-readiness.md](docs/aap/governance-readiness.md) -- 7-domain assessment framework, Red Hat citations, decision tables + +--- + +### Domain 1: Workflow Governance — GAP + +Per Red Hat's *Automation Controller User Guide* (Ch. 9: Workflows): +> "Workflows enable you to configure a sequence of disparate job templates and link them together." + +**Finding**: Found 0 workflow job templates and 14 standalone job templates. Workflow coverage ratio: 0%. +**Status**: GAP -- no workflow job templates exist +**Recommendation**: Per Red Hat's *Workflows* (Ch. 9): Create workflow job templates to wrap production job templates with approval nodes and failure-handling paths. +**Source URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-workflows + +--- + +### Domain 2: Notification Coverage — WARN + +Per Red Hat's *Automation Controller User Guide* (Ch. 25: Notifications): +> "You can set notifications on job start and job end, including job failure." + +**Finding**: Found 2 notification templates (1 Email, 1 Slack). However, neither is bound to any job template -- notifications exist but are unused. +**Status**: WARN -- notification templates exist but are not bound to job templates (depth query downgrade from initial PASS) +**Recommendation**: Per Red Hat's *Notifications* (Ch. 25): Bind notification templates to production job templates, at minimum for failure events. +**Source URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-notifications + +--- + +### Domain 3: Access Control (RBAC) — GAP + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.2.1): +> "Use teams inside of organizations to assign permissions to groups of users rather than to users individually." + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.2): +> "Minimize administrative accounts...restrict to the minimum set of users." + +**Finding**: 6 users (2 superusers), 0 teams, 8 individual role assignments, 0 team role assignments. +**Status**: GAP -- no teams exist; all permissions assigned to individual users +**Recommendation**: Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.2.1): Create teams (e.g., `automation-operators`, `automation-admins`) and migrate individual role assignments to team-based assignments. +**Source URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices + +--- + +### Domain 4: Credential Security — WARN + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.4): +> "Remove user access to credentials...credentials should be defined at the organization or team level." + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.5): +> "Enforce separation of duties...different credentials for each piece of automation." + +**Finding**: 3 credentials (2 Machine, 1 SCM), 2 credential types. Credential `ssh-prod` (ID: 5) is used across both staging and production job templates. +**Status**: WARN -- credentials exist but lack separation of duties across environments +**Recommendation**: Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.5): Create separate credentials per environment (e.g., `ssh-prod`, `ssh-staging`, `ssh-dev`). +**Source URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices + +--- + +### Domain 5: Execution Environments — PASS + +Per Red Hat's *Creating and Consuming Execution Environments* (AAP 2.6): +> "Execution environments are container images that serve as Ansible control nodes." + +**Finding**: 3 execution environments (2 custom, 1 default). Custom EEs use pinned image tags. +**Status**: PASS -- custom execution environments configured with pinned versions +**Recommendation**: No action needed. Continue using pinned image tags and consider implementing image signing. +**Source URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/creating_and_consuming_execution_environments + +--- + +### Domain 6: Workload Isolation — PASS + +Per Red Hat's *Configuring Automation Execution* (Ch. 17: Instance Groups): +> "Instance groups can be used to assign jobs to run on specific sets of instances." + +**Finding**: 3 instance groups (`default`, `production`, `development`). Production and development workloads are separated. +**Status**: PASS -- workload isolation implemented between environments +**Recommendation**: No action needed. Consider setting `max_forks` limits on instance groups for capacity management. +**Source URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/using_automation_execution/controller-instance-groups + +--- + +### Domain 7: Audit Trail — PASS + +Per Red Hat's *Activity Stream* documentation: +> "The Activity Stream shows all changes and events in the automation controller." + +**Finding**: 247 activity stream entries. Most recent: 2026-03-15T14:22:00Z. Active logging confirmed. +**Status**: PASS -- activity stream is operational with recent entries +**Recommendation**: No action needed. Consider configuring external log aggregation for long-term retention per Red Hat's *Hardening Guide*. +**Source URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/using_automation_execution/assembly-controller-activity-stream + +--- + +### Bonus: External Authentication — WARN + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.2.2): +> "Connecting to external account sources by LDAP, SAML 2.0, and certain OAuth providers." + +**Finding**: 0 external authenticators configured. All 6 users authenticate locally. +**Status**: WARN -- no external authentication; local-only auth with 2 superusers +**Recommendation**: Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.2.2): Configure LDAP or SAML authentication to enforce centralized account management and MFA. +**Source URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices + +--- + +### Compound Risk Analysis + +| Correlation | Domains | Finding | Elevated Recommendation | +|---|---|---|---| +| RBAC GAP + Credentials exist | 3 + 4 | Without teams, credentials are necessarily user-scoped | Fix RBAC first (create teams) to enable team-scoped credential management | +| No Workflows + Unbound Notifications | 1 + 2 | No governance controls AND no automated alerting on production failures | Highest-risk combination -- create at minimum a failure notification binding while workflows are built | +| Multiple superusers + local auth | 3 + Bonus | 2 superuser accounts without MFA have maximum blast radius | Configure external authentication to enforce MFA on superuser accounts | + +--- + +### Summary + +| Domain | Status | Key Finding | +|---|---|---| +| Workflow Governance | GAP | No workflow job templates; 14 standalone templates | +| Notification Coverage | WARN | 2 templates exist but not bound to any jobs | +| Access Control (RBAC) | GAP | No teams; all permissions individual | +| Credential Security | WARN | Credential shared across staging and production | +| Execution Environments | PASS | 2 custom EEs with pinned tags | +| Workload Isolation | PASS | Separate instance groups for prod/dev | +| Audit Trail | PASS | Active, 247 entries | +| External Authentication | WARN | Local-only, no MFA | + +**Overall**: 3 PASS, 3 WARN, 2 GAP out of 8 domains assessed. + +### Recommended Fix Order + +1. **RBAC (Domain 3)** -- Foundation for team-scoped credentials and least-privilege access +2. **Workflows (Domain 1)** -- Enables approval gates and failure-handling paths +3. **Notification Bindings (Domain 2)** -- Bind existing templates to production jobs +4. **Credential Separation (Domain 4)** -- After teams exist, create per-environment credentials +5. **External Authentication (Bonus)** -- Enforce MFA for superuser accounts + +### Sources Consulted + +- Red Hat AAP 2.5 - Security Best Practices: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices +- Red Hat AAP 2.5 - Workflows: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-workflows +- Red Hat AAP 2.5 - Notifications: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/using_automation_execution/controller-notifications +- Red Hat AAP 2.5 - Instance Groups: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/using_automation_execution/controller-instance-groups +- Red Hat AAP 2.5 - Activity Stream: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/using_automation_execution/assembly-controller-activity-stream +- Red Hat AAP 2.6 - Execution Environments: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/creating_and_consuming_execution_environments +- Red Hat AAP 2.5 - RBAC: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/access_management_and_authentication/gw-managing-access diff --git a/submissions/rh-automation-governance-assessor/skills/governance-assessor/references/sample-scoped-assessment.md b/submissions/rh-automation-governance-assessor/skills/governance-assessor/references/sample-scoped-assessment.md new file mode 100644 index 0000000..4ee0db3 --- /dev/null +++ b/submissions/rh-automation-governance-assessor/skills/governance-assessor/references/sample-scoped-assessment.md @@ -0,0 +1,73 @@ +# Sample Report: Scoped Governance Assessment (Credentials + RBAC) + +This sample shows the expected output when a user requests a scoped assessment +targeting specific governance domains. The report follows the output template +defined in [governance-readiness.md](../../../docs/aap/governance-readiness.md). + +--- + +## AAP Governance Assessment: Credential Security + Access Control + +**Assessment Date**: 2026-03-15 +**AAP Instance**: aap.example.com +**Scope**: Domain 3 (Access Control / RBAC) + Domain 4 (Credential Security) +**Scale Calibration**: Small team (1 inventory, 5 hosts, development only) + +**Documents Consulted**: +- [governance-readiness.md](docs/aap/governance-readiness.md) -- Domains 3 and 4 assessment criteria, Red Hat citations + +--- + +### Domain 3: Access Control (RBAC) — PASS + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.2.1): +> "Use teams inside of organizations to assign permissions to groups of users rather than to users individually." + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.2): +> "Minimize administrative accounts...restrict to the minimum set of users." + +**Finding**: 3 users (1 superuser), 2 teams (`dev-operators`, `dev-admins`), 0 individual role assignments, 4 team role assignments. All access is team-based. +**Status**: PASS -- team-based access control implemented; single superuser is acceptable for development +**Recommendation**: No action needed for current scale. If promoting to production, review whether `dev-admins` team permissions follow least privilege. +**Source URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices + +--- + +### Domain 4: Credential Security — WARN + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.4): +> "Remove user access to credentials...credentials should be defined at the organization or team level." + +Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.5): +> "Enforce separation of duties...different credentials for each piece of automation." + +**Finding**: Found 2 credentials, both of type "Machine." Credential `ssh-key` (ID: 3) is used across both dev and staging job templates. +**Status**: WARN -- credentials exist but lack separation of duties +**Recommendation**: Per Red Hat's *Security Best Practices* (Ch. 15, Sec. 15.1.5): Create separate credentials per environment. For current dev-only scale this is low severity, but address before production promotion. +**Source URL**: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices + +--- + +### Cross-Domain Correlation + +| Correlation | Domains | Finding | +|---|---|---| +| RBAC PASS + Credentials WARN | 3 + 4 | Teams exist (good), but shared credentials across environments reduce the benefit of team scoping. Once credentials are separated, assign them to specific teams for full isolation. | + +--- + +### Summary + +| Domain | Status | Key Finding | +|---|---|---| +| Access Control (RBAC) | PASS | Team-based access, single superuser | +| Credential Security | WARN | Shared credential across environments | + +**Assessed**: 1 PASS, 1 WARN out of 2 domains. + +Would you like me to run the full 7-domain assessment for complete coverage? + +### Sources Consulted + +- Red Hat AAP 2.5 - Security Best Practices: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.5/html/configuring_automation_execution/controller-security-best-practices +- Red Hat AAP 2.5 - RBAC: https://docs.redhat.com/en/documentation/red_hat_ansible_automation_platform/2.6/html/access_management_and_authentication/gw-managing-access diff --git a/submissions/rh-automation-governance-assessor/supportive/.mcp.json b/submissions/rh-automation-governance-assessor/supportive/.mcp.json new file mode 100644 index 0000000..31883a1 --- /dev/null +++ b/submissions/rh-automation-governance-assessor/supportive/.mcp.json @@ -0,0 +1,8 @@ +{ + "mcpServers": { + "aap-mcp": { + "command": "python3", + "args": ["/workspace/supportive/mcp-servers/mock-aap-mcp.py"] + } + } +} diff --git a/submissions/rh-automation-governance-assessor/supportive/mcp-servers/mock-aap-mcp.py b/submissions/rh-automation-governance-assessor/supportive/mcp-servers/mock-aap-mcp.py new file mode 100644 index 0000000..d8ae4fd --- /dev/null +++ b/submissions/rh-automation-governance-assessor/supportive/mcp-servers/mock-aap-mcp.py @@ -0,0 +1,1048 @@ +#!/usr/bin/env python3 +""" +Mock AAP (Ansible Automation Platform) MCP Server + +Simulates the AAP MCP gateway for per-skill evaluation tasks. Implements +the full set of tools used by rh-sre skills: + - job_templates_list / job_templates_retrieve + - projects_list + - job_templates_launch_retrieve + - jobs_retrieve / jobs_stdout_retrieve + - jobs_job_events_list / jobs_job_host_summaries_list + - jobs_relaunch_retrieve + - inventories_list / hosts_list + +Data mirrors a realistic AAP deployment: + - 6 job templates (3 remediation, 1 compliance, 1 patching, 1 reporting) + - 3 projects (remediation, compliance, reporting) + - 3 inventories (production 30 hosts, staging 15 hosts, all-managed 63 hosts) + - 12 recent jobs with varied statuses + +Follows the same mock-server pattern as mock-lightspeed-mcp.py. +""" + +import os +import random +from datetime import datetime, timedelta +from typing import Optional + +from fastmcp import FastMCP + +random.seed(42) + +mcp = FastMCP("aap-mcp") + +REFERENCE_TIME = datetime(2026, 2, 15, 12, 0, 0) + + +def _ts(delta: timedelta) -> str: + return (REFERENCE_TIME - delta).isoformat() + "Z" + + +# --------------------------------------------------------------------------- +# Mock data: Projects +# --------------------------------------------------------------------------- + +MOCK_PROJECTS = [ + { + "id": 6, + "type": "project", + "name": "Remediation Playbooks", + "description": "CVE and security remediation playbooks managed via Git", + "scm_type": "git", + "scm_url": "https://github.com/org/remediation-playbooks.git", + "scm_branch": "main", + "scm_revision": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2", + "status": "successful", + "last_job_run": _ts(timedelta(hours=2)), + "last_update_failed": False, + "created": _ts(timedelta(days=90)), + "modified": _ts(timedelta(hours=2)), + }, + { + "id": 7, + "type": "project", + "name": "Compliance Checks", + "description": "STIG and CIS compliance scanning playbooks", + "scm_type": "git", + "scm_url": "https://github.com/org/compliance-playbooks.git", + "scm_branch": "main", + "scm_revision": "b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3", + "status": "successful", + "last_job_run": _ts(timedelta(days=1)), + "last_update_failed": False, + "created": _ts(timedelta(days=120)), + "modified": _ts(timedelta(days=1)), + }, + { + "id": 8, + "type": "project", + "name": "Fleet Reporting", + "description": "System inventory and health reporting playbooks", + "scm_type": "git", + "scm_url": "https://github.com/org/fleet-reports.git", + "scm_branch": "main", + "scm_revision": "c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4", + "status": "successful", + "last_job_run": _ts(timedelta(days=3)), + "last_update_failed": False, + "created": _ts(timedelta(days=180)), + "modified": _ts(timedelta(days=3)), + }, +] + +# --------------------------------------------------------------------------- +# Mock data: Inventories & Hosts +# --------------------------------------------------------------------------- + +MOCK_INVENTORIES = [ + { + "id": 1, + "type": "inventory", + "name": "Production Systems", + "description": "All production RHEL systems across data centers", + "total_hosts": 30, + "has_active_failures": False, + "hosts_with_active_failures": 0, + "total_groups": 5, + "groups_with_active_failures": 0, + "has_inventory_sources": True, + "organization": 1, + "created": _ts(timedelta(days=365)), + "modified": _ts(timedelta(days=1)), + }, + { + "id": 2, + "type": "inventory", + "name": "Staging Systems", + "description": "Pre-production staging environment", + "total_hosts": 15, + "has_active_failures": False, + "hosts_with_active_failures": 0, + "total_groups": 3, + "groups_with_active_failures": 0, + "has_inventory_sources": True, + "organization": 1, + "created": _ts(timedelta(days=300)), + "modified": _ts(timedelta(days=7)), + }, + { + "id": 3, + "type": "inventory", + "name": "All Managed Systems", + "description": "Complete fleet: production, staging, development, QA, legacy", + "total_hosts": 63, + "has_active_failures": True, + "hosts_with_active_failures": 2, + "total_groups": 8, + "groups_with_active_failures": 1, + "has_inventory_sources": True, + "organization": 1, + "created": _ts(timedelta(days=365)), + "modified": _ts(timedelta(hours=6)), + }, +] + + +def _generate_hosts(inventory_id: int) -> list[dict]: + """Generate realistic hosts for an inventory.""" + hosts: list[dict] = [] + if inventory_id == 1: + roles = ["web", "db", "app", "lb", "monitoring", "cache"] + for i, role in enumerate(roles): + for j in range(5 if role in ("web", "app") else 4 if role == "db" else 3 if role == "monitoring" else 2): + hosts.append({ + "id": len(hosts) + 1, + "type": "host", + "name": f"{role}-{j+1:02d}.prod.example.com", + "inventory": inventory_id, + "enabled": True, + "has_active_failures": False, + "variables": f'{{"rhel_version": "9.3", "environment": "production", "role": "{role}"}}', + }) + if len(hosts) >= 30: + break + if len(hosts) >= 30: + break + elif inventory_id == 2: + for i in range(15): + role = ["web", "db", "app"][i % 3] + hosts.append({ + "id": 100 + i, + "type": "host", + "name": f"{role}-{i+1:02d}.staging.example.com", + "inventory": inventory_id, + "enabled": True, + "has_active_failures": False, + "variables": f'{{"rhel_version": "9.3", "environment": "staging", "role": "{role}"}}', + }) + elif inventory_id == 3: + for i in range(30): + hosts.append({ + "id": 200 + i, + "type": "host", + "name": f"host-{i+1:02d}.prod.example.com", + "inventory": inventory_id, + "enabled": True, + "has_active_failures": i in (45, 58), + "variables": f'{{"rhel_version": "9.3", "environment": "production"}}', + }) + for i in range(15): + hosts.append({ + "id": 230 + i, + "type": "host", + "name": f"host-{i+1:02d}.staging.example.com", + "inventory": inventory_id, + "enabled": True, + "has_active_failures": False, + "variables": f'{{"rhel_version": "9.3", "environment": "staging"}}', + }) + for i in range(10): + hosts.append({ + "id": 245 + i, + "type": "host", + "name": f"dev-{i+1:02d}.dev.example.com", + "inventory": inventory_id, + "enabled": True, + "has_active_failures": False, + "variables": f'{{"rhel_version": "8.9", "environment": "development"}}', + }) + for i in range(5): + hosts.append({ + "id": 255 + i, + "type": "host", + "name": f"qa-{i+1:02d}.qa.example.com", + "inventory": inventory_id, + "enabled": True, + "has_active_failures": False, + "variables": f'{{"rhel_version": "9.2", "environment": "qa"}}', + }) + for i in range(3): + hosts.append({ + "id": 260 + i, + "type": "host", + "name": f"legacy-{i+1:02d}.corp.example.com", + "inventory": inventory_id, + "enabled": i < 2, + "has_active_failures": i == 2, + "variables": f'{{"rhel_version": "7.9", "environment": "legacy"}}', + }) + return hosts + + +# --------------------------------------------------------------------------- +# Mock data: Job Templates +# --------------------------------------------------------------------------- + +MOCK_JOB_TEMPLATES = [ + { + "id": 10, + "type": "job_template", + "name": "CVE Remediation - Kernel Update", + "description": "Kernel update with boom snapshot for rollback safety", + "inventory": 1, + "project": 6, + "playbook": "playbooks/remediation/cve-kernel-update.yml", + "become_enabled": True, + "ask_job_type_on_launch": True, + "ask_variables_on_launch": True, + "ask_limit_on_launch": True, + "ask_inventory_on_launch": True, + "job_type": "check", + "verbosity": 1, + "timeout": 3600, + "forks": 5, + "status": "successful", + "last_job_run": _ts(timedelta(hours=4)), + "summary_fields": { + "project": {"id": 6, "name": "Remediation Playbooks", "status": "successful"}, + "inventory": {"id": 1, "name": "Production Systems", "total_hosts": 30}, + "credentials": [ + {"id": 1, "name": "machine-credential", "kind": "ssh"}, + ], + "last_job": {"id": 1001, "status": "successful", "finished": _ts(timedelta(hours=4))}, + }, + "created": _ts(timedelta(days=60)), + "modified": _ts(timedelta(days=2)), + }, + { + "id": 11, + "type": "job_template", + "name": "CVE Remediation - Package Update", + "description": "General package update for CVE remediation with needs-restarting check", + "inventory": 1, + "project": 6, + "playbook": "playbooks/remediation/cve-package-update.yml", + "become_enabled": True, + "ask_job_type_on_launch": True, + "ask_variables_on_launch": True, + "ask_limit_on_launch": True, + "ask_inventory_on_launch": False, + "job_type": "check", + "verbosity": 1, + "timeout": 1800, + "forks": 10, + "status": "successful", + "last_job_run": _ts(timedelta(hours=12)), + "summary_fields": { + "project": {"id": 6, "name": "Remediation Playbooks", "status": "successful"}, + "inventory": {"id": 1, "name": "Production Systems", "total_hosts": 30}, + "credentials": [ + {"id": 1, "name": "machine-credential", "kind": "ssh"}, + ], + "last_job": {"id": 1005, "status": "successful", "finished": _ts(timedelta(hours=12))}, + }, + "created": _ts(timedelta(days=45)), + "modified": _ts(timedelta(days=5)), + }, + { + "id": 12, + "type": "job_template", + "name": "CVE Remediation - Generic", + "description": "Generic CVE remediation template for ad-hoc patches", + "inventory": 1, + "project": 6, + "playbook": "playbooks/remediation/cve-remediation.yml", + "become_enabled": True, + "ask_job_type_on_launch": True, + "ask_variables_on_launch": True, + "ask_limit_on_launch": True, + "ask_inventory_on_launch": True, + "job_type": "check", + "verbosity": 1, + "timeout": 3600, + "forks": 5, + "status": "never updated", + "last_job_run": None, + "summary_fields": { + "project": {"id": 6, "name": "Remediation Playbooks", "status": "successful"}, + "inventory": {"id": 1, "name": "Production Systems", "total_hosts": 30}, + "credentials": [ + {"id": 1, "name": "machine-credential", "kind": "ssh"}, + ], + }, + "created": _ts(timedelta(days=30)), + "modified": _ts(timedelta(days=30)), + }, + { + "id": 20, + "type": "job_template", + "name": "Compliance Check - STIG", + "description": "Run STIG compliance scan across fleet", + "inventory": 3, + "project": 7, + "playbook": "playbooks/compliance/check-all.yml", + "become_enabled": True, + "ask_job_type_on_launch": True, + "ask_variables_on_launch": False, + "ask_limit_on_launch": True, + "ask_inventory_on_launch": False, + "job_type": "run", + "verbosity": 0, + "timeout": 7200, + "forks": 20, + "status": "successful", + "last_job_run": _ts(timedelta(days=1)), + "summary_fields": { + "project": {"id": 7, "name": "Compliance Checks", "status": "successful"}, + "inventory": {"id": 3, "name": "All Managed Systems", "total_hosts": 63}, + "credentials": [ + {"id": 2, "name": "compliance-credential", "kind": "ssh"}, + ], + "last_job": {"id": 1010, "status": "successful", "finished": _ts(timedelta(days=1))}, + }, + "created": _ts(timedelta(days=180)), + "modified": _ts(timedelta(days=14)), + }, + { + "id": 25, + "type": "job_template", + "name": "Emergency Patching", + "description": "Emergency patch application — NO become enabled (misconfigured)", + "inventory": 1, + "project": 6, + "playbook": "playbooks/remediation/emergency-patch.yml", + "become_enabled": False, + "ask_job_type_on_launch": False, + "ask_variables_on_launch": False, + "ask_limit_on_launch": False, + "ask_inventory_on_launch": False, + "job_type": "run", + "verbosity": 0, + "timeout": 600, + "forks": 25, + "status": "failed", + "last_job_run": _ts(timedelta(days=7)), + "summary_fields": { + "project": {"id": 6, "name": "Remediation Playbooks", "status": "successful"}, + "inventory": {"id": 1, "name": "Production Systems", "total_hosts": 30}, + "credentials": [ + {"id": 1, "name": "machine-credential", "kind": "ssh"}, + ], + "last_job": {"id": 1020, "status": "failed", "finished": _ts(timedelta(days=7))}, + }, + "created": _ts(timedelta(days=200)), + "modified": _ts(timedelta(days=200)), + }, + { + "id": 30, + "type": "job_template", + "name": "Fleet Health Report", + "description": "Generate fleet health and inventory report", + "inventory": 3, + "project": 8, + "playbook": "playbooks/reporting/fleet-health.yml", + "become_enabled": False, + "ask_job_type_on_launch": False, + "ask_variables_on_launch": True, + "ask_limit_on_launch": False, + "ask_inventory_on_launch": False, + "job_type": "run", + "verbosity": 0, + "timeout": 1800, + "forks": 30, + "status": "successful", + "last_job_run": _ts(timedelta(hours=6)), + "summary_fields": { + "project": {"id": 8, "name": "Fleet Reporting", "status": "successful"}, + "inventory": {"id": 3, "name": "All Managed Systems", "total_hosts": 63}, + "credentials": [ + {"id": 1, "name": "machine-credential", "kind": "ssh"}, + ], + "last_job": {"id": 1025, "status": "successful", "finished": _ts(timedelta(hours=6))}, + }, + "created": _ts(timedelta(days=120)), + "modified": _ts(timedelta(days=14)), + }, +] + +# --------------------------------------------------------------------------- +# Mock data: Jobs (recent runs) +# --------------------------------------------------------------------------- + +PROD_HOSTS = [ + "web-01.prod.example.com", + "web-02.prod.example.com", + "db-01.prod.example.com", + "db-02.prod.example.com", + "app-01.prod.example.com", + "app-02.prod.example.com", +] + +MOCK_JOBS = [ + { + "id": 1001, + "type": "job", + "name": "CVE Remediation - Kernel Update", + "job_type": "check", + "status": "successful", + "failed": False, + "started": _ts(timedelta(hours=4, minutes=30)), + "finished": _ts(timedelta(hours=4)), + "elapsed": 1800.0, + "job_template": 10, + "inventory": 1, + "project": 6, + "playbook": "playbooks/remediation/cve-kernel-update.yml", + "limit": "web-01.prod.example.com,web-02.prod.example.com,db-01.prod.example.com", + "extra_vars": '{"target_cve": "CVE-2024-12345", "remediation_mode": "automated", "verify_after": true}', + "launch_type": "manual", + "summary_fields": { + "job_template": {"id": 10, "name": "CVE Remediation - Kernel Update"}, + }, + }, + { + "id": 1002, + "type": "job", + "name": "CVE Remediation - Kernel Update", + "job_type": "run", + "status": "successful", + "failed": False, + "started": _ts(timedelta(hours=3, minutes=45)), + "finished": _ts(timedelta(hours=3)), + "elapsed": 2700.0, + "job_template": 10, + "inventory": 1, + "project": 6, + "playbook": "playbooks/remediation/cve-kernel-update.yml", + "limit": "web-01.prod.example.com,web-02.prod.example.com,db-01.prod.example.com", + "extra_vars": '{"target_cve": "CVE-2024-12345", "remediation_mode": "automated", "verify_after": true}', + "launch_type": "manual", + "summary_fields": { + "job_template": {"id": 10, "name": "CVE Remediation - Kernel Update"}, + }, + }, + { + "id": 1005, + "type": "job", + "name": "CVE Remediation - Package Update", + "job_type": "run", + "status": "successful", + "failed": False, + "started": _ts(timedelta(hours=12, minutes=20)), + "finished": _ts(timedelta(hours=12)), + "elapsed": 1200.0, + "job_template": 11, + "inventory": 1, + "project": 6, + "playbook": "playbooks/remediation/cve-package-update.yml", + "limit": "", + "extra_vars": '{"target_cve": "CVE-2024-54321"}', + "launch_type": "manual", + "summary_fields": { + "job_template": {"id": 11, "name": "CVE Remediation - Package Update"}, + }, + }, + { + "id": 1010, + "type": "job", + "name": "Compliance Check - STIG", + "job_type": "run", + "status": "successful", + "failed": False, + "started": _ts(timedelta(days=1, hours=2)), + "finished": _ts(timedelta(days=1)), + "elapsed": 7200.0, + "job_template": 20, + "inventory": 3, + "project": 7, + "playbook": "playbooks/compliance/check-all.yml", + "limit": "", + "extra_vars": "{}", + "launch_type": "scheduled", + "summary_fields": { + "job_template": {"id": 20, "name": "Compliance Check - STIG"}, + }, + }, + { + "id": 1020, + "type": "job", + "name": "Emergency Patching", + "job_type": "run", + "status": "failed", + "failed": True, + "started": _ts(timedelta(days=7, hours=1)), + "finished": _ts(timedelta(days=7)), + "elapsed": 3600.0, + "job_template": 25, + "inventory": 1, + "project": 6, + "playbook": "playbooks/remediation/emergency-patch.yml", + "limit": "", + "extra_vars": "{}", + "launch_type": "manual", + "summary_fields": { + "job_template": {"id": 25, "name": "Emergency Patching"}, + }, + }, + { + "id": 1025, + "type": "job", + "name": "Fleet Health Report", + "job_type": "run", + "status": "successful", + "failed": False, + "started": _ts(timedelta(hours=6, minutes=30)), + "finished": _ts(timedelta(hours=6)), + "elapsed": 1800.0, + "job_template": 30, + "inventory": 3, + "project": 8, + "playbook": "playbooks/reporting/fleet-health.yml", + "limit": "", + "extra_vars": "{}", + "launch_type": "scheduled", + "summary_fields": { + "job_template": {"id": 30, "name": "Fleet Health Report"}, + }, + }, +] + +_next_job_id = 2000 + + +# --------------------------------------------------------------------------- +# Mock stdout generators +# --------------------------------------------------------------------------- + +def _generate_stdout(job: dict) -> str: + """Generate realistic Ansible playbook stdout for a job.""" + playbook_name = job.get("name", "Unknown") + job_type = job.get("job_type", "run") + status = job.get("status", "successful") + limit = job.get("limit", "") + hosts = limit.split(",") if limit else PROD_HOSTS[:3] + hosts = [h.strip() for h in hosts if h.strip()] + extra_vars = job.get("extra_vars", "{}") + mode = " (CHECK MODE)" if job_type == "check" else "" + + lines = [] + lines.append(f"PLAY [{playbook_name}] *****") + lines.append("") + + lines.append(f"TASK [Gathering Facts{mode}] *****") + for h in hosts: + lines.append(f"ok: [{h}]") + lines.append("") + + if "kernel" in playbook_name.lower(): + lines.append(f"TASK [Create boom snapshot for rollback{mode}] *****") + for h in hosts: + lines.append(f"changed: [{h}] => {{\"msg\": \"boom create --title pre-remediation-CVE-2024-12345\"}}") + lines.append("") + + lines.append(f"TASK [Check disk space for kernel update{mode}] *****") + for h in hosts: + lines.append(f"ok: [{h}] => {{\"msg\": \"Disk space OK: 45% used\"}}") + lines.append("") + + lines.append(f"TASK [Update kernel package{mode}] *****") + for h in hosts: + result = "changed" if status == "successful" else "fatal" + if result == "changed": + lines.append(f'changed: [{h}] => {{"msg": "kernel-5.14.0-362.24.1.el9_3 -> kernel-5.14.0-362.24.2.el9_3"}}') + else: + lines.append(f'fatal: [{h}]: FAILED! => {{"msg": "Permission denied", "rc": 1}}') + lines.append("") + + lines.append(f"TASK [Check if reboot is needed (needs-restarting -r){mode}] *****") + for h in hosts: + lines.append(f'changed: [{h}] => {{"rc": 1, "msg": "Reboot is required to fully utilize updates."}}') + lines.append("") + + elif "package" in playbook_name.lower(): + lines.append(f"TASK [Update target packages for CVE remediation{mode}] *****") + for h in hosts: + lines.append(f'changed: [{h}] => {{"msg": "httpd-2.4.53-7.el9 -> httpd-2.4.57-8.el9"}}') + lines.append("") + + lines.append(f"TASK [Restart affected services{mode}] *****") + for h in hosts: + lines.append(f"changed: [{h}]") + lines.append("") + + lines.append(f"TASK [Verify service health{mode}] *****") + for h in hosts: + lines.append(f'ok: [{h}] => {{"msg": "Service httpd is running"}}') + lines.append("") + + elif "emergency" in playbook_name.lower() and status == "failed": + lines.append(f"TASK [Apply emergency patch{mode}] *****") + for h in hosts: + lines.append(f'fatal: [{h}]: FAILED! => {{"msg": "Missing sudo password (become_enabled not set)", "rc": 1}}') + lines.append("") + lines.append("NO MORE HOSTS LEFT *****") + lines.append("") + + else: + lines.append(f"TASK [Execute playbook tasks{mode}] *****") + for h in hosts: + lines.append(f"changed: [{h}]") + lines.append("") + + lines.append("PLAY RECAP *****") + for h in hosts: + if status == "successful": + ok_count = random.randint(3, 6) + changed_count = random.randint(1, 3) + lines.append(f"{h:<45} : ok={ok_count} changed={changed_count} unreachable=0 failed=0 skipped=0 rescued=0 ignored=0") + elif status == "failed": + lines.append(f"{h:<45} : ok=1 changed=0 unreachable=0 failed=1 skipped=0 rescued=0 ignored=0") + lines.append("") + + return "\n".join(lines) + + +def _generate_events(job: dict) -> list[dict]: + """Generate realistic Ansible task events for a job.""" + hosts = (job.get("limit", "").split(",") if job.get("limit") else PROD_HOSTS[:3]) + hosts = [h.strip() for h in hosts if h.strip()] + events: list[dict] = [] + eid = 1 + + task_names = ["Gathering Facts"] + if "kernel" in job.get("name", "").lower(): + task_names += [ + "Create boom snapshot for rollback", + "Check disk space for kernel update", + "Update kernel package", + "Check if reboot is needed (needs-restarting -r)", + ] + elif "package" in job.get("name", "").lower(): + task_names += [ + "Update target packages for CVE remediation", + "Restart affected services", + "Verify service health", + ] + else: + task_names += ["Execute playbook tasks"] + + for task_name in task_names: + for host in hosts: + is_failed = job.get("status") == "failed" and task_name != "Gathering Facts" + events.append({ + "id": eid, + "type": "job_event", + "event": "runner_on_ok" if not is_failed else "runner_on_failed", + "task": task_name, + "host": host, + "host_name": host, + "play": job.get("name", ""), + "changed": task_name != "Gathering Facts" and not is_failed, + "failed": is_failed, + "event_data": { + "task": task_name, + "host": host, + "res": { + "changed": task_name != "Gathering Facts" and not is_failed, + "msg": "Task completed" if not is_failed else "Permission denied", + }, + }, + "created": _ts(timedelta(hours=4, minutes=30 - eid)), + }) + eid += 1 + + return events + + +def _generate_host_summaries(job: dict) -> list[dict]: + """Generate per-host summaries for a job.""" + hosts = (job.get("limit", "").split(",") if job.get("limit") else PROD_HOSTS[:3]) + hosts = [h.strip() for h in hosts if h.strip()] + summaries: list[dict] = [] + + for i, host in enumerate(hosts): + is_failed = job.get("status") == "failed" + summaries.append({ + "id": i + 1, + "type": "job_host_summary", + "host": i + 1, + "host_name": host, + "ok": 1 if is_failed else random.randint(3, 6), + "changed": 0 if is_failed else random.randint(1, 3), + "dark": 0, + "failures": 1 if is_failed else 0, + "skipped": 0, + "processed": 1, + "failed": is_failed, + }) + + return summaries + + +# --------------------------------------------------------------------------- +# MCP Tools: Job Management +# --------------------------------------------------------------------------- + +@mcp.tool() +def job_templates_list( + page_size: int = 10, + search: Optional[str] = None, +) -> dict: + """List available job templates in AAP. + + Args: + page_size: Number of results per page (default 10, max 200). + search: Optional search string to filter templates by name. + """ + results = MOCK_JOB_TEMPLATES + if search: + s = search.lower() + results = [t for t in results if s in t["name"].lower() or s in t.get("description", "").lower()] + return { + "count": len(results), + "next": None, + "previous": None, + "results": results[:page_size], + } + + +@mcp.tool() +def job_templates_retrieve(id: str) -> dict: + """Retrieve detailed information about a specific job template. + + Args: + id: Job template ID (as string). + """ + tid = int(id) + template = next((t for t in MOCK_JOB_TEMPLATES if t["id"] == tid), None) + if not template: + return {"detail": f"Not found. Job template {id} does not exist."} + return template + + +@mcp.tool() +def projects_list( + page_size: int = 50, + search: Optional[str] = None, +) -> dict: + """List available projects in AAP. + + Args: + page_size: Number of results per page. + search: Optional search string to filter projects by name. + """ + results = MOCK_PROJECTS + if search: + s = search.lower() + results = [p for p in results if s in p["name"].lower() or s in p.get("description", "").lower()] + return { + "count": len(results), + "next": None, + "previous": None, + "results": results[:page_size], + } + + +@mcp.tool() +def job_templates_launch_retrieve( + id: str, + requestBody: Optional[dict] = None, +) -> dict: + """Launch a job from a job template. + + Args: + id: Job template ID to launch. + requestBody: Optional launch parameters including job_type ('run' or 'check'), + extra_vars (dict), and limit (comma-separated host list). + """ + global _next_job_id + tid = int(id) + template = next((t for t in MOCK_JOB_TEMPLATES if t["id"] == tid), None) + if not template: + return {"detail": f"Not found. Job template {id} does not exist."} + + body = requestBody or {} + job_type = body.get("job_type", template.get("job_type", "run")) + + if not template.get("ask_job_type_on_launch") and job_type != template.get("job_type"): + return { + "error": f"Cannot override job_type: ask_job_type_on_launch is disabled on template {id}", + } + + job_id = _next_job_id + _next_job_id += 1 + + new_job = { + "id": job_id, + "type": "job", + "name": template["name"], + "job_type": job_type, + "status": "pending", + "failed": False, + "started": _ts(timedelta(seconds=0)), + "finished": None, + "elapsed": 0.0, + "job_template": tid, + "inventory": template["inventory"], + "project": template["project"], + "playbook": template["playbook"], + "limit": body.get("limit", ""), + "extra_vars": str(body.get("extra_vars", {})), + "launch_type": "manual", + "summary_fields": { + "job_template": {"id": tid, "name": template["name"]}, + }, + } + MOCK_JOBS.append(new_job) + + # Simulate job completion after launch + new_job["status"] = "successful" + new_job["finished"] = _ts(timedelta(seconds=-300)) + new_job["elapsed"] = 300.0 + + return { + "job": job_id, + "status": "pending", + "type": "job", + "url": f"/api/controller/v2/jobs/{job_id}/", + "related": { + "stdout": f"/api/controller/v2/jobs/{job_id}/stdout/", + "job_events": f"/api/controller/v2/jobs/{job_id}/job_events/", + "job_host_summaries": f"/api/controller/v2/jobs/{job_id}/job_host_summaries/", + }, + } + + +@mcp.tool() +def jobs_retrieve(id: int) -> dict: + """Get the status and details of a job run. + + Args: + id: Job ID to retrieve. + """ + job = next((j for j in MOCK_JOBS if j["id"] == id), None) + if not job: + return {"detail": f"Not found. Job {id} does not exist."} + return job + + +@mcp.tool() +def jobs_list(page_size: int = 10) -> dict: + """List recent job runs. + + Args: + page_size: Number of results to return. + """ + results = sorted(MOCK_JOBS, key=lambda j: j.get("started", ""), reverse=True) + return { + "count": len(results), + "next": None, + "previous": None, + "results": results[:page_size], + } + + +@mcp.tool() +def jobs_stdout_retrieve(id: int, format: str = "txt") -> dict: + """Get the stdout (console output) from a job run. + + Args: + id: Job ID. + format: Output format ('txt' or 'json'). Default 'txt'. + """ + job = next((j for j in MOCK_JOBS if j["id"] == id), None) + if not job: + return {"detail": f"Not found. Job {id} does not exist."} + return { + "content": _generate_stdout(job), + "range": {"start": 0, "end": 1}, + } + + +@mcp.tool() +def jobs_job_events_list(id: int, page_size: int = 50) -> dict: + """Get task-level events for a job run. + + Args: + id: Job ID. + page_size: Number of events to return. + """ + job = next((j for j in MOCK_JOBS if j["id"] == id), None) + if not job: + return {"detail": f"Not found. Job {id} does not exist."} + events = _generate_events(job) + return { + "count": len(events), + "next": None, + "previous": None, + "results": events[:page_size], + } + + +@mcp.tool() +def jobs_job_host_summaries_list(id: int) -> dict: + """Get per-host execution summaries for a job run. + + Args: + id: Job ID. + """ + job = next((j for j in MOCK_JOBS if j["id"] == id), None) + if not job: + return {"detail": f"Not found. Job {id} does not exist."} + summaries = _generate_host_summaries(job) + return { + "count": len(summaries), + "next": None, + "previous": None, + "results": summaries, + } + + +@mcp.tool() +def jobs_relaunch_retrieve( + id: int, + hosts: str = "all", + job_type: str = "run", +) -> dict: + """Relaunch a previously completed or failed job. + + Args: + id: Original job ID to relaunch. + hosts: Which hosts to target ('all' or 'failed'). + job_type: Job type for relaunch ('run' or 'check'). + """ + global _next_job_id + original = next((j for j in MOCK_JOBS if j["id"] == id), None) + if not original: + return {"detail": f"Not found. Job {id} does not exist."} + + new_id = _next_job_id + _next_job_id += 1 + + new_job = { + **original, + "id": new_id, + "job_type": job_type, + "status": "successful", + "failed": False, + "started": _ts(timedelta(seconds=0)), + "finished": _ts(timedelta(seconds=-300)), + "elapsed": 300.0, + "launch_type": "relaunch", + } + MOCK_JOBS.append(new_job) + + return { + "job": new_id, + "status": "pending", + "type": "job", + "url": f"/api/controller/v2/jobs/{new_id}/", + } + + +# --------------------------------------------------------------------------- +# MCP Tools: Inventory Management +# --------------------------------------------------------------------------- + +@mcp.tool() +def inventories_list( + page_size: int = 10, + search: Optional[str] = None, +) -> dict: + """List available inventories in AAP. + + Args: + page_size: Number of results per page. + search: Optional search string to filter inventories. + """ + results = MOCK_INVENTORIES + if search: + s = search.lower() + results = [inv for inv in results if s in inv["name"].lower() or s in inv.get("description", "").lower()] + return { + "count": len(results), + "next": None, + "previous": None, + "results": results[:page_size], + } + + +@mcp.tool() +def hosts_list( + inventory_id: Optional[int] = None, + page_size: int = 50, + search: Optional[str] = None, +) -> dict: + """List hosts in an inventory. + + Args: + inventory_id: Filter by inventory ID. If not provided, lists hosts from all inventories. + page_size: Number of results per page. + search: Optional search string to filter hosts by name. + """ + inv_id = inventory_id or 1 + hosts = _generate_hosts(inv_id) + if search: + s = search.lower() + hosts = [h for h in hosts if s in h["name"].lower()] + return { + "count": len(hosts), + "next": None if len(hosts) <= page_size else f"/api/controller/v2/hosts/?page=2", + "previous": None, + "results": hosts[:page_size], + } + + +if __name__ == "__main__": + mcp.run() diff --git a/submissions/rh-automation-governance-assessor/tests/llm_judge.py b/submissions/rh-automation-governance-assessor/tests/llm_judge.py new file mode 100644 index 0000000..903b29a --- /dev/null +++ b/submissions/rh-automation-governance-assessor/tests/llm_judge.py @@ -0,0 +1,83 @@ +import json +import os +import sys +import time +from pathlib import Path + +try: + from anthropic import Anthropic +except ImportError: + print("ERROR: anthropic package not installed") + sys.exit(1) + +CRITERIA = [ + { + "id": "skill_knowledge", + "file": "/solution/report.md", + "question": "Does the report demonstrate specific knowledge from the skill documentation and MCP tool data, rather than generic advice?", + "reference": "A skilled report uses specific data, tool names, and concepts from the SKILL.md and MCP environment. An unskilled report gives generic advice.", + }, +] + +SYSTEM_PROMPT = ( + "You are an evaluator for a cloud operations benchmark. You will be given a " + "file produced by an AI agent, a yes/no question about its contents, and a " + "REFERENCE ANSWER.\n\nRules:\n" + "- Answer ONLY with a JSON object: {\"pass\": true} or {\"pass\": false}\n" + "- Base your answer strictly on what is written in the file content\n" + "- Do not infer or assume knowledge the agent did not demonstrate" +) + +def judge_criterion(client, model, criterion): + filepath = criterion["file"] + if not Path(filepath).exists(): + return {"id": criterion["id"], "pass": False, "reason": "file not found"} + content = Path(filepath).read_text() + if len(content) > 50000: + content = content[:50000] + "\n... (truncated)" + ref = criterion.get("reference", "") + ref_block = f"\n\n## Reference Answer\n{ref}" if ref else "" + for attempt in range(3): + try: + response = client.messages.create( + model=model, max_tokens=64, system=SYSTEM_PROMPT, + messages=[{"role": "user", "content": ( + f"## File: {filepath}\n\n```\n{content}\n```\n\n" + f"## Question\n{criterion['question']}{ref_block}" + )}], + ) + text = response.content[0].text.strip() + if "{" in text: + text = text[text.index("{"):text.rindex("}") + 1] + result = json.loads(text) + return {"id": criterion["id"], "pass": bool(result.get("pass", False))} + except Exception as e: + if attempt < 2: time.sleep(5 * (attempt + 1)) + else: return {"id": criterion["id"], "pass": False, "reason": str(e)} + +def main(): + api_key = os.getenv("ANTHROPIC_API_KEY") + base_url = os.getenv("ANTHROPIC_BASE_URL") + model = os.getenv("LLM_JUDGE_MODEL", "claude-haiku-4-5") + if not api_key: + print("ERROR: ANTHROPIC_API_KEY not set") + json.dump({"criteria": [], "passed": 0, "total": 0, "score": 0.0}, + open("/logs/verifier/llm_judge.json", "w"), indent=2) + return + kwargs = {"api_key": api_key} + if base_url: kwargs["base_url"] = base_url + client = Anthropic(**kwargs) + results = [] + for c in CRITERIA: + r = judge_criterion(client, model, c) + results.append(r) + print(f" {c['id']}: {'PASS' if r['pass'] else 'FAIL'}") + passed = sum(1 for r in results if r["pass"]) + total = len(results) + score = round(passed / total, 4) if total > 0 else 0.0 + print(f"=== LLM Judge: {passed}/{total} (score={score}) ===") + Path("/logs/verifier/llm_judge.json").write_text(json.dumps( + {"criteria": results, "passed": passed, "total": total, "score": score}, indent=2)) + +if __name__ == "__main__": + main() diff --git a/submissions/rh-automation-governance-assessor/tests/test_outputs.py b/submissions/rh-automation-governance-assessor/tests/test_outputs.py new file mode 100644 index 0000000..6827864 --- /dev/null +++ b/submissions/rh-automation-governance-assessor/tests/test_outputs.py @@ -0,0 +1,25 @@ +"""Tests for skill evaluation. Baseline + skill-dependent checks.""" +import os +import pytest + +REPORT = "/solution/report.md" + +def read_report(): + if not os.path.exists(REPORT): + pytest.fail(f"Required file not found: {REPORT}") + with open(REPORT) as f: + return f.read() + +class TestBaseline: + def test_report_exists(self): + assert os.path.exists(REPORT), "report.md must exist" + + def test_report_has_content(self): + content = read_report() + assert len(content) > 200, "report should have substantial content" + +class TestSkillDependent: + def test_uses_mcp_data(self): + """Report should contain specific data from MCP tool queries.""" + c = read_report().lower() + assert len(c) > 500, "report should demonstrate thorough analysis using MCP tools" From 7f80d0db5a34a257bdbd02c5abe006525a73bfd9 Mon Sep 17 00:00:00 2001 From: gziv Date: Mon, 11 May 2026 11:43:17 +0300 Subject: [PATCH 02/11] feat: strengthen governance-assessor tests with 7+1 domain and scoring checks --- .../instruction.md | 24 ++++++--- .../tests/test_outputs.py | 50 +++++++++++++++---- 2 files changed, 56 insertions(+), 18 deletions(-) diff --git a/submissions/rh-automation-governance-assessor/instruction.md b/submissions/rh-automation-governance-assessor/instruction.md index 4f5ee07..668dced 100644 --- a/submissions/rh-automation-governance-assessor/instruction.md +++ b/submissions/rh-automation-governance-assessor/instruction.md @@ -1,11 +1,21 @@ -# governance-assessor Task +# AAP Governance Assessment Task -You are a Red Hat Ansible Automation Platform (AAP) engineer. Complete the task described below using the tools and documentation available in your environment. +You are an AAP automation engineer. The security team needs a comprehensive governance readiness assessment before promoting the AAP platform to handle production workloads. ## Requirements -- Use MCP tools to interact with the AAP environment -- Document your methodology, findings, and results in `/solution/report.md` -- If reference documentation or skills are available in this environment, consult them before beginning work -- Complete the entire analysis autonomously -Write your complete analysis in `/solution/report.md`. +1. Assess all 7 governance domains + 1 bonus domain: + - **Workflow Governance**: Approval gates, workflow coverage + - **Notification Coverage**: Failure alerting, notification bindings + - **Access Control / RBAC**: Teams, roles, least privilege + - **Credential Security**: Separation of duties, credential hygiene + - **Execution Environments**: Custom EEs, image provenance + - **Workload Isolation**: Instance groups, capacity separation + - **Audit Trail**: Activity stream, change tracking + - **Bonus: External Authentication**: LDAP, SAML, SSO configuration +2. For each domain, use the appropriate AAP MCP tools to gather data +3. Score each domain (0-100%) with specific findings +4. Provide an overall governance readiness score with pass/fail recommendation +5. List specific remediation items ordered by priority + +Write your governance assessment in `/solution/report.md`. diff --git a/submissions/rh-automation-governance-assessor/tests/test_outputs.py b/submissions/rh-automation-governance-assessor/tests/test_outputs.py index 6827864..160a0f4 100644 --- a/submissions/rh-automation-governance-assessor/tests/test_outputs.py +++ b/submissions/rh-automation-governance-assessor/tests/test_outputs.py @@ -1,6 +1,5 @@ -"""Tests for skill evaluation. Baseline + skill-dependent checks.""" -import os -import pytest +"""Governance assessor tests - 7+1 domains, scoring, remediation.""" +import os, re, pytest REPORT = "/solution/report.md" @@ -12,14 +11,43 @@ def read_report(): class TestBaseline: def test_report_exists(self): - assert os.path.exists(REPORT), "report.md must exist" - + assert os.path.exists(REPORT) def test_report_has_content(self): - content = read_report() - assert len(content) > 200, "report should have substantial content" + assert len(read_report()) > 600 + +class TestGovernanceDomains: + """The 7+1 domains are defined in the skill.""" + def test_workflow_governance(self): + c = read_report().lower() + assert "workflow" in c and ("governance" in c or "approval" in c) + def test_notification_coverage(self): + c = read_report().lower() + assert "notification" in c + def test_rbac(self): + c = read_report().lower() + assert "rbac" in c or "access control" in c or "role" in c + def test_credential_security(self): + c = read_report().lower() + assert "credential" in c + def test_execution_environments(self): + c = read_report().lower() + assert "execution environment" in c or "ee" in c or "custom ee" in c + def test_workload_isolation(self): + c = read_report().lower() + assert "isolation" in c or "instance group" in c + def test_audit_trail(self): + c = read_report().lower() + assert "audit" in c + +class TestScoring: + def test_has_scores(self): + c = read_report() + assert re.search(r'\d+%', c) or re.search(r'\d+/\d+', c), "Must include scores" + def test_overall_readiness(self): + c = read_report().lower() + assert "readiness" in c or "overall" in c or "pass" in c or "fail" in c -class TestSkillDependent: - def test_uses_mcp_data(self): - """Report should contain specific data from MCP tool queries.""" +class TestRemediation: + def test_remediation_items(self): c = read_report().lower() - assert len(c) > 500, "report should demonstrate thorough analysis using MCP tools" + assert "remediat" in c or "fix" in c or "action" in c or "recommend" in c From b6be4dd7735e6e7a0f7bad1f19d83319cbcf3b61 Mon Sep 17 00:00:00 2001 From: gziv Date: Wed, 13 May 2026 07:50:08 +0300 Subject: [PATCH 03/11] retrigger: queue runner From 89c723b0240df2bd247ac6518d023fb4e126b0b4 Mon Sep 17 00:00:00 2001 From: gziv Date: Wed, 13 May 2026 14:37:36 +0300 Subject: [PATCH 04/11] fix(tests): tighten governance-assessor tests to differentiate skilled vs unskilled Co-authored-by: Cursor --- .../tests/test_outputs.py | 94 +++++++++++-------- 1 file changed, 53 insertions(+), 41 deletions(-) diff --git a/submissions/rh-automation-governance-assessor/tests/test_outputs.py b/submissions/rh-automation-governance-assessor/tests/test_outputs.py index 160a0f4..2d8f803 100644 --- a/submissions/rh-automation-governance-assessor/tests/test_outputs.py +++ b/submissions/rh-automation-governance-assessor/tests/test_outputs.py @@ -1,53 +1,65 @@ -"""Governance assessor tests - 7+1 domains, scoring, remediation.""" -import os, re, pytest +""" +Tests for rh-automation-governance-assessor per-skill evaluation. + +Only differentiating tests kept — dead-weight tests where both +control and treatment pass 3/3 have been removed. +""" +import os +import pytest REPORT = "/solution/report.md" + def read_report(): if not os.path.exists(REPORT): pytest.fail(f"Required file not found: {REPORT}") with open(REPORT) as f: return f.read() + class TestBaseline: def test_report_exists(self): - assert os.path.exists(REPORT) - def test_report_has_content(self): - assert len(read_report()) > 600 - -class TestGovernanceDomains: - """The 7+1 domains are defined in the skill.""" - def test_workflow_governance(self): - c = read_report().lower() - assert "workflow" in c and ("governance" in c or "approval" in c) - def test_notification_coverage(self): - c = read_report().lower() - assert "notification" in c - def test_rbac(self): - c = read_report().lower() - assert "rbac" in c or "access control" in c or "role" in c - def test_credential_security(self): - c = read_report().lower() - assert "credential" in c - def test_execution_environments(self): - c = read_report().lower() - assert "execution environment" in c or "ee" in c or "custom ee" in c - def test_workload_isolation(self): - c = read_report().lower() - assert "isolation" in c or "instance group" in c - def test_audit_trail(self): - c = read_report().lower() - assert "audit" in c - -class TestScoring: - def test_has_scores(self): + assert os.path.exists(REPORT), "report.md must exist" + + +class TestSkillDependent: + def test_governance_readiness_assessor_invocation(self): + """Skill teaches invoking governance-readiness-assessor sub-skill + for the 7-domain assessment. Without skill, agents do freeform audit.""" + c = read_report() + assert "governance-readiness-assessor" in c or "readiness-assessor" in c, ( + "must reference governance-readiness-assessor sub-skill" + ) + + def test_aap_mcp_validator_invocation(self): + """Skill teaches invoking aap-mcp-validator as first step + before any assessment queries.""" + c = read_report() + assert "aap-mcp-validator" in c or "mcp-validator" in c, ( + "must reference aap-mcp-validator sub-skill invocation" + ) + + def test_compound_risk_analysis(self): + """Skill teaches cross-domain compound risk analysis identifying + combinations like 'no workflows + no notifications = invisible failures'. + Without skill, agents assess domains independently.""" + c = read_report() + assert "compound" in c.lower() or "cross-domain" in c.lower() or "correlation" in c.lower(), ( + "must include compound/cross-domain risk analysis" + ) + + def test_governance_readiness_doc(self): + """Skill teaches consulting governance-readiness.md for the + 7-domain assessment framework with Red Hat citations.""" + c = read_report() + assert "governance-readiness" in c, ( + "must reference governance-readiness.md" + ) + + def test_execution_summary_invocation(self): + """Skill teaches invoking execution-summary sub-skill as final + step to produce audit trail.""" c = read_report() - assert re.search(r'\d+%', c) or re.search(r'\d+/\d+', c), "Must include scores" - def test_overall_readiness(self): - c = read_report().lower() - assert "readiness" in c or "overall" in c or "pass" in c or "fail" in c - -class TestRemediation: - def test_remediation_items(self): - c = read_report().lower() - assert "remediat" in c or "fix" in c or "action" in c or "recommend" in c + assert "execution-summary" in c or "execution summary" in c.lower(), ( + "must reference execution-summary sub-skill" + ) From 2bff95347851c2bb1caf9ac2ef4e25a5f57057a6 Mon Sep 17 00:00:00 2001 From: gziv Date: Thu, 14 May 2026 10:27:21 +0300 Subject: [PATCH 05/11] retrigger: queue runner From 8ba9b6842a31b6281491974155823ba5a4d3cf5e Mon Sep 17 00:00:00 2001 From: gziv Date: Fri, 15 May 2026 04:24:09 +0300 Subject: [PATCH 06/11] retrigger: queue runner From 0c3183c494f8f6339a06f3bd7bc07e11adc1feb7 Mon Sep 17 00:00:00 2001 From: gziv Date: Sun, 17 May 2026 09:43:11 +0300 Subject: [PATCH 07/11] test: trim dead-weight tests for rh-automation-governance-assessor Remove baseline and non-differentiating checks both variants pass. --- .../tests/test_outputs.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/submissions/rh-automation-governance-assessor/tests/test_outputs.py b/submissions/rh-automation-governance-assessor/tests/test_outputs.py index 2d8f803..a25dd07 100644 --- a/submissions/rh-automation-governance-assessor/tests/test_outputs.py +++ b/submissions/rh-automation-governance-assessor/tests/test_outputs.py @@ -39,15 +39,6 @@ def test_aap_mcp_validator_invocation(self): "must reference aap-mcp-validator sub-skill invocation" ) - def test_compound_risk_analysis(self): - """Skill teaches cross-domain compound risk analysis identifying - combinations like 'no workflows + no notifications = invisible failures'. - Without skill, agents assess domains independently.""" - c = read_report() - assert "compound" in c.lower() or "cross-domain" in c.lower() or "correlation" in c.lower(), ( - "must include compound/cross-domain risk analysis" - ) - def test_governance_readiness_doc(self): """Skill teaches consulting governance-readiness.md for the 7-domain assessment framework with Red Hat citations.""" @@ -55,11 +46,3 @@ def test_governance_readiness_doc(self): assert "governance-readiness" in c, ( "must reference governance-readiness.md" ) - - def test_execution_summary_invocation(self): - """Skill teaches invoking execution-summary sub-skill as final - step to produce audit trail.""" - c = read_report() - assert "execution-summary" in c or "execution summary" in c.lower(), ( - "must reference execution-summary sub-skill" - ) From fa6caf7598c208264a152a85e25ffb18429d63eb Mon Sep 17 00:00:00 2001 From: gziv Date: Sun, 17 May 2026 15:38:28 +0300 Subject: [PATCH 08/11] test: keep only differentiating checks for rh-automation-governance-assessor Drop tests both arms pass or both fail equally per trial logs. Co-authored-by: Cursor --- .../tests/test_outputs.py | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/submissions/rh-automation-governance-assessor/tests/test_outputs.py b/submissions/rh-automation-governance-assessor/tests/test_outputs.py index a25dd07..1517df6 100644 --- a/submissions/rh-automation-governance-assessor/tests/test_outputs.py +++ b/submissions/rh-automation-governance-assessor/tests/test_outputs.py @@ -1,8 +1,8 @@ """ Tests for rh-automation-governance-assessor per-skill evaluation. -Only differentiating tests kept — dead-weight tests where both -control and treatment pass 3/3 have been removed. +Kept tests that differentiate per trial logs (xrtvqw). +Removed invocation checks both arms fail at 0%. """ import os import pytest @@ -23,25 +23,22 @@ def test_report_exists(self): class TestSkillDependent: - def test_governance_readiness_assessor_invocation(self): - """Skill teaches invoking governance-readiness-assessor sub-skill - for the 7-domain assessment. Without skill, agents do freeform audit.""" + def test_compound_risk_analysis(self): + """Skill teaches cross-domain compound risk analysis.""" c = read_report() - assert "governance-readiness-assessor" in c or "readiness-assessor" in c, ( - "must reference governance-readiness-assessor sub-skill" + assert "compound" in c.lower() or "cross-domain" in c.lower() or "correlation" in c.lower(), ( + "must include compound/cross-domain risk analysis" ) - def test_aap_mcp_validator_invocation(self): - """Skill teaches invoking aap-mcp-validator as first step - before any assessment queries.""" + def test_execution_summary_invocation(self): + """Skill teaches invoking execution-summary sub-skill.""" c = read_report() - assert "aap-mcp-validator" in c or "mcp-validator" in c, ( - "must reference aap-mcp-validator sub-skill invocation" + assert "execution-summary" in c or "execution summary" in c.lower(), ( + "must reference execution-summary sub-skill" ) def test_governance_readiness_doc(self): - """Skill teaches consulting governance-readiness.md for the - 7-domain assessment framework with Red Hat citations.""" + """Skill teaches consulting governance-readiness.md.""" c = read_report() assert "governance-readiness" in c, ( "must reference governance-readiness.md" From fec09b5d1b488fd693a877bc26d345068006112e Mon Sep 17 00:00:00 2001 From: gziv Date: Sun, 17 May 2026 16:17:10 +0300 Subject: [PATCH 09/11] retrigger: queue runner From ca9a64110b053be16392894fc8db0c6b31418d12 Mon Sep 17 00:00:00 2001 From: gziv Date: Mon, 18 May 2026 16:06:12 +0300 Subject: [PATCH 10/11] fix: strengthen governance-assessor tests with domain coverage and MCP data Add governance domain coverage (7 domains), aap-mcp-validator prerequisite, mock inventory scale references, and compound risk analysis checks. Co-authored-by: Cursor --- .../tests/test_outputs.py | 64 +++++++++++++++---- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/submissions/rh-automation-governance-assessor/tests/test_outputs.py b/submissions/rh-automation-governance-assessor/tests/test_outputs.py index 1517df6..bcd750e 100644 --- a/submissions/rh-automation-governance-assessor/tests/test_outputs.py +++ b/submissions/rh-automation-governance-assessor/tests/test_outputs.py @@ -1,8 +1,10 @@ """ Tests for rh-automation-governance-assessor per-skill evaluation. -Kept tests that differentiate per trial logs (xrtvqw). -Removed invocation checks both arms fail at 0%. +Skill orchestrates AAP governance readiness assessment across 7 domains +(Workflow, Notification, RBAC, Credentials, EEs, Isolation, Audit) plus +External Auth bonus. Uses governance-readiness-assessor sub-skill and +consults governance-readiness.md. """ import os import pytest @@ -23,23 +25,59 @@ def test_report_exists(self): class TestSkillDependent: - def test_compound_risk_analysis(self): - """Skill teaches cross-domain compound risk analysis.""" + def test_governance_domains_coverage(self): + """Skill teaches 7+1 governance domains. Without skill, agents + don't know the AAP governance taxonomy.""" + c = read_report().lower() + domains = [ + "workflow", "notification", "rbac", "credential", + "execution environment", "instance group", "audit", + ] + found = sum(1 for d in domains if d in c) + assert found >= 4, ( + f"must cover governance domains (workflow, notification, RBAC, " + f"credentials, EEs, instance groups, audit); found {found}/7" + ) + + def test_governance_readiness_doc(self): + """Skill teaches consulting governance-readiness.md.""" c = read_report() - assert "compound" in c.lower() or "cross-domain" in c.lower() or "correlation" in c.lower(), ( + assert "governance-readiness" in c, ( + "must reference governance-readiness.md" + ) + + def test_compound_risk_analysis(self): + """Skill teaches cross-domain compound risk analysis — identifying + risks that emerge from combinations of gaps.""" + c = read_report().lower() + assert "compound" in c or "cross-domain" in c or "correlation" in c, ( "must include compound/cross-domain risk analysis" ) + def test_aap_mcp_validator_prerequisite(self): + """Skill teaches invoking aap-mcp-validator to verify all 6 AAP + MCP servers before assessment. Without skill, agents skip validation.""" + c = read_report().lower() + assert "aap-mcp-validator" in c or "mcp-validator" in c, ( + "must invoke aap-mcp-validator as prerequisite" + ) + + def test_mock_inventory_scale(self): + """Mock AAP has 3 inventories: Production (30 hosts), Staging (15), + All Managed (63). Skilled agent references these to calibrate findings.""" + c = read_report().lower() + has_data = any(t in c for t in [ + "production systems", "staging systems", "30 host", "15 host", + "63 host", "all managed", + ]) + assert has_data, ( + "must reference AAP inventory data from MCP " + "(Production 30 hosts, Staging 15, All Managed 63)" + ) + def test_execution_summary_invocation(self): - """Skill teaches invoking execution-summary sub-skill.""" + """Skill teaches invoking execution-summary sub-skill for report.""" c = read_report() assert "execution-summary" in c or "execution summary" in c.lower(), ( "must reference execution-summary sub-skill" ) - - def test_governance_readiness_doc(self): - """Skill teaches consulting governance-readiness.md.""" - c = read_report() - assert "governance-readiness" in c, ( - "must reference governance-readiness.md" - ) From a494652612e886b6d0f1238be3c661fc5397e157 Mon Sep 17 00:00:00 2001 From: gziv Date: Tue, 19 May 2026 03:42:12 +0300 Subject: [PATCH 11/11] retrigger: queue runner