diff --git a/.agents/plugins/marketplace.json b/.agents/plugins/marketplace.json new file mode 100644 index 0000000..67c6ba1 --- /dev/null +++ b/.agents/plugins/marketplace.json @@ -0,0 +1,20 @@ +{ + "name": "openstack-agent-toolkit", + "interface": { + "displayName": "SAP Converged Cloud Agent Toolkit" + }, + "plugins": [ + { + "name": "sapcc", + "source": { + "source": "local", + "path": "./plugins/sapcc" + }, + "policy": { + "installation": "AVAILABLE", + "authentication": "ON_INSTALL" + }, + "category": "Cloud" + } + ] +} diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json new file mode 100644 index 0000000..30e9c9e --- /dev/null +++ b/.claude-plugin/marketplace.json @@ -0,0 +1,24 @@ +{ + "metadata": { + "description": "A plugin marketplace for SAP Converged Cloud agent skills.", + "version": "1.0.0" + }, + "name": "openstack-agent-toolkit", + "owner": { + "name": "SAP Converged Cloud" + }, + "plugins": [ + { + "category": "cloud", + "description": "Operate SAP Converged Cloud: compute, networking, storage, identity, quota, audit, metrics, registry, and endpoint services.", + "keywords": [ + "openstack", + "sapcc", + "converged-cloud" + ], + "name": "sapcc", + "source": "./plugins/sapcc", + "version": "1.0.0" + } + ] +} diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..c874d05 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: SAP SE or an SAP affiliate company +# SPDX-License-Identifier: Apache-2.0 + +root = true + +[*] +insert_final_newline = true +charset = utf-8 +trim_trailing_whitespace = true +indent_style = space +indent_size = 2 + +[*.md] +trim_trailing_whitespace = false + +[*.py] +indent_size = 4 + +[{Makefile,go.mod,go.sum,*.go}] +indent_style = tab +indent_size = unset + +[{LICENSE,LICENSES/*}] +charset = unset +end_of_line = unset +indent_size = unset +indent_style = unset +insert_final_newline = unset +trim_trailing_whitespace = unset diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..00e37f4 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,5 @@ +# SPDX-FileCopyrightText: Copyright 2026 SAP SE or an SAP affiliate company +# +# SPDX-License-Identifier: Apache-2.0 + +* @notque diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml new file mode 100644 index 0000000..49616ab --- /dev/null +++ b/.github/workflows/validate.yml @@ -0,0 +1,29 @@ +name: Validate + +on: + push: + branches: [main] + pull_request: + branches: [main] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +permissions: + contents: read + +jobs: + validate: + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Validate manifests and skills + run: python3 tools/validate.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c31f8c1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +# OS +.DS_Store +Thumbs.db + +# Editor +*.swp +*.swo +*~ +.idea/ +.vscode/ + +# Python +__pycache__/ +*.pyc +.pytest_cache/ + +# Build +dist/ +build/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..abcbdbe --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,92 @@ + + +# SAP Open Source Code of Conduct + +SAP adopts the [Contributor's Covenant 2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct/) +across our open source projects to ensure a welcoming and open culture for everyone involved. + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at [ospo@sap.com](mailto:ospo@sap.com) (SAP Open Source Program Office). All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of actions. + +**Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.1, available at [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. + +For answers to common questions about this code of conduct, see the FAQ at [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at [https://www.contributor-covenant.org/translations][translations]. + +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LICENSES/Apache-2.0.txt b/LICENSES/Apache-2.0.txt new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSES/Apache-2.0.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 86c11ec..76a3beb 100644 --- a/README.md +++ b/README.md @@ -1,52 +1,109 @@ -# OpenStack Agent Toolkit +# SAP Converged Cloud Agent Toolkit -Skills, knowledge, and prompts for AI agents working with OpenStack and SAP Converged Cloud. Designed for use with [openstack-mcp-server](https://github.com/notque/openstack-mcp-server). +Help AI coding agents operate SAP Converged Cloud infrastructure. + +The Agent Toolkit gives AI agents the skills, knowledge, and guardrails to work with SAP CC services effectively. It works with Claude Code, Codex, and any agent supporting the skills format. + +## Quick Start + +### Claude Code + +``` +/plugin marketplace add notque/openstack-agent-toolkit +/plugin install sapcc@openstack-agent-toolkit +``` + +### Manual Installation + +Copy skills to your agent's skills location: + +| Agent | Path | +|-------|------| +| Claude Code | `~/.claude/skills/` or `.claude/skills/` | +| Codex | `~/.codex/skills/` or `.agents/skills/` | ## Architecture ``` ┌─────────────────────────────────────────────────────────────────┐ -│ AI Agent (Claude, etc.) │ +│ AI Agent (Claude Code, Codex, etc.) │ ├─────────────────────────────────────────────────────────────────┤ │ Agent Toolkit (this repo) │ MCP Server │ -│ • Skills (workflows) │ • Tool implementations │ -│ • Knowledge (context) │ • Auth management │ -│ • Best practices │ • API proxying │ +│ • Skills (domain workflows) │ • Tool implementations │ +│ • Knowledge (SAP CC context) │ • Auth management │ +│ • Rules (guardrails) │ • API proxying │ +│ • Gotchas (mistake prevention) │ • Response sanitization │ └─────────────────────────────────────────────────────────────────┘ ``` -**MCP Server** = runtime that provides typed tools (API calls) -**Agent Toolkit** = intelligence layer that teaches LLMs how to use those tools effectively +**MCP Server** ([openstack-mcp-server](https://github.com/notque/openstack-mcp-server)) = runtime providing typed tools (28+ API operations) +**Agent Toolkit** (this repo) = intelligence layer teaching agents *when* and *how* to use those tools -## Skills +## What's Included -| Skill | Description | -|-------|-------------| -| [credential-setup](skills/credential-setup/) | Guided workflow for creating and storing OpenStack application credentials securely | +### Plugin + +| Plugin | Description | +|--------|-------------| +| [sapcc](plugins/sapcc/) | All SAP CC skills and MCP server configuration. Covers compute, networking, storage, identity, quota, audit, metrics, registry, and endpoint services. | + +### Skills -## Knowledge +| Skill | Service | Key Capability | +|-------|---------|----------------| +| [sapcc-compute](plugins/sapcc/skills/sapcc-compute/) | Nova | Server lifecycle, flavor selection, cross-service correlation | +| [sapcc-networking](plugins/sapcc/skills/sapcc-networking/) | Neutron | Network topology, security groups, connectivity debugging | +| [sapcc-storage](plugins/sapcc/skills/sapcc-storage/) | Cinder | Volume lifecycle, attachment states, performance tiers | +| [sapcc-identity](plugins/sapcc/skills/sapcc-identity/) | Keystone | Domain/project model, app credentials, service catalog | +| [sapcc-quota](plugins/sapcc/skills/sapcc-quota/) | Limes | Quota interpretation, capacity planning, usage tracking | +| [sapcc-audit](plugins/sapcc/skills/sapcc-audit/) | Hermes | CADF events, compliance queries, change investigation | +| [sapcc-metrics](plugins/sapcc/skills/sapcc-metrics/) | Maia | PromQL queries, metric discovery, monitoring | +| [sapcc-registry](plugins/sapcc/skills/sapcc-registry/) | Keppel | Container images, vulnerability status, federation | +| [sapcc-connectivity](plugins/sapcc/skills/sapcc-connectivity/) | Archer | Private endpoint services, service discovery | +| [credential-setup](plugins/sapcc/skills/credential-setup/) | Keystone | Guided auth setup with keychain storage | + +### Rules + +The [rules file](rules/sapcc-agent-rules.md) provides baseline agent behavior: +- Check quota before resource creation +- Use audit trail for debugging +- Never expose credentials to the LLM +- Load skills before guessing at SAP CC-specific behavior + +### Knowledge | Topic | Description | |-------|-------------| -| [SAP CC Services](knowledge/sapcc/services.md) | Reference guide to SAP Converged Cloud services, their APIs, and common operations | +| [SAP CC Services](knowledge/sapcc/services.md) | Service reference: APIs, tool prefixes, common operations by role | -## Usage with Claude Code +## How Skills Work -Skills from this toolkit can be loaded as context when working with OpenStack: +Skills use progressive disclosure: -```bash -# The MCP server provides the tools -# This toolkit provides the knowledge and workflows -``` +1. At startup, the agent reads only skill name + description (~50 tokens each) +2. When a task matches, the full skill loads (~200-400 lines of instructions) +3. Reference files load on-demand for deep-dive content +4. Skill context releases when the task completes + +10 skills installed = ~500 tokens at startup. Full context only when needed. ## Security Philosophy -- **Credentials never reach the LLM** — the MCP server holds secrets in process memory +- **Credentials never reach the LLM** — MCP server holds secrets in process memory - **Application credentials over passwords** — scoped, revocable, no password exposure -- **Keychain storage** — secrets retrieved via system keychain commands, never stored in config files +- **Keychain storage** — secrets retrieved via system commands, never in config files - **Defense in depth** — response sanitization catches accidental leakage +- **Destructive operations require confirmation** — skills enforce user consent + +## Validation + +```bash +python3 tools/validate.py +``` + +Validates all plugin manifests, skill frontmatter, and MCP configs. Runs in CI. ## Related - [openstack-mcp-server](https://github.com/notque/openstack-mcp-server) — The Go MCP server this toolkit complements -- [AWS Agent Toolkit](https://github.com/aws/agent-toolkit-for-aws) — Similar pattern for AWS +- [AWS Agent Toolkit](https://github.com/aws/agent-toolkit-for-aws) — Similar pattern for AWS (reference architecture) diff --git a/docs/adr/001-toolkit-architecture.md b/docs/adr/001-toolkit-architecture.md new file mode 100644 index 0000000..cbf62be --- /dev/null +++ b/docs/adr/001-toolkit-architecture.md @@ -0,0 +1,233 @@ +# ADR-001: SAP Converged Cloud Agent Toolkit Architecture + +## Status + +**Proposed** — 2026-05-06 + +## Context + +We have a working [openstack-mcp-server](https://github.com/notque/openstack-mcp-server) that provides 28+ MCP tools across 8 SAP Converged Cloud services: + +| Service | Project | Tools | +|---------|---------|-------| +| Compute | Nova | list servers, get server, list flavors, server actions | +| Networking | Neutron | list networks, subnets, ports, security groups | +| Block Storage | Cinder | list volumes, get volume | +| Identity | Keystone | list projects, token info, app credential CRUD | +| Quota/Usage | Limes | project/domain/cluster quota | +| Audit | Hermes | list/get audit events, list attributes | +| Metrics | Maia | PromQL query, label values, metric names | +| Registry | Keppel | list accounts, repositories, manifests | +| Endpoint Service | Archer | list/get services and endpoints | + +The server handles authentication, API proxying, and response sanitization. + +What's missing is the **intelligence layer** — skills, knowledge, and guardrails that teach AI agents *how* to use those tools effectively. Without this layer, agents make common mistakes: +- Creating servers without checking quota first (Limes) +- Not correlating cross-service information (e.g., server → ports → security groups) +- Misunderstanding SAP CC-specific concepts (domains, CADF events, federated registries) +- Not following security best practices (keychain storage, app credentials over passwords) +- Debugging issues without checking the audit trail (Hermes) or metrics (Maia) + +AWS has released [agent-toolkit-for-aws](https://github.com/aws/agent-toolkit-for-aws) which provides a reference architecture for this pattern. We will build an equivalent purpose-built for SAP Converged Cloud. + +## Decision + +### Architecture: Single Plugin with Domain-Specific Skills + +Since this toolkit is exclusively for SAP Converged Cloud (not vanilla OpenStack), we use a **single plugin** containing all skills. Every SAP CC user needs the full service suite — there's no "pick and choose" scenario. + +``` +openstack-agent-toolkit/ +├── .claude-plugin/ +│ └── marketplace.json # Plugin registry +├── plugins/ +│ └── sapcc/ +│ ├── .claude-plugin/plugin.json +│ ├── .mcp.json # MCP server config +│ └── skills/ +│ ├── sapcc-compute/ # Nova operations + gotchas +│ ├── sapcc-networking/ # Neutron + cross-service correlation +│ ├── sapcc-storage/ # Cinder volumes + lifecycle +│ ├── sapcc-identity/ # Keystone, domains, app credentials +│ ├── sapcc-quota/ # Limes quota management + capacity planning +│ ├── sapcc-audit/ # Hermes CADF events + compliance +│ ├── sapcc-metrics/ # Maia PromQL + alerting patterns +│ ├── sapcc-registry/ # Keppel container images + federation +│ ├── sapcc-connectivity/ # Archer endpoint services +│ └── credential-setup/ # Auth workflow (existing, enhanced) +├── rules/ +│ └── sapcc-agent-rules.md # Baseline agent behavior rules +├── knowledge/ +│ ├── sapcc/ +│ │ ├── services.md # Service reference (existing) +│ │ ├── architecture.md # Regional architecture, domain model +│ │ └── troubleshooting-flows.md # Cross-service debugging patterns +│ └── openstack/ +│ └── api-conventions.md # Common OpenStack API patterns +├── tools/ +│ └── validate.py # CI validation (stdlib-only) +└── docs/ + └── adr/ # Architecture decision records +``` + +### Skill Inventory (10 Skills) + +| Skill | MCP Tools Used | Key Content | +|-------|---------------|-------------| +| `sapcc-compute` | `nova_*` | Flavor selection, server lifecycle, gotchas (status transitions, metadata) | +| `sapcc-networking` | `neutron_*` | Network topology, port debugging, security group rules, cross-ref with nova | +| `sapcc-storage` | `cinder_*` | Volume lifecycle, attachment states, performance tiers | +| `sapcc-identity` | `keystone_*` | Domain/project model, role assignments, app credentials, service catalog | +| `sapcc-quota` | `limes_*` | Capacity planning, quota interpretation, "am I running out?" workflows | +| `sapcc-audit` | `hermes_*` | CADF event format, compliance queries, "who changed what?" workflows | +| `sapcc-metrics` | `maia_*` | PromQL patterns for SAP CC, discovering metrics, building dashboards | +| `sapcc-registry` | `keppel_*` | Image lifecycle, vulnerability scanning, cross-region federation | +| `sapcc-connectivity` | `archer_*` | Private service access, endpoint provisioning, troubleshooting | +| `credential-setup` | `keystone_*` | Guided auth setup (migrated + enhanced from current) | + +### Skill Structure (from AWS reference pattern) + +Each skill follows this template: + +```markdown +--- +name: sapcc-compute +description: >- + Manage compute instances on SAP Converged Cloud. Covers server lifecycle, + flavor selection, status transitions, and cross-service correlation with + networking and storage. Use when creating, debugging, or managing VMs. + NOT for container workloads (use sapcc-registry) or bare metal. +version: 1 +metadata: + service: [nova] + task: [create, debug, manage] + persona: [developer, platform-engineer] +allowed-tools: [Read] +--- + +# SAP CC Compute + +## Service Overview +[Decision tables, quick-start commands] + +## Gotchas +[Numbered list of common agent mistakes — THE most important section] + +## Common Workflows +[Step-by-step procedures referencing MCP tools] + +## Troubleshooting +[Failure modes with diagnostic steps] + +## Security Considerations +[Mandatory section] +``` + +### Gotchas: The Core Innovation + +The AWS toolkit's most valuable pattern is the **Gotchas section** — numbered corrections for mistakes agents consistently make. For SAP CC, examples include: + +**sapcc-compute:** +1. Always check quota via `limes_get_project_quota` before attempting server creation +2. Server status `SHUTOFF` ≠ deleted — the instance still consumes quota +3. Use `neutron_list_ports` with `device_id` to find a server's network interfaces (not in nova response) + +**sapcc-quota:** +1. Limes quota values are in base units — RAM is MiB, not GB +2. `physical_usage` may exceed `usage` due to snapshots and replicas +3. Domain quota is a cap on the sum of project quotas, not a pool + +**sapcc-audit:** +1. Hermes events use CADF `target_type` format: `compute/server`, not `nova/server` +2. Time filters use `gte:` and `lte:` prefix syntax, not standard query params +3. Event `outcome` is `success`/`failure`/`pending`, not HTTP status codes + +### MCP Server Configuration + +The `.mcp.json` bundles the server binary config: + +```json +{ + "mcpServers": { + "sapcc": { + "command": "openstack-mcp-server", + "env": { + "OS_AUTH_URL": "${OS_AUTH_URL}", + "OS_APPLICATION_CREDENTIAL_ID": "${OS_APPLICATION_CREDENTIAL_ID}", + "OS_APPCRED_SECRET_CMD": "${OS_APPCRED_SECRET_CMD}", + "OS_REGION_NAME": "${OS_REGION_NAME}" + } + } + } +} +``` + +### Rules File (`sapcc-agent-rules.md`) + +```markdown +# SAP Converged Cloud Guidance + +- Use the SAP CC MCP Server for all OpenStack/SAP CC interactions. +- Before starting a task, check whether a relevant sapcc-* skill is available. + Load the skill and prefer its guidance over general knowledge. +- SAP CC uses a Domain → Project hierarchy. Always be aware of the current + project scope (check with keystone_token_info if uncertain). +- For any operation that creates or resizes resources, check quota first + via limes_get_project_quota. +- When debugging issues, check the audit trail (hermes_list_events) and + metrics (maia_query) before guessing. +- Credentials never reach the LLM — the MCP server holds secrets in process + memory. Never ask the user for passwords or tokens. +- When uncertain about SAP CC-specific behavior (Limes, Hermes, Maia, + Keppel, Archer), load the relevant skill rather than guessing. +``` + +### Validation (`tools/validate.py`) + +Stdlib-only Python script validates: +- Plugin manifest structure +- Skill frontmatter (name matches directory, kebab-case, description ≥ 20 chars) +- MCP config structure +- Cross-references (skills don't reference non-existent tools) + +## Consequences + +### Positive +- **Single install** — `sapcc` plugin gives users everything they need +- **Progressive disclosure** — Skills load on-demand (~50 tokens at startup each) +- **Gotchas prevent mistakes** — Most value comes from corrections, not tutorials +- **Cross-service skills** — Each skill documents how it relates to other services +- **Portable** — Works with Claude Code, Codex, and any agent supporting skills format +- **Testable** — `validate.py` catches structural issues before merge + +### Negative +- **Two repos** — MCP server and toolkit evolve separately (different cadences) +- **Sync risk** — New MCP tools need matching skill updates +- **Knowledge capture** — Gotchas require real-world agent usage to discover + +### Mitigations +- CI validation ensures skills only reference tools that exist in MCP server +- `version` field in plugin.json tracks minimum MCP server version +- Gotchas section is a living document — add entries as we discover agent mistakes +- The existing `credential-setup` skill proves the pattern works + +## Alternatives Considered + +### 1. Two plugins (openstack-core + sapcc-platform) +**Rejected** — This is exclusively for SAP CC. There's no use case where someone wants Nova skills without Limes or Hermes. One plugin, one install. + +### 2. Embed skills in the MCP server repo +**Rejected** — Couples Go tool implementation with prompt engineering. Different audiences, different change frequencies. + +### 3. Monolithic CLAUDE.md with all knowledge +**Rejected** — Bloats context window. Progressive disclosure is essential with 10 skills × ~500 lines each. + +### 4. Web-hosted skills with runtime discovery +**Rejected for now** — Adds infrastructure dependency. Local-first matches our team size. Can add later. + +## References + +- [AWS Agent Toolkit for AWS](https://github.com/aws/agent-toolkit-for-aws) — Reference implementation +- [openstack-mcp-server](https://github.com/notque/openstack-mcp-server) — Our MCP server (Go) +- [Claude Code Plugin Docs](https://docs.anthropic.com/en/docs/claude-code/plugins) — Plugin format spec diff --git a/docs/implementation-plan.md b/docs/implementation-plan.md new file mode 100644 index 0000000..f66c28d --- /dev/null +++ b/docs/implementation-plan.md @@ -0,0 +1,186 @@ +# Implementation Plan: SAP Converged Cloud Agent Toolkit + +## Goal + +Build a complete agent toolkit for SAP Converged Cloud that teaches AI agents how to effectively use the openstack-mcp-server's 28+ tools across 9 services, following the AWS agent-toolkit-for-aws plugin pattern. + +## Phases + +### Phase 1: Scaffolding & Infrastructure ✅ (partially done) + +**Deliverables:** +- [x] Repository structure (`openstack-agent-toolkit`) +- [x] Initial `knowledge/sapcc/services.md` +- [x] Initial `skills/credential-setup/SKILL.md` +- [ ] Plugin scaffolding (`.claude-plugin/`, `plugins/sapcc/`) +- [ ] Marketplace manifest +- [ ] MCP server config (`.mcp.json`) +- [ ] Rules file (`rules/sapcc-agent-rules.md`) +- [ ] Validation script (`tools/validate.py`) +- [ ] CI workflow (`.github/workflows/validate.yml`) + +**Effort:** 1 session + +--- + +### Phase 2: Core Service Skills (Nova, Neutron, Cinder, Keystone) + +These are the most-used services and have the most tool coverage in the MCP server. + +#### 2a: `sapcc-compute` (Nova) +- Server lifecycle (create → active → stop → delete) +- Flavor selection guide (SAP CC flavor naming conventions) +- Gotchas: quota check first, status transitions, metadata size limits +- Cross-service: "find ports for server" pattern (nova + neutron) +- References: `flavor-families.md`, `server-troubleshooting.md` + +#### 2b: `sapcc-networking` (Neutron) +- Network topology understanding (networks → subnets → ports) +- Security group debugging ("why can't I reach my server?") +- Gotchas: default security group blocks everything, port vs floating IP +- Cross-service: correlate ports with servers, check security groups +- References: `security-group-patterns.md`, `network-debugging.md` + +#### 2c: `sapcc-storage` (Cinder) +- Volume lifecycle, attachment states +- Performance tiers (SAP CC volume types) +- Gotchas: volume stuck "in-use", can't delete attached volumes +- References: `volume-types.md` + +#### 2d: `sapcc-identity` (Keystone) +- Domain → Project hierarchy +- Application credentials (ties into existing credential-setup skill) +- Role model, service catalog interpretation +- Gotchas: scoping matters, app creds inherit current roles +- References: `domain-project-model.md` + +**Effort:** 2 sessions + +--- + +### Phase 3: Platform Service Skills (Limes, Hermes, Maia) + +These are SAP CC-specific and need the most domain knowledge baked in. + +#### 3a: `sapcc-quota` (Limes) +- Quota hierarchy: cluster → domain → project +- Interpreting quota reports (quota, usage, physical_usage, burst) +- "Am I running out?" workflow +- Gotchas: units are base (MiB not GB), physical > logical is normal +- References: `quota-services-mapping.md`, `capacity-planning.md` + +#### 3b: `sapcc-audit` (Hermes) +- CADF event format explanation +- "Who did what?" investigation workflow +- "What changed in the last hour?" workflow +- Gotchas: target_type uses slashes, time filter syntax, outcome values +- References: `cadf-event-format.md`, `common-queries.md` + +#### 3c: `sapcc-metrics` (Maia) +- PromQL patterns for SAP CC infrastructure +- Metric discovery workflow (names → labels → query) +- Common queries: VM CPU, network throughput, volume IOPS +- Gotchas: tenant-scoped (can only see your project), time format +- References: `promql-patterns.md`, `metric-catalog.md` + +**Effort:** 2 sessions + +--- + +### Phase 4: Specialized Service Skills (Keppel, Archer) + +#### 4a: `sapcc-registry` (Keppel) +- Account → Repository → Manifest hierarchy +- Image lifecycle, vulnerability status interpretation +- Cross-region federation (images replicated between regions) +- Gotchas: account ≠ project, manifest vs tag, vulnerability scan timing +- References: `image-lifecycle.md` + +#### 4b: `sapcc-connectivity` (Archer) +- Service vs Endpoint model (producer/consumer pattern) +- "Access a service privately" workflow +- Gotchas: endpoint requires matching availability zone, status transitions +- References: `endpoint-provisioning.md` + +**Effort:** 1 session + +--- + +### Phase 5: Cross-Cutting Concerns + +#### 5a: Enhance `credential-setup` +- Migrate from `skills/` to `plugins/sapcc/skills/` +- Add references for multi-region credential management +- Add rotation workflow + +#### 5b: Knowledge enhancement +- `knowledge/sapcc/architecture.md` — Regional model, domain hierarchy +- `knowledge/sapcc/troubleshooting-flows.md` — Cross-service debugging patterns +- `knowledge/openstack/api-conventions.md` — Pagination, error codes, microversions + +#### 5c: Integration testing +- Test each skill with real MCP server interactions +- Verify gotchas actually prevent common mistakes +- Validate progressive disclosure works (skills load only when needed) + +**Effort:** 1 session + +--- + +### Phase 6: Distribution & Documentation + +- [ ] README with install instructions (Claude Code, Codex, manual) +- [ ] Update `openstack-mcp-server` README to reference this toolkit +- [ ] GitHub release with version tag +- [ ] Test `/plugin install` flow end-to-end + +**Effort:** 1 session + +--- + +## Priority Order + +If time is limited, build skills in this order (highest user impact first): + +1. **credential-setup** (already done) — Gate to everything else +2. **sapcc-quota** (Limes) — Most confusing service, highest gotcha density +3. **sapcc-compute** (Nova) — Most-used service +4. **sapcc-audit** (Hermes) — Unique to SAP CC, non-obvious API +5. **sapcc-networking** (Neutron) — Critical for debugging +6. **sapcc-metrics** (Maia) — PromQL expertise is rare +7. **sapcc-identity** (Keystone) — Domain model confusion +8. **sapcc-registry** (Keppel) — Growing usage, unique federation +9. **sapcc-storage** (Cinder) — Simpler API, fewer gotchas +10. **sapcc-connectivity** (Archer) — Niche but important + +## Key Design Decisions + +| Decision | Rationale | +|----------|-----------| +| Single plugin (`sapcc`) | All users need all services — no partial installs | +| Gotchas are numbered | Agents follow numbered lists reliably | +| Each skill ≤ 500 lines | Context budget — progressive disclosure for depth | +| References load on-demand | "Read X only if user needs Y" pattern | +| Cross-service references | Every skill says how it relates to others | +| Validation in CI | Catch broken frontmatter, dead references before merge | + +## Success Criteria + +- [ ] All 10 skills pass `tools/validate.py` +- [ ] Plugin installs via `/plugin install sapcc@openstack-agent-toolkit` +- [ ] Agent correctly checks quota before server creation (gotcha test) +- [ ] Agent uses Hermes for "who changed X?" questions (skill routing test) +- [ ] Agent correlates Nova servers with Neutron ports (cross-service test) +- [ ] No skill exceeds 500 lines (context budget test) + +## Timeline Estimate + +| Phase | Sessions | Calendar | +|-------|----------|----------| +| Phase 1: Scaffolding | 1 | Day 1 | +| Phase 2: Core services | 2 | Days 2-3 | +| Phase 3: Platform services | 2 | Days 4-5 | +| Phase 4: Specialized | 1 | Day 6 | +| Phase 5: Cross-cutting | 1 | Day 7 | +| Phase 6: Distribution | 1 | Day 8 | +| **Total** | **8 sessions** | **~2 weeks** | diff --git a/knowledge/sapcc/architecture.md b/knowledge/sapcc/architecture.md new file mode 100644 index 0000000..4d37b61 --- /dev/null +++ b/knowledge/sapcc/architecture.md @@ -0,0 +1,86 @@ +# SAP Converged Cloud Architecture + +## Regional Model + +Each SAP CC region is an independent OpenStack deployment: + +| Property | Scope | Implication | +|----------|-------|-------------| +| Keystone (identity) | Per-region | Separate credentials per region | +| Service catalog | Per-region | Available services may differ | +| Resources | Per-region | VMs, volumes, networks don't cross regions | +| Credentials | Per-region | App credentials valid only in creating region | +| Quota (Limes) | Per-region | Capacity tracked independently | + +## Region Naming + +Format: `--` + +| Region | Location | Notes | +|--------|----------|-------| +| `eu-de-1` | Germany | Primary EU region | +| `eu-de-2` | Germany | Secondary EU region | +| `eu-nl-1` | Netherlands | EU expansion | +| `na-us-1` | US East | North America | +| `na-us-2` | US West | North America | +| `na-us-3` | US Central | North America | +| `ap-jp-1` | Japan | Asia-Pacific | +| `ap-au-1` | Australia | Asia-Pacific | +| `qa-de-1` | Germany | QA/staging environment | + +## Domain → Project Hierarchy + +``` +Region (eu-de-1) +├── Domain: cc-demo +│ ├── Project: demo-platform +│ ├── Project: demo-app-team +│ └── Project: demo-monitoring +├── Domain: cc-production +│ ├── Project: prod-frontend +│ ├── Project: prod-backend +│ └── Project: prod-data +└── Domain: cc-network + ├── Project: network-mgmt + └── Project: network-shared +``` + +- **Domain** = organizational unit (team, department, environment) +- **Project** = resource container (where VMs, volumes, networks live) +- **Roles** = assigned at project level (member, admin, reader) + +## Service Architecture + +All services authenticate through Keystone and are scoped to the current project: + +``` +User → Keystone (auth) → Token (project-scoped) + ↓ + ┌────────────────┼────────────────┐ + │ │ │ + Nova Neutron Cinder + (compute) (network) (storage) + │ │ │ + └────────────────┼────────────────┘ + ↓ + SAP CC Platform Services + ┌────────────────┼────────────────┐ + │ │ │ │ │ + Limes Hermes Maia Keppel Archer + (quota) (audit) (metrics)(registry)(endpoints) +``` + +## Multi-Tenancy Model + +| Layer | Isolation | +|-------|-----------| +| Region | Complete (separate deployments) | +| Domain | Administrative (quota boundaries) | +| Project | Resource (VMs, networks, volumes) | + +Cross-project visibility: +- Hermes audit events: project-scoped (you see only your actions) +- Maia metrics: project-scoped (you see only your metrics) +- Limes quota: project/domain/cluster views (different detail levels) +- Archer services: services visible cross-project (endpoints are project-scoped) +- Keppel accounts: project-scoped (but federation shares images across regions) diff --git a/knowledge/sapcc/troubleshooting-flows.md b/knowledge/sapcc/troubleshooting-flows.md new file mode 100644 index 0000000..f323788 --- /dev/null +++ b/knowledge/sapcc/troubleshooting-flows.md @@ -0,0 +1,104 @@ +# Cross-Service Troubleshooting Flows + +## "Why can't I reach my server?" + +``` +Step 1: nova_get_server → Is status ACTIVE? + ├── No (SHUTOFF/ERROR) → Server isn't running. Start it or check ERROR cause. + └── Yes → Continue + +Step 2: neutron_list_ports (device_id=server_id) → Port status? + ├── No ports → Server has no network interface. Problem is at creation level. + ├── DOWN → VM is off or port is not bound. + └── ACTIVE → Continue + +Step 3: Check port's security_groups → neutron_list_security_groups + ├── No rules allowing your traffic → Add ingress rule for your port/protocol + └── Rules exist → Continue + +Step 4: Check if IP is correct + └── Port fixed_ips[].ip_address matches what you're connecting to? + ├── No → Wrong IP. Use the one from the port. + └── Yes → Issue is outside OpenStack (DNS, routing, client firewall) +``` + +## "Who changed this resource?" + +``` +Step 1: Identify the resource + └── Get target_type + target_id + Examples: compute/server + UUID, network/port + UUID + +Step 2: hermes_list_events (target_type, target_id, sort=time:desc) + ├── Events found → Read initiator_name, action, outcome, time + └── No events → Widen time range or check target_type format (uses slashes) + +Step 3: For full detail → hermes_get_event (event_id) + └── Shows request/response attachments — what exactly was sent +``` + +## "Am I running out of resources?" + +``` +Step 1: keystone_token_info → Get project_id and domain_id + +Step 2: limes_get_project_quota (domain_id, project_id) + └── For each service/resource: + Calculate: usage / quota × 100 = utilization % + +Step 3: Alert thresholds + ├── > 90% → Critical: will hit limit soon + ├── > 80% → Warning: plan capacity increase + └── < 80% → Healthy + +Step 4: If physical_usage > usage → Normal (snapshots/replicas) + If burst_usage > 0 → Temporary over-quota (will need to reduce) +``` + +## "What's happening with my server?" + +``` +Step 1: nova_get_server → Status, task_state, host_id + +Step 2: hermes_list_events (target_type=compute/server, target_id=UUID, sort=time:desc) + └── Recent actions and their outcomes + +Step 3: maia_query → Performance metrics + └── CPU: rate(vm_cpu_seconds_total[5m]) + └── Memory: vm_memory_usage_bytes + +Step 4: neutron_list_ports (device_id=server_id) → Network state + +Step 5: cinder_list_volumes (filter by server in attachments) → Storage state +``` + +## "Debug a failed operation" + +``` +Step 1: hermes_list_events (outcome=failure, time_gte=) + └── Find the failed event + +Step 2: hermes_get_event (event_id) + └── Read the response attachment for error details + +Step 3: Common failure reasons: + ├── 403 → Wrong project scope or insufficient roles + ├── 409 → Conflict (resource in wrong state, quota exceeded) + ├── 404 → Resource doesn't exist in this project + └── 500 → Backend error (retry or escalate) +``` + +## Cross-Service Correlation Table + +| Starting Point | Need to Know | Use | +|---------------|--------------|-----| +| Server UUID | Network interfaces | `neutron_list_ports(device_id=UUID)` | +| Server UUID | Attached volumes | `cinder_list_volumes` → filter attachments | +| Server UUID | What happened to it | `hermes_list_events(target_type=compute/server, target_id=UUID)` | +| Server UUID | Performance metrics | `maia_query` with instance label | +| Port UUID | Which server | Port's `device_id` field → `nova_get_server` | +| Network UUID | Subnets in it | `neutron_list_subnets(network_id=UUID)` | +| Network UUID | All ports | `neutron_list_ports(network_id=UUID)` | +| Volume UUID | Which server | Volume's `attachments[].server_id` | +| Any resource | Quota impact | `limes_get_project_quota(service=)` | +| Any resource | Audit trail | `hermes_list_events(target_type=, target_id=UUID)` | diff --git a/plugins/sapcc/.claude-plugin/plugin.json b/plugins/sapcc/.claude-plugin/plugin.json new file mode 100644 index 0000000..81239ef --- /dev/null +++ b/plugins/sapcc/.claude-plugin/plugin.json @@ -0,0 +1,16 @@ +{ + "author": { + "name": "SAP Converged Cloud" + }, + "description": "Operate SAP Converged Cloud: compute, networking, storage, identity, quota, audit, metrics, registry, and endpoint services.", + "homepage": "https://github.com/notque/openstack-agent-toolkit", + "keywords": [ + "openstack", + "sapcc", + "converged-cloud" + ], + "license": "Apache-2.0", + "name": "sapcc", + "repository": "https://github.com/notque/openstack-agent-toolkit", + "version": "1.0.0" +} diff --git a/plugins/sapcc/.codex-plugin/plugin.json b/plugins/sapcc/.codex-plugin/plugin.json new file mode 100644 index 0000000..81239ef --- /dev/null +++ b/plugins/sapcc/.codex-plugin/plugin.json @@ -0,0 +1,16 @@ +{ + "author": { + "name": "SAP Converged Cloud" + }, + "description": "Operate SAP Converged Cloud: compute, networking, storage, identity, quota, audit, metrics, registry, and endpoint services.", + "homepage": "https://github.com/notque/openstack-agent-toolkit", + "keywords": [ + "openstack", + "sapcc", + "converged-cloud" + ], + "license": "Apache-2.0", + "name": "sapcc", + "repository": "https://github.com/notque/openstack-agent-toolkit", + "version": "1.0.0" +} diff --git a/plugins/sapcc/.mcp.json b/plugins/sapcc/.mcp.json new file mode 100644 index 0000000..824146c --- /dev/null +++ b/plugins/sapcc/.mcp.json @@ -0,0 +1,13 @@ +{ + "mcpServers": { + "sapcc": { + "command": "openstack-mcp-server", + "env": { + "OS_AUTH_URL": "${OS_AUTH_URL}", + "OS_APPLICATION_CREDENTIAL_ID": "${OS_APPLICATION_CREDENTIAL_ID}", + "OS_APPCRED_SECRET_CMD": "${OS_APPCRED_SECRET_CMD}", + "OS_REGION_NAME": "${OS_REGION_NAME}" + } + } + } +} diff --git a/plugins/sapcc/skills/README.md b/plugins/sapcc/skills/README.md new file mode 100644 index 0000000..5243996 --- /dev/null +++ b/plugins/sapcc/skills/README.md @@ -0,0 +1,29 @@ +# Skills + +This directory contains all SAP CC agent skills, accessible through the `sapcc` plugin. + +## How Skills Work + +Each skill is a directory containing a `SKILL.md` file with instructions, plus optional reference files. Skills use progressive disclosure: + +1. At startup, the agent reads only the skill name and description (~50 tokens per skill) +2. When a task matches a skill's description, the agent loads the full instructions +3. Reference files load on-demand as specific phases require them +4. Skill context releases when the task completes + +## Skill Format + +``` +skill-name/ +├── SKILL.md # Required: frontmatter + workflow + gotchas +└── references/ # Optional: deep-dive content loaded on demand + ├── topic-a.md + └── topic-b.md +``` + +The `SKILL.md` includes YAML frontmatter with `name` and `description`, followed by: +- MCP Tools table +- Gotchas (numbered agent mistake corrections) +- Common Workflows +- Troubleshooting +- Security Considerations diff --git a/plugins/sapcc/skills/credential-setup/SKILL.md b/plugins/sapcc/skills/credential-setup/SKILL.md new file mode 100644 index 0000000..d3f70f4 --- /dev/null +++ b/plugins/sapcc/skills/credential-setup/SKILL.md @@ -0,0 +1,179 @@ +--- +name: credential-setup +description: | + Create and securely store OpenStack application credentials for MCP server authentication. + Triggers: setup credentials, configure auth, application credential, MCP server setup, + first time setup, rotate credentials +version: 1.0.0 +allowed-tools: + - Read + - Write + - Bash + - mcp__sapcc__keystone_token_info + - mcp__sapcc__keystone_create_application_credential + - mcp__sapcc__keystone_list_application_credentials + - mcp__sapcc__keystone_delete_application_credential +--- + +# Credential Setup + +Create, store, and configure OpenStack application credentials for MCP server authentication. Handles first-time setup, rotation, and multi-project configurations. + +## MCP Tools + +| Tool | Purpose | +|------|---------| +| `keystone_token_info` | Check current auth context (project, roles, expiry) | +| `keystone_create_application_credential` | Create app credential (secret shown ONCE) | +| `keystone_list_application_credentials` | List existing credentials for current user/project | +| `keystone_delete_application_credential` | Delete/revoke a credential by ID or name | + +## Gotchas + +These will save you from the most common failures: + +1. **Secret shown ONLY ONCE at creation.** The API returns the secret exactly once in the creation response. If you don't capture it immediately, it's gone forever. You must create a new credential. + +2. **Always list existing credentials first.** Call `keystone_list_application_credentials` before creating. Duplicates cause confusion during rotation and waste the 25-credential limit per user. + +3. **Roles are frozen at creation time.** The credential inherits whatever roles the user has at the moment of creation. If roles are added/removed later, the credential is unaffected. To pick up new roles, create a new credential. + +4. **Naming convention is load-bearing.** Use `mcp-server-{project}-{region}` (e.g., `mcp-server-cc-demo-eu-de-1`). The keychain lookup command embeds this name — inconsistent naming breaks secret retrieval. + +5. **Deletion is immediate revocation.** The instant you delete a credential, any process using it loses access. Always verify the replacement works BEFORE deleting the old one. + +6. **Set expiration to force rotation.** Recommended: 1 year for development, 90 days for production. Credentials without expiry become forgotten attack surface. + +7. **App credentials cannot create other app credentials.** This is an intentional anti-escalation design. The MCP server authenticated via app credential cannot mint new credentials — only a user-scoped token can. + +## Workflow + +### Phase 1: Assess Current Auth + +Call `keystone_token_info` to determine: +- Current project scope (name, ID, domain) +- Active roles +- Auth method in use (password, token, or application_credential) +- Token expiry + +If already using app credentials, determine whether this is rotation or new setup. + +### Phase 2: Check Existing Credentials + +Call `keystone_list_application_credentials` and inspect results: +- Look for credentials matching `mcp-server-*` naming pattern +- Check expiration dates — expired ones can be cleaned up +- Identify if a credential already exists for this project+region + +If a valid credential exists and user wants fresh setup, proceed to rotation flow (create new first, then delete old in Phase 7). + +### Phase 3: Create Application Credential + +Call `keystone_create_application_credential` with: + +``` +name: mcp-server-{project_name}-{region} +description: "MCP server credential for {project_name} in {region}. Created {YYYY-MM-DD}." +expires_at: {calculated expiry} # ISO 8601 format +roles: [{minimal required roles}] # omit to inherit all current roles +``` + +**IMMEDIATELY capture the `id` and `secret` from the response.** The secret will not be retrievable again. + +### Phase 4: Store Secret in System Keychain + +**macOS:** +```bash +security add-generic-password -a "mcp-server-{project}-{region}" -s "openstack-appcred" -w "{secret}" +``` + +**Linux (GNOME Keyring / libsecret):** +```bash +secret-tool store --label="OpenStack App Credential" service openstack-appcred account "mcp-server-{project}-{region}" +``` +(Prompts for the secret value via stdin) + +**Verify storage immediately:** +```bash +# macOS +security find-generic-password -a "mcp-server-{project}-{region}" -s "openstack-appcred" -w + +# Linux +secret-tool lookup service openstack-appcred account "mcp-server-{project}-{region}" +``` + +### Phase 5: Generate MCP Server Configuration + +Output the configuration block for Claude Code settings (`~/.claude/settings.json` or project `.claude/settings.json`): + +```json +{ + "mcpServers": { + "sapcc": { + "command": "openstack-mcp-server", + "env": { + "OS_AUTH_URL": "https://identity-3.{region}.cloud.sap/v3", + "OS_APPLICATION_CREDENTIAL_ID": "{id}", + "OS_APPCRED_SECRET_CMD": "security find-generic-password -a mcp-server-{project}-{region} -s openstack-appcred -w", + "OS_REGION_NAME": "{region}" + } + } + } +} +``` + +For Linux, replace `OS_APPCRED_SECRET_CMD` value with: +``` +secret-tool lookup service openstack-appcred account mcp-server-{project}-{region} +``` + +### Phase 6: Verify + +1. Restart Claude Code (MCP servers reload on restart) +2. Call `keystone_token_info` — confirm it returns valid auth context +3. Verify project scope and roles match expectations + +If verification fails, check: +- Credential ID matches (copy-paste errors) +- Secret retrieval command works standalone in terminal +- Auth URL region matches the credential's project region + +### Phase 7: Clean Up (Rotation Only) + +Only after Phase 6 succeeds: +1. Identify old credential ID from Phase 2 listing +2. Call `keystone_delete_application_credential` with old credential ID +3. Remove old keychain entry: + ```bash + # macOS + security delete-generic-password -a "old-credential-name" -s "openstack-appcred" + ``` + +## Best Practices + +| Practice | Rationale | +|----------|-----------| +| One credential per purpose | Revoke one without affecting others | +| Descriptive names (`mcp-server-{project}-{region}`) | Keychain lookup depends on exact name | +| Set expiration | Forces rotation, limits blast radius of leaked creds | +| Minimum roles | Don't grant admin for read-only MCP access | +| Keychain storage | Never store in env files, dotfiles, or git | +| Test before delete | Verify new cred works before revoking old | +| Document creation date in description | Know when rotation is overdue | + +## Troubleshooting + +| Symptom | Likely Cause | Fix | +|---------|-------------|-----| +| "Authentication required" after setup | Secret retrieval command fails | Run `OS_APPCRED_SECRET_CMD` manually in terminal; check keychain entry name matches exactly | +| "Credential not found" | Credential was deleted or expired | `keystone_list_application_credentials` to confirm; create new if missing | +| Wrong project scope in token | Credential was created under different project | Check `keystone_token_info` project field; recreate credential under correct project scope | +| "Unauthorized" with valid credential | Roles insufficient for requested operation | List credential roles vs. required roles; recreate with correct role set | +| MCP server won't start | Malformed config JSON or missing binary | Validate JSON syntax; confirm `openstack-mcp-server` is in PATH | +| Works in terminal but not in Claude Code | Environment differences (PATH, keychain access) | Ensure Claude Code's process has keychain access; use full path to `security`/`secret-tool` | + +## Routing + +| User need | Action | +|-----------|--------| +| Authentication methods comparison | Read [auth-methods.md](references/auth-methods.md) | diff --git a/plugins/sapcc/skills/credential-setup/references/auth-methods.md b/plugins/sapcc/skills/credential-setup/references/auth-methods.md new file mode 100644 index 0000000..2aa7db1 --- /dev/null +++ b/plugins/sapcc/skills/credential-setup/references/auth-methods.md @@ -0,0 +1,103 @@ +# Authentication Methods for OpenStack MCP Server + +Comparison of authentication methods available for the openstack-mcp-server, with recommendations for each use case. + +## Method Comparison + +| Method | Security | Rotation | MCP Suitability | Scope Control | +|--------|----------|----------|-----------------|---------------| +| Application Credential | High | Manual (with expiry) | **Recommended** | Per-project, frozen roles | +| Password | Low | Shared lifecycle | Not recommended | Full user access | +| Token | Medium | Auto-expires (hours) | Impractical | Inherits current scope | + +## Application Credential (Recommended) + +**How it works:** A project-scoped credential with its own ID and secret, tied to a specific user and project. Authenticates directly with Keystone without exposing the user's password. + +**Pros:** +- Secret is independent of user password (password change doesn't break MCP) +- Scoped to single project (limits blast radius) +- Roles frozen at creation (predictable permissions) +- Can set expiration date +- Revocable without affecting other credentials +- Cannot create other app credentials (anti-escalation) + +**Cons:** +- Secret shown only once at creation +- Cannot exceed the creating user's roles +- Must recreate to pick up new role assignments +- 25-credential limit per user + +**MCP server config pattern:** +``` +OS_APPLICATION_CREDENTIAL_ID + OS_APPCRED_SECRET_CMD (keychain retrieval) +``` + +## Password Auth + +**How it works:** User's OpenStack password passed directly to the MCP server process, which exchanges it for tokens as needed. + +**Pros:** +- Simple setup (no credential creation step) +- Always has current roles +- No expiration management + +**Cons:** +- Password exposed to MCP server process +- Password rotation breaks all MCP instances +- Full user access (all projects, all roles) unless manually scoped +- Cannot revoke without changing password (affects everything) +- Violates principle of least privilege +- Password may be subject to external rotation policies (LDAP, SSO) + +**MCP server config pattern:** +``` +OS_USERNAME + OS_PASSWORD + OS_PROJECT_NAME + OS_USER_DOMAIN_NAME + OS_PROJECT_DOMAIN_NAME +``` + +## Token Auth + +**How it works:** A pre-obtained Keystone token passed to the MCP server. Token is already scoped and has limited lifetime (typically 1-4 hours). + +**Pros:** +- No long-lived secrets stored +- Already scoped to project +- Revocable via Keystone + +**Cons:** +- Expires in hours — MCP server stops working until manually refreshed +- Requires external process to obtain and refresh tokens +- Impractical for always-on MCP server use +- Adds operational complexity with no security benefit over app credentials + +**MCP server config pattern:** +``` +OS_TOKEN (must be refreshed externally) +``` + +## Decision Matrix + +| Scenario | Recommended Method | Reason | +|----------|-------------------|--------| +| Developer local setup | Application Credential (1yr expiry) | Set-and-forget, secure, revocable | +| Production/shared CI | Application Credential (90d expiry) | Forced rotation, minimal permissions | +| Quick one-off testing | Token | No credential cleanup needed | +| Legacy migration | Password → App Credential | Migrate away from password ASAP | +| Multi-project access | Multiple App Credentials | One per project, independent lifecycle | + +## Migration Path: Password to Application Credential + +1. Authenticate with password (current state) +2. Create app credential scoped to needed project +3. Store secret in keychain +4. Update MCP config to use app credential +5. Verify MCP server works with new auth +6. Remove password from any config files +7. Confirm no other processes depend on stored password + +## Security Considerations + +- **Keychain storage is mandatory.** Secrets in environment variables, dotfiles, or config files are readable by any process with file access. +- **One credential per MCP instance.** Sharing credentials across instances means you cannot revoke one without breaking all. +- **Monitor credential age.** Credentials without expiration should be audited quarterly. +- **App credentials survive password changes.** This is a feature (MCP doesn't break) and a risk (compromised cred persists). Set expiration as mitigation. diff --git a/plugins/sapcc/skills/sapcc-audit/SKILL.md b/plugins/sapcc/skills/sapcc-audit/SKILL.md new file mode 100644 index 0000000..7f64b8f --- /dev/null +++ b/plugins/sapcc/skills/sapcc-audit/SKILL.md @@ -0,0 +1,200 @@ +--- +name: sapcc-audit +description: > + Audit trail investigation on SAP Converged Cloud using Hermes. + Triggers: audit, who changed, what happened, hermes, events, compliance, + CADF, activity log, trace action, who did, what was done, event history +version: 1.0.0 +metadata: + service: [hermes] + task: [investigate, compliance, trace, audit] + persona: [platform-engineer, security, developer] +--- + +# SAP CC Audit (Hermes) + +Hermes is SAP CC's centralized audit service. It records all API actions across all OpenStack services in CADF (Cloud Auditing Data Federation) format. Events are immutable — the audit trail cannot be modified or deleted by tenants. + +## MCP Tools + +| Tool | Purpose | Key Parameters | +|------|---------|----------------| +| `hermes_list_events` | Search/filter audit events | `target_type`, `target_id`, `initiator_name`, `action`, `outcome`, `time_gte`, `time_lte`, `limit`, `sort` | +| `hermes_get_event` | Full CADF event by UUID | `event_id` | +| `hermes_list_attributes` | Discover valid filter values | `attribute_name` (one of: target_type, action, outcome, observer_type, initiator_type) | + +## CADF Event Model + +Every audit event follows this structure: + +``` +initiator (who) → action (what) → target (to what) → outcome (result) +``` + +| Field | Description | Example | +|-------|-------------|---------| +| `initiator.name` | Username who performed the action | `I810033` | +| `action` | The operation performed | `update` | +| `target.type_uri` | Resource type in slash format | `compute/server` | +| `target.id` | UUID of the affected resource | `abc-123-def` | +| `outcome` | Result of the operation | `success` | +| `eventTime` | When it occurred (UTC) | `2024-03-15T14:22:01Z` | + +See `references/cadf-event-format.md` for the full event schema. + +## Gotchas + +### 1. target_type uses SLASH format — not service names + +Correct: `compute/server`, `network/port`, `identity/project`, `dns/zone` +Wrong: `nova/server`, `server`, `neutron/port`, `VM` + +The format is `/`. Call `hermes_list_attributes` with `attribute_name=target_type` to discover valid values if unsure. + +### 2. Time filters use PREFIX syntax + +The parameter name itself encodes the comparison: +- `time_gte="2024-01-01T00:00:00Z"` — events at or after this time +- `time_lte="2024-01-01T23:59:59Z"` — events at or before this time + +The value is a plain ISO 8601 timestamp. Do NOT embed operators in the value string. + +### 3. outcome values are words, NOT HTTP status codes + +Valid outcomes: `success`, `failure`, `pending` + +NOT: `200`, `404`, `500`, `created`, `error`. Use `hermes_list_attributes` with `attribute_name=outcome` to confirm. + +### 4. action values are present-tense verbs + +Valid: `create`, `update`, `delete`, `read`, `authenticate`, `start`, `stop` + +NOT past tense: `created`, `updated`, `deleted`. NOT nouns: `creation`, `deletion`. Call `hermes_list_attributes` with `attribute_name=action` to see all tracked actions. + +### 5. hermes_list_attributes is your discovery tool — call it first + +When unsure about valid filter values for target_type, action, or outcome, always call `hermes_list_attributes` before `hermes_list_events`. Avoids empty results from typos or wrong format. + +### 6. Default limit is 50 — increase for comprehensive audits + +If you need a complete picture (compliance reviews, full resource history), set `limit=200` or higher. Default 50 may miss critical events in active projects. + +### 7. sort uses "field:direction" format + +Format: `sort="time:desc"` or `sort="time:asc"` + +Default is newest first (`time:desc`). Use `time:asc` when building a chronological narrative of what happened. + +### 8. Events have ingestion delay + +Events appear seconds to minutes after the action occurs. If you just performed an action and see no event, wait 30-60 seconds and retry. Do not tell the user "no events exist" immediately after an action. + +### 9. initiator_name is the username, not UUID + +Filter by human-readable username (e.g., `I810033`, `technical_user_xyz`), not the user's Keystone UUID. This is the name that appears in Keystone token info. + +### 10. Full event detail includes request/response attachments + +`hermes_get_event` returns the complete CADF event including `attachments` — these contain the actual API request body and response. Essential for answering "what exactly changed?" (e.g., which field was updated, what value was set). + +## Common Workflows + +### "Who changed resource X?" + +``` +1. hermes_list_events(target_id="", sort="time:desc", limit=20) +2. Review initiator.name on each event → identifies who made changes +3. For detail on a specific change: hermes_get_event(event_id) → check attachments +``` + +### "What happened in the last hour?" + +``` +1. Calculate time_gte = current time minus 1 hour (ISO 8601 UTC) +2. hermes_list_events(time_gte="2024-03-15T13:00:00Z", limit=100) +3. Group by target_type for overview, or filter by action/outcome +``` + +### "What did user Y do?" + +``` +1. hermes_list_events(initiator_name="", sort="time:desc", limit=50) +2. Optionally narrow with time range or target_type +3. Shows all actions taken by that user across all services +``` + +### "Show me all failures" + +``` +1. hermes_list_events(outcome="failure", sort="time:desc", limit=50) +2. Optionally narrow by time range or target_type +3. Each event shows what was attempted and on what resource +4. hermes_get_event for details on specific failures +``` + +### Compliance audit — full resource history + +``` +1. hermes_list_events(target_id="", sort="time:asc", limit=500) +2. This gives chronological lifecycle: create → updates → deletes +3. For each event of interest: hermes_get_event → full request/response +4. Build timeline: who did what, when, and the exact changes made +``` + +### Discovery — what's tracked? + +``` +1. hermes_list_attributes(attribute_name="target_type") → all audited resource types +2. hermes_list_attributes(attribute_name="action") → all tracked actions +3. hermes_list_attributes(attribute_name="outcome") → valid outcome values +4. Use results to construct precise queries +``` + +## Troubleshooting + +### No events found + +Most common causes (check in order): + +1. **Wrong target_type format** — Must be slash format: `compute/server` not `nova/server` or `server`. Call `hermes_list_attributes` to verify. +2. **Time range too narrow** — Expand `time_gte`/`time_lte` range. Events for old resources may be outside default window. +3. **Resource never audited** — Not all internal operations generate events. Read-only operations (list, get) may not be tracked for all services. +4. **Ingestion delay** — If the action just happened, wait 30-60 seconds. +5. **Wrong project scope** — Hermes returns events scoped to the authenticated project. Events in other projects are invisible. + +### Too many results + +1. Add `target_type` filter to narrow to specific service +2. Add time range (`time_gte`/`time_lte`) to bound the window +3. Add `action` filter if looking for specific operations (e.g., only `delete`) +4. Add `outcome` filter if only interested in failures + +### Event detail missing attachments + +Not all events include request/response attachments. Simple actions (delete, start, stop) may have minimal or no attachments. Update events typically include the changed fields. + +## Security + +Audit data is sensitive. It reveals: +- **Who** performed actions (usernames, technical accounts) +- **What** they did (including potentially destructive operations) +- **When** they were active (activity patterns) +- **Which resources** they accessed (infrastructure topology) + +Only query audit data scoped to the authenticated project. Do not expose audit data containing other users' actions or resource details without confirming the requester has legitimate need. The MCP server enforces project-scoped access, but be judicious in what you surface. + +## Cross-Service References + +| Need | Service | Tool | +|------|---------|------| +| Server details for compute/server events | Nova | `nova_get_server()` | +| Port details for network/port events | Neutron | `neutron_list_ports` | +| Volume details for volume events | Cinder | `cinder_get_volume()` | +| Who is the initiator (token context) | Keystone | `keystone_token_info` | +| Resource quota impact of actions | Limes | `limes_get_project_quota` | + +## Routing + +| User need | Action | +|-----------|--------| +| Understanding CADF event structure | Read [cadf-event-format.md](references/cadf-event-format.md) | diff --git a/plugins/sapcc/skills/sapcc-audit/references/cadf-event-format.md b/plugins/sapcc/skills/sapcc-audit/references/cadf-event-format.md new file mode 100644 index 0000000..a472dfc --- /dev/null +++ b/plugins/sapcc/skills/sapcc-audit/references/cadf-event-format.md @@ -0,0 +1,195 @@ +# CADF Event Format Reference + +CADF (Cloud Auditing Data Federation) is the DMTF standard used by Hermes to structure audit events. Every action in SAP Converged Cloud generates a CADF event. + +## Event Structure + +```json +{ + "id": "event-uuid", + "eventType": "activity", + "eventTime": "2024-03-15T14:22:01.234Z", + "action": "update", + "outcome": "success", + "initiator": { + "id": "user-uuid", + "name": "I810033", + "typeURI": "service/security/account/user", + "domain_id": "domain-uuid", + "project_id": "project-uuid" + }, + "target": { + "id": "resource-uuid", + "typeURI": "compute/server", + "name": "my-server-01", + "project_id": "project-uuid" + }, + "observer": { + "id": "observer-uuid", + "typeURI": "service/compute", + "name": "nova" + }, + "attachments": [ + { + "name": "payload", + "typeURI": "mime:application/json", + "content": "{\"server\": {\"name\": \"new-name\"}}" + } + ], + "requestPath": "/v2.1/servers/resource-uuid" +} +``` + +## Field Definitions + +### Top-Level Fields + +| Field | Type | Description | +|-------|------|-------------| +| `id` | UUID | Unique event identifier. Use with `hermes_get_event`. | +| `eventType` | string | Always `activity` for API actions. | +| `eventTime` | ISO 8601 | UTC timestamp of when the action occurred. | +| `action` | string | The operation: `create`, `update`, `delete`, `read`, `authenticate`, etc. | +| `outcome` | string | `success`, `failure`, or `pending`. | +| `requestPath` | string | The API endpoint path that was called. | + +### Initiator (Who) + +| Field | Description | +|-------|-------------| +| `initiator.id` | Keystone user UUID | +| `initiator.name` | Human-readable username (this is what you filter on) | +| `initiator.typeURI` | Always `service/security/account/user` for human users | +| `initiator.domain_id` | Domain the user belongs to | +| `initiator.project_id` | Project scope of the action | + +### Target (What Was Acted On) + +| Field | Description | +|-------|-------------| +| `target.id` | UUID of the resource (server, port, volume, etc.) | +| `target.typeURI` | Resource type in slash format (see table below) | +| `target.name` | Human-readable resource name (if available) | +| `target.project_id` | Project that owns the resource | + +### Observer (Which Service Recorded It) + +| Field | Description | +|-------|-------------| +| `observer.id` | Service instance UUID | +| `observer.typeURI` | Service type (e.g., `service/compute`) | +| `observer.name` | Service name (e.g., `nova`, `neutron`) | + +### Attachments (Request/Response Details) + +| Field | Description | +|-------|-------------| +| `attachments[].name` | Typically `payload` or `request`/`response` | +| `attachments[].typeURI` | MIME type, usually `mime:application/json` | +| `attachments[].content` | JSON string of the request body or response | + +Attachments are the key to answering "what exactly changed?" — they contain the API request payload showing which fields were modified and to what values. + +## Common target_type Values + +### Compute (Nova) + +| target_type | Resource | +|-------------|----------| +| `compute/server` | Virtual machine instance | +| `compute/keypair` | SSH keypair | +| `compute/server-group` | Server anti-affinity group | +| `compute/flavor` | Instance type definition | + +### Networking (Neutron) + +| target_type | Resource | +|-------------|----------| +| `network/port` | Virtual network interface | +| `network/network` | Virtual network | +| `network/subnet` | IP subnet | +| `network/router` | Virtual router | +| `network/security-group` | Security group | +| `network/security-group-rule` | Individual firewall rule | +| `network/floatingip` | Floating IP address | + +### Identity (Keystone) + +| target_type | Resource | +|-------------|----------| +| `identity/project` | Project/tenant | +| `identity/user` | User account | +| `identity/role-assignment` | Role grant/revoke | +| `identity/application-credential` | App credential | +| `identity/OS-TRUST/trust` | Trust delegation | + +### Block Storage (Cinder) + +| target_type | Resource | +|-------------|----------| +| `storage/volume` | Block volume | +| `storage/snapshot` | Volume snapshot | +| `storage/backup` | Volume backup | + +### DNS (Designate) + +| target_type | Resource | +|-------------|----------| +| `dns/zone` | DNS zone | +| `dns/recordset` | DNS record set | + +### Load Balancing (Octavia) + +| target_type | Resource | +|-------------|----------| +| `load-balancer/loadbalancer` | Load balancer | +| `load-balancer/listener` | LB listener | +| `load-balancer/pool` | LB backend pool | +| `load-balancer/member` | Pool member | + +### Object Storage (Swift) + +| target_type | Resource | +|-------------|----------| +| `object-store/container` | Swift container | +| `object-store/object` | Stored object | + +## Common action Values + +| Action | Meaning | +|--------|---------| +| `create` | New resource created | +| `update` | Existing resource modified | +| `delete` | Resource removed | +| `read` | Resource details retrieved (not always tracked) | +| `authenticate` | Login/token creation | +| `start` | Server/service started | +| `stop` | Server/service stopped | +| `reboot` | Server rebooted | +| `attach` | Volume/port attached | +| `detach` | Volume/port detached | +| `resize` | Server flavor changed | +| `migrate` | Server moved to different host | + +## Querying Tips + +**By resource lifecycle:** +``` +target_id=, sort=time:asc → full creation-to-deletion history +``` + +**By user activity:** +``` +initiator_name=, time_gte= → all actions by user in window +``` + +**By failure investigation:** +``` +outcome=failure, target_type=compute/server → all failed compute operations +``` + +**By security review:** +``` +action=authenticate, outcome=failure → failed login attempts +action=delete, target_type=identity/role-assignment → permission removals +``` diff --git a/plugins/sapcc/skills/sapcc-compute/SKILL.md b/plugins/sapcc/skills/sapcc-compute/SKILL.md new file mode 100644 index 0000000..2c6fe35 --- /dev/null +++ b/plugins/sapcc/skills/sapcc-compute/SKILL.md @@ -0,0 +1,165 @@ +--- +name: sapcc-compute +description: >- + Manage compute instances on SAP Converged Cloud. Covers server lifecycle, + flavor selection, status transitions, and cross-service correlation with + networking and storage. Use when: listing VMs, checking server status, + debugging instance issues, performing server actions (start/stop/reboot), + selecting flavors, or investigating why a server won't start. + NOT for container workloads (use sapcc-registry) or bare metal. +version: 1.0.0 +metadata: + service: [nova] + task: [list, inspect, debug, lifecycle] + persona: [developer, platform-engineer] +--- + +# SAP CC Compute (Nova) + +## MCP Tools + +| Tool | Purpose | +|------|---------| +| \`nova_list_servers\` | List instances. Filters: \`status\`, \`name\` (regex), \`limit\`. Returns ID, name, status, addresses. | +| \`nova_get_server\` | Full detail by UUID: addresses, flavor, image, host_id, metadata, created/updated timestamps. | +| \`nova_list_flavors\` | Available instance types with vCPUs, RAM (MiB), disk (GiB). Use for sizing decisions. | +| \`nova_server_action\` | Lifecycle actions: \`start\`, \`stop\`, \`reboot\` (type: SOFT/HARD), \`pause\`, \`unpause\`, \`suspend\`, \`resume\`. | + +## Gotchas + +1. **Check quota before creating.** Nova returns generic 403 or 409 when quota is exhausted — no helpful message. Always call \`limes_get_project_quota\` for \`compute\` resources (instances, cores, ram) before any create/resize operation. The error "Quota exceeded" is often not in the Nova response at all. + +2. **SHUTOFF still consumes quota.** A stopped server (\`SHUTOFF\` status) continues to consume \`instances\` and \`cores\` quota. Only \`DELETED\` servers release quota. Users who "stopped" servers expecting freed capacity will be confused — clarify this distinction. + +3. **Ports are not in the Nova response.** The \`addresses\` field on a server shows IP addresses grouped by network name, but does NOT include port UUIDs, MAC addresses, or security group assignments. To get port details, call \`neutron_list_ports\` with \`device_id=\`. This is the only way to find which security groups apply to a server. + +4. **SAP CC flavor naming conventions.** Flavors follow the pattern \`_\` (e.g., \`m2_xlarge\`). Families: \`m\` = general purpose, \`r\` = memory-optimized, \`c\` = compute-optimized. Generations increment (1, 2, 3...). See \`references/flavor-families.md\` for the full matrix. Do not guess flavor names — always call \`nova_list_flavors\` to confirm availability in the current region. + +5. **Server actions are asynchronous.** Calling \`nova_server_action\` returns 202 immediately. The server transitions through intermediate states (e.g., \`REBOOT\` → \`ACTIVE\`). You must poll with \`nova_get_server\` to confirm the action completed. Typical transitions take 5-30 seconds but can take minutes for large instances. + +6. **HARD reboot is destructive.** A \`HARD\` reboot is equivalent to pulling the power cord — in-flight I/O is lost, filesystems may corrupt. Always attempt \`SOFT\` reboot first (sends ACPI shutdown signal). Only escalate to \`HARD\` if the guest OS is unresponsive. Confirm with the user before issuing HARD reboot. + +7. **ERROR state requires admin intervention.** Servers in \`ERROR\` status cannot be recovered via \`nova_server_action\`. Common causes: host failure, scheduler error, failed live-migration. The user must contact their cloud admin or file a support ticket. Do not attempt repeated actions on ERROR servers. + +8. **Addresses field structure varies by network.** The \`addresses\` response is keyed by network name, with each entry containing \`addr\` (IP), \`version\` (4/6), and \`OS-EXT-IPS:type\` (\`fixed\` or \`floating\`). Multiple networks produce multiple keys. Do not assume a single-network structure. + +9. **Server metadata is not automatically populated.** Nova metadata is user-supplied key-value pairs. Do not expect metadata to contain project info, cost center, or ownership unless the user's automation sets it. The \`host_id\` is an opaque hash — it identifies co-location but is not a hostname. + +10. **Name filter is not exact match.** \`nova_list_servers\` with a \`name\` filter uses regex-style matching. Searching for \`name=web\` returns \`web-1\`, \`web-prod\`, \`my-web-server\`, etc. For exact matches, filter results client-side after retrieval. + +## Common Workflows + +### List and Inspect Servers + +``` +1. nova_list_servers (optionally filter by status or name) +2. For each server needing detail: nova_get_server with its UUID +3. For network info: neutron_list_ports with device_id= +``` + +### Get Full Server + Network Picture + +``` +1. nova_get_server → note addresses (IPs) and server UUID +2. neutron_list_ports with device_id= → port UUIDs, MAC, security groups +3. For each security group ID: neutron_list_security_groups for rules +``` + +This gives the complete picture: server → IPs → ports → security groups → rules. + +### Debug: Server Won't Start + +``` +1. nova_get_server → check current status + - SHUTOFF: try nova_server_action start + - ERROR: inform user, admin required + - BUILD: still provisioning, wait + - PAUSED/SUSPENDED: unpause/resume first +2. If start fails with 409: limes_get_project_quota → check compute quota +3. If quota ok: hermes_list_events with target.id= → recent errors +4. If no audit clues: maia_query for host-level issues +``` + +### Perform Server Action Safely + +``` +1. nova_get_server → confirm current status allows the action + Valid transitions: + - start: SHUTOFF → ACTIVE + - stop: ACTIVE → SHUTOFF + - reboot (SOFT): ACTIVE → ACTIVE (via REBOOT) + - reboot (HARD): any running state → ACTIVE (destructive) + - pause: ACTIVE → PAUSED + - unpause: PAUSED → ACTIVE + - suspend: ACTIVE → SUSPENDED + - resume: SUSPENDED → ACTIVE +2. For destructive actions (stop, HARD reboot): confirm with user +3. nova_server_action with action and server_id +4. Poll nova_get_server until status reaches target (or timeout after 2 min) +``` + +### Select a Flavor + +``` +1. nova_list_flavors → get available flavors in region +2. Match requirements to flavor family: + - Balanced workload → m-series (general purpose) + - Database/cache → r-series (memory-optimized) + - Batch/CI → c-series (compute-optimized) +3. Check limes_get_project_quota to ensure cores/ram headroom +4. If flavor not found: may not be available in this region or project +``` + +## Troubleshooting + +### Stuck in BUILD + +Server has been in \`BUILD\` status for >10 minutes. + +- **Typical cause**: Scheduler couldn't place it (no host with capacity), or image download is slow. +- **Diagnostic**: \`hermes_list_events\` filtered to \`target.id=\` — look for \`compute/server/create\` with outcome \`pending\` or \`failure\`. +- **Resolution**: If no events after 15 min, likely a scheduler issue — admin intervention required. Do not delete-and-retry without checking quota first. + +### ERROR After Resize + +Server entered ERROR after a resize or migrate operation. + +- **Typical cause**: Target host ran out of disk during resize, or live-migration timed out. +- **Diagnostic**: \`hermes_list_events\` for recent \`compute/server/resize\` or \`compute/server/migrate\` events. Check outcome field. +- **Resolution**: Admin must reset the server state. User cannot self-service from ERROR. + +### Can't Reach Server via Network + +Server is ACTIVE but unreachable. + +- **Diagnostic steps**: + 1. \`nova_get_server\` → confirm status is ACTIVE (not PAUSED/SUSPENDED) + 2. Check addresses: does it have a floating IP? Fixed IPs are only reachable from within the VPC/network. + 3. \`neutron_list_ports\` with \`device_id\` → check port \`status\` (should be \`ACTIVE\`, not \`DOWN\` or \`BUILD\`) + 4. Check security groups on the port → ensure ingress rules allow the traffic (SSH=22, ICMP, etc.) + 5. If port is DOWN: may be a binding failure — check \`hermes_list_events\` for port-related events + +## Security Considerations + +- **Confirm destructive actions**: Always ask user confirmation before \`stop\`, \`HARD reboot\`, or any action that interrupts service. State what will happen: "This will immediately power off the instance, dropping all connections." +- **Metadata visibility**: Server metadata is visible to anyone with \`compute:server:show\` permission in the project. Do not store secrets, credentials, or PII in metadata. +- **Host ID is semi-sensitive**: While opaque, \`host_id\` reveals co-location (same hash = same hypervisor). Avoid exposing it in shared contexts without need. +- **Audit trail**: All server actions generate Hermes events. Inform users that actions are logged with their credential identity. +- **Cross-project access**: Nova operations are scoped to the authenticated project. You cannot see or act on servers in other projects without re-scoping credentials. + +## Cross-Service References + +| Need | Service | Tool | +|------|---------|------| +| Network interfaces for a server | Neutron | `neutron_list_ports(device_id=)` | +| Attached volumes | Cinder | `cinder_list_volumes` → filter by attachments[].server_id | +| Quota before creating | Limes | `limes_get_project_quota(service=compute)` | +| Who modified this server | Hermes | `hermes_list_events(target_type=compute/server, target_id=)` | +| CPU/memory metrics | Maia | `maia_query` with `vm_cpu_seconds_total`, `vm_memory_usage_bytes` | +| Security groups on ports | Neutron | `neutron_list_ports` → then `neutron_list_security_groups` | + +## Routing + +| User need | Action | +|-----------|--------| +| Flavor naming and selection | Read [flavor-families.md](references/flavor-families.md) | diff --git a/plugins/sapcc/skills/sapcc-compute/references/flavor-families.md b/plugins/sapcc/skills/sapcc-compute/references/flavor-families.md new file mode 100644 index 0000000..1416ab1 --- /dev/null +++ b/plugins/sapcc/skills/sapcc-compute/references/flavor-families.md @@ -0,0 +1,61 @@ +# SAP CC Flavor Families + +## Naming Convention + +``` +_ +``` + +Examples: `m2_xlarge`, `r1_large`, `c3_2xlarge` + +## Families + +| Prefix | Family | Optimized For | Use Case | +|--------|--------|---------------|----------| +| `m` | General Purpose | Balanced CPU:RAM ratio | Web servers, application servers, dev/test | +| `r` | Memory-Optimized | High RAM per vCPU | Databases, in-memory caches, SAP HANA sidecars | +| `c` | Compute-Optimized | High vCPU per RAM | Batch processing, CI/CD workers, HPC | + +## Generations + +Higher generation = newer hardware and potentially better price/performance. +Not all generations are available in all regions. + +| Generation | Typical Hardware | Notes | +|------------|-----------------|-------| +| 1 | Legacy (may be deprecated) | Check availability before recommending | +| 2 | Current default | Safe choice for most workloads | +| 3 | Latest | Best performance, may have limited availability | + +## Sizes + +Sizes scale within a family/generation. Exact vCPU/RAM values vary by family — always confirm with `nova_list_flavors`. + +| Size | Relative Scale | +|------|---------------| +| `small` | Minimum viable | +| `medium` | 2x small | +| `large` | 2x medium | +| `xlarge` | 2x large | +| `2xlarge` | 2x xlarge | +| `4xlarge` | 2x 2xlarge | +| `8xlarge` | 2x 4xlarge (where available) | + +## Selection Guidance + +| Workload | Recommended Family | Why | +|----------|--------------------|-----| +| Generic web app | `m2_large` or `m2_xlarge` | Balanced, cost-effective | +| PostgreSQL / MySQL | `r2_large` or larger | Databases need RAM for buffer pools | +| Redis / Memcached | `r2_xlarge` or larger | In-memory stores are RAM-bound | +| CI runners | `c2_large` | Build jobs are CPU-bound, transient | +| Kubernetes nodes | `m2_2xlarge` or larger | Need headroom for pod scheduling | +| Batch ETL | `c2_xlarge` | CPU-bound, short-lived | + +## Important Notes + +- **Always verify with `nova_list_flavors`** — flavor availability varies by region and project. +- **Flavor IDs are UUIDs**, not the human-readable names. The API accepts both, but responses use UUIDs. +- **Some flavors are project-scoped** — your project may have access to custom flavors not visible globally. +- **Deprecated flavors** may still appear in listings but cannot be used for new instances. Check if existing servers use deprecated flavors before recommending resize. +- **RAM is in MiB** in flavor definitions. A flavor showing `ram: 8192` means 8 GiB. diff --git a/plugins/sapcc/skills/sapcc-connectivity/SKILL.md b/plugins/sapcc/skills/sapcc-connectivity/SKILL.md new file mode 100644 index 0000000..5a78a81 --- /dev/null +++ b/plugins/sapcc/skills/sapcc-connectivity/SKILL.md @@ -0,0 +1,154 @@ +--- +name: sapcc-connectivity +description: > + Archer endpoint service management for private network connectivity in SAP Converged Cloud. + Triggers: endpoint service, private link, archer, private connectivity, service endpoint, internal access, cross-project access +version: 1.0.0 +metadata: + service: [archer] + task: [discover, connect, debug] + persona: [developer, platform-engineer] +--- + +# SAP CC Private Connectivity (Archer) + +Archer is SAP CC's endpoint service for private network connectivity between projects. Similar to AWS PrivateLink. Not part of vanilla OpenStack. Enables consumers to access services published by other projects via a local IP address without traversing public networks. + +## MCP Tools + +| Tool | Purpose | Required Params | +|------|---------|-----------------| +| `archer_list_services` | List services available for endpoint creation | (optional: `status`) | +| `archer_list_endpoints` | List endpoints in current project | (optional: `service_id`, `status`) | +| `archer_get_service` | Get service detail by UUID | `service_id` | +| `archer_get_endpoint` | Get endpoint detail by UUID | `endpoint_id` | + +## Archer Model + +``` +Producer (other project) Consumer (your project) +┌─────────────────────┐ ┌─────────────────────┐ +│ Network Resource │ │ Endpoint │ +│ (DB, API, etc.) │◄──private──────│ (local IP in YOUR │ +│ │ connection │ network) │ +│ Published as │ │ │ +│ SERVICE │ │ Created by YOU │ +└─────────────────────┘ └─────────────────────┘ +``` + +**Producer/Consumer pattern:** +- **Service** = a network resource published by another project (the producer). You cannot create services with these tools — you consume them. +- **Endpoint** = your access point. Created in your project, provides a local IP address on your network that routes privately to the service. +- Traffic stays on the internal network fabric. No public IPs, no NAT, no firewall traversal. + +## Gotchas + +### 1. You create endpoints, not services + +Services are published by OTHER projects (producers). Your project is a consumer. You discover available services with `archer_list_services`, then create endpoints to connect to them. If you don't see a service you expect, the producer hasn't published it yet or hasn't made it available to your project. + +### 2. Endpoint status transitions matter + +``` +PENDING_APPROVAL → AVAILABLE (happy path) +PENDING_APPROVAL → REJECTED (producer denied access) +``` + +An endpoint is NOT usable until status is `AVAILABLE`. The local IP won't route traffic until the endpoint is fully active. + +### 3. Some services require producer approval + +Not all services auto-approve endpoints. The producer can configure their service to require manual approval of each consumer endpoint. If your endpoint is stuck in `PENDING_APPROVAL`, the producer must approve it — this is not something you can resolve yourself. + +### 4. The endpoint IP is YOUR access point + +The endpoint provides a local IP address in your network. Use that IP (or DNS pointing to it) to reach the remote service. Don't try to reach the service's original IP — the whole point is that you access it through your local endpoint IP. + +### 5. List services BEFORE creating endpoints + +Always call `archer_list_services` first to discover what's available. Services have UUIDs that you need for endpoint creation. Guessing service IDs will fail. + +### 6. Correlate endpoint to service for full picture + +An endpoint's `service_id` tells you which service it connects to. Use `archer_get_service` with that UUID to get the service details (what it is, who provides it, what network resource it exposes). + +### 7. Endpoints are project-scoped + +You only see your own project's endpoints with `archer_list_endpoints`. You cannot see other projects' endpoints. But `archer_list_services` shows services available to you regardless of which project published them. + +## Common Workflows + +### "What services can I connect to?" + +``` +1. archer_list_services() +2. Review available services — note name, description, service_id +3. archer_get_service(service_id) for details on a specific service +``` + +Filter by status to see only active services: `archer_list_services(status="AVAILABLE")` + +### "What are my current connections?" + +``` +1. archer_list_endpoints() +2. Review each endpoint: status, service_id, IP address +3. For any with status != AVAILABLE, investigate +``` + +### "Get full details about a connection" + +``` +1. archer_get_endpoint(endpoint_id) → endpoint details including service_id +2. archer_get_service(service_id) → what service it connects to +3. Correlate: you now know your local IP and what remote resource it reaches +``` + +### Troubleshooting connectivity + +``` +1. archer_list_endpoints(service_id=) → find your endpoint +2. Check endpoint status: + - AVAILABLE → endpoint is fine, problem is elsewhere (DNS, security groups, application) + - PENDING_APPROVAL → not active yet, contact producer + - REJECTED → producer denied, contact them + - ERROR → platform issue, escalate +3. Verify you're connecting to the endpoint's local IP, not the service's original IP +``` + +## Troubleshooting + +### Endpoint stuck in PENDING_APPROVAL + +- The service requires manual approval from the producer project +- You cannot approve it yourself +- Action: contact the team that owns the service and ask them to approve your endpoint +- Check service details with `archer_get_service` to identify the producer + +### Endpoint status is REJECTED + +- The producer explicitly denied your endpoint request +- Possible reasons: wrong project, policy violation, service decommissioned +- Action: contact the service producer to understand why and resolve + +### Service not found in archer_list_services + +- The service may not exist yet (producer hasn't published it) +- The service may not be available to your project (producer restricts visibility) +- The service may have been deleted or is in a non-AVAILABLE status +- Try without status filter to see services in all states + +### Endpoint shows AVAILABLE but traffic doesn't flow + +- Verify you're using the endpoint's local IP, not the service's backend IP +- Check security groups on your port/network — they still apply to endpoint traffic +- Verify the service itself is healthy (the endpoint is just the tunnel — if the backend is down, traffic won't work) + +## Security + +Private connectivity via Archer reduces attack surface: +- Traffic never touches public networks — no exposure to internet-based threats +- Endpoints are project-scoped — only your project can use your endpoints +- No public IPs required — the service is reachable only via private endpoint IP +- Producer controls access — services can require approval before granting connectivity +- Principle of least exposure — only the specific service is reachable, not the entire producer network diff --git a/plugins/sapcc/skills/sapcc-identity/SKILL.md b/plugins/sapcc/skills/sapcc-identity/SKILL.md new file mode 100644 index 0000000..4d72ab2 --- /dev/null +++ b/plugins/sapcc/skills/sapcc-identity/SKILL.md @@ -0,0 +1,188 @@ +--- +name: sapcc-identity +description: >- + SAP Converged Cloud identity and access management via Keystone. Domain/project + hierarchy, authentication context, application credentials, role assignments, + and service catalog interpretation. Use when: project, domain, authentication, + roles, token, application credential, keystone, who am I, service catalog. +version: 1.0.0 +metadata: + service: [keystone] + task: [auth, debug, manage, discover] + persona: [developer, platform-engineer, security] +--- + +# SAP CC Identity (Keystone) + +## MCP Tools + +| Tool | Purpose | Key Parameters | +|------|---------|----------------| +| `keystone_token_info` | Current auth context: user, project, domain, roles, catalog | None | +| `keystone_list_projects` | List accessible projects | `domain_id`, `name` (optional filters) | +| `keystone_create_application_credential` | Create app credential (secret shown once) | `name`, `description`, `expires_at`, `roles` | +| `keystone_list_application_credentials` | List app creds for current user | None | +| `keystone_delete_application_credential` | Delete/revoke an app credential | `id` or `name` | + +## SAP CC Domain Model + +``` +Region (eu-de-1) +└── Domain (organizational unit, e.g. "cc-demo") + ├── Project A (resource container) + │ └── Role assignments (member, admin, network_admin...) + ├── Project B + └── Project C +``` + +**Key facts:** +- A domain is an organizational boundary (typically maps to a team or cost center) +- Each project belongs to exactly one domain +- Roles are assigned per-project (a user can be admin in Project A, member in Project B) +- Regions are fully independent — credentials, projects, and catalogs don't cross regions +- Domain names follow convention: `cc-` (e.g., `cc-demo`, `cc-platform`) + +For deeper explanation, see `references/domain-project-model.md`. + +## Gotchas + +1. **token_info never exposes the actual token value** — only metadata (user, project, roles, catalog). This is security by design in the MCP server. Don't tell users you can show them their token. + +2. **App credential secret is shown ONLY at creation time** — if the user loses it, the only recovery is delete + recreate. Always instruct users to store the secret immediately (keychain). + +3. **App credentials inherit roles at creation time** — role changes to the user after creation do NOT affect existing app credentials. To pick up new roles: delete old credential, create new one. + +4. **Always list existing app credentials before creating** — avoids duplicates. Use `keystone_list_application_credentials` first, check if one with the intended name already exists. + +5. **App credential names are unique per user, not globally** — two different users can have an app credential named `mcp-server`. But the same user cannot have two with the same name. + +6. **Deleting an app credential immediately revokes access** — any service (including the MCP server itself) using that credential will fail on the next API call. Always create the replacement first. + +7. **domain_id is required for cross-domain project listing** — without it, `keystone_list_projects` only returns projects in your token's current domain scope. + +8. **Service catalog in token_info is region-specific** — it shows only services available in the current region. Different regions may have different service availability. + +9. **App credentials cannot create other app credentials** — unless created with `unrestricted: true` (which most deployments disallow). This prevents credential escalation chains. + +10. **Project scope determines what you see** — your token is scoped to one project. All API calls operate within that project's context. To work across projects, you need separate credentials per project. + +## Common Workflows + +### Check Current Auth Context + +"Who am I? What project? What can I do?" + +``` +1. keystone_token_info + → Returns: user name, user domain, project name, project domain, + role assignments, service catalog, token expiry +2. Interpret roles to determine access level +3. Review service catalog to see available services +``` + +### Create Application Credential for MCP Server + +``` +1. keystone_list_application_credentials + → Check if one already exists with intended name +2. keystone_create_application_credential + name: "mcp-server--" (e.g., mcp-server-cc-demo-qa-de-1) + description: "MCP server credential for in " + expires_at: "2027-05-06T00:00:00Z" (recommend 1 year dev, 90 days prod) +3. IMMEDIATELY store the secret in keychain: + macOS: security add-generic-password -a "" -s "openstack-appcred" -w "" + Linux: secret-tool store --label="" service openstack-appcred account "" +4. Configure MCP server with credential ID + secret retrieval command +5. Verify with keystone_token_info after restart +``` + +### Rotate Credentials + +Order matters — create new BEFORE deleting old: + +``` +1. keystone_list_application_credentials → identify the old credential +2. keystone_create_application_credential → new credential with new name/suffix +3. Store new secret in keychain +4. Update MCP server config to use new credential +5. Restart MCP server +6. keystone_token_info → verify new credential works +7. keystone_delete_application_credential → remove old credential ONLY after verification +8. Remove old secret from keychain +``` + +### List Accessible Projects + +``` +1. keystone_list_projects + → Shows projects in current domain +2. For cross-domain: keystone_list_projects with domain_id filter + → Need to know the target domain_id (not name) +3. Cross-reference with keystone_token_info to see current project scope +``` + +### Discover Available Services + +``` +1. keystone_token_info → service_catalog section +2. Each catalog entry contains: + - type (e.g., "compute", "resources", "audit-data") + - name (e.g., "nova", "limes", "hermes") + - endpoints with region and URL +3. Use catalog to determine which MCP tools are usable in current region +``` + +## Troubleshooting + +### "Authentication failed" or 401 errors + +| Cause | Diagnostic | Fix | +|-------|-----------|-----| +| App credential deleted/expired | `keystone_list_application_credentials` — is it still there? | Create new credential | +| Wrong project scope | `keystone_token_info` — check project name | Update `OS_PROJECT_NAME` or recreate credential | +| User account disabled | Login to dashboard — is account active? | Contact domain admin | +| Wrong region | Check `OS_AUTH_URL` matches `OS_REGION_NAME` | Fix auth URL | + +### "Insufficient permissions" or 403 errors + +| Cause | Diagnostic | Fix | +|-------|-----------|-----| +| Missing role | `keystone_token_info` → check roles list | Request role from project admin | +| App credential has subset of roles | List app cred → check roles field | Delete and recreate with needed roles | +| Wrong project | `keystone_token_info` → check project | Switch to correct project | + +### "Project not found" + +- Verify domain scope — cross-domain listing requires `domain_id` +- Check spelling: project names are case-sensitive +- Confirm the project exists in this region (regions are independent) + +### "App credential name already exists" + +- Names are unique per user — list existing creds to find the conflict +- Delete the old one (if no longer needed) or choose a different name + +## Security + +| Principle | Implementation | +|-----------|---------------| +| Never expose tokens | MCP server sanitizes all responses — token values never reach the LLM | +| App creds over passwords | Scoped, revocable, no password exposure — always prefer | +| Keychain storage | Use `OS_APPCRED_SECRET_CMD` pattern — never plaintext in config files | +| Set expiration | Forces periodic rotation, limits blast radius of compromised creds | +| Minimum roles | Create app credentials with only the roles needed for the task | +| One credential per purpose | Easy to revoke without disrupting other services | +| Verify before deleting | Always confirm new credential works before revoking the old one | + +## Cross-Service References + +- **credential-setup skill** — detailed guided workflow for first-time credential creation +- **sapcc-quota** — check `limes_get_project_quota` to see what resources your project can use +- **sapcc-audit** — use `hermes_list_events` with `initiator_name` to see who did what in a project +- **Service catalog** — determines which other MCP tools (nova_, neutron_, limes_, etc.) are available + +## Routing + +| User need | Action | +|-----------|--------| +| Domain and project hierarchy details | Read [domain-project-model.md](references/domain-project-model.md) | diff --git a/plugins/sapcc/skills/sapcc-identity/references/domain-project-model.md b/plugins/sapcc/skills/sapcc-identity/references/domain-project-model.md new file mode 100644 index 0000000..5fcd3be --- /dev/null +++ b/plugins/sapcc/skills/sapcc-identity/references/domain-project-model.md @@ -0,0 +1,131 @@ +# SAP Converged Cloud Domain-Project Model + +## Hierarchy + +``` +SAP Converged Cloud +├── Region: eu-de-1 (independent OpenStack deployment) +│ ├── Domain: cc-demo +│ │ ├── Project: demo-app-dev +│ │ ├── Project: demo-app-staging +│ │ └── Project: demo-app-prod +│ ├── Domain: cc-platform +│ │ ├── Project: platform-monitoring +│ │ └── Project: platform-shared +│ └── Domain: cc-network +│ └── Project: network-infra +├── Region: eu-de-2 (completely separate) +│ ├── Domain: cc-demo (same name, different instance) +│ │ └── ... +│ └── ... +└── Region: na-us-1 + └── ... +``` + +## Concepts + +### Region + +An independent OpenStack deployment. Each region has: +- Its own Keystone (identity service) +- Its own service catalog +- Its own set of credentials +- No resource sharing across regions + +Region naming: `--` (e.g., `eu-de-1`, `qa-de-1`, `na-us-1`, `ap-jp-1`) + +### Domain + +An organizational unit within a region. Typically maps to: +- A team (e.g., `cc-platform`) +- A business unit (e.g., `cc-finance`) +- An environment grouping (e.g., `cc-demo`) + +Domain properties: +- Contains one or more projects +- Has domain-level quota (cap on sum of project quotas, via Limes) +- Users can be members of multiple domains +- Domain admins manage projects and role assignments within their domain + +Naming convention: `cc-` (the `cc-` prefix is SAP CC convention, not enforced by Keystone) + +### Project + +The fundamental resource container. All OpenStack resources (servers, networks, volumes) belong to a project. + +Project properties: +- Belongs to exactly one domain +- Has its own quota allocation (from domain's quota pool, managed by Limes) +- Role assignments are per-project +- Isolated from other projects (network, compute, storage) + +### Roles + +Roles determine what actions a user can perform within a project scope. + +Common SAP CC roles: +| Role | Typical Permissions | +|------|-------------------| +| `admin` | Full control over project resources | +| `member` | Create/manage own resources | +| `reader` | Read-only access to project resources | +| `network_admin` | Manage networks, subnets, security groups | +| `compute_admin` | Manage servers, flavors (project-scoped) | +| `audit_viewer` | Read audit events (Hermes) | + +Key behaviors: +- Roles are additive (a user can have multiple roles) +- Roles are project-scoped (admin in Project A does not mean admin in Project B) +- App credentials inherit the creating user's roles at creation time +- Domain admin is a separate concept from project admin + +## Authentication Scoping + +When you authenticate to SAP CC, your token is scoped to: +- One user +- One project (and by extension, one domain) +- A set of roles (for that project) + +This means: +- All API calls operate in the context of that one project +- You cannot "switch projects" without re-authenticating (or using a different credential) +- For multi-project workflows, you need one app credential per project + +## Relationship to MCP Server + +The MCP server authenticates with a single set of credentials, which means: +- It operates in one project at a time +- `keystone_token_info` shows the current scope +- `keystone_list_projects` can show other projects you have access to +- To work in a different project, you need a different MCP server instance (or reconfigure) + +## Common Patterns + +### Multi-Project Access + +``` +MCP Server Instance 1: OS_PROJECT_NAME=demo-app-dev +MCP Server Instance 2: OS_PROJECT_NAME=demo-app-prod +``` + +Or use one credential and know you're scoped to one project. + +### Cross-Domain Discovery + +To list projects in another domain: +``` +keystone_list_projects(domain_id="") +``` + +Note: requires the domain ID (UUID), not the domain name. The domain ID can be found in `keystone_token_info` for your own domain. + +### Credential Per Environment + +Recommended naming: +``` +mcp-server-- +``` + +Examples: +- `mcp-server-demo-app-dev-eu-de-1` +- `mcp-server-platform-monitoring-na-us-1` diff --git a/plugins/sapcc/skills/sapcc-metrics/SKILL.md b/plugins/sapcc/skills/sapcc-metrics/SKILL.md new file mode 100644 index 0000000..2ab1674 --- /dev/null +++ b/plugins/sapcc/skills/sapcc-metrics/SKILL.md @@ -0,0 +1,182 @@ +--- +name: sapcc-metrics +description: > + Maia metrics querying for SAP Converged Cloud (Prometheus-as-a-Service). + Triggers: metrics, monitoring, prometheus, promql, CPU usage, memory usage, maia, dashboard, alert, performance +version: 1.0.0 +metadata: + service: [maia] + task: [query, monitor, debug] + persona: [developer, devops] +--- + +# SAP CC Metrics (Maia) + +Maia is SAP CC's multi-tenant Prometheus-as-a-Service. Same PromQL query language, but tenant-isolated — each project sees only its own metrics. Read-only via MCP. + +## MCP Tools + +| Tool | Purpose | Required Params | +|------|---------|-----------------| +| `maia_metric_names` | List all available metric names for current project | (none) | +| `maia_label_values` | Get values for a specific label | `label` (e.g., `__name__`, `instance`, `job`) | +| `maia_query` | Execute instant PromQL query | `query` (optional: `time`) | + +## Maia vs Regular Prometheus + +- **Same PromQL** — all standard functions and operators work (rate, avg, sum, topk, etc.) +- **Tenant-isolated** — you only see metrics from your authenticated project +- **Available metrics vary** — depends on what's instrumented in your project (not all projects have the same metrics) +- **Read-only** — no recording rules, no alert configuration, no write path via MCP +- **Instant queries only** — maia_query executes point-in-time queries, not range queries + +## Gotchas + +### 1. Results are scoped to current project — you CANNOT query other projects' metrics + +Maia enforces tenant isolation via the OpenStack token. There is no way to query cross-project metrics. If you need fleet-wide data, check Limes for capacity or use cluster-level monitoring. + +### 2. maia_query is INSTANT query only + +No range queries (`query_range`) via this tool. You get a single point in time. Use the `time` parameter to query a historical point, but you cannot get a time series of values in one call. For "show me the last hour" requests — you can only sample individual points. + +### 3. Always start with maia_metric_names + +Before writing PromQL, discover what's available. Different projects have different metrics. Never assume a metric exists — verify first. + +### 4. maia_label_values with label="__name__" equals maia_metric_names + +Both return the list of available metric names. Use `maia_label_values` when you need values for other labels (instance, job, device, etc.). + +### 5. time parameter defaults to NOW + +Optional. Accepts RFC3339 (`2024-03-15T10:00:00Z`) or Unix timestamp (`1710500400`). Omitting it gives current values. For "what was CPU at 3am?" — pass the specific timestamp. + +### 6. PromQL syntax errors return generic errors + +The API does not give helpful parse errors. If you get an error, validate your query syntax independently before blaming connectivity or permissions. Common mistakes: missing brackets, unbalanced quotes, typos in metric names. + +### 7. Metric names vary by project + +Not all projects have the same instrumentation. A compute-heavy project might have `node_cpu_seconds_total` while a Kubernetes project has `container_cpu_usage_seconds_total`. Always discover first. + +### 8. Large result sets may be truncated + +Queries that match many series (e.g., `{__name__=~".+"}`) can hit response size limits. Use specific label selectors to narrow: `{instance="specific-host"}`, `{job="specific-job"}`. + +### 9. No aggregation across projects + +If the user asks "how much CPU is our whole team using?" — that requires cluster-level access or Limes. Maia only shows the current project's metrics. Redirect to quota tools for cross-project views. + +## Common Workflows + +### Discover available metrics + +``` +1. maia_metric_names → see what's instrumented +2. Pick relevant metric name +3. maia_label_values(label="instance") → find dimensions +4. maia_label_values(label="job") → understand metric sources +``` + +### "Is my server healthy?" + +``` +1. maia_metric_names → find CPU/memory/disk metrics +2. maia_label_values(label="instance") → identify the server +3. maia_query(query='node_cpu_seconds_total{instance="", mode="idle"}') +4. maia_query(query='node_memory_MemAvailable_bytes{instance=""}') +5. maia_query(query='node_filesystem_avail_bytes{instance="", mountpoint="/"}') +``` + +### "What's the current load?" + +``` +1. maia_query(query='rate(node_cpu_seconds_total{mode!="idle"}[5m])') +2. maia_query(query='avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m]))') +``` + +Note: rate() requires a range vector but returns an instant vector — works with maia_query. + +### Troubleshooting a specific issue + +``` +1. maia_metric_names → find relevant metrics (grep mentally) +2. maia_label_values(label="") → explore dimensions +3. Write targeted PromQL with specific label selectors +4. Interpret the value in context (e.g., bytes → GiB, ratio → percentage) +``` + +## PromQL Quick Reference + +Common patterns for SAP CC infrastructure metrics: + +| Pattern | PromQL | Use Case | +|---------|--------|----------| +| Rate of change | `rate(metric[5m])` | Counter metrics (CPU, network bytes) | +| Average by instance | `avg by (instance) (metric)` | Reduce cardinality | +| Top N | `topk(5, metric)` | Find highest consumers | +| Threshold check | `metric > 0.9` | Alert-style filtering | +| CPU usage % | `1 - rate(node_cpu_seconds_total{mode="idle"}[5m])` | Per-core idle inverse | +| Memory usage % | `1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)` | Available/total ratio | +| Disk usage % | `1 - (node_filesystem_avail_bytes / node_filesystem_size_bytes)` | Free/total ratio | +| Sum by label | `sum by (job) (metric)` | Aggregate across instances | +| Filter by regex | `metric{label=~"pattern.*"}` | Flexible label matching | + +See `references/promql-patterns.md` for extended pattern library. + +## Troubleshooting + +### No metrics found (maia_metric_names returns empty) + +- Project has no instrumentation configured +- Token might be scoped to wrong project — verify with `keystone_token_info` +- The project may be new with nothing deployed yet + +### maia_query returns empty result + +- Metric name typo — copy-paste from maia_metric_names output +- Label selector too restrictive — remove labels and broaden +- Metric exists but no data at queried time — try without `time` param (defaults to now) +- Counter metric with rate() on too-short window — try `[10m]` instead of `[1m]` + +### Query syntax error + +- Unbalanced brackets: `rate(metric[5m]` missing closing `)` +- Wrong bracket type: range vectors use `[5m]` not `(5m)` +- Unquoted label value: must be `{label="value"}` with quotes +- Invalid duration: use `s`, `m`, `h`, `d` — not `sec`, `min` +- Metric name with dots: wrap in `{__name__="metric.with.dots"}` + +### Unexpected values + +- Counters always increase — use `rate()` or `increase()` to get meaningful values +- Gauge vs counter confusion — check if values only go up (counter) or fluctuate (gauge) +- Unit mismatch — bytes vs bits, MiB vs MB, seconds vs milliseconds +- `NaN` or `+Inf` — division by zero in your query or absent denominator series + +## Security + +Metrics reveal operational state: CPU patterns, memory pressure, disk growth rates, network traffic volumes, and deployment schedules. This data exposes: + +- Capacity planning (how close to limits) +- Traffic patterns (peak times, quiet periods) +- Infrastructure topology (which instances, what roles) +- Potential vulnerabilities (overloaded systems, resource exhaustion) + +Treat metric data as internal/confidential. Only query what's needed for the task at hand. + +## Cross-Service References + +| Need | Service | Tool | +|------|---------|------| +| Map instance label to server name | Nova | `nova_get_server()` | +| Correlate resource usage with quota | Limes | `limes_get_project_quota` | +| Investigate anomalies in audit trail | Hermes | `hermes_list_events` | +| Network metrics correlation | Neutron | `neutron_list_ports(device_id=)` | + +## Routing + +| User need | Action | +|-----------|--------| +| PromQL query patterns for SAP CC | Read [promql-patterns.md](references/promql-patterns.md) | diff --git a/plugins/sapcc/skills/sapcc-metrics/references/promql-patterns.md b/plugins/sapcc/skills/sapcc-metrics/references/promql-patterns.md new file mode 100644 index 0000000..61f146f --- /dev/null +++ b/plugins/sapcc/skills/sapcc-metrics/references/promql-patterns.md @@ -0,0 +1,247 @@ +# PromQL Patterns for SAP CC / OpenStack Infrastructure + +## CPU Metrics + +### CPU utilization per instance (percentage) +```promql +100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) +``` + +### CPU breakdown by mode +```promql +avg by (mode) (rate(node_cpu_seconds_total[5m])) +``` + +### High CPU alert (>90%) +```promql +100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > 90 +``` + +### CPU steal time (noisy neighbor detection) +```promql +rate(node_cpu_seconds_total{mode="steal"}[5m]) > 0.05 +``` + +## Memory Metrics + +### Memory usage percentage +```promql +100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) +``` + +### Available memory in GiB +```promql +node_memory_MemAvailable_bytes / 1024 / 1024 / 1024 +``` + +### Memory pressure (low available) +```promql +(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) < 0.1 +``` + +### Swap usage (indicates memory pressure) +```promql +node_memory_SwapTotal_bytes - node_memory_SwapFree_bytes +``` + +## Disk Metrics + +### Disk usage percentage by mountpoint +```promql +100 * (1 - node_filesystem_avail_bytes / node_filesystem_size_bytes) +``` + +### Disk filling prediction (hours until full at current rate) +```promql +node_filesystem_avail_bytes / (rate(node_filesystem_avail_bytes[1h]) * -1) / 3600 +``` + +### Disk I/O utilization +```promql +rate(node_disk_io_time_seconds_total[5m]) +``` + +### Disk read/write throughput (bytes/sec) +```promql +rate(node_disk_read_bytes_total[5m]) +rate(node_disk_written_bytes_total[5m]) +``` + +### IOPS +```promql +rate(node_disk_reads_completed_total[5m]) +rate(node_disk_writes_completed_total[5m]) +``` + +## Network Metrics + +### Network throughput (bytes/sec per interface) +```promql +rate(node_network_receive_bytes_total{device!="lo"}[5m]) +rate(node_network_transmit_bytes_total{device!="lo"}[5m]) +``` + +### Network errors +```promql +rate(node_network_receive_errs_total[5m]) > 0 +rate(node_network_transmit_errs_total[5m]) > 0 +``` + +### Packet drops +```promql +rate(node_network_receive_drop_total[5m]) > 0 +rate(node_network_transmit_drop_total[5m]) > 0 +``` + +### TCP connection states +```promql +node_netstat_Tcp_CurrEstab +``` + +## OpenStack Nova (Compute) Metrics + +### VM count by hypervisor +```promql +openstack_nova_running_vms +``` + +### vCPU allocation ratio +```promql +openstack_nova_vcpus_used / openstack_nova_vcpus_available +``` + +### Memory allocation +```promql +openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes +``` + +### Local disk usage +```promql +openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes +``` + +## OpenStack Neutron (Network) Metrics + +### Port count by network +```promql +openstack_neutron_ports{status="ACTIVE"} +``` + +### Floating IP usage +```promql +openstack_neutron_floating_ips{status="ACTIVE"} +``` + +## Container/Kubernetes Metrics (if project runs K8s) + +### Container CPU usage +```promql +rate(container_cpu_usage_seconds_total{container!=""}[5m]) +``` + +### Container memory usage +```promql +container_memory_working_set_bytes{container!=""} +``` + +### Pod restart count +```promql +increase(kube_pod_container_status_restarts_total[1h]) +``` + +### OOMKill events +```promql +increase(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[1h]) +``` + +## Aggregation Patterns + +### Top 5 by CPU usage +```promql +topk(5, 100 * (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])))) +``` + +### Bottom 5 by free disk +```promql +bottomk(5, node_filesystem_avail_bytes{mountpoint="/"}) +``` + +### Count instances above threshold +```promql +count(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.2) +``` + +### Average across all instances +```promql +avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) +``` + +### Sum total network traffic +```promql +sum(rate(node_network_receive_bytes_total{device!="lo"}[5m])) +``` + +## Time-Based Patterns + +### Compare to 1 hour ago (requires two separate instant queries) + +Query at current time: +```promql +avg(rate(node_cpu_seconds_total{mode!="idle"}[5m])) +``` +Then query with `time` parameter set to 1 hour ago for comparison. + +### Rate over different windows + +Short-term spike detection: +```promql +rate(metric[1m]) +``` + +Smoothed trend: +```promql +rate(metric[30m]) +``` + +## Label Manipulation + +### Group by job +```promql +sum by (job) (up) +``` + +### Filter by regex on instance name +```promql +node_cpu_seconds_total{instance=~"web-.*"} +``` + +### Exclude specific labels +```promql +node_network_receive_bytes_total{device!~"lo|veth.*|docker.*"} +``` + +## Useful Instant Checks + +### "Is everything up?" +```promql +up == 0 +``` + +### "Which targets are being scraped?" +```promql +up +``` + +### "How long since last scrape?" +```promql +time() - node_time_seconds +``` + +## Notes on SAP CC Context + +- Metric availability depends entirely on project instrumentation +- `node_*` metrics appear when node-exporter is deployed on instances +- `openstack_*` metrics appear when OpenStack exporters are configured +- `container_*` and `kube_*` metrics appear in Kubernetes-enabled projects +- Always run `maia_metric_names` first to confirm which metric families exist +- Metric retention varies — very old historical points may not be available diff --git a/plugins/sapcc/skills/sapcc-networking/SKILL.md b/plugins/sapcc/skills/sapcc-networking/SKILL.md new file mode 100644 index 0000000..e11c853 --- /dev/null +++ b/plugins/sapcc/skills/sapcc-networking/SKILL.md @@ -0,0 +1,196 @@ +--- +name: sapcc-networking +description: > + Neutron networking operations: network topology, port inspection, security group debugging, + connectivity troubleshooting. Triggers: network, subnet, port, security group, firewall, + connectivity, "can't reach", interface, IP address, CIDR. +version: 1.0.0 +metadata: + service: + - neutron +--- + +# SAP CC Networking (Neutron) + +Investigate and debug network topology, port state, and security group rules in SAP Converged Cloud. + +## MCP Tools + +| Tool | Purpose | Key Filters | +|------|---------|-------------| +| `neutron_list_networks` | List networks | `name`, `status` | +| `neutron_list_subnets` | List subnets (CIDR, gateway, DHCP) | `network_id` | +| `neutron_list_ports` | List ports (MAC, fixed IPs, device_owner) | `network_id`, `device_id`, `status` | +| `neutron_list_security_groups` | List security groups with rules | — | + +## Gotchas + +These are the most common mistakes. Follow them exactly. + +1. **Default security group blocks ALL inbound traffic.** Users assume "default" means permissive. It is not. The default group allows all egress but zero ingress. Always check security group rules when debugging connectivity. + +2. **`device_owner` tells you what owns a port.** Key values: + - `compute:nova` — VM network interface + - `network:dhcp` — DHCP agent port + - `network:router_interface` — Router attachment + - `network:floatingip` — Floating IP anchor + - Empty string — Unattached/orphaned port + +3. **To find a server's ports, use `neutron_list_ports` with `device_id=`.** Do not look for port information in Nova responses. Neutron is the source of truth for network interfaces. + +4. **Security group rules are additive (whitelist-only).** There is no explicit deny. If traffic is not allowed by any rule, it is denied. You cannot "block port 22" — you can only not allow it. + +5. **Ports in DOWN status may be detached OR the VM is shut off.** A DOWN port does not mean a problem — cross-reference with `nova_get_server` to check server power state. ACTIVE port + SHUTOFF server = normal (port state lags). + +6. **`network_id` is required to correlate subnets to networks.** Subnets do not carry the network name. To map the full topology: list networks first, then list subnets filtered by each `network_id`. + +7. **Fixed IPs in port response contain both `subnet_id` and `ip_address`.** To determine the CIDR range for a port's IP, use the `subnet_id` from the port's `fixed_ips` array to look up the subnet. + +8. **SAP CC networks are provider networks (not self-service).** Users cannot create or delete networks. Network infrastructure is managed by platform operators. Users can only create ports, security groups, and floating IP associations. + +9. **Security group is applied per-port, not per-server.** A server with multiple interfaces can have different security groups on each port. Always check the `security_groups` field on the specific port, not at the server level. + +10. **Remote group rules create circular dependencies.** A security group rule referencing another group (remote_group_id) means "allow traffic from any port that has that group applied." This is how you allow intra-project communication without specifying IPs. + +## Common Workflows + +### "Why can't I reach my server?" + +This is the most common networking question. Follow this sequence: + +``` +Step 1: Find the server's ports + → neutron_list_ports(device_id=) + → Note the port status, fixed_ips, and security_groups + +Step 2: Check security groups on those ports + → neutron_list_security_groups() + → Find the groups listed on the port + → Verify inbound rules allow the protocol/port/source you need + +Step 3: Verify subnet configuration + → Use subnet_id from port's fixed_ips + → neutron_list_subnets(network_id=) + → Check gateway_ip is set, DHCP is enabled if expected + +Step 4: Verify network status + → neutron_list_networks(name=) + → Confirm status is ACTIVE, admin_state_up is true +``` + +**Common findings:** +- Missing ingress rule (most common — see Gotcha #1) +- Wrong security group attached to port +- Port is DOWN because VM is SHUTOFF +- Subnet has no gateway (isolated network) + +### Find All Network Interfaces for a Server + +``` +neutron_list_ports(device_id=) +``` + +Each result is one NIC. For each port, extract: +- `mac_address` — correlates to OS-visible interface +- `fixed_ips[].ip_address` — assigned IPs +- `fixed_ips[].subnet_id` — which subnet it belongs to +- `network_id` — which network it connects to +- `security_groups` — applied firewall rules +- `status` — ACTIVE, DOWN, BUILD + +### Audit Security Group Rules for a Project + +``` +Step 1: List all security groups + → neutron_list_security_groups() + +Step 2: For each group, examine rules + → Look at security_group_rules array in response + → Flag overly permissive rules (see Security Considerations) + +Step 3: Cross-reference with ports + → neutron_list_ports() to see which groups are actually in use + → Groups not referenced by any port may be stale +``` + +### Map Network Topology + +Build the full picture: network → subnets → ports → servers. + +``` +Step 1: List all networks + → neutron_list_networks() + +Step 2: For each network, list subnets + → neutron_list_subnets(network_id=) + → Record CIDR, gateway, allocation_pools + +Step 3: For each network, list ports + → neutron_list_ports(network_id=) + → Group by device_owner to separate VMs from infrastructure + +Step 4: Correlate ports to servers + → Ports with device_owner="compute:nova" have device_id=server_uuid + → Use nova_get_server() to get server name/status +``` + +## Troubleshooting + +### Port ACTIVE but No Connectivity + +1. Security group missing required ingress rule (check Gotcha #1) +2. Source IP not in allowed CIDR of the security group rule +3. Subnet has no gateway — network is intentionally isolated +4. MTU mismatch (rare, but check subnet MTU if large packets fail) +5. Anti-spoofing: traffic from IP not in port's `allowed_address_pairs` or `fixed_ips` is dropped + +### Security Group Not Taking Effect + +1. Wrong group attached — compare port's `security_groups` list to the group you edited +2. Rule direction wrong — `ingress` is traffic TO the port, `egress` is FROM the port +3. Ethertype mismatch — IPv4 rule won't match IPv6 traffic and vice versa +4. Protocol/port mismatch — rule says TCP but traffic is UDP, or port range excludes your port +5. Remote IP prefix too restrictive — source CIDR doesn't include the connecting client + +### IP Conflict / Duplicate Address + +1. Check if multiple ports have the same `fixed_ips[].ip_address` on the same subnet +2. Look for ports with `device_owner=""` (orphaned) holding the IP +3. Check `allowed_address_pairs` — another port may legitimately share the IP (VRRP/HA) + +## Security Considerations + +### Overly Permissive Rules (Flag These) + +| Pattern | Risk | Recommendation | +|---------|------|----------------| +| `remote_ip_prefix: 0.0.0.0/0` on SSH (22) | Internet-exposed SSH | Restrict to bastion/VPN CIDR | +| `remote_ip_prefix: 0.0.0.0/0` on all ports | Fully open | Almost never correct — audit immediately | +| `protocol: null` (all protocols allowed) | No protocol restriction | Specify TCP/UDP/ICMP explicitly | +| Egress `0.0.0.0/0` to all ports | Unrestricted outbound | Acceptable for most workloads, flag for PCI | + +### Before Modifying Security Groups + +1. Identify all ports using the group: `neutron_list_ports()` and filter by security_groups containing the group ID +2. Understand the blast radius — one group change affects all attached ports +3. Check if it's the "default" group — almost every port uses it +4. Prefer adding a new group over modifying a shared one + +## Cross-Service References + +| Need | Service | Tool | +|------|---------|------| +| Server name/status for a port's device_id | Nova | `nova_get_server` | +| Who modified a security group? | Hermes | `hermes_list_events(target_type=security_group)` | +| Network quota remaining | Limes | `limes_get_project_quota` (service: networking) | +| DNS records for an IP | Designate | Not yet in MCP server | + +## Reference Files + +- `references/security-group-patterns.md` — Common SAP CC security group configurations and templates + +## Routing + +| User need | Action | +|-----------|--------| +| Common security group configurations | Read [security-group-patterns.md](references/security-group-patterns.md) | diff --git a/plugins/sapcc/skills/sapcc-networking/references/security-group-patterns.md b/plugins/sapcc/skills/sapcc-networking/references/security-group-patterns.md new file mode 100644 index 0000000..c008bea --- /dev/null +++ b/plugins/sapcc/skills/sapcc-networking/references/security-group-patterns.md @@ -0,0 +1,139 @@ +# Security Group Patterns for SAP CC + +Common security group configurations in SAP Converged Cloud environments. + +## Standard Patterns + +### Web Server (HTTP/HTTPS) + +``` +Ingress TCP 80 from 0.0.0.0/0 — HTTP +Ingress TCP 443 from 0.0.0.0/0 — HTTPS +Ingress TCP 22 from — SSH (restricted) +Egress * * to 0.0.0.0/0 — All outbound (default) +``` + +Use case: Public-facing web application. SSH restricted to jump host CIDR only. + +### Database Server (Internal Only) + +``` +Ingress TCP 5432 from — PostgreSQL +Ingress TCP 3306 from — MySQL +Ingress TCP 22 from — SSH (restricted) +Ingress ICMP from — Ping for monitoring +Egress * * to 0.0.0.0/0 — All outbound +``` + +Use case: Database accessible only from application tier. No public access. + +### Intra-Project Communication (Remote Group) + +``` +Ingress * * from remote_group_id= — All traffic within group +Ingress TCP 22 from — SSH (restricted) +Egress * * to 0.0.0.0/0 — All outbound +``` + +Use case: Cluster nodes (Kubernetes, Kafka, etc.) that need unrestricted communication with each other. The `remote_group_id` pointing to itself means "any port with this same group." + +### Bastion / Jump Host + +``` +Ingress TCP 22 from — SSH from VPN only +Ingress ICMP from — Ping from VPN +Egress TCP 22 to — SSH to internal hosts +Egress ICMP to — Ping internal hosts +``` + +Use case: Single point of SSH entry. Tight egress limits prevent lateral movement if compromised. Note: restricted egress is unusual in SAP CC but recommended for bastions. + +### Monitoring Agent + +``` +Ingress TCP 9100 from — Node exporter +Ingress TCP 9090 from — Prometheus +Ingress TCP 22 from — SSH +Egress * * to 0.0.0.0/0 — All outbound +``` + +Use case: Servers running Prometheus exporters. Only the monitoring system can scrape metrics. + +### Load Balancer Backend + +``` +Ingress TCP 8080 from — App port from LB +Ingress TCP 8443 from — App TLS port from LB +Ingress TCP 22 from — SSH +Ingress TCP 9100 from — Metrics +Egress * * to 0.0.0.0/0 — All outbound +``` + +Use case: Application servers behind Octavia load balancer. Only accepts traffic from the LB subnet. + +## Anti-Patterns (Flag During Audits) + +### Fully Open (Critical Risk) + +``` +Ingress * * from 0.0.0.0/0 — ALL traffic from anywhere +``` + +This is almost never correct. It means any IP on the internet can reach any port. Immediate remediation required. + +### SSH from Anywhere (High Risk) + +``` +Ingress TCP 22 from 0.0.0.0/0 — SSH from internet +``` + +Common mistake. Should always be restricted to bastion/VPN CIDR. Brute-force attacks begin within minutes of exposure. + +### Overly Broad Port Range + +``` +Ingress TCP 1-65535 from — All TCP ports +``` + +Usually indicates the user didn't know which port they needed. Ask them to identify the specific port and narrow the rule. + +### Stale Rules (Moderate Risk) + +Security groups with rules referencing CIDRs of decommissioned networks or remote_group_ids that no longer exist. These don't cause immediate harm but indicate configuration drift. + +## SAP CC Specifics + +### Default Security Group Behavior + +Every project starts with a "default" security group containing: +``` +Egress * * to 0.0.0.0/0 — Allow all outbound (IPv4) +Egress * * to ::/0 — Allow all outbound (IPv6) +Ingress * * from remote_group=default — Allow from same group (IPv4) +Ingress * * from remote_group=default — Allow from same group (IPv6) +``` + +This means: servers in the default group can talk to each other, but nothing external can reach them. Users must explicitly add ingress rules. + +### Provider Network Implications + +Since SAP CC uses provider networks: +- All subnets in a project may share the same physical network segment +- Security groups are the primary isolation mechanism between workloads +- There is no "network-level" isolation like VPCs — security groups do all the work +- This makes security group hygiene more critical than in self-service environments + +### Naming Conventions + +Recommended naming for security groups in SAP CC: +``` +-- +``` + +Examples: +- `myapp-web-prod` +- `myapp-db-prod` +- `myapp-bastion-all` +- `platform-monitoring-prod` + +Avoid generic names like "test", "temp", "allow-all" — they accumulate and become impossible to audit. diff --git a/plugins/sapcc/skills/sapcc-quota/SKILL.md b/plugins/sapcc/skills/sapcc-quota/SKILL.md new file mode 100644 index 0000000..b751e26 --- /dev/null +++ b/plugins/sapcc/skills/sapcc-quota/SKILL.md @@ -0,0 +1,228 @@ +--- +name: sapcc-quota +description: > + Limes quota and usage management for SAP Converged Cloud. + Triggers: quota, usage, capacity, resource limit, how much, running out, limes, burst +version: 1.0.0 +metadata: + service: [limes] + task: [capacity, plan, audit] + persona: [platform-engineer, developer] +--- + +# SAP CC Quota & Usage (Limes) + +Limes is SAP CC's central quota management service. Not part of vanilla OpenStack. It enforces hierarchical resource quotas across cluster, domain, and project levels. + +## MCP Tools + +| Tool | Purpose | Required Params | +|------|---------|-----------------| +| `limes_get_project_quota` | Quota + usage for a single project | `domain_id`, `project_id` (optional: `service`, `resource`) | +| `limes_get_domain_quota` | Aggregated quota for all projects in a domain | `domain_id` (optional: `service`, `resource`) | +| `limes_get_cluster_quota` | Cluster-wide capacity and usage | (optional: `service`, `resource`) | + +## Quota Model + +``` +Cluster Capacity + └─ Domain Quota (sum of project quotas ≤ domain quota) + └─ Project Quota (what a project is allowed to consume) + └─ Usage (what's actually consumed) +``` + +**Constraints:** +- Project quota ≤ Domain quota ≤ Cluster capacity +- Each service (compute, network, object-store, etc.) has multiple resources +- Quota is pre-allocated — not on-demand. Unused quota still "counts" against the domain. + +## Gotchas + +This is the highest-gotcha-density skill. Read all of these before interpreting Limes data. + +### 1. RAM is in MiB, not GB + +RAM quota and usage are reported in **MiB**. A quota of `51200` = 50 GiB. A quota of `131072` = 128 GiB. Always convert: `value / 1024 = GiB`. + +### 2. physical_usage can exceed usage — this is NORMAL + +`physical_usage` includes backend overhead: snapshots, replicas, metadata, copy-on-write reserves. A volume with `usage=100 GiB` might have `physical_usage=150 GiB`. This is not a bug. + +### 3. quota=0 with usage=0 means DISABLED, not unlimited + +Zero quota = resource is not available to this project. It's explicitly disabled. If you see `quota: 0, usage: 0`, the project cannot use that resource at all. + +### 4. quota=-1 means unlimited + +Rare. Usually only at domain level for internal/platform domains. Means no hard cap enforced. Do not confuse with "no quota set." + +### 5. burst_usage is borrowed capacity + +Burst allows temporary over-quota usage. It's a loan from unused cluster capacity. Limes can reclaim it. If `burst_usage > 0`, the project is currently exceeding its base quota and relying on burst — this is fragile and should be resolved. + +### 6. domain_id is REQUIRED for project quota + +`limes_get_project_quota` needs both `domain_id` AND `project_id`. Get them from `keystone_token_info` first. The token response includes `project.domain.id` and `project.id`. + +### 7. project_id is REQUIRED — not project name + +Limes uses UUIDs, not names. Always resolve via `keystone_token_info` or `keystone_list_projects`. + +### 8. Service filter narrows the response + +`service=compute` only returns compute resources. Omit the filter to see ALL services. Don't filter unless you know exactly what you want — you'll miss relevant data. + +### 9. "Running out?" = usage/quota ratio + +There's no Limes alert. You calculate it: `usage / quota * 100`. Alert thresholds: >80% = warning, >90% = critical, 100% = exhausted (new resource creation blocked). + +### 10. Cluster quota ≠ what's available to you + +Cluster capacity shows total platform resources. Your project gets a fraction. Don't confuse "cluster has 10TB" with "I can use 10TB." + +### 11. Quota does not auto-adjust + +If a project needs more quota, someone must request a raise. This is often a manual approval process. Limes just enforces limits — it doesn't grow them. + +### 12. Volume types have separate quotas + +`volumes_vmware`, `volumes_ceph`, `capacity_vmware`, `capacity_ceph` — each volume backend has its own quota line. A project might have plenty of ceph quota but zero vmware quota. + +### 13. Quota exhaustion blocks creation, not operation + +Running resources continue to run even if quota is 100% used. But you cannot create NEW resources. Existing VMs keep running, existing volumes stay mounted. + +## Common Workflows + +### "Am I running out of resources?" + +``` +1. keystone_token_info → get domain_id, project_id +2. limes_get_project_quota(domain_id, project_id) +3. For each resource: calculate usage/quota percentage +4. Flag anything > 80% +``` + +### "Can I create a server with flavor X?" + +Check three resources simultaneously: +- `compute/cores` — flavor vCPUs ≤ (quota - usage) +- `compute/ram` — flavor RAM (in MiB!) ≤ (quota - usage) +- `compute/instances` — at least 1 instance slot available + +If any is insufficient, the server creation will fail with a quota error. + +### Full resource inventory for a project + +``` +1. keystone_token_info → domain_id, project_id +2. limes_get_project_quota(domain_id, project_id) [no service filter] +3. List all services and resources with quota, usage, percentage +``` + +### Domain capacity planning + +``` +1. limes_get_domain_quota(domain_id) +2. Compare projects_quota (sum allocated to projects) vs domain_quota +3. If projects_quota ≈ domain_quota → no room to grow any project +``` + +### Cluster-wide capacity check + +``` +1. limes_get_cluster_quota(service="compute") +2. Shows total cluster capacity, domains_quota, usage +3. capacity - domains_quota = unallocated headroom +``` + +## Interpreting the Response + +| Field | Meaning | +|-------|---------| +| `quota` | Maximum allowed for this resource in this project/domain | +| `usage` | Currently consumed (logical — what the user sees) | +| `physical_usage` | Actually consumed on backend (includes overhead, replicas, snapshots) | +| `burst_usage` | Amount currently over-quota via burst allowance | +| `backend_quota` | What's actually configured in the backend service (should match `quota`) | +| `projects_quota` | (Domain/cluster level) Sum of all child project quotas | +| `capacity` | (Cluster level) Total physical capacity of the platform | + +### Reading a typical response + +```json +{ + "service": "compute", + "resources": [ + { + "name": "cores", + "quota": 64, + "usage": 48, + "burst_usage": 0 + }, + { + "name": "ram", + "quota": 131072, + "usage": 98304, + "burst_usage": 0 + } + ] +} +``` + +Translation: Project has 64 cores quota, using 48 (75%). RAM quota is 128 GiB (131072 MiB), using 96 GiB (98304 MiB, 75%). + +## Troubleshooting + +### "Quota exhausted" but I can't see why + +1. Check `usage` vs `quota` — is it really at 100%? +2. Check if burst is active (`burst_usage > 0`) — burst may have been reclaimed +3. Check specific resource — error might say "compute" but actual limit is `instances`, not `cores` +4. Check volume type quotas separately — generic "storage" quota might be fine but specific backend is exhausted + +### physical_usage > usage + +Normal. Causes: +- Volume snapshots count toward physical but not logical usage +- Replication factor (3x for ceph) +- Copy-on-write overhead +- Deleted resources still being garbage collected + +Not a problem unless physical_usage is growing unboundedly. + +### Burst usage appearing unexpectedly + +1. Project exceeded its base quota at some point +2. Limes granted burst from available cluster headroom +3. This is temporary — Limes can reclaim burst at any time +4. Action: request a quota raise to cover actual usage, or reduce usage below base quota + +### backend_quota != quota + +Limes periodically syncs to backend services. Temporary mismatch during sync is normal. Persistent mismatch (>1 hour) indicates a Limes issue — escalate. + +## Security + +Quota data reveals project sizing, resource allocation strategy, and capacity headroom. Treat as internal/confidential: +- Project quota = how big the project is +- Domain quota = how big the business unit's allocation is +- Cluster capacity = total platform size + +Only query at the scope you have legitimate access to. The MCP server enforces token-based access, but be aware that quota data you return to users should be scope-appropriate. + +## Cross-Service References + +| Need | Service | Tool | +|------|---------|------| +| Actual compute instances consuming quota | Nova | `nova_list_servers` | +| Actual volumes consuming storage quota | Cinder | `cinder_list_volumes` | +| Actual ports/networks consuming network quota | Neutron | `neutron_list_ports`, `neutron_list_networks` | +| Who changed quota settings | Hermes | `hermes_list_events(target_type=resources/project)` | +| Current project/domain context | Keystone | `keystone_token_info` | + +## Routing + +| User need | Action | +|-----------|--------| +| Mapping services to quota resource names | Read [quota-services-mapping.md](references/quota-services-mapping.md) | diff --git a/plugins/sapcc/skills/sapcc-quota/references/quota-services-mapping.md b/plugins/sapcc/skills/sapcc-quota/references/quota-services-mapping.md new file mode 100644 index 0000000..224e5eb --- /dev/null +++ b/plugins/sapcc/skills/sapcc-quota/references/quota-services-mapping.md @@ -0,0 +1,101 @@ +# Limes Services and Resources Mapping + +Which Limes services map to which resources. Use this to understand what you're looking at in quota responses. + +## compute + +| Resource | Unit | Notes | +|----------|------|-------| +| `cores` | count | vCPU cores across all instances | +| `ram` | MiB | **Not GB!** Divide by 1024 for GiB | +| `instances` | count | Total number of VMs | +| `server_groups` | count | Anti-affinity / affinity groups | +| `server_group_members` | count | Max members per server group | + +## network + +| Resource | Unit | Notes | +|----------|------|-------| +| `floating_ips` | count | Public IPv4 addresses | +| `networks` | count | Neutron networks | +| `subnets` | count | Subnets across all networks | +| `ports` | count | Virtual switch ports | +| `routers` | count | Neutron routers | +| `security_groups` | count | Firewall rule groups | +| `security_group_rules` | count | Individual firewall rules | +| `rbac_policies` | count | Cross-project sharing policies | + +## object-store + +| Resource | Unit | Notes | +|----------|------|-------| +| `capacity` | bytes | Total Swift storage capacity | + +## sharev2 (Manila - Shared File Systems) + +| Resource | Unit | Notes | +|----------|------|-------| +| `share_capacity` | GiB | Total shared filesystem capacity | +| `shares` | count | Number of file shares | +| `share_networks` | count | Share network objects | +| `share_snapshots` | count | Snapshots of shares | +| `snapshot_capacity` | GiB | Capacity used by snapshots | + +## volumev2 (Cinder - Block Storage) + +| Resource | Unit | Notes | +|----------|------|-------| +| `capacity` | GiB | Total block storage (all types combined) | +| `volumes` | count | Number of volumes (all types) | +| `snapshots` | count | Volume snapshots | +| `capacity_` | GiB | Per-backend: `capacity_vmware`, `capacity_ceph` | +| `volumes_` | count | Per-backend: `volumes_vmware`, `volumes_ceph` | +| `snapshots_` | count | Per-backend: `snapshots_vmware`, `snapshots_ceph` | + +**Volume type gotcha:** Generic `capacity` is the sum of all types. But each type has its own sub-quota. A project can have `capacity=500 GiB` total but only `capacity_vmware=200` and `capacity_ceph=300`. You must check the specific type. + +## dns (Designate) + +| Resource | Unit | Notes | +|----------|------|-------| +| `zones` | count | DNS zones managed | +| `recordsets` | count | DNS record sets across all zones | + +## loadbalancing (Octavia) + +| Resource | Unit | Notes | +|----------|------|-------| +| `loadbalancers` | count | Load balancer instances | +| `listeners` | count | LB listeners (frontends) | +| `pools` | count | LB backend pools | +| `pool_members` | count | Members across all pools | +| `healthmonitors` | count | Health check configurations | +| `l7policies` | count | Layer 7 routing policies | + +## keppel (Container Registry) + +| Resource | Unit | Notes | +|----------|------|-------| +| `images` | count | Container images stored | + +## Common Service Filters + +When using `service` parameter with Limes tools: + +``` +service=compute → cores, ram, instances +service=network → floating_ips, networks, ports, etc. +service=volumev2 → capacity, volumes, snapshots (all types) +service=object-store → capacity (Swift) +service=sharev2 → shares, share_capacity +service=dns → zones, recordsets +service=loadbalancing → loadbalancers, listeners, pools +service=keppel → images +``` + +When using `resource` parameter (must combine with `service`): + +``` +service=compute&resource=ram → just RAM quota/usage +service=volumev2&resource=capacity → just total volume capacity +``` diff --git a/plugins/sapcc/skills/sapcc-registry/SKILL.md b/plugins/sapcc/skills/sapcc-registry/SKILL.md new file mode 100644 index 0000000..6e5fff2 --- /dev/null +++ b/plugins/sapcc/skills/sapcc-registry/SKILL.md @@ -0,0 +1,160 @@ +--- +name: sapcc-registry +description: > + Keppel container registry management for SAP Converged Cloud. + Triggers: container image, registry, docker image, keppel, repository, manifest, tag, vulnerability scan, image version +version: 1.0.0 +metadata: + service: [keppel] + task: [scan, inspect, manage] + persona: [developer, devops] +--- + +# SAP CC Container Registry (Keppel) + +Keppel is SAP CC's multi-tenant container image registry. Not vanilla OpenStack. It provides regionally federated, project-scoped image storage with integrated vulnerability scanning. + +## MCP Tools + +| Tool | Purpose | Required Params | +|------|---------|-----------------| +| `keppel_list_accounts` | List registry accounts (namespaces) | — | +| `keppel_list_repositories` | List image repos within an account | `account` | +| `keppel_list_manifests` | List manifests (image versions) in a repo | `account`, `repository` | + +## Keppel Hierarchy + +``` +Account (registry namespace) + └─ Repository (image name, e.g. "nginx") + └─ Manifest (specific image version) + ├─ Tags (human-readable: "latest", "v1.2.3") + └─ Digest (immutable: "sha256:abc123...") +``` + +**Key distinctions:** +- Account ≈ namespace for image organization. One OpenStack project can own multiple accounts. +- Repository = just the image name. NOT the full registry path (not `registry.example.com/account/repo`). +- Manifest = a specific image build. Identified by digest (immutable) or tag (mutable pointer). + +## Gotchas + +### 1. Account ≠ OpenStack Project + +A Keppel account is a registry namespace, not an OpenStack project. One project can have multiple accounts (e.g., `myproject-base`, `myproject-app`). The account name is an arbitrary string chosen at creation time. + +### 2. Untagged manifests still consume storage + +Manifests without tags are NOT automatically deleted. They accumulate from CI pipelines pushing new builds that overwrite existing tags. The old manifest loses its tag but persists. Garbage collection policies (if configured) handle cleanup, but this is not instant. + +### 3. vulnerability_status values + +Manifests report scan status as one of: `Clean`, `Low`, `Medium`, `High`, `Critical`, `Unknown`, `Unsupported`. `Unknown` = scan pending or incomplete. `Unsupported` = scanner cannot analyze this image format (e.g., Windows containers, scratch-based images with no OS packages). + +### 4. Repository name is just the image name + +When calling `keppel_list_manifests`, pass the bare image name: `nginx`, `my-app`, `tools/build-agent`. NOT the full registry URL. The account parameter already provides namespace context. + +### 5. Account name is required to list repositories + +You cannot list all repos across all accounts in one call. Flow is always: `keppel_list_accounts` → pick account → `keppel_list_repositories(account)`. No shortcut. + +### 6. Cross-region federation + +The same account name can exist in multiple regions. Images replicate automatically between regions (eventual consistency). A manifest pushed to `eu-de-1` will appear in `eu-nl-1` after replication completes. Replication is pull-on-demand — the remote region fetches layers when first requested. + +### 7. Digests are immutable, tags are not + +`sha256:abc123...` always points to the same image content. A tag like `latest` or `v1.0` can be moved to point to a different digest at any time. For reproducible deployments, pin to digest. Tags are convenience labels only. + +### 8. Size is compressed layer size + +The `size_bytes` field on manifests is the compressed (wire-transfer) size, not the extracted filesystem size. Actual disk usage after pulling can be 2-5x larger depending on compression ratio and layer contents. + +## Common Workflows + +### "What images do we have?" + +``` +1. keppel_list_accounts → enumerate namespaces +2. keppel_list_repositories(account) → list image repos per account +3. keppel_list_manifests(account, repository) → list versions per image +``` + +### "What version is deployed?" + +Given a known image tag (from a deployment manifest or Helm chart): + +``` +1. keppel_list_manifests(account, repository) +2. Find manifest where tags contains the target tag (e.g., "v2.1.0") +3. Report: digest, push time, vulnerability_status +``` + +The digest confirms exactly which build is running regardless of tag moves. + +### "Are there vulnerable images?" + +``` +1. keppel_list_accounts → get all accounts +2. For each account: keppel_list_repositories(account) +3. For each repo: keppel_list_manifests(account, repo) +4. Filter manifests where vulnerability_status in ("High", "Critical") +5. Report: account/repo:tag, severity, digest +``` + +Note: only manifests with completed scans will show severity. `Unknown` status means the scan hasn't finished — don't treat as "clean." + +### "What's our storage usage?" + +``` +1. keppel_list_accounts → get all accounts +2. For each account: keppel_list_repositories(account) +3. For each repo: keppel_list_manifests(account, repo) +4. Sum size_bytes across all manifests +5. Report per-account and total (remember: compressed size, not extracted) +``` + +Layer deduplication means actual backend storage is less than the naive sum — shared layers between manifests are stored once. + +## Troubleshooting + +### Empty account (no repositories listed) + +Causes: +- Account was just created, no images pushed yet +- Images were pushed to a different region and haven't replicated +- Permission issue — token may lack read access to this account +- All repositories were garbage collected (all manifests expired) + +### Missing tags (manifest exists but no tag) + +- Tag was overwritten by a newer push (CI pushed `latest` again) +- Tag was explicitly deleted via Keppel API +- The manifest is orphaned — it's just layers consuming storage now + +### vulnerability_status = "Unknown" persisting + +- Scan queue backlog — large images take time to scan +- Scanner service may be degraded — check region health +- Image was just pushed — allow 5-15 minutes for initial scan +- If persistent (>1 hour): scanner cannot reach the image layers (network issue) + +### Image pull fails but manifest shows in list + +- Replication incomplete — manifest metadata arrived but layers haven't replicated yet +- Storage backend issue — layers corrupted or missing from swift/S3 +- Manifest is a multi-arch index and local platform architecture isn't available + +## Security + +- **Image layers may contain secrets.** Embedded credentials, API keys, and tokens in image layers are permanently baked in. Rebuilding with secrets removed doesn't delete old manifests — the old layers still exist until garbage collected. +- **Vulnerability scanning is passive.** Keppel scans for known CVEs but does NOT block image pulls. A `Critical` vulnerability status is informational only — the image remains pullable. +- **Account access follows OpenStack RBAC.** Token scope determines which accounts are visible. Cross-project image sharing requires explicit account policies. +- **Digest exposure is safe.** Sharing a `sha256:...` digest does not grant pull access — authentication is still enforced. But digests do confirm whether two environments run the same build. + +## Routing + +| User need | Action | +|-----------|--------| +| Image lifecycle and vulnerability workflow | Read [image-lifecycle.md](references/image-lifecycle.md) | diff --git a/plugins/sapcc/skills/sapcc-registry/references/image-lifecycle.md b/plugins/sapcc/skills/sapcc-registry/references/image-lifecycle.md new file mode 100644 index 0000000..e74b9aa --- /dev/null +++ b/plugins/sapcc/skills/sapcc-registry/references/image-lifecycle.md @@ -0,0 +1,109 @@ +# Keppel Image Lifecycle & Federation Model + +## Image Lifecycle + +### Push Flow + +``` +docker push registry.region.cloud.sap/account/repo:tag + 1. Client authenticates via OpenStack token + 2. Layers uploaded to account's backing storage (Swift/S3) + 3. Manifest created linking layers together + 4. Tag pointer set to new manifest digest + 5. Previous manifest (if tag existed) becomes untagged + 6. Vulnerability scanner queues new manifest for analysis +``` + +### Pull Flow + +``` +docker pull registry.region.cloud.sap/account/repo:tag + 1. Client authenticates via OpenStack token + 2. Tag resolved to manifest digest + 3. Manifest fetched → layer list retrieved + 4. Layers downloaded from backing storage + 5. If layer missing locally → triggers replication from primary region +``` + +### Manifest States + +| State | Meaning | Action | +|-------|---------|--------| +| Tagged + Clean | Active image, no vulnerabilities | Normal operation | +| Tagged + Critical | Active image, severe CVEs found | Rebuild with patched base image | +| Tagged + Unknown | Active image, scan incomplete | Wait 5-15 min, check again | +| Untagged + Clean | Orphaned but safe | Candidate for garbage collection | +| Untagged + Critical | Orphaned and vulnerable | Priority cleanup target | + +### Garbage Collection + +Keppel garbage collection is policy-driven per account: + +- **Untagged manifests**: Deleted after configurable retention period (default varies by account policy) +- **Tagged manifests**: Never auto-deleted while tag exists +- **Shared layers**: Only deleted when no manifest references them +- **Soft-delete window**: Deleted manifests may be recoverable briefly (implementation-dependent) + +GC does NOT run continuously. It's periodic. Expect latency between a manifest becoming untagged and actual storage reclamation. + +## Federation Model + +### Architecture + +``` +Region A (eu-de-1) Region B (eu-nl-1) Region C (ap-jp-1) +┌──────────────────┐ ┌──────────────────┐ ┌──────────────────┐ +│ Keppel Registry │ │ Keppel Registry │ │ Keppel Registry │ +│ │◄─────►│ │◄─────►│ │ +│ account: myapp │ │ account: myapp │ │ account: myapp │ +│ repo: frontend │ │ repo: frontend │ │ repo: frontend │ +└──────────────────┘ └──────────────────┘ └──────────────────┘ + │ │ │ + ▼ ▼ ▼ + Swift/S3 (local) Swift/S3 (local) Swift/S3 (local) +``` + +### Replication Behavior + +- **Pull-on-demand**: Layers are NOT eagerly replicated. When a client in Region B pulls an image that was pushed to Region A, Region B fetches the layers from Region A on first request. +- **Manifest sync**: Manifest metadata (tags, digests, vulnerability status) syncs between regions. You can list manifests in any region even if layers aren't local yet. +- **Consistency model**: Eventually consistent. Tag updates propagate within seconds to minutes. Layer availability depends on pull triggers. +- **Primary region**: The region where the image was originally pushed. It's the authoritative source for layers until replication completes. + +### Federation Implications + +| Scenario | Behavior | +|----------|----------| +| Push to eu-de-1, pull from eu-nl-1 | First pull is slower (cross-region layer fetch). Subsequent pulls are local-speed. | +| Tag update in eu-de-1 | Tag pointer updates in all regions within seconds. | +| Delete in eu-de-1 | Deletion replicates. Other regions lose access after sync. | +| Region outage (eu-de-1 down) | If layers already replicated to eu-nl-1, pulls succeed. If not replicated, pulls fail. | +| Same account name, different content | Not possible — federation ensures consistency. Same account = same content everywhere. | + +### Cross-Region Considerations + +- **First pull latency**: Budget extra time for first pull in a non-primary region. Layer download crosses region boundaries. +- **Disaster recovery**: Critical images should be pulled at least once in each region to ensure layers are locally cached. +- **Storage costs**: Replicated layers consume storage in each region they're pulled to. More regions pulling = more storage used. +- **Network costs**: Cross-region replication incurs network transfer charges. + +## Retention Strategy Recommendations + +### For CI/CD pipelines + +- Tag releases with semver: `v1.2.3` (never delete) +- Use `latest` or branch tags for development (accept overwrites) +- Set GC policies to clean untagged manifests after 7-14 days + +### For production images + +- Pin deployments to digest, not tag +- Maintain at least N-2 tagged versions for rollback +- Monitor vulnerability_status on all tagged manifests +- Rebuild and re-push when base image CVEs are published + +### For multi-arch images + +- Push manifest lists (multi-arch indexes) that reference per-platform manifests +- Each platform manifest has its own vulnerability scan +- A manifest list shows as a single entry with aggregated vulnerability status diff --git a/plugins/sapcc/skills/sapcc-storage/SKILL.md b/plugins/sapcc/skills/sapcc-storage/SKILL.md new file mode 100644 index 0000000..ce163c9 --- /dev/null +++ b/plugins/sapcc/skills/sapcc-storage/SKILL.md @@ -0,0 +1,125 @@ +--- +name: sapcc-storage +description: > + Block storage operations via Cinder. Triggers: volume, disk, storage, + block storage, attachment, cinder. +version: 1.0.0 +metadata: + service: [cinder] + task: [manage, inspect, debug] + persona: [developer, platform-engineer] +--- + +# SAP CC Block Storage (Cinder) + +Manage Cinder volumes: list, inspect, understand attachment state, and troubleshoot failures. + +## MCP Tools + +| Tool | Purpose | Key Parameters | +|------|---------|----------------| +| `cinder_list_volumes` | List volumes with optional filters | `status`, `name` (returns: ID, name, status, size, volume_type, attachments) | +| `cinder_get_volume` | Full detail for a single volume | `volume_id` (UUID) | + +## Gotchas + +1. **Status "in-use" blocks deletion.** A volume with status `in-use` is attached to a server. You must detach it (via Nova) before deletion. Attempting to delete returns 400. + +2. **Status "available" = safe to operate.** Only volumes in `available` state can be deleted, extended, or retyped. This is the idle/detached state. + +3. **Size is GiB, not GB.** Cinder reports size in gibibytes (1 GiB = 1.074 GB). Limes quota is also in GiB. Do not confuse with decimal gigabytes when calculating capacity. + +4. **volume_type determines performance tier.** Each volume type (e.g., `vmware`, `vmware_hdd`) maps to a backend with different IOPS/throughput characteristics. Quota in Limes is tracked per volume_type, not just total capacity. + +5. **attachments array reveals server linkage.** Each entry contains `server_id` and `device` (e.g., `/dev/sdb`). A volume can have multiple attachments if multiattach is enabled (rare in SAP CC). + +6. **Status "error" = backend failure.** Usually indicates a storage backend issue (e.g., failed provisioning, snapshot failure). Check Hermes audit trail (`hermes_list_events` with `target.id=`) for the originating action and error detail. + +7. **Quota is per volume_type in Limes.** Use `limes_get_project_quota` and inspect `block-storage` service resources. You will see separate `capacity` and `volumes` quotas for each type. Running out of quota for one type does not mean all storage is exhausted. + +## Common Workflows + +### List Volumes and Attachment Status + +``` +cinder_list_volumes +``` + +Scan the response for: +- `status: in-use` — attached (check `attachments[].server_id`) +- `status: available` — detached, idle +- `status: error` — needs investigation + +### Find Volumes Attached to a Specific Server + +``` +cinder_list_volumes +``` + +Filter results client-side: iterate `attachments` array and match `server_id` against the target server UUID. There is no server-side filter for attachment target. + +### Check Available Storage Quota Before Creating + +``` +limes_get_project_quota (service: block-storage) +``` + +Compare `usage` vs `quota` for the target `volume_type`. Key resources: +- `capacity_` — total GiB allocated for that type +- `volumes_` — count of volumes of that type + +### Investigate Volume in Error State + +1. `cinder_get_volume` with the volume UUID — note the `status`, `migration_status`, and any `error` fields. +2. `hermes_list_events` filtered to `target.id=` — find the action that triggered the error (create, extend, snapshot, migrate). +3. Check if quota was exceeded at the time of the action. +4. If the volume was being created from a snapshot, verify the source snapshot still exists. + +## Troubleshooting + +### Volume Stuck in "in-use" After Server Deletion + +The volume's attachment record was not cleaned up. This happens when a server is force-deleted or the detach call fails mid-operation. + +**Diagnosis:** +1. `cinder_get_volume` — check `attachments[].server_id` +2. `nova_get_server` with that server_id — if 404, the server no longer exists +3. The attachment is orphaned + +**Resolution:** Requires admin intervention or a `os-force_detach` action (not available via MCP tools). Escalate to platform team with volume_id and orphaned server_id. + +### Volume in Error State + +**Diagnosis:** +1. `cinder_get_volume` — capture full status fields +2. `hermes_list_events` with `target.id=` and `outcome=failure` +3. Common causes: backend capacity exhausted, snapshot source deleted, network timeout to storage backend + +**Resolution:** If the volume was never successfully provisioned (size shows 0 or status is `error` from creation), it can be deleted. If it held data, escalate — the backend may recover. + +### Quota Exhausted + +**Diagnosis:** +1. `limes_get_project_quota` — check `block-storage` service +2. Identify which volume_type hit the limit (capacity or count) + +**Resolution:** +- Delete unused `available` volumes of that type +- Request quota increase via Limes (requires approval workflow) +- Consider using a different volume_type if another has available capacity + +## Security Considerations + +- **Volumes may contain sensitive data.** Always confirm with the user before deleting a volume. There is no soft-delete or recycle bin — deletion is permanent and unrecoverable. +- **Snapshot inheritance.** Deleting a volume does not delete its snapshots, but orphaned snapshots still consume quota and may contain sensitive data. +- **Cross-project visibility.** Volumes are project-scoped. You cannot see or operate on volumes in other projects without re-scoping credentials. +- **Audit trail.** All volume operations (create, delete, attach, detach, extend) are logged in Hermes. Use this to verify who performed destructive actions. + +## Cross-Service References + +| Need | Service | Tool | +|------|---------|------| +| Server a volume is attached to | Nova | `nova_get_server()` | +| Who created/deleted/modified a volume | Hermes | `hermes_list_events(target_type=volume, target_id=)` | +| Block storage quota remaining | Limes | `limes_get_project_quota(service=block-storage)` | +| Server's other volumes | Nova + Cinder | Get server → list volumes → filter by server_id | diff --git a/rules/sapcc-agent-rules.md b/rules/sapcc-agent-rules.md new file mode 100644 index 0000000..d90f927 --- /dev/null +++ b/rules/sapcc-agent-rules.md @@ -0,0 +1,17 @@ +# SAP Converged Cloud Guidance + +- Use the SAP CC MCP Server for all OpenStack/SAP CC interactions — it provides + authenticated API access with credential isolation (secrets never reach the LLM). +- Before starting a task, check whether a relevant sapcc-* skill is available. + Load the skill and prefer its guidance over general knowledge. +- SAP CC uses a Domain → Project hierarchy. Always be aware of the current + project scope (check with `keystone_token_info` if uncertain). +- For any operation that creates or resizes resources, check quota first + via `limes_get_project_quota`. +- When debugging issues, check the audit trail (`hermes_list_events`) and + metrics (`maia_query`) before guessing at root causes. +- When uncertain about SAP CC-specific behavior (Limes, Hermes, Maia, + Keppel, Archer), load the relevant skill rather than guessing. +- Prefer application credentials over passwords. Use keychain storage for secrets. +- SAP CC regions are independent deployments. Credentials and resources do not + cross region boundaries. Region naming: `--` (e.g., `eu-de-1`). diff --git a/skills/credential-setup/SKILL.md b/skills/credential-setup/SKILL.md deleted file mode 100644 index 354d562..0000000 --- a/skills/credential-setup/SKILL.md +++ /dev/null @@ -1,93 +0,0 @@ -# Credential Setup - -Guided workflow for creating and securely storing OpenStack application credentials for use with the MCP server. - -## What It Does - -1. Creates an application credential scoped to the user's current project and roles -2. Stores the secret securely in the system keychain (macOS Keychain / Linux secret-tool) -3. Outputs the exact MCP server configuration to use - -## When to Use - -- First-time MCP server setup (replaces password-based auth) -- Rotating credentials (create new, update config, delete old) -- Setting up credentials for a new project scope - -## Workflow - -### Phase 1: Assess Current Auth - -Check what authentication method is currently configured: -- If already using application credentials → offer rotation -- If using password → recommend migration to app credentials - -### Phase 2: Create Application Credential - -Use the `keystone_create_application_credential` MCP tool with: -- **name**: `mcp-server--` naming convention -- **description**: Include purpose, creation date, and scope -- **expires_at**: Recommend 1 year for development, 90 days for production -- **roles**: Use minimum required roles (avoid admin unless needed) - -### Phase 3: Store Secret Securely - -Save the secret to the system keychain: - -**macOS:** -```bash -security add-generic-password -a "" -s "openstack-appcred" -w "" -``` - -**Linux (GNOME Keyring):** -```bash -secret-tool store --label="OpenStack App Credential" service openstack-appcred account "" -``` - -### Phase 4: Update MCP Server Configuration - -Generate the Claude Code settings configuration: - -```json -{ - "mcpServers": { - "openstack": { - "command": "/path/to/openstack-mcp-server", - "env": { - "OS_AUTH_URL": "https://identity-3..cloud.sap/v3", - "OS_APPLICATION_CREDENTIAL_ID": "", - "OS_APPCRED_SECRET_CMD": "security find-generic-password -a -s openstack-appcred -w", - "OS_REGION_NAME": "" - } - } - } -} -``` - -### Phase 5: Verify - -1. Restart Claude Code (to reload MCP server) -2. Test with `keystone_token_info` to confirm authentication works -3. If rotating: delete the old credential with `keystone_delete_application_credential` - -## Best Practices - -| Practice | Rationale | -|----------|-----------| -| One credential per purpose | Easy to revoke without affecting other services | -| Descriptive names | `mcp-server-cc-demo-qa-de-1` not `my-cred` | -| Set expiration | Force periodic rotation, limit blast radius | -| Minimum roles | Don't use admin roles for read-only MCP server access | -| Keychain storage | Never store secrets in plain text files | -| Test before deleting old | Verify new credential works before revoking old one | - -## Naming Convention - -``` -mcp-server-- -``` - -Examples: -- `mcp-server-cc-demo-qa-de-1` -- `mcp-server-platform-eu-de-2` -- `mcp-server-network-na-us-1` diff --git a/skills/credential-setup/references/auth-methods.md b/skills/credential-setup/references/auth-methods.md deleted file mode 100644 index 1484f2c..0000000 --- a/skills/credential-setup/references/auth-methods.md +++ /dev/null @@ -1,53 +0,0 @@ -# OpenStack Authentication Methods - -## Comparison for MCP Server Use - -| Method | Security | Convenience | Recommended | -|--------|----------|-------------|-------------| -| Application Credential | High (scoped, revocable, no password) | High (set once, forget) | **Yes** | -| Password + OS_PW_CMD | Medium (password in keychain) | Medium (keychain access prompts) | Acceptable | -| Password in env/config | Low (plaintext in files) | High | **Never** | -| Token (manual) | Low (expires, not auto-refreshed) | Low | Never | - -## Application Credentials - -Application credentials are the recommended authentication method for MCP servers: - -- **Scoped**: Automatically scoped to the project they were created in -- **Revocable**: Delete without changing your password -- **Role-limited**: Can have a subset of your roles -- **Password-free**: Your main password is never stored anywhere -- **Expirable**: Set expiration for automatic rotation enforcement - -### Limitations - -- Cannot create other application credentials (unless `unrestricted: true`) -- Cannot change your password -- Scoped to exactly one project (create multiple for multi-project access) -- If your user account is disabled, all app credentials stop working - -## OS_PW_CMD Pattern - -For password-based auth, use the command pattern to avoid storing passwords: - -```bash -# macOS Keychain -OS_PW_CMD="security find-generic-password -a -s openstack -w" - -# Linux (pass) -OS_PW_CMD="pass show openstack/" - -# Linux (GNOME Keyring) -OS_PW_CMD="secret-tool lookup service openstack account " - -# 1Password CLI -OS_PW_CMD="op item get 'OpenStack' --fields password" -``` - -## Migration Path - -1. Start with password + OS_PW_CMD (quickest to set up) -2. Create an application credential using the MCP server itself -3. Switch configuration to use the app credential -4. Verify everything works -5. Done — your password is no longer needed in the MCP config diff --git a/skills/credential-setup/references/security-checklist.md b/skills/credential-setup/references/security-checklist.md deleted file mode 100644 index d359f48..0000000 --- a/skills/credential-setup/references/security-checklist.md +++ /dev/null @@ -1,41 +0,0 @@ -# Security Checklist for Application Credentials - -## Before Creating - -- [ ] Identify minimum required roles for the MCP server's use case -- [ ] Choose appropriate expiration (90 days production, 1 year development) -- [ ] Verify you're in the correct project scope -- [ ] Confirm system keychain is accessible - -## During Creation - -- [ ] Use descriptive name: `mcp-server--` -- [ ] Include description with purpose and date -- [ ] Specify roles explicitly (don't inherit all roles unless needed) -- [ ] Save the secret IMMEDIATELY (only shown once) - -## After Creation - -- [ ] Store secret in system keychain (never in plain text) -- [ ] Update MCP server config to use OS_APPLICATION_CREDENTIAL_ID -- [ ] Use OS_APPCRED_SECRET_CMD to retrieve from keychain -- [ ] Test authentication works with new credential -- [ ] Delete any old/unused credentials -- [ ] Verify old credential is revoked (test should fail) - -## Ongoing - -- [ ] Monitor expiration dates -- [ ] Rotate before expiry (create new → verify → delete old) -- [ ] Audit unused credentials periodically (`keystone_list_application_credentials`) -- [ ] Review roles — remove any no longer needed - -## Red Flags - -| Situation | Action | -|-----------|--------| -| Secret stored in config file | Migrate to keychain immediately | -| Credential with no expiration in production | Set expiration, schedule rotation | -| Credential with `unrestricted: true` | Delete and recreate without unrestricted | -| Multiple credentials with same roles | Consolidate to one per purpose | -| Credential still active after user leaves | Delete immediately | diff --git a/tools/validate.py b/tools/validate.py new file mode 100644 index 0000000..bf2131b --- /dev/null +++ b/tools/validate.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python3 +"""Validate manifests, skill frontmatter, and MCP configs. + +Stdlib-only. Exit 0 on success, non-zero on failure. + +Usage: + python3 tools/validate.py # validate everything + python3 tools/validate.py --plugin X # validate one plugin +""" +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +KEBAB_RE = re.compile(r"^[a-z][a-z0-9]*(-[a-z0-9]+)*$") + +errors: list[str] = [] + + +def error(msg: str) -> None: + errors.append(msg) + print(f" ERROR: {msg}", file=sys.stderr) + + +def validate_json(path: Path, required_keys: list[str]) -> dict | None: + """Validate a JSON file exists, parses, and has required keys.""" + if not path.exists(): + error(f"Missing file: {path.relative_to(REPO_ROOT)}") + return None + try: + data = json.loads(path.read_text()) + except json.JSONDecodeError as e: + error(f"Invalid JSON in {path.relative_to(REPO_ROOT)}: {e}") + return None + for key in required_keys: + if key not in data: + error(f"Missing key '{key}' in {path.relative_to(REPO_ROOT)}") + return data + + +def validate_skill_frontmatter(skill_md: Path) -> None: + """Validate SKILL.md has valid YAML frontmatter with name and description.""" + text = skill_md.read_text() + if not text.startswith("---\n"): + error(f"Missing YAML frontmatter in {skill_md.relative_to(REPO_ROOT)}") + return + + end = text.find("\n---\n", 4) + if end == -1: + error(f"Unterminated frontmatter in {skill_md.relative_to(REPO_ROOT)}") + return + + frontmatter = text[4:end] + fm = {} + current_key = None + for line in frontmatter.splitlines(): + if ":" in line and not line.startswith(" ") and not line.startswith("\t"): + key, _, value = line.partition(":") + value = value.strip().strip('"').strip("'") + if value in (">", "|", ">-", "|-"): + value = "" + fm[key.strip()] = value + current_key = key.strip() + elif current_key and (line.startswith(" ") or line.startswith("\t")): + fm[current_key] = (fm[current_key] + " " + line.strip()).strip() + + name = fm.get("name") + desc = fm.get("description") + + if not name: + error(f"Missing 'name' in frontmatter: {skill_md.relative_to(REPO_ROOT)}") + elif not KEBAB_RE.match(name): + error(f"Name '{name}' is not kebab-case in {skill_md.relative_to(REPO_ROOT)}") + elif len(name) > 64: + error(f"Name exceeds 64 chars in {skill_md.relative_to(REPO_ROOT)}") + else: + expected_dir = skill_md.parent.name + if name != expected_dir: + error(f"Name '{name}' does not match directory '{expected_dir}' in {skill_md.relative_to(REPO_ROOT)}") + + if not desc: + error(f"Missing 'description' in frontmatter: {skill_md.relative_to(REPO_ROOT)}") + elif len(desc) < 20: + error(f"Description too short (<20 chars) in {skill_md.relative_to(REPO_ROOT)}") + + # Check line count + line_count = len(text.splitlines()) + if line_count > 700: + error(f"SKILL.md exceeds 700 lines ({line_count}) in {skill_md.relative_to(REPO_ROOT)} — extract to references/") + elif line_count > 500: + print(f" WARN: SKILL.md is {line_count} lines in {skill_md.relative_to(REPO_ROOT)} — consider extracting to references/") + + +def validate_marketplace(path: Path) -> None: + """Validate marketplace manifest and check plugin source paths.""" + print(f"Validating marketplace: {path.relative_to(REPO_ROOT)}") + data = validate_json(path, ["name", "plugins"]) + if data is None: + return + for plugin in data.get("plugins", []): + if "name" not in plugin: + error(f"Plugin missing 'name' in {path.relative_to(REPO_ROOT)}") + continue + source = plugin.get("source", "") + if source: + resolved = (REPO_ROOT / source).resolve() + if not resolved.is_dir(): + error(f"Plugin source '{source}' does not exist for '{plugin['name']}'") + + +def validate_plugin(plugin_dir: Path) -> None: + """Validate a single plugin's manifests and skills.""" + name = plugin_dir.name + print(f"Validating plugin: {name}") + + # Claude Code manifest + validate_json(plugin_dir / ".claude-plugin" / "plugin.json", ["name"]) + + # Codex manifest (optional, but validate if present) + codex_json = plugin_dir / ".codex-plugin" / "plugin.json" + if codex_json.exists(): + validate_json(codex_json, ["name"]) + + # MCP config + mcp_path = plugin_dir / ".mcp.json" + if mcp_path.exists(): + data = validate_json(mcp_path, ["mcpServers"]) + if data: + for srv_name, srv in data.get("mcpServers", {}).items(): + if "command" not in srv: + error(f"MCP server '{srv_name}' missing 'command'") + + # Skills in this plugin + skills_dir = plugin_dir / "skills" + if skills_dir.is_dir(): + for skill_dir in sorted(skills_dir.iterdir()): + skill_md = skill_dir / "SKILL.md" + if skill_dir.is_dir() and skill_md.exists(): + print(f" Validating skill: {skill_dir.name}") + validate_skill_frontmatter(skill_md) + + +def validate_top_level_skills() -> None: + """Validate skills in top-level skills/ directory.""" + skills_dir = REPO_ROOT / "skills" + if not skills_dir.is_dir(): + return + for skill_md in sorted(skills_dir.rglob("SKILL.md")): + skill_dir = skill_md.parent + print(f"Validating skill: {skill_dir.relative_to(REPO_ROOT)}") + validate_skill_frontmatter(skill_md) + + +def main() -> None: + parser = argparse.ArgumentParser(description="Validate repo manifests and skills") + parser.add_argument("--plugin", help="Validate only this plugin") + args = parser.parse_args() + + plugins_dir = REPO_ROOT / "plugins" + + if args.plugin: + plugin_dir = plugins_dir / args.plugin + if not plugin_dir.is_dir(): + print(f"Plugin not found: {args.plugin}", file=sys.stderr) + sys.exit(1) + validate_plugin(plugin_dir) + else: + # Marketplace manifest (Claude Code) + validate_marketplace(REPO_ROOT / ".claude-plugin" / "marketplace.json") + + # Marketplace manifest (Agents/Codex) + agents_marketplace = REPO_ROOT / ".agents" / "plugins" / "marketplace.json" + if agents_marketplace.exists(): + print(f"Validating agents marketplace: {agents_marketplace.relative_to(REPO_ROOT)}") + validate_json(agents_marketplace, ["name", "plugins"]) + + # All plugins + if plugins_dir.is_dir(): + for plugin_dir in sorted(plugins_dir.iterdir()): + if plugin_dir.is_dir(): + validate_plugin(plugin_dir) + + # Top-level skills + validate_top_level_skills() + + if errors: + print(f"\nValidation failed with {len(errors)} error(s).", file=sys.stderr) + sys.exit(1) + else: + print("\nAll validations passed.") + + +if __name__ == "__main__": + main()