From d1a57f7dd570342ae0fa9775a5aac8297da46f0e Mon Sep 17 00:00:00 2001 From: Nbarola Date: Wed, 6 Aug 2025 13:57:22 +0530 Subject: [PATCH 1/6] add codebundle for acr --- .../azure-acr-health/.test/Taskfile.yaml | 358 ++++++++++++++++++ .../.test/terraform/Taskfile.yaml | 69 ++++ .../azure-acr-health/.test/terraform/main.tf | 34 ++ .../.test/terraform/variables.tf | 20 + .../azure-acr-health/acr_reachability.sh | 117 ++++++ codebundles/azure-acr-health/runbook.robot | 113 ++++++ codebundles/azure-acr-health/sli.robot | 73 ++++ 7 files changed, 784 insertions(+) create mode 100644 codebundles/azure-acr-health/.test/Taskfile.yaml create mode 100644 codebundles/azure-acr-health/.test/terraform/Taskfile.yaml create mode 100644 codebundles/azure-acr-health/.test/terraform/main.tf create mode 100644 codebundles/azure-acr-health/.test/terraform/variables.tf create mode 100644 codebundles/azure-acr-health/acr_reachability.sh create mode 100644 codebundles/azure-acr-health/runbook.robot create mode 100644 codebundles/azure-acr-health/sli.robot diff --git a/codebundles/azure-acr-health/.test/Taskfile.yaml b/codebundles/azure-acr-health/.test/Taskfile.yaml new file mode 100644 index 000000000..e68a1fa43 --- /dev/null +++ b/codebundles/azure-acr-health/.test/Taskfile.yaml @@ -0,0 +1,358 @@ +version: "3" + +tasks: + default: + desc: "Run/refresh config" + cmds: + - task: check-unpushed-commits + - task: generate-rwl-config + - task: run-rwl-discovery + + clean: + desc: "Run cleanup tasks" + cmds: + - task: check-and-cleanup-terraform + - task: delete-slxs + - task: clean-rwl-discovery + + build-infra: + desc: "Build test infrastructure" + cmds: + - task: build-terraform-infra + + check-unpushed-commits: + desc: Check if outstanding commits or file updates need to be pushed before testing. + vars: + BASE_DIR: "../" + cmds: + - | + echo "Checking for uncommitted changes in $BASE_DIR and $BASE_DIR.runwhen, excluding '.test'..." + UNCOMMITTED_FILES=$(git diff --name-only HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNCOMMITTED_FILES" ]; then + echo "✗" + echo "Uncommitted changes found:" + echo "$UNCOMMITTED_FILES" + echo "Remember to commit & push changes before executing the `run-rwl-discovery` task." + echo "------------" + exit 1 + else + echo "√" + echo "No uncommitted changes in specified directories." + echo "------------" + fi + - | + echo "Checking for unpushed commits in $BASE_DIR and $BASE_DIR.runwhen, excluding '.test'..." + git fetch origin + UNPUSHED_FILES=$(git diff --name-only origin/$(git rev-parse --abbrev-ref HEAD) HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNPUSHED_FILES" ]; then + echo "✗" + echo "Unpushed commits found:" + echo "$UNPUSHED_FILES" + echo "Remember to push changes before executing the `run-rwl-discovery` task." + echo "------------" + exit 1 + else + echo "√" + echo "No unpushed commits in specified directories." + echo "------------" + fi + silent: true + + generate-rwl-config: + desc: "Generate RunWhen Local configuration (workspaceInfo.yaml)" + env: + ARM_SUBSCRIPTION_ID: "{{.ARM_SUBSCRIPTION_ID}}" + AZ_TENANT_ID: "{{.AZ_TENANT_ID}}" + AZ_CLIENT_SECRET: "{{.AZ_CLIENT_SECRET}}" + AZ_CLIENT_ID: "{{.AZ_CLIENT_ID}}" + RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}' + cmds: + - | + source terraform/tf.secret + repo_url=$(git config --get remote.origin.url) + branch_name=$(git rev-parse --abbrev-ref HEAD) + codebundle=$(basename "$(dirname "$PWD")") + AZURE_SUBSCRIPTION_ID=$ARM_SUBSCRIPTION_ID + pushd terraform > /dev/null + resource_group=$(terraform show -json terraform.tfstate | jq -r ' + .values.root_module.resources[] | + select(.type == "azurerm_resource_group") | .values.name') + popd > /dev/null + if [ -z "$resource_group" ]; then + echo "Error: Missing resource_group details. Ensure Terraform plan has been applied." + exit 1 + fi + source terraform/tf.secret + cat < workspaceInfo.yaml + workspaceName: "$RW_WORKSPACE" + workspaceOwnerEmail: authors@runwhen.com + defaultLocation: location-01-us-west1 + defaultLOD: detailed + cloudConfig: + azure: + subscriptionId: "$ARM_SUBSCRIPTION_ID" + tenantId: "$AZ_TENANT_ID" + clientId: "$AZ_CLIENT_ID" + clientSecret: "$AZ_CLIENT_SECRET" + resourceGroupLevelOfDetails: + $resource_group: detailed + codeCollections: + - repoURL: "$repo_url" + branch: "$branch_name" + codeBundles: ["$codebundle"] + EOF + silent: true + + run-rwl-discovery: + desc: "Run RunWhen Local Discovery on test infrastructure" + cmds: + - | + source terraform/tf.secret + CONTAINER_NAME="RunWhenLocal" + if docker ps -q --filter "name=$CONTAINER_NAME" | grep -q .; then + echo "Stopping and removing existing container $CONTAINER_NAME..." + docker stop $CONTAINER_NAME && docker rm $CONTAINER_NAME + elif docker ps -a -q --filter "name=$CONTAINER_NAME" | grep -q .; then + echo "Removing existing stopped container $CONTAINER_NAME..." + docker rm $CONTAINER_NAME + else + echo "No existing container named $CONTAINER_NAME found." + fi + echo "Cleaning up output directory..." + sudo rm -rf output || { echo "Failed to remove output directory"; exit 1; } + mkdir output && chmod 777 output || { echo "Failed to set permissions"; exit 1; } + echo "Starting new container $CONTAINER_NAME..." + docker run --name $CONTAINER_NAME -p 8081:8081 -v "$(pwd)":/shared -d ghcr.io/runwhen-contrib/runwhen-local:latest || { + echo "Failed to start container"; exit 1; + } + echo "Running workspace builder script in container..." + docker exec -w /workspace-builder $CONTAINER_NAME ./run.sh $1 --verbose || { + echo "Error executing script in container"; exit 1; + } + echo "Review generated config files under output/workspaces/" + silent: true + + validate-generation-rules: + desc: "Validate YAML files in .runwhen/generation-rules" + cmds: + - | + for cmd in curl yq ajv; do + if ! command -v $cmd &> /dev/null; then + echo "Error: $cmd is required but not installed." + exit 1 + fi + done + temp_dir=$(mktemp -d) + curl -s -o "$temp_dir/generation-rule-schema.json" https://raw.githubusercontent.com/runwhen-contrib/runwhen-local/refs/heads/main/src/generation-rule-schema.json + for yaml_file in ../.runwhen/generation-rules/*.yaml; do + echo "Validating $yaml_file" + json_file="$temp_dir/$(basename "${yaml_file%.*}.json")" + yq -o=json "$yaml_file" > "$json_file" + ajv validate -s "$temp_dir/generation-rule-schema.json" -d "$json_file" --spec=draft2020 --strict=false \ + && echo "$yaml_file is valid." || echo "$yaml_file is invalid." + done + rm -rf "$temp_dir" + silent: true + + check-rwp-config: + desc: Check if env vars are set for RunWhen Platform + cmds: + - | + source terraform/tf.secret + missing_vars=() + if [ -z "$RW_WORKSPACE" ]; then + missing_vars+=("RW_WORKSPACE") + fi + if [ -z "$RW_API_URL" ]; then + missing_vars+=("RW_API_URL") + fi + if [ -z "$RW_PAT" ]; then + missing_vars+=("RW_PAT") + fi + if [ ${#missing_vars[@]} -ne 0 ]; then + echo "The following required environment variables are missing: ${missing_vars[*]}" + exit 1 + fi + silent: true + + upload-slxs: + desc: "Upload SLX files to the appropriate URL" + env: + RW_WORKSPACE: "{{.RW_WORKSPACE}}" + RW_API_URL: "{{.RW_API}}" + RW_PAT: "{{.RW_PAT}}" + cmds: + - task: check-rwp-config + - | + source terraform/tf.secret + BASE_DIR="output/workspaces/${RW_WORKSPACE}/slxs" + if [ ! -d "$BASE_DIR" ]; then + echo "Directory $BASE_DIR does not exist. Upload aborted." + exit 1 + fi + URL="https://${RW_API_URL}/api/v3/workspaces/${RW_WORKSPACE}/secrets" + PAYLOAD="{\"secrets\": {\"az_subscriptionId\": \"${ARM_SUBSCRIPTION_ID}\", \"az_clientId\": \"${AZ_CLIENT_ID}\", \"az_tenantId\": \"${AZ_TENANT_ID}\", \"az_clientSecret\": \"${AZ_CLIENT_SECRET}\"}}" + echo "Uploading secrets to $URL" + response_code=$(curl -X POST "$URL" \ + -H "Authorization: Bearer $RW_PAT" \ + -H "Content-Type: application/json" \ + -d "$PAYLOAD" \ + -w "%{http_code}" -o /dev/null -s) + if [[ "$response_code" == "200" || "$response_code" == "201" ]]; then + echo "Successfully uploaded secrets to $URL" + else + echo "Failed to upload secrets: $SLX_NAME to $URL. Unexpected response code: $response_code" + fi + for dir in "$BASE_DIR"/*; do + if [ -d "$dir" ]; then + SLX_NAME=$(basename "$dir") + PAYLOAD=$(jq -n --arg commitMsg "Creating new SLX $SLX_NAME" '{ commitMsg: $commitMsg, files: {} }') + for file in slx.yaml runbook.yaml sli.yaml; do + if [ -f "$dir/$file" ]; then + CONTENT=$(cat "$dir/$file") + PAYLOAD=$(echo "$PAYLOAD" | jq --arg fileContent "$CONTENT" --arg fileName "$file" '.files[$fileName] = $fileContent') + fi + done + URL="https://${RW_API_URL}/api/v3/workspaces/${RW_WORKSPACE}/branches/main/slxs/${SLX_NAME}" + echo "Uploading SLX: $SLX_NAME to $URL" + response=$(curl -v -X POST "$URL" \ + -H "Authorization: Bearer $RW_PAT" \ + -H "Content-Type: application/json" \ + -d "$PAYLOAD" -w "%{http_code}" -o /dev/null -s 2>&1) + if [[ "$response" =~ 200|201 ]]; then + echo "Successfully uploaded SLX: $SLX_NAME to $URL" + else + echo "Failed to upload SLX: $SLX_NAME to $URL. Response:" + echo "$response" + fi + fi + done + silent: true + + delete-slxs: + desc: "Delete SLX objects from the appropriate URL" + env: + RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}' + RW_API_URL: "{{.RW_API}}" + RW_PAT: "{{.RW_PAT}}" + cmds: + - task: check-rwp-config + - | + source terraform/tf.secret + BASE_DIR="output/workspaces/${RW_WORKSPACE}/slxs" + if [ ! -d "$BASE_DIR" ]; then + echo "Directory $BASE_DIR does not exist. Deletion aborted." + exit 1 + fi + for dir in "$BASE_DIR"/*; do + if [ -d "$dir" ]; then + SLX_NAME=$(basename "$dir") + URL="https://${RW_API_URL}/api/v3/workspaces/${RW_WORKSPACE}/branches/main/slxs/${SLX_NAME}" + echo "Deleting SLX: $SLX_NAME from $URL" + response=$(curl -v -X DELETE "$URL" \ + -H "Authorization: Bearer $RW_PAT" \ + -H "Content-Type: application/json" -w "%{http_code}" -o /dev/null -s 2>&1) + if [[ "$response" =~ 200|204 ]]; then + echo "Successfully deleted SLX: $SLX_NAME from $URL" + else + echo "Failed to delete SLX: $SLX_NAME from $URL. Response:" + echo "$response" + fi + fi + done + silent: true + + check-terraform-infra: + desc: "Check if Terraform has any deployed infrastructure in the terraform subdirectory" + cmds: + - | + source terraform/tf.secret + export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv) + export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID + export TF_VAR_tenant_id=$AZ_TENANT_ID + if [ ! -d "terraform" ]; then + echo "Terraform directory not found." + exit 1 + fi + cd terraform + if [ ! -f "terraform.tfstate" ]; then + echo "No Terraform state file found in the terraform directory. No infrastructure is deployed." + exit 0 + fi + resources=$(terraform state list) + if [ -n "$resources" ]; then + echo "Deployed infrastructure detected." + echo "$resources" + exit 0 + else + echo "No deployed infrastructure found in Terraform state." + exit 0 + fi + silent: true + + build-terraform-infra: + desc: "Run terraform apply" + cmds: + - | + source terraform/tf.secret + export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv) + export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID + export TF_VAR_tenant_id=$AZ_TENANT_ID + if [ -d "terraform" ]; then + cd terraform + else + echo "Terraform directory not found. Terraform apply aborted." + exit 1 + fi + task format-and-init-terraform + echo "Starting Terraform Build of Terraform infrastructure..." + terraform apply -auto-approve || { + echo "Failed to clean up Terraform infrastructure." + exit 1 + } + echo "Terraform infrastructure build completed." + silent: true + + cleanup-terraform-infra: + desc: "Cleanup deployed Terraform infrastructure" + cmds: + - | + source terraform/tf.secret + export TF_VAR_sp_principal_id=$(az ad sp show --id $AZ_CLIENT_ID --query id -o tsv) + export TF_VAR_subscription_id=$ARM_SUBSCRIPTION_ID + export TF_VAR_tenant_id=$AZ_TENANT_ID + if [ -d "terraform" ]; then + cd terraform + else + echo "Terraform directory not found. Cleanup aborted." + exit 1 + fi + echo "Starting cleanup of Terraform infrastructure..." + terraform destroy -auto-approve || { + echo "Failed to clean up Terraform infrastructure." + exit 1 + } + echo "Terraform infrastructure cleanup completed." + silent: true + + check-and-cleanup-terraform: + desc: "Check and clean up deployed Terraform infrastructure if it exists" + cmds: + - | + infra_output=$(task check-terraform-infra | tee /dev/tty) + if echo "$infra_output" | grep -q "Deployed infrastructure detected"; then + echo "Infrastructure detected; proceeding with cleanup." + task cleanup-terraform-infra + else + echo "No deployed infrastructure found; no cleanup required." + fi + silent: true + + clean-rwl-discovery: + desc: "Check and clean up RunWhen Local discovery output" + cmds: + - | + sudo rm -rf output + rm workspaceInfo.yaml + silent: true + diff --git a/codebundles/azure-acr-health/.test/terraform/Taskfile.yaml b/codebundles/azure-acr-health/.test/terraform/Taskfile.yaml new file mode 100644 index 000000000..4222e7864 --- /dev/null +++ b/codebundles/azure-acr-health/.test/terraform/Taskfile.yaml @@ -0,0 +1,69 @@ +version: '3' + +env: + TERM: screen-256color + +tasks: + default: + cmds: + - task: test + + test: + desc: Run tests. + cmds: + - task: test-terraform + + clean: + desc: Clean the environment. + cmds: + - task: clean-terraform + + clean-terraform: + desc: Clean the terraform environment (remove terraform directories and files) + cmds: + - find . -type d -name .terraform -exec rm -rf {} + + - find . -type f -name .terraform.lock.hcl -delete + + format-and-init-terraform: + desc: Run Terraform fmt and init + cmds: + - | + terraform fmt + terraform init + + test-terraform: + desc: Run tests for all terraform directories. + silent: true + env: + DIRECTORIES: + sh: find . -path '*/.terraform/*' -prune -o -name '*.tf' -type f -exec dirname {} \; | sort -u + cmds: + - | + BOLD=$(tput bold) + NORM=$(tput sgr0) + + CWD=$PWD + + for d in $DIRECTORIES; do + cd $d + echo "${BOLD}$PWD:${NORM}" + if ! terraform fmt -check=true -list=false -recursive=false; then + echo " ✗ terraform fmt" && exit 1 + else + echo " √ terraform fmt" + fi + + if ! terraform init -backend=false -input=false -get=true -no-color > /dev/null; then + echo " ✗ terraform init" && exit 1 + else + echo " √ terraform init" + fi + + if ! terraform validate > /dev/null; then + echo " ✗ terraform validate" && exit 1 + else + echo " √ terraform validate" + fi + + cd $CWD + done \ No newline at end of file diff --git a/codebundles/azure-acr-health/.test/terraform/main.tf b/codebundles/azure-acr-health/.test/terraform/main.tf new file mode 100644 index 000000000..21ac8c205 --- /dev/null +++ b/codebundles/azure-acr-health/.test/terraform/main.tf @@ -0,0 +1,34 @@ +provider "azurerm" { + features {} +} + +resource "azurerm_resource_group" "acr_rg" { + name = var.resource_group + location = "East US" +} + +resource "azurerm_container_registry" "demo_acr" { + name = "uniqueacrname12345" # must be globally unique, change as needed + resource_group_name = azurerm_resource_group.acr_rg.name + location = azurerm_resource_group.acr_rg.location + sku = "Basic" + admin_enabled = true +} + +output "acr_name" { + value = azurerm_container_registry.demo_acr.name +} + +# output "acr_admin_username" { +# value = azurerm_container_registry.demo_acr.admin_username +# sensitive = true +# } + +# output "acr_admin_password" { +# value = azurerm_container_registry.demo_acr.admin_password +# sensitive = true +# } + +output "acr_login_server" { + value = azurerm_container_registry.demo_acr.login_server +} diff --git a/codebundles/azure-acr-health/.test/terraform/variables.tf b/codebundles/azure-acr-health/.test/terraform/variables.tf new file mode 100644 index 000000000..69791d69f --- /dev/null +++ b/codebundles/azure-acr-health/.test/terraform/variables.tf @@ -0,0 +1,20 @@ +variable "resource_group" { + description = "Name of the resource group" + type = string + default = "test-acr-rg" +} + +variable "location" { + description = "Azure region for resources" + type = string + default = "eastus" +} + +variable "tags" { + description = "Tags to apply to resources" + type = map(string) + default = { + environment = "test" + purpose = "acr-health-testing" + } +} \ No newline at end of file diff --git a/codebundles/azure-acr-health/acr_reachability.sh b/codebundles/azure-acr-health/acr_reachability.sh new file mode 100644 index 000000000..5a044879b --- /dev/null +++ b/codebundles/azure-acr-health/acr_reachability.sh @@ -0,0 +1,117 @@ +#!/bin/bash +# Check Azure Container Registry reachability and next steps + +SUBSCRIPTION_ID="${AZURE_SUBSCRIPTION_ID:-}" +RESOURCE_GROUP="${AZ_RESOURCE_GROUP:-}" +ACR_NAME="${ACR_NAME:-}" + +ISSUES_FILE="reachability_issues.json" +echo '[]' > "$ISSUES_FILE" + +add_issue() { + local title="$1" + local severity="$2" + local expected="$3" + local actual="$4" + local details="$5" + local next_steps="$6" + details=$(echo "$details" | sed 's/"/\\"/g' | sed ':a;N;$!ba;s/\n/\\n/g') + next_steps=$(echo "$next_steps" | sed 's/"/\\"/g' | sed ':a;N;$!ba;s/\n/\\n/g') + local issue="{\"title\":\"$title\",\"severity\":$severity,\"expected\":\"$expected\",\"actual\":\"$actual\",\"details\":\"$details\",\"next_steps\":\"$next_steps\"}" + jq ". += [${issue}]" "$ISSUES_FILE" > temp.json && mv temp.json "$ISSUES_FILE" +} + +if [ -z "$SUBSCRIPTION_ID" ] || [ -z "$RESOURCE_GROUP" ] || [ -z "$ACR_NAME" ]; then + missing_vars=() + [ -z "$SUBSCRIPTION_ID" ] && missing_vars+=("AZURE_SUBSCRIPTION_ID") + [ -z "$RESOURCE_GROUP" ] && missing_vars+=("AZ_RESOURCE_GROUP") + [ -z "$ACR_NAME" ] && missing_vars+=("ACR_NAME") + echo "Missing required environment variables: ${missing_vars[*]}" + echo '{"error": "Required environment variables not set"}' + exit 1 +fi + +if ! az account show --subscription "$SUBSCRIPTION_ID" >/dev/null 2>&1; then + add_issue \ + "Azure authentication failed for subscription $SUBSCRIPTION_ID" \ + 4 \ + "Azure CLI should authenticate successfully" \ + "Azure authentication failed for subscription $SUBSCRIPTION_ID" \ + "SUBSCRIPTION_ID: $SUBSCRIPTION_ID" \ + "Check Azure credentials and login with 'az login' or set the correct subscription." + echo '{"error": "Azure authentication failed"}' + exit 1 +fi + +az account set --subscription "$SUBSCRIPTION_ID" + +acr_info=$(az acr show --name "$ACR_NAME" --resource-group "$RESOURCE_GROUP" -o json 2>az_acr_show_err.log) +if [ $? -ne 0 ] || [ -z "$acr_info" ]; then + # Check for permission error + if grep -q "AuthorizationFailed" az_acr_show_err.log; then + add_issue \ + "Insufficient permissions to access ACR '$ACR_NAME' (RG: '$RESOURCE_GROUP')" \ + 4 \ + "User/service principal should have 'AcrRegistryReader' or higher role on the registry" \ + "az acr show failed due to insufficient permissions" \ + "See az_acr_show_err.log for details" \ + "Assign 'AcrRegistryReader' or higher role to the user/service principal for the registry." + else + add_issue \ + "ACR '$ACR_NAME' (RG: '$RESOURCE_GROUP') is unreachable or not found (Subscription: $SUBSCRIPTION_ID)" \ + 4 \ + "ACR should be reachable and exist in the specified resource group and subscription" \ + "ACR '$ACR_NAME' is unreachable or not found" \ + "Tried: az acr show --name $ACR_NAME --resource-group $RESOURCE_GROUP --subscription $SUBSCRIPTION_ID" \ + "Check if the registry exists, is spelled correctly, and is accessible from your network." + fi + echo '{"status": "unreachable"}' + exit 0 +fi + +# Retrieve admin credentials +admin_creds=$(az acr credential show --name "$ACR_NAME" --resource-group "$RESOURCE_GROUP" -o json 2>az_acr_cred_err.log) +if [ $? -ne 0 ] || [ -z "$admin_creds" ]; then + if grep -q "AuthorizationFailed" az_acr_cred_err.log; then + add_issue \ + "Insufficient permissions to retrieve admin credentials for ACR '$ACR_NAME'" \ + 4 \ + "User/service principal should have 'AcrRegistryReader' or higher role on the registry" \ + "az acr credential show failed due to insufficient permissions" \ + "See az_acr_cred_err.log for details" \ + "Assign 'AcrRegistryReader' or higher role to the user/service principal for the registry." + else + add_issue \ + "Failed to retrieve admin credentials for ACR '$ACR_NAME'" \ + 4 \ + "Should be able to retrieve admin credentials if admin is enabled" \ + "az acr credential show failed" \ + "Tried: az acr credential show --name $ACR_NAME --resource-group $RESOURCE_GROUP" \ + "Check if admin user is enabled and you have sufficient permissions." + fi + echo '{"status": "no_admin_creds"}' + exit 0 +fi + +login_server=$(echo "$acr_info" | jq -r '.loginServer') +admin_username=$(echo "$admin_creds" | jq -r '.username') +admin_password=$(echo "$admin_creds" | jq -r '.passwords[0].value') + +# Attempt docker login +if ! echo "$admin_password" | docker login "$login_server" -u "$admin_username" --password-stdin >docker_login.log 2>&1; then + add_issue \ + "Docker login to ACR '$ACR_NAME' failed" \ + 4 \ + "Should be able to login to the registry using admin credentials" \ + "docker login failed" \ + "See docker_login.log for details" \ + "Check if admin user is enabled, credentials are correct, and Docker is running." + echo '{"status": "docker_login_failed"}' + exit 0 +fi + +# If everything succeeded +rm -f az_acr_show_err.log az_acr_cred_err.log docker_login.log + +echo '{"status": "reachable"}' +echo '[]' > "$ISSUES_FILE" \ No newline at end of file diff --git a/codebundles/azure-acr-health/runbook.robot b/codebundles/azure-acr-health/runbook.robot new file mode 100644 index 000000000..ee4a2605c --- /dev/null +++ b/codebundles/azure-acr-health/runbook.robot @@ -0,0 +1,113 @@ +*** Settings *** +Documentation Runs diagnostic checks against Azure Container Registry (ACR) to monitor reachability, SKU, pull/push success ratio, and storage utilization. +Metadata Author Nbarola +Metadata Display Name Azure ACR Health Check +Metadata Supports Azure Container Registry ACR Health + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library Azure +Library RW.platform +Library String +Library OperatingSystem +Library Collections + +Suite Setup Suite Initialization + +*** Tasks *** +Check ACR Reachability for Registry `${ACR_NAME}` + [Documentation] Checks if the ACR endpoint is reachable. + [Tags] access:read-only ACR Azure Reachability Health + ${reachability}= RW.CLI.Run Bash File + ... bash_file=acr_reachability.sh + ... env=${env} + ... timeout_seconds=120 + ... include_in_history=false + ${issues_list}= RW.CLI.Run Cli + ... cmd=cat reachability_issues.json + ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json + IF len(@{issues}) > 0 + FOR ${issue} IN @{issues} + RW.Core.Add Issue + ... severity=${issue}["severity"] + ... title=${issue}["title"] + ... expected=${issue}["expected"] + ... actual=${issue}["actual"] + ... reproduce_hint=${issue}.get("reproduce_hint", "") + ... details=${issue}["details"] + ... next_steps=${issue}["next_steps"] + END + END + +Check ACR Usage SKU Metric for Registry `${ACR_NAME}` + [Documentation] Checks the SKU and usage limits for the ACR. + [Tags] access:read-only ACR Azure SKU Health + ${sku}= RW.CLI.Run Bash File + ... bash_file=acr_usage_sku.sh + ... env=${env} + ... timeout_seconds=120 + ... include_in_history=false + ${issues_list}= RW.CLI.Run Cli + ... cmd=cat usage_sku_issues.json + ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json + IF len(@{issues}) > 0 + FOR ${issue} IN @{issues} + RW.Core.Add Issue + ... severity=${issue}["severity"] + ... title=${issue}["title"] + ... expected=${issue}["expected"] + ... actual=${issue}["actual"] + ... reproduce_hint=${issue}.get("reproduce_hint", "") + ... details=${issue}["details"] + ... next_steps=${issue}["next_steps"] + END + END + +Check ACR Pull/Push Success Ratio for Registry `${ACR_NAME}` + [Documentation] Checks the success rate of image pull and push operations. + [Tags] access:read-only ACR Azure PullPush Health + ${ratio}= RW.CLI.Run Bash File + ... bash_file=acr_pull_push_ratio.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ${issues_list}= RW.CLI.Run Cli + ... cmd=cat pull_push_ratio_issues.json + ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json + IF len(@{issues}) > 0 + FOR ${issue} IN @{issues} + RW.Core.Add Issue + ... severity=${issue}["severity"] + ... title=${issue}["title"] + ... expected=${issue}["expected"] + ... actual=${issue}["actual"] + ... reproduce_hint=${issue}.get("reproduce_hint", "") + ... details=${issue}["details"] + ... next_steps=${issue}["next_steps"] + END + END + +Check ACR Storage Utilization for Registry `${ACR_NAME}` + [Documentation] Checks the storage usage of the ACR. + [Tags] access:read-only ACR Azure Storage Health + ${storage}= RW.CLI.Run Bash File + ... bash_file=acr_storage_utilization.sh + ... env=${env} + ... timeout_seconds=120 + ... include_in_history=false + ${issues_list}= RW.CLI.Run Cli + ... cmd=cat storage_utilization_issues.json + ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json + IF len(@{issues}) > 0 + FOR ${issue} IN @{issues} + RW.Core.Add Issue + ... severity=${issue}["severity"] + ... title=${issue}["title"] + ... expected=${issue}["expected"] + ... actual=${issue}["actual"] + ... reproduce_hint=${issue}.get("reproduce_hint", "") + ... details=${issue}["details"] + ... next_steps=${issue}["next_steps"] + END + END \ No newline at end of file diff --git a/codebundles/azure-acr-health/sli.robot b/codebundles/azure-acr-health/sli.robot new file mode 100644 index 000000000..749594022 --- /dev/null +++ b/codebundles/azure-acr-health/sli.robot @@ -0,0 +1,73 @@ +*** Settings *** +Documentation Calculates Azure ACR health by checking reachability, SKU, pull/push ratio, and storage utilization. +Metadata Author Nbarola +Metadata Display Name Azure ACR Health SLI +Metadata Supports Azure Container Registry ACR Health + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library Azure +Library RW.platform +Library String +Library OperatingSystem +Library Collections + +Suite Setup Suite Initialization + +*** Tasks *** +Check ACR Reachability for Registry `${ACR_NAME}` + [Documentation] Checks if the ACR endpoint is reachable. + [Tags] ACR Azure Reachability Health + ${reachability}= RW.CLI.Run Bash File + ... bash_file=acr_reachability.sh + ... env=${env} + ... timeout_seconds=120 + ... include_in_history=false + ${issues_list}= RW.CLI.Run Cli + ... cmd=cat reachability_issues.json + ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json + ${score}= Evaluate 0 if len(@{issues}) > 0 else 1 + Set Global Variable ${reachability_score} ${score} + +Check ACR Usage SKU Metric for Registry `${ACR_NAME}` + [Documentation] Checks the SKU and usage limits for the ACR. + [Tags] ACR Azure SKU Health + ${sku}= RW.CLI.Run Bash File + ... bash_file=acr_usage_sku.sh + ... env=${env} + ... timeout_seconds=120 + ... include_in_history=false + ${issues_list}= RW.CLI.Run Cli + ... cmd=cat usage_sku_issues.json + ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json + ${score}= Evaluate 0 if len(@{issues}) > 0 else 1 + Set Global Variable ${sku_score} ${score} + +Check ACR Pull/Push Success Ratio for Registry `${ACR_NAME}` + [Documentation] Checks the success rate of image pull and push operations. + [Tags] ACR Azure PullPush Health + ${ratio}= RW.CLI.Run Bash File + ... bash_file=acr_pull_push_ratio.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ${issues_list}= RW.CLI.Run Cli + ... cmd=cat pull_push_ratio_issues.json + ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json + ${score}= Evaluate 0 if len(@{issues}) > 0 else 1 + Set Global Variable ${pull_push_score} ${score} + +Check ACR Storage Utilization for Registry `${ACR_NAME}` + [Documentation] Checks the storage usage of the ACR. + [Tags] ACR Azure Storage Health + ${storage}= RW.CLI.Run Bash File + ... bash_file=acr_storage_utilization.sh + ... env=${env} + ... timeout_seconds=120 + ... include_in_history=false + ${issues_list}= RW.CLI.Run Cli + ... cmd=cat storage_utilization_issues.json + ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json + ${score}= Evaluate 0 if len(@{issues}) > 0 else 1 + Set Global Variable ${storage_score} ${score} \ No newline at end of file From 1b904e012a5a2a1d0902b64e29d13e909506a4f9 Mon Sep 17 00:00:00 2001 From: Nbarola Date: Wed, 6 Aug 2025 17:13:03 +0530 Subject: [PATCH 2/6] add acr_authentication script --- .../azure-acr-health/acr_authentication.sh | 118 ++++++++++++++++++ .../azure-acr-health/acr_reachability.sh | 110 ++-------------- codebundles/azure-acr-health/runbook.robot | 104 ++++++++++----- 3 files changed, 204 insertions(+), 128 deletions(-) create mode 100644 codebundles/azure-acr-health/acr_authentication.sh diff --git a/codebundles/azure-acr-health/acr_authentication.sh b/codebundles/azure-acr-health/acr_authentication.sh new file mode 100644 index 000000000..1d0829bf8 --- /dev/null +++ b/codebundles/azure-acr-health/acr_authentication.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# Check Azure Container Registry reachability and next steps + +SUBSCRIPTION_ID="${AZURE_SUBSCRIPTION_ID:-}" +RESOURCE_GROUP="${AZ_RESOURCE_GROUP:-}" +ACR_NAME="${ACR_NAME:-}" +ACR_PASSWORD="${ACR_PASSWORD:-}" + +ISSUES_FILE="reachability_issues.json" +echo '[]' > "$ISSUES_FILE" + +add_issue() { + local title="$1" + local severity="$2" + local expected="$3" + local actual="$4" + local details="$5" + local next_steps="$6" + details=$(echo "$details" | sed 's/"/\\"/g' | sed ':a;N;$!ba;s/\n/\\n/g') + next_steps=$(echo "$next_steps" | sed 's/"/\\"/g' | sed ':a;N;$!ba;s/\n/\\n/g') + local issue="{\"title\":\"$title\",\"severity\":$severity,\"expected\":\"$expected\",\"actual\":\"$actual\",\"details\":\"$details\",\"next_steps\":\"$next_steps\"}" + jq ". += [${issue}]" "$ISSUES_FILE" > temp.json && mv temp.json "$ISSUES_FILE" +} + +if [ -z "$SUBSCRIPTION_ID" ] || [ -z "$RESOURCE_GROUP" ] || [ -z "$ACR_NAME" ]; then + missing_vars=() + [ -z "$SUBSCRIPTION_ID" ] && missing_vars+=("AZURE_SUBSCRIPTION_ID") + [ -z "$RESOURCE_GROUP" ] && missing_vars+=("AZ_RESOURCE_GROUP") + [ -z "$ACR_NAME" ] && missing_vars+=("ACR_NAME") + echo "Missing required environment variables: ${missing_vars[*]}" + echo '{"error": "Required environment variables not set"}' + exit 1 +fi + +if ! az account show --subscription "$SUBSCRIPTION_ID" >/dev/null 2>&1; then + add_issue \ + "Azure authentication failed for subscription $SUBSCRIPTION_ID" \ + 4 \ + "Azure CLI should authenticate successfully" \ + "Azure authentication failed for subscription $SUBSCRIPTION_ID" \ + "SUBSCRIPTION_ID: $SUBSCRIPTION_ID" \ + "Check Azure credentials and login with 'az login' or set the correct subscription." + echo '{"error": "Azure authentication failed"}' + exit 1 +fi + +az account set --subscription "$SUBSCRIPTION_ID" + +acr_info=$(az acr show --name "$ACR_NAME" --resource-group "$RESOURCE_GROUP" -o json 2>az_acr_show_err.log) +if [ $? -ne 0 ] || [ -z "$acr_info" ]; then + # Check for permission error + if grep -q "AuthorizationFailed" az_acr_show_err.log; then + add_issue \ + "Insufficient permissions to access ACR '$ACR_NAME' (RG: '$RESOURCE_GROUP')" \ + 4 \ + "User/service principal should have 'AcrRegistryReader' or higher role on the registry" \ + "az acr show failed due to insufficient permissions" \ + "See az_acr_show_err.log for details" \ + "Assign 'AcrRegistryReader' or higher role to the user/service principal for the registry." + else + add_issue \ + "ACR '$ACR_NAME' (RG: '$RESOURCE_GROUP') is unreachable or not found (Subscription: $SUBSCRIPTION_ID)" \ + 4 \ + "ACR should be reachable and exist in the specified resource group and subscription" \ + "ACR '$ACR_NAME' is unreachable or not found" \ + "Tried: az acr show --name $ACR_NAME --resource-group $RESOURCE_GROUP --subscription $SUBSCRIPTION_ID" \ + "Check if the registry exists, is spelled correctly, and is accessible from your network." + fi + echo '{"status": "unreachable"}' + exit 0 +fi + +# Retrieve admin credentials +admin_creds=$(az acr credential show --name "$ACR_NAME" --resource-group "$RESOURCE_GROUP" -o json 2>az_acr_cred_err.log) +if [ $? -ne 0 ] || [ -z "$admin_creds" ]; then + if grep -q "AuthorizationFailed" az_acr_cred_err.log; then + add_issue \ + "Insufficient permissions to retrieve admin credentials for ACR '$ACR_NAME'" \ + 4 \ + "User/service principal should have 'AcrRegistryReader' or higher role on the registry" \ + "az acr credential show failed due to insufficient permissions" \ + "See az_acr_cred_err.log for details" \ + "Assign 'AcrRegistryReader' or higher role to the user/service principal for the registry." + else + add_issue \ + "Failed to retrieve admin credentials for ACR '$ACR_NAME'" \ + 4 \ + "Should be able to retrieve admin credentials if admin is enabled" \ + "az acr credential show failed" \ + "Tried: az acr credential show --name $ACR_NAME --resource-group $RESOURCE_GROUP" \ + "Check if admin user is enabled and you have sufficient permissions." + fi + echo '{"status": "no_admin_creds"}' + exit 0 +fi + +login_server=$(echo "$acr_info" | jq -r '.loginServer') +admin_username=$(echo "$admin_creds" | jq -r '.username') +# admin_password=$(echo "$admin_creds" | jq -r '.passwords[0].value') + +# Attempt docker login +if ! echo "$ACR_PASSWORD" | docker login "$login_server" -u "$admin_username" --password-stdin >docker_login.log 2>&1; then + add_issue \ + "Docker login to ACR '$ACR_NAME' failed" \ + 4 \ + "Should be able to login to the registry using admin credentials" \ + "docker login failed" \ + "See docker_login.log for details" \ + "Check if admin user is enabled, credentials are correct, and Docker is running." + echo '{"status": "docker_login_failed"}' + exit 0 +fi + +# If everything succeeded +rm -f az_acr_show_err.log az_acr_cred_err.log docker_login.log + +echo '{"status": "reachable"}' +echo '[]' > "$ISSUES_FILE" \ No newline at end of file diff --git a/codebundles/azure-acr-health/acr_reachability.sh b/codebundles/azure-acr-health/acr_reachability.sh index 5a044879b..813043481 100644 --- a/codebundles/azure-acr-health/acr_reachability.sh +++ b/codebundles/azure-acr-health/acr_reachability.sh @@ -1,11 +1,7 @@ #!/bin/bash -# Check Azure Container Registry reachability and next steps -SUBSCRIPTION_ID="${AZURE_SUBSCRIPTION_ID:-}" -RESOURCE_GROUP="${AZ_RESOURCE_GROUP:-}" -ACR_NAME="${ACR_NAME:-}" - -ISSUES_FILE="reachability_issues.json" +REGISTRY_NAME=${ACR_NAME:-} +ISSUES_FILE="dns_tls_issues.json" echo '[]' > "$ISSUES_FILE" add_issue() { @@ -21,97 +17,17 @@ add_issue() { jq ". += [${issue}]" "$ISSUES_FILE" > temp.json && mv temp.json "$ISSUES_FILE" } -if [ -z "$SUBSCRIPTION_ID" ] || [ -z "$RESOURCE_GROUP" ] || [ -z "$ACR_NAME" ]; then - missing_vars=() - [ -z "$SUBSCRIPTION_ID" ] && missing_vars+=("AZURE_SUBSCRIPTION_ID") - [ -z "$RESOURCE_GROUP" ] && missing_vars+=("AZ_RESOURCE_GROUP") - [ -z "$ACR_NAME" ] && missing_vars+=("ACR_NAME") - echo "Missing required environment variables: ${missing_vars[*]}" - echo '{"error": "Required environment variables not set"}' - exit 1 -fi - -if ! az account show --subscription "$SUBSCRIPTION_ID" >/dev/null 2>&1; then - add_issue \ - "Azure authentication failed for subscription $SUBSCRIPTION_ID" \ - 4 \ - "Azure CLI should authenticate successfully" \ - "Azure authentication failed for subscription $SUBSCRIPTION_ID" \ - "SUBSCRIPTION_ID: $SUBSCRIPTION_ID" \ - "Check Azure credentials and login with 'az login' or set the correct subscription." - echo '{"error": "Azure authentication failed"}' - exit 1 -fi - -az account set --subscription "$SUBSCRIPTION_ID" - -acr_info=$(az acr show --name "$ACR_NAME" --resource-group "$RESOURCE_GROUP" -o json 2>az_acr_show_err.log) -if [ $? -ne 0 ] || [ -z "$acr_info" ]; then - # Check for permission error - if grep -q "AuthorizationFailed" az_acr_show_err.log; then - add_issue \ - "Insufficient permissions to access ACR '$ACR_NAME' (RG: '$RESOURCE_GROUP')" \ - 4 \ - "User/service principal should have 'AcrRegistryReader' or higher role on the registry" \ - "az acr show failed due to insufficient permissions" \ - "See az_acr_show_err.log for details" \ - "Assign 'AcrRegistryReader' or higher role to the user/service principal for the registry." - else - add_issue \ - "ACR '$ACR_NAME' (RG: '$RESOURCE_GROUP') is unreachable or not found (Subscription: $SUBSCRIPTION_ID)" \ - 4 \ - "ACR should be reachable and exist in the specified resource group and subscription" \ - "ACR '$ACR_NAME' is unreachable or not found" \ - "Tried: az acr show --name $ACR_NAME --resource-group $RESOURCE_GROUP --subscription $SUBSCRIPTION_ID" \ - "Check if the registry exists, is spelled correctly, and is accessible from your network." - fi - echo '{"status": "unreachable"}' - exit 0 +# DNS check +nslookup $REGISTRY_NAME.azurecr.io > /dev/null 2>&1 +if [ $? -ne 0 ]; then + add_issue "DNS Lookup failed" 4 "DNS should resolve" "Failed to resolve registry DNS" "Check network/DNS settings" fi -# Retrieve admin credentials -admin_creds=$(az acr credential show --name "$ACR_NAME" --resource-group "$RESOURCE_GROUP" -o json 2>az_acr_cred_err.log) -if [ $? -ne 0 ] || [ -z "$admin_creds" ]; then - if grep -q "AuthorizationFailed" az_acr_cred_err.log; then - add_issue \ - "Insufficient permissions to retrieve admin credentials for ACR '$ACR_NAME'" \ - 4 \ - "User/service principal should have 'AcrRegistryReader' or higher role on the registry" \ - "az acr credential show failed due to insufficient permissions" \ - "See az_acr_cred_err.log for details" \ - "Assign 'AcrRegistryReader' or higher role to the user/service principal for the registry." - else - add_issue \ - "Failed to retrieve admin credentials for ACR '$ACR_NAME'" \ - 4 \ - "Should be able to retrieve admin credentials if admin is enabled" \ - "az acr credential show failed" \ - "Tried: az acr credential show --name $ACR_NAME --resource-group $RESOURCE_GROUP" \ - "Check if admin user is enabled and you have sufficient permissions." - fi - echo '{"status": "no_admin_creds"}' - exit 0 +# TLS check +openssl s_client -connect $REGISTRY_NAME.azurecr.io:443 -servername $REGISTRY_NAME.azurecr.io < /dev/null > tls_log.txt 2>&1 +if grep -q "Verify return code: 0 (ok)" tls_log.txt; then + echo "TLS handshake success" +else + add_issue "TLS handshake failed" 4 "TLS handshake should succeed" "Failed handshake or cert issue" "Check firewall and trust chains" fi - -login_server=$(echo "$acr_info" | jq -r '.loginServer') -admin_username=$(echo "$admin_creds" | jq -r '.username') -admin_password=$(echo "$admin_creds" | jq -r '.passwords[0].value') - -# Attempt docker login -if ! echo "$admin_password" | docker login "$login_server" -u "$admin_username" --password-stdin >docker_login.log 2>&1; then - add_issue \ - "Docker login to ACR '$ACR_NAME' failed" \ - 4 \ - "Should be able to login to the registry using admin credentials" \ - "docker login failed" \ - "See docker_login.log for details" \ - "Check if admin user is enabled, credentials are correct, and Docker is running." - echo '{"status": "docker_login_failed"}' - exit 0 -fi - -# If everything succeeded -rm -f az_acr_show_err.log az_acr_cred_err.log docker_login.log - -echo '{"status": "reachable"}' -echo '[]' > "$ISSUES_FILE" \ No newline at end of file +rm -f tls_log.txt diff --git a/codebundles/azure-acr-health/runbook.robot b/codebundles/azure-acr-health/runbook.robot index ee4a2605c..945058ae9 100644 --- a/codebundles/azure-acr-health/runbook.robot +++ b/codebundles/azure-acr-health/runbook.robot @@ -1,8 +1,8 @@ *** Settings *** -Documentation Runs diagnostic checks against Azure Container Registry (ACR) to monitor reachability, SKU, pull/push success ratio, and storage utilization. +Documentation Runs diagnostic checks against Azure Container Registry (ACR), including DNS/TLS, authentication, SKU/usage, storage, pull/push, geo-replication, repository events, and retention health. Metadata Author Nbarola Metadata Display Name Azure ACR Health Check -Metadata Supports Azure Container Registry ACR Health +Metadata Supports Azure Container Registry ACR Health Push Pull Storage Library BuiltIn Library RW.Core @@ -15,17 +15,18 @@ Library Collections Suite Setup Suite Initialization + *** Tasks *** -Check ACR Reachability for Registry `${ACR_NAME}` - [Documentation] Checks if the ACR endpoint is reachable. - [Tags] access:read-only ACR Azure Reachability Health - ${reachability}= RW.CLI.Run Bash File - ... bash_file=acr_reachability.sh +Check DNS & TLS Reachability for Registry `${ACR_NAME}` + [Documentation] Verifies DNS resolution and HTTPS/TLS for ACR endpoint. + [Tags] access:read-only ACR Azure DNS TLS Connectivity Health + ${dns_tls}= RW.CLI.Run Bash File + ... bash_file=acr_dns_tls_reachability.sh ... env=${env} - ... timeout_seconds=120 + ... timeout_seconds=60 ... include_in_history=false ${issues_list}= RW.CLI.Run Cli - ... cmd=cat reachability_issues.json + ... cmd=cat dns_tls_issues.json ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json IF len(@{issues}) > 0 FOR ${issue} IN @{issues} @@ -40,16 +41,16 @@ Check ACR Reachability for Registry `${ACR_NAME}` END END -Check ACR Usage SKU Metric for Registry `${ACR_NAME}` - [Documentation] Checks the SKU and usage limits for the ACR. - [Tags] access:read-only ACR Azure SKU Health - ${sku}= RW.CLI.Run Bash File - ... bash_file=acr_usage_sku.sh +Check ACR Login & Authentication for Registry `${ACR_NAME}` + [Documentation] Attempts az acr login and docker login using intended workload identity. + [Tags] access:read-only ACR Azure Login Auth Connectivity Health + ${login}= RW.CLI.Run Bash File + ... bash_file=acr_login_check.sh ... env=${env} - ... timeout_seconds=120 + ... timeout_seconds=90 ... include_in_history=false ${issues_list}= RW.CLI.Run Cli - ... cmd=cat usage_sku_issues.json + ... cmd=cat acr_login_issues.json ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json IF len(@{issues}) > 0 FOR ${issue} IN @{issues} @@ -64,16 +65,16 @@ Check ACR Usage SKU Metric for Registry `${ACR_NAME}` END END -Check ACR Pull/Push Success Ratio for Registry `${ACR_NAME}` - [Documentation] Checks the success rate of image pull and push operations. - [Tags] access:read-only ACR Azure PullPush Health - ${ratio}= RW.CLI.Run Bash File - ... bash_file=acr_pull_push_ratio.sh +Check ACR Storage Usage for Registry `${ACR_NAME}` + [Documentation] Checks storage used vs quota using az acr show-usage. + [Tags] access:read-only ACR Azure Storage Health + ${storage}= RW.CLI.Run Bash File + ... bash_file=acr_storage_usage.sh ... env=${env} - ... timeout_seconds=180 + ... timeout_seconds=60 ... include_in_history=false ${issues_list}= RW.CLI.Run Cli - ... cmd=cat pull_push_ratio_issues.json + ... cmd=cat storage_usage_issues.json ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json IF len(@{issues}) > 0 FOR ${issue} IN @{issues} @@ -88,16 +89,17 @@ Check ACR Pull/Push Success Ratio for Registry `${ACR_NAME}` END END -Check ACR Storage Utilization for Registry `${ACR_NAME}` - [Documentation] Checks the storage usage of the ACR. - [Tags] access:read-only ACR Azure Storage Health - ${storage}= RW.CLI.Run Bash File - ... bash_file=acr_storage_utilization.sh + +Check ACR Repository Event Failures for Registry `${ACR_NAME}` + [Documentation] Queries Log Analytics for recent failed pushes/pulls and repo errors. + [Tags] access:read-only ACR Azure Events Health + ${repo_events}= RW.CLI.Run Bash File + ... bash_file=acr_repository_events.sh ... env=${env} - ... timeout_seconds=120 + ... timeout_seconds=90 ... include_in_history=false ${issues_list}= RW.CLI.Run Cli - ... cmd=cat storage_utilization_issues.json + ... cmd=cat repository_events_issues.json ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json IF len(@{issues}) > 0 FOR ${issue} IN @{issues} @@ -110,4 +112,44 @@ Check ACR Storage Utilization for Registry `${ACR_NAME}` ... details=${issue}["details"] ... next_steps=${issue}["next_steps"] END - END \ No newline at end of file + END + + +*** Keywords *** +Suite Initialization + ${AZ_RESOURCE_GROUP}= RW.Core.Import User Variable AZ_RESOURCE_GROUP + ... type=string + ... description=The resource group containing the ACR. + ... pattern=\w* + ${ACR_NAME}= RW.Core.Import User Variable ACR_NAME + ... type=string + ... description=Azure Container Registry Name. + ... pattern=^[a-zA-Z0-9]*$ + ${ACR_PASSWORD}= RW.Core.Import Secret acr_admin_password + ... type=string + ... description=Azure Container Registry password (admin or SP credential). + ... pattern=.* + ${AZURE_RESOURCE_SUBSCRIPTION_ID}= RW.Core.Import User Variable AZURE_SUBSCRIPTION_ID + ... type=string + ... description=The Azure Subscription ID. + ... pattern=\w* + ${AZURE_SUBSCRIPTION_NAME}= RW.Core.Import User Variable AZURE_SUBSCRIPTION_NAME + ... type=string + ... description=The Azure Subscription Name. + ... pattern=\w* + ${LOG_WORKSPACE_ID}= RW.Core.Import User Variable LOG_WORKSPACE_ID + ... type=string + ... description=Log Analytics Workspace ID for querying diagnostic events. + ... pattern=\w* + Set Suite Variable ${ACR_NAME} ${ACR_NAME} + Set Suite Variable ${ACR_PASSWORD} ${ACR_PASSWORD} + Set Suite Variable ${AZ_RESOURCE_GROUP} ${AZ_RESOURCE_GROUP} + Set Suite Variable ${AZURE_SUBSCRIPTION_ID} ${AZURE_RESOURCE_SUBSCRIPTION_ID} + Set Suite Variable ${AZURE_SUBSCRIPTION_NAME} ${AZURE_SUBSCRIPTION_NAME} + Set Suite Variable ${LOG_WORKSPACE_ID} ${LOG_WORKSPACE_ID} + Set Suite Variable + ... ${env} + ... {"ACR_NAME": "${ACR_NAME}", "${ACR_PASSWORD}": "${ACR_PASSWORD}", "AZ_RESOURCE_GROUP": "${AZ_RESOURCE_GROUP}", "AZURE_SUBSCRIPTION_ID": "${AZURE_RESOURCE_SUBSCRIPTION_ID}", "AZURE_SUBSCRIPTION_NAME": "${AZURE_SUBSCRIPTION_NAME}", "LOG_WORKSPACE_ID": "${LOG_WORKSPACE_ID}"} + RW.CLI.Run Cli + ... cmd=az account set --subscription ${AZURE_RESOURCE_SUBSCRIPTION_ID} + ... include_in_history=false From bf6009376117bf1394fe85e3ef20cae1fe5b8476 Mon Sep 17 00:00:00 2001 From: Nbarola Date: Thu, 7 Aug 2025 16:09:51 +0530 Subject: [PATCH 3/6] add acr_authentication and acr_events bash scriipts --- .../azure-acr-health/.test/Taskfile.yaml | 22 ++++-- codebundles/azure-acr-health/README.md | 49 ++++++++++++ .../azure-acr-health/acr_authentication.sh | 15 ++-- codebundles/azure-acr-health/acr_events.sh | 32 ++++++++ codebundles/azure-acr-health/runbook.robot | 75 ++++++++++--------- 5 files changed, 143 insertions(+), 50 deletions(-) create mode 100644 codebundles/azure-acr-health/README.md create mode 100644 codebundles/azure-acr-health/acr_events.sh diff --git a/codebundles/azure-acr-health/.test/Taskfile.yaml b/codebundles/azure-acr-health/.test/Taskfile.yaml index e68a1fa43..86e13be94 100644 --- a/codebundles/azure-acr-health/.test/Taskfile.yaml +++ b/codebundles/azure-acr-health/.test/Taskfile.yaml @@ -73,13 +73,22 @@ tasks: branch_name=$(git rev-parse --abbrev-ref HEAD) codebundle=$(basename "$(dirname "$PWD")") AZURE_SUBSCRIPTION_ID=$ARM_SUBSCRIPTION_ID - pushd terraform > /dev/null - resource_group=$(terraform show -json terraform.tfstate | jq -r ' - .values.root_module.resources[] | - select(.type == "azurerm_resource_group") | .values.name') - popd > /dev/null + + # Check if AZ_RESOURCE_GROUP is set, otherwise get from Terraform state + if [ -z "${AZ_RESOURCE_GROUP}" ]; then + # Fetch individual cluster details from Terraform state + pushd terraform > /dev/null + resource_group=$(terraform show -json terraform.tfstate | jq -r ' + .values.root_module.resources[] | + select(.type == "azurerm_resource_group") | .values.name') + popd > /dev/null + else + resource_group="${AZ_RESOURCE_GROUP}" + fi + + # Check if resource group is still empty after all checks if [ -z "$resource_group" ]; then - echo "Error: Missing resource_group details. Ensure Terraform plan has been applied." + echo "Error: Missing resource_group details. Either set AZ_RESOURCE_GROUP environment variable or ensure Terraform plan has been applied." exit 1 fi source terraform/tf.secret @@ -355,4 +364,3 @@ tasks: sudo rm -rf output rm workspaceInfo.yaml silent: true - diff --git a/codebundles/azure-acr-health/README.md b/codebundles/azure-acr-health/README.md new file mode 100644 index 000000000..3bcbb70d0 --- /dev/null +++ b/codebundles/azure-acr-health/README.md @@ -0,0 +1,49 @@ +# Azure Container Registry (ACR) Health Bundle + +This bundle provides comprehensive health checks for Azure Container Registries (ACR), including reachability, usage SKU metrics, pull/push success ratio, and storage utilization. It uses Robot Framework tasks and Bash scripts to collect, parse, and score ACR health. + +## Included Health Checks + +- **Registry Reachability**: Verifies that the ACR endpoint is reachable and responsive. +- **Usage SKU Metric**: Checks the current SKU and usage limits for the registry. +- **Pull/Push Success Ratio**: Analyzes the success rate of image pull and push operations. +- **Storage Utilization**: Checks the storage usage against quota/thresholds. + +## Main Tasks + +- `Check ACR Reachability` +- `Check ACR Usage SKU Metric` +- `Check ACR Pull/Push Success Ratio` +- `Check ACR Storage Utilization` +- `Score ACR Health Metrics` +- `Generate Comprehensive ACR Health Score` + +## How It Works + +1. **Bash scripts** (e.g., `acr_reachability.sh`, `acr_usage_sku.sh`, etc.) collect raw data from Azure Container Registry. +2. **Robot Framework tasks** run these scripts, parse the output, and (for SLI) calculate a health score. +3. **Next steps scripts** (e.g., `next_steps_reachability.sh`) analyze the parsed output and generate JSON issues or recommendations. +4. **SLI tasks** aggregate the results and push a health score metric. + +## Usage + +- Configure your environment variables (registry name, resource group, subscription, thresholds, etc.). +- Run the desired Robot Framework task (e.g., from `runbook.robot` or `sli.robot`). +- Review the output and health scores. + +### Example + +To check a specific registry: + +``` +export ACR_NAME="myregistry" +robot runbook.robot +``` + +## Directory Structure + +- `runbook.robot` - Main runbook for health checks and issue creation. +- `sli.robot` - SLI/score-only version for health scoring. +- `acr_reachability.sh`, `acr_usage_sku.sh`, `acr_pull_push_ratio.sh`, `acr_storage_utilization.sh` - Data collection scripts. +- `next_steps_reachability.sh`, `next_steps_usage_sku.sh`, `next_steps_pull_push_ratio.sh`, `next_steps_storage_utilization.sh` - Next steps/issue analysis scripts. +- `.test/` - Example and test cases. \ No newline at end of file diff --git a/codebundles/azure-acr-health/acr_authentication.sh b/codebundles/azure-acr-health/acr_authentication.sh index 1d0829bf8..f5f31f85b 100644 --- a/codebundles/azure-acr-health/acr_authentication.sh +++ b/codebundles/azure-acr-health/acr_authentication.sh @@ -1,12 +1,11 @@ #!/bin/bash -# Check Azure Container Registry reachability and next steps - +set -x SUBSCRIPTION_ID="${AZURE_SUBSCRIPTION_ID:-}" RESOURCE_GROUP="${AZ_RESOURCE_GROUP:-}" ACR_NAME="${ACR_NAME:-}" ACR_PASSWORD="${ACR_PASSWORD:-}" -ISSUES_FILE="reachability_issues.json" +ISSUES_FILE="login_issues.json" echo '[]' > "$ISSUES_FILE" add_issue() { @@ -52,7 +51,7 @@ if [ $? -ne 0 ] || [ -z "$acr_info" ]; then if grep -q "AuthorizationFailed" az_acr_show_err.log; then add_issue \ "Insufficient permissions to access ACR '$ACR_NAME' (RG: '$RESOURCE_GROUP')" \ - 4 \ + 3 \ "User/service principal should have 'AcrRegistryReader' or higher role on the registry" \ "az acr show failed due to insufficient permissions" \ "See az_acr_show_err.log for details" \ @@ -60,7 +59,7 @@ if [ $? -ne 0 ] || [ -z "$acr_info" ]; then else add_issue \ "ACR '$ACR_NAME' (RG: '$RESOURCE_GROUP') is unreachable or not found (Subscription: $SUBSCRIPTION_ID)" \ - 4 \ + 3 \ "ACR should be reachable and exist in the specified resource group and subscription" \ "ACR '$ACR_NAME' is unreachable or not found" \ "Tried: az acr show --name $ACR_NAME --resource-group $RESOURCE_GROUP --subscription $SUBSCRIPTION_ID" \ @@ -76,7 +75,7 @@ if [ $? -ne 0 ] || [ -z "$admin_creds" ]; then if grep -q "AuthorizationFailed" az_acr_cred_err.log; then add_issue \ "Insufficient permissions to retrieve admin credentials for ACR '$ACR_NAME'" \ - 4 \ + 3 \ "User/service principal should have 'AcrRegistryReader' or higher role on the registry" \ "az acr credential show failed due to insufficient permissions" \ "See az_acr_cred_err.log for details" \ @@ -84,7 +83,7 @@ if [ $? -ne 0 ] || [ -z "$admin_creds" ]; then else add_issue \ "Failed to retrieve admin credentials for ACR '$ACR_NAME'" \ - 4 \ + 3 \ "Should be able to retrieve admin credentials if admin is enabled" \ "az acr credential show failed" \ "Tried: az acr credential show --name $ACR_NAME --resource-group $RESOURCE_GROUP" \ @@ -102,7 +101,7 @@ admin_username=$(echo "$admin_creds" | jq -r '.username') if ! echo "$ACR_PASSWORD" | docker login "$login_server" -u "$admin_username" --password-stdin >docker_login.log 2>&1; then add_issue \ "Docker login to ACR '$ACR_NAME' failed" \ - 4 \ + 2 \ "Should be able to login to the registry using admin credentials" \ "docker login failed" \ "See docker_login.log for details" \ diff --git a/codebundles/azure-acr-health/acr_events.sh b/codebundles/azure-acr-health/acr_events.sh new file mode 100644 index 000000000..13e65831d --- /dev/null +++ b/codebundles/azure-acr-health/acr_events.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +RESOURCE_GROUP=${AZ_RESOURCE_GROUP:-} +ACR_NAME=${ACR_NAME:-} +LOG_WORKSPACE_ID=${LOG_WORKSPACE_ID:-} + +ISSUES_FILE="repository_events_issues.json" +echo '[]' > "$ISSUES_FILE" + +add_issue() { + local title="$1" + local severity="$2" + local expected="$3" + local actual="$4" + local details="$5" + local next_steps="$6" + details=$(echo "$details" | sed 's/"/\\"/g' | sed ':a;N;$!ba;s/\n/\\n/g') + next_steps=$(echo "$next_steps" | sed 's/"/\\"/g' | sed ':a;N;$!ba;s/\n/\\n/g') + local issue="{\"title\":\"$title\",\"severity\":$severity,\"expected\":\"$expected\",\"actual\":\"$actual\",\"details\":\"$details\",\"next_steps\":\"$next_steps\"}" + jq ". += [${issue}]" "$ISSUES_FILE" > temp.json && mv temp.json "$ISSUES_FILE" +} + +if [ -z "$LOG_WORKSPACE_ID" ]; then + add_issue "Log workspace ID missing" 4 "Log Analytics workspace ID should be set" "LOG_WORKSPACE_ID is not set" "No query run" "Provide LOG_WORKSPACE_ID to query repository events" + exit 1 +fi + +query_result=$(az monitor log-analytics query --workspace "$LOG_WORKSPACE_ID" --query "ContainerRegistryRepositoryEvents | where ResultType != 0 | summarize count() by ResultType, bin(TimeGenerated, 5m) | top 5 by count_") + +if [ $? -ne 0 ]; then + add_issue "Failed to query repository events" 4 "Should be able to query repository events" "Command failed" "See CLI errors" "Check permissions and workspace ID" +fi diff --git a/codebundles/azure-acr-health/runbook.robot b/codebundles/azure-acr-health/runbook.robot index 945058ae9..78e8b87ae 100644 --- a/codebundles/azure-acr-health/runbook.robot +++ b/codebundles/azure-acr-health/runbook.robot @@ -7,7 +7,6 @@ Metadata Supports Azure Container Registry ACR Health Library BuiltIn Library RW.Core Library RW.CLI -Library Azure Library RW.platform Library String Library OperatingSystem @@ -21,7 +20,7 @@ Check DNS & TLS Reachability for Registry `${ACR_NAME}` [Documentation] Verifies DNS resolution and HTTPS/TLS for ACR endpoint. [Tags] access:read-only ACR Azure DNS TLS Connectivity Health ${dns_tls}= RW.CLI.Run Bash File - ... bash_file=acr_dns_tls_reachability.sh + ... bash_file=acr_reachability.sh ... env=${env} ... timeout_seconds=60 ... include_in_history=false @@ -31,13 +30,13 @@ Check DNS & TLS Reachability for Registry `${ACR_NAME}` IF len(@{issues}) > 0 FOR ${issue} IN @{issues} RW.Core.Add Issue - ... severity=${issue}["severity"] - ... title=${issue}["title"] - ... expected=${issue}["expected"] - ... actual=${issue}["actual"] - ... reproduce_hint=${issue}.get("reproduce_hint", "") - ... details=${issue}["details"] - ... next_steps=${issue}["next_steps"] + ... severity=${issue["severity"]} + ... title=${issue["title"]} + ... expected=${issue["expected"]} + ... actual=${issue["actual"]} + ... reproduce_hint=${issue.get("reproduce_hint", "")} + ... details=${issue["details"]} + ... next_steps=${issue["next_steps"]} END END @@ -45,23 +44,24 @@ Check ACR Login & Authentication for Registry `${ACR_NAME}` [Documentation] Attempts az acr login and docker login using intended workload identity. [Tags] access:read-only ACR Azure Login Auth Connectivity Health ${login}= RW.CLI.Run Bash File - ... bash_file=acr_login_check.sh + ... bash_file=acr_authentication.sh ... env=${env} + ... secret__ACR_PASSWORD=${ACR_PASSWORD} ... timeout_seconds=90 ... include_in_history=false ${issues_list}= RW.CLI.Run Cli - ... cmd=cat acr_login_issues.json + ... cmd=cat login_issues.json ${issues}= Evaluate json.loads(r'''${issues_list.stdout}''') json IF len(@{issues}) > 0 FOR ${issue} IN @{issues} RW.Core.Add Issue - ... severity=${issue}["severity"] - ... title=${issue}["title"] - ... expected=${issue}["expected"] - ... actual=${issue}["actual"] - ... reproduce_hint=${issue}.get("reproduce_hint", "") - ... details=${issue}["details"] - ... next_steps=${issue}["next_steps"] + ... severity=${issue["severity"]} + ... title=${issue["title"]} + ... expected=${issue["expected"]} + ... actual=${issue["actual"]} + ... reproduce_hint=${issue.get("reproduce_hint", "")} + ... details=${issue["details"]} + ... next_steps=${issue["next_steps"]} END END @@ -79,13 +79,13 @@ Check ACR Storage Usage for Registry `${ACR_NAME}` IF len(@{issues}) > 0 FOR ${issue} IN @{issues} RW.Core.Add Issue - ... severity=${issue}["severity"] - ... title=${issue}["title"] - ... expected=${issue}["expected"] - ... actual=${issue}["actual"] - ... reproduce_hint=${issue}.get("reproduce_hint", "") - ... details=${issue}["details"] - ... next_steps=${issue}["next_steps"] + ... severity=${issue["severity"]} + ... title=${issue["title"]} + ... expected=${issue["expected"]} + ... actual=${issue["actual"]} + ... reproduce_hint=${issue.get("reproduce_hint", "")} + ... details=${issue["details"]} + ... next_steps=${issue["next_steps"]} END END @@ -94,7 +94,7 @@ Check ACR Repository Event Failures for Registry `${ACR_NAME}` [Documentation] Queries Log Analytics for recent failed pushes/pulls and repo errors. [Tags] access:read-only ACR Azure Events Health ${repo_events}= RW.CLI.Run Bash File - ... bash_file=acr_repository_events.sh + ... bash_file=acr_events.sh ... env=${env} ... timeout_seconds=90 ... include_in_history=false @@ -104,13 +104,13 @@ Check ACR Repository Event Failures for Registry `${ACR_NAME}` IF len(@{issues}) > 0 FOR ${issue} IN @{issues} RW.Core.Add Issue - ... severity=${issue}["severity"] - ... title=${issue}["title"] - ... expected=${issue}["expected"] - ... actual=${issue}["actual"] - ... reproduce_hint=${issue}.get("reproduce_hint", "") - ... details=${issue}["details"] - ... next_steps=${issue}["next_steps"] + ... severity=${issue["severity"]} + ... title=${issue["title"]} + ... expected=${issue["expected"]} + ... actual=${issue["actual"]} + ... reproduce_hint=${issue.get("reproduce_hint", "")} + ... details=${issue["details"]} + ... next_steps=${issue["next_steps"]} END END @@ -125,7 +125,7 @@ Suite Initialization ... type=string ... description=Azure Container Registry Name. ... pattern=^[a-zA-Z0-9]*$ - ${ACR_PASSWORD}= RW.Core.Import Secret acr_admin_password + ${ACR_PASSWORD}= RW.Core.Import Secret ACR_PASSWORD ... type=string ... description=Azure Container Registry password (admin or SP credential). ... pattern=.* @@ -141,6 +141,11 @@ Suite Initialization ... type=string ... description=Log Analytics Workspace ID for querying diagnostic events. ... pattern=\w* + ${USAGE_THRESHOLD}= RW.Core.Import User Variable USAGE_THRESHOLD + ... type=string + ... description=Threshold for acr usage + ... pattern=\d* + ... default=80 Set Suite Variable ${ACR_NAME} ${ACR_NAME} Set Suite Variable ${ACR_PASSWORD} ${ACR_PASSWORD} Set Suite Variable ${AZ_RESOURCE_GROUP} ${AZ_RESOURCE_GROUP} @@ -149,7 +154,7 @@ Suite Initialization Set Suite Variable ${LOG_WORKSPACE_ID} ${LOG_WORKSPACE_ID} Set Suite Variable ... ${env} - ... {"ACR_NAME": "${ACR_NAME}", "${ACR_PASSWORD}": "${ACR_PASSWORD}", "AZ_RESOURCE_GROUP": "${AZ_RESOURCE_GROUP}", "AZURE_SUBSCRIPTION_ID": "${AZURE_RESOURCE_SUBSCRIPTION_ID}", "AZURE_SUBSCRIPTION_NAME": "${AZURE_SUBSCRIPTION_NAME}", "LOG_WORKSPACE_ID": "${LOG_WORKSPACE_ID}"} + ... {"ACR_NAME": "${ACR_NAME}", "AZ_RESOURCE_GROUP": "${AZ_RESOURCE_GROUP}", "AZURE_SUBSCRIPTION_ID": "${AZURE_RESOURCE_SUBSCRIPTION_ID}", "AZURE_SUBSCRIPTION_NAME": "${AZURE_SUBSCRIPTION_NAME}", "LOG_WORKSPACE_ID": "${LOG_WORKSPACE_ID}", "USAGE_THRESHOLD": "${USAGE_THRESHOLD}"} RW.CLI.Run Cli ... cmd=az account set --subscription ${AZURE_RESOURCE_SUBSCRIPTION_ID} ... include_in_history=false From 4783c90a7c37ef4f8fa700a31e7e36ae7852c0b3 Mon Sep 17 00:00:00 2001 From: Nbarola Date: Thu, 7 Aug 2025 16:10:22 +0530 Subject: [PATCH 4/6] add acr generation rule --- .../generation-rules/azure-acr-health.yaml | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 codebundles/azure-acr-health/.runwhen/generation-rules/azure-acr-health.yaml diff --git a/codebundles/azure-acr-health/.runwhen/generation-rules/azure-acr-health.yaml b/codebundles/azure-acr-health/.runwhen/generation-rules/azure-acr-health.yaml new file mode 100644 index 000000000..55bd8a600 --- /dev/null +++ b/codebundles/azure-acr-health/.runwhen/generation-rules/azure-acr-health.yaml @@ -0,0 +1,23 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + platform: azure + generationRules: + - resourceTypes: + - azure_containerregistry_registries + matchRules: + - type: pattern + pattern: ".+" + properties: [name] + mode: substring + slxs: + - baseName: azure-acr-health + qualifiers: ["resource"] + baseTemplateName: azure-acr-health + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: azure-acr-health-taskset.yaml + - type: workflow \ No newline at end of file From c8a8e34407eafd8342dcba3bc5a9599dafc466fb Mon Sep 17 00:00:00 2001 From: Nbarola Date: Thu, 7 Aug 2025 16:10:42 +0530 Subject: [PATCH 5/6] add acr template yaml files --- .../templates/azure-acr-health-sli.yaml | 53 +++++++++++++++++++ .../templates/azure-acr-health-slx.yaml | 27 ++++++++++ .../templates/azure-acr-health-taskset.yaml | 35 ++++++++++++ 3 files changed, 115 insertions(+) create mode 100644 codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-sli.yaml create mode 100644 codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-slx.yaml create mode 100644 codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-taskset.yaml diff --git a/codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-sli.yaml b/codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-sli.yaml new file mode 100644 index 000000000..8954c4d95 --- /dev/null +++ b/codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-sli.yaml @@ -0,0 +1,53 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + displayUnitsLong: OK + displayUnitsShort: ok + locations: + - {{default_location}} + description: Monitors the health of Azure Container Registry (ACR) by checking reachability, SKU metrics, pull/push success ratios, and storage utilization. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/azure-c7n-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/azure-acr-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 600 + configProvided: + - name: AZURE_RESOURCE_GROUP + value: "{{resource_group.name}}" + - name: AZURE_SUBSCRIPTION_ID + value: "{{ subscription_id }}" + secretsProvided: + {% if wb_version %} + {% include "azure-auth.yaml" ignore missing %} + {% else %} + - name: azure_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} + alerts: + warning: + operator: < + threshold: '1' + for: '20m' + ticket: + operator: < + threshold: '1' + for: '40m' + page: + operator: '==' + threshold: '0' + for: '' \ No newline at end of file diff --git a/codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-slx.yaml b/codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-slx.yaml new file mode 100644 index 000000000..d57bc8d4f --- /dev/null +++ b/codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-slx.yaml @@ -0,0 +1,27 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://placeholder.svg + alias: {{match_resource.resource.name}} Azure ACR Health + asMeasuredBy: Composite health score of resources & activities. + configProvided: + - name: SLX_PLACEHOLDER + value: SLX_PLACEHOLDER + owners: + - {{workspace.owner_email}} + statement: Monitors the health of Azure Container Registry (ACR) by checking reachability, SKU metrics, pull/push success ratios, and storage utilization. + additionalContext: + {% include "azure-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "azure-tags.yaml" ignore missing %} + - name: service + value: acr + - name: access + value: read-only \ No newline at end of file diff --git a/codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-taskset.yaml b/codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-taskset.yaml new file mode 100644 index 000000000..e72293123 --- /dev/null +++ b/codebundles/azure-acr-health/.runwhen/templates/azure-acr-health-taskset.yaml @@ -0,0 +1,35 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Monitors the health of Azure Container Registry (ACR) by checking reachability, SKU metrics, pull/push success ratios, and storage utilization. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/azure-c7n-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/azure-acr-health/runbook.robot + configProvided: + - name: AZURE_RESOURCE_GROUP + value: "{{resource_group.name}}" + - name: AZURE_SUBSCRIPTION_ID + value: "{{ subscription_id }}" + secretsProvided: + {% if wb_version %} + {% include "azure-auth.yaml" ignore missing %} + {% else %} + - name: azure_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} \ No newline at end of file From 0fffbf52cd589f29da04501bb253ff4c3e190b6a Mon Sep 17 00:00:00 2001 From: Nbarola Date: Thu, 7 Aug 2025 16:29:17 +0530 Subject: [PATCH 6/6] add acr storage usage bash scriipt --- .../azure-acr-health/acr_storage_usage.sh | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 codebundles/azure-acr-health/acr_storage_usage.sh diff --git a/codebundles/azure-acr-health/acr_storage_usage.sh b/codebundles/azure-acr-health/acr_storage_usage.sh new file mode 100644 index 000000000..81fdf9532 --- /dev/null +++ b/codebundles/azure-acr-health/acr_storage_usage.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +set -x +SUBSCRIPTION_ID=${AZURE_SUBSCRIPTION_ID:-} +RESOURCE_GROUP=${AZ_RESOURCE_GROUP:-} +ACR_NAME=${ACR_NAME:-} +USAGE_THRESHOLD=${USAGE_THRESHOLD:-80} + +ISSUES_FILE="storage_usage_issues.json" +echo '[]' > "$ISSUES_FILE" + +add_issue() { + local title="$1" + local severity="$2" + local expected="$3" + local actual="$4" + local details="$5" + local next_steps="$6" + details=$(echo "$details" | sed 's/"/\\"/g' | sed ':a;N;$!ba;s/\n/\\n/g') + next_steps=$(echo "$next_steps" | sed 's/"/\\"/g' | sed ':a;N;$!ba;s/\n/\\n/g') + local issue="{\"title\":\"$title\",\"severity\":$severity,\"expected\":\"$expected\",\"actual\":\"$actual\",\"details\":\"$details\",\"next_steps\":\"$next_steps\"}" + jq ". += [${issue}]" "$ISSUES_FILE" > temp.json && mv temp.json "$ISSUES_FILE" +} + +usage=$(az acr show-usage --name "$ACR_NAME" --subscription "$SUBSCRIPTION_ID" -o json 2>usage_err.log) +if [ $? -ne 0 ]; then + add_issue "Failed to get storage usage" 4 "Registry storage usage info should be retrievable" "Command failed" "See usage_err.log" "Check if registry and subscription exist and you have access" +else + used=$(echo "$usage" | jq -r '.value[] | select(.name.value=="StorageUsed") | .currentValue') + quota=$(echo "$usage" | jq -r '.value[] | select(.name.value=="StorageUsed") | .limitValue') + percent=$(echo "scale=2; ($used/$quota)*100" | bc) + if (( $(echo "$percent > $USAGE_THRESHOLD" | bc -l) )); then + add_issue "High storage usage" 3 "Usage below 80%" "Usage at $percent%" "Consider cleaning images or increase quota" + fi +fi