Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,7 @@ build
**bashfile-**
**db.sqlite3**
**generated_id_rsa
**vm_uptime_stdout**
**vm_patch_stdout**
**vm_mem_stdout**
**vm_disk_stdout**
1 change: 0 additions & 1 deletion VERSION

This file was deleted.

Empty file modified codebundles/azure-vm-os-health/.cursorrules
100644 → 100755
Empty file.
Empty file modified codebundles/azure-vm-os-health/.cursorrules-azure
100644 → 100755
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
249 changes: 217 additions & 32 deletions codebundles/azure-vm-os-health/.test/terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -97,36 +97,6 @@ resource "azurerm_linux_virtual_machine" "test_vm" {
tags = var.tags
}

# # Create a data disk
# resource "azurerm_managed_disk" "test_data_disk" {
# name = "test-data-disk"
# location = azurerm_resource_group.test_rg.location
# resource_group_name = azurerm_resource_group.test_rg.name
# storage_account_type = "Standard_LRS"
# create_option = "Empty"
# disk_size_gb = 50
# tags = var.tags
# }

# # Attach the data disk to the VM
# resource "azurerm_virtual_machine_data_disk_attachment" "test_disk_attachment" {
# managed_disk_id = azurerm_managed_disk.test_data_disk.id
# virtual_machine_id = azurerm_linux_virtual_machine.test_vm.id
# lun = 0
# caching = "ReadWrite"
# }

resource "tls_private_key" "vm_key" {
algorithm = "RSA"
rsa_bits = 4096
}

resource "local_file" "private_key" {
content = tls_private_key.vm_key.private_key_pem
filename = "${path.module}/generated_id_rsa"
file_permission = "0600"
}

# Create a second VM with high disk usage for testing
resource "azurerm_linux_virtual_machine" "high_usage_vm" {
name = "high-usage-vm"
Expand Down Expand Up @@ -180,15 +150,230 @@ resource "azurerm_network_interface" "high_usage_nic" {
}
}

# Create a Windows VM to test OS filtering
resource "azurerm_windows_virtual_machine" "windows_test_vm" {
name = "windows-test-vm"
resource_group_name = azurerm_resource_group.test_rg.name
location = azurerm_resource_group.test_rg.location
size = "Standard_B1s"
admin_username = "adminuser"
admin_password = "P@ssw0rd123!"
network_interface_ids = [
azurerm_network_interface.windows_nic.id,
]

os_disk {
caching = "ReadWrite"
storage_account_type = "Standard_LRS"
disk_size_gb = 30
}

source_image_reference {
publisher = "MicrosoftWindowsServer"
offer = "WindowsServer"
sku = "2019-Datacenter"
version = "latest"
}

tags = var.tags
}

# Create a network interface for the Windows VM
resource "azurerm_network_interface" "windows_nic" {
name = "windows-nic"
location = azurerm_resource_group.test_rg.location
resource_group_name = azurerm_resource_group.test_rg.name

ip_configuration {
name = "internal"
subnet_id = azurerm_subnet.test_subnet.id
private_ip_address_allocation = "Dynamic"
}
}

# Create a stopped Linux VM to test VM status handling
resource "azurerm_linux_virtual_machine" "stopped_vm" {
name = "stopped-vm"
resource_group_name = azurerm_resource_group.test_rg.name
location = azurerm_resource_group.test_rg.location
size = "Standard_B1s"
admin_username = "adminuser"
network_interface_ids = [
azurerm_network_interface.stopped_nic.id,
]

admin_ssh_key {
username = "adminuser"
public_key = tls_private_key.vm_key.public_key_openssh
}

os_disk {
caching = "ReadWrite"
storage_account_type = "Standard_LRS"
disk_size_gb = 30
}

source_image_reference {
publisher = "Canonical"
offer = "UbuntuServer"
sku = "18.04-LTS"
version = "latest"
}

tags = var.tags
}

# Create a network interface for the stopped VM
resource "azurerm_network_interface" "stopped_nic" {
name = "stopped-nic"
location = azurerm_resource_group.test_rg.location
resource_group_name = azurerm_resource_group.test_rg.name

ip_configuration {
name = "internal"
subnet_id = azurerm_subnet.test_subnet.id
private_ip_address_allocation = "Dynamic"
}
}

# Create a VM with a name that should be included by default patterns
resource "azurerm_linux_virtual_machine" "web_server_vm" {
name = "web-server-01"
resource_group_name = azurerm_resource_group.test_rg.name
location = azurerm_resource_group.test_rg.location
size = "Standard_B1s"
admin_username = "adminuser"
network_interface_ids = [
azurerm_network_interface.web_nic.id,
]

admin_ssh_key {
username = "adminuser"
public_key = tls_private_key.vm_key.public_key_openssh
}

os_disk {
caching = "ReadWrite"
storage_account_type = "Standard_LRS"
disk_size_gb = 30
}

source_image_reference {
publisher = "Canonical"
offer = "UbuntuServer"
sku = "18.04-LTS"
version = "latest"
}

tags = var.tags
}

# Create a network interface for the web server VM
resource "azurerm_network_interface" "web_nic" {
name = "web-nic"
location = azurerm_resource_group.test_rg.location
resource_group_name = azurerm_resource_group.test_rg.name

ip_configuration {
name = "internal"
subnet_id = azurerm_subnet.test_subnet.id
private_ip_address_allocation = "Dynamic"
}
}

# Create a VM with a name that should be excluded by default patterns
resource "azurerm_linux_virtual_machine" "test_excluded_vm" {
name = "test-excluded-vm"
resource_group_name = azurerm_resource_group.test_rg.name
location = azurerm_resource_group.test_rg.location
size = "Standard_B1s"
admin_username = "adminuser"
network_interface_ids = [
azurerm_network_interface.excluded_nic.id,
]

admin_ssh_key {
username = "adminuser"
public_key = tls_private_key.vm_key.public_key_openssh
}

os_disk {
caching = "ReadWrite"
storage_account_type = "Standard_LRS"
disk_size_gb = 30
}

source_image_reference {
publisher = "Canonical"
offer = "UbuntuServer"
sku = "18.04-LTS"
version = "latest"
}

tags = var.tags
}

# Create a network interface for the excluded VM
resource "azurerm_network_interface" "excluded_nic" {
name = "excluded-nic"
location = azurerm_resource_group.test_rg.location
resource_group_name = azurerm_resource_group.test_rg.name

ip_configuration {
name = "internal"
subnet_id = azurerm_subnet.test_subnet.id
private_ip_address_allocation = "Dynamic"
}
}

resource "tls_private_key" "vm_key" {
algorithm = "RSA"
rsa_bits = 4096
}

resource "local_file" "private_key" {
content = tls_private_key.vm_key.private_key_pem
filename = "${path.module}/generated_id_rsa"
file_permission = "0600"
}

# Output the resource group name
output "resource_group_name" {
value = azurerm_resource_group.test_rg.name
}

# Output the VM names
# Output the VM names with their types
output "vm_names" {
value = [
azurerm_linux_virtual_machine.test_vm.name,
azurerm_linux_virtual_machine.high_usage_vm.name
azurerm_linux_virtual_machine.high_usage_vm.name,
azurerm_windows_virtual_machine.windows_test_vm.name,
azurerm_linux_virtual_machine.stopped_vm.name,
azurerm_linux_virtual_machine.web_server_vm.name,
azurerm_linux_virtual_machine.test_excluded_vm.name
]
}

# Output VM details for testing
output "vm_details" {
value = {
linux_vms = [
azurerm_linux_virtual_machine.test_vm.name,
azurerm_linux_virtual_machine.high_usage_vm.name,
azurerm_linux_virtual_machine.stopped_vm.name,
azurerm_linux_virtual_machine.web_server_vm.name,
azurerm_linux_virtual_machine.test_excluded_vm.name
]
windows_vms = [
azurerm_windows_virtual_machine.windows_test_vm.name
]
all_vms = [
azurerm_linux_virtual_machine.test_vm.name,
azurerm_linux_virtual_machine.high_usage_vm.name,
azurerm_windows_virtual_machine.windows_test_vm.name,
azurerm_linux_virtual_machine.stopped_vm.name,
azurerm_linux_virtual_machine.web_server_vm.name,
azurerm_linux_virtual_machine.test_excluded_vm.name
]
}
}
57 changes: 54 additions & 3 deletions codebundles/azure-vm-os-health/README.md
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,58 @@ This bundle provides comprehensive health checks for Azure Virtual Machines, inc
3. **Next steps scripts** (e.g., `next_steps_disk_utilization.sh`) analyze the parsed output and generate JSON issues or recommendations.
4. **SLI tasks** aggregate the results and push a health score metric.

## Key Features

### OS Filtering
- **Linux-only**: Scripts automatically filter out Windows VMs and only process Linux machines
- **OS Detection**: Uses Azure VM metadata to determine OS type before attempting commands

### Robust Error Handling
- **Graceful Failures**: Individual VM connection failures don't stop the entire script
- **Issue Creation**: Failed connections create structured issues for tracking
- **Detailed Logging**: Clear error messages for troubleshooting

### Configurable Timeouts
- **VM Status Timeout**: `VM_STATUS_TIMEOUT` (default: 10s) - Time to check VM power state
- **Command Timeout**: `COMMAND_TIMEOUT` (default: 45-60s) - Time for run-command execution
- **Overall Timeout**: `TIMEOUT_SECONDS` (default: 30s) - General script timeout

## Usage

- Configure your environment variables (resource group, subscription, thresholds, etc.).
- Optionally set `VM_INCLUDE_LIST` and/or `VM_OMIT_LIST` to control which VMs are checked:
- `VM_INCLUDE_LIST`: Comma-separated shell-style wildcards (e.g., `web-*,db-*`). Only VMs matching any pattern are included.
- `VM_OMIT_LIST`: Comma-separated shell-style wildcards. Any VM matching a pattern is excluded.
- If both are empty, all VMs in the resource group are checked.
- If both are empty, all Linux VMs in the resource group are checked.
- Run the desired Robot Framework task (e.g., from `runbook.robot` or `sli.robot`).
- Review the output and health scores.

### Environment Variables

```bash
# Required
AZURE_SUBSCRIPTION_ID="your-subscription-id"
AZ_RESOURCE_GROUP="your-resource-group"

# Optional - VM filtering
VM_INCLUDE_LIST="web-*,db-*" # Only check VMs matching patterns
VM_OMIT_LIST="*-test" # Skip VMs matching patterns

# Optional - Performance tuning
MAX_PARALLEL_JOBS=5 # Number of concurrent VM checks
VM_STATUS_TIMEOUT=10 # Seconds to check VM power state
COMMAND_TIMEOUT=45 # Seconds for run-command execution
TIMEOUT_SECONDS=30 # General script timeout
```

### Example

To check only VMs starting with `web-` or `db-`, but omit any ending with `-test`:

```
```bash
export VM_INCLUDE_LIST="web-*,db-*"
export VM_OMIT_LIST="*-test"
export COMMAND_TIMEOUT=60 # Longer timeout for patch checks
robot runbook.robot
```

Expand All @@ -56,8 +91,24 @@ robot runbook.robot
- `next_steps_disk_utilization.sh`, `next_steps_memory_check.sh`, `next_steps_uptime.sh`, `next_steps_patch_time.sh` - Next steps/issue analysis scripts.
- `.test/` - Example and test cases (see below for Terraform usage).

## Error Handling

The scripts handle various failure scenarios gracefully:

- **Connection Failures**: When a VM can't be reached, an issue is created and the script continues
- **Authentication Issues**: Clear error messages for Azure CLI authentication problems
- **VM Power State**: Non-running VMs are skipped with appropriate status codes
- **Command Timeouts**: Long-running commands are terminated with configurable timeouts
- **Invalid Responses**: Malformed Azure responses are handled with error reporting

### Issue Types

- `ConnectionError`: Failed to connect to VM or get status
- `VMNotRunning`: VM is not in running state
- `CommandTimeout`: Run-command execution timed out
- `InvalidResponse`: Unexpected response format from Azure

### How to Use the Terraform Code
## How to Use the Terraform Code

1. Prepare your secrets file (tf.secret)
Create a file named tf.secret in your Terraform directory with the following structure:
Expand Down
Empty file modified codebundles/azure-vm-os-health/next_steps_memory_check.sh
100644 → 100755
Empty file.
Empty file modified codebundles/azure-vm-os-health/next_steps_patch_time.sh
100644 → 100755
Empty file.
Empty file modified codebundles/azure-vm-os-health/next_steps_uptime.sh
100644 → 100755
Empty file.
Loading