Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: runwhen.com/v1
kind: GenerationRules
spec:
platform: gcp
generationRules:
- resourceTypes:
- gcp_dms_migration_jobs
matchRules:
- type: pattern
pattern: ".+"
properties: ["project_id", "dms_location"]
mode: substring
slxs:
- baseName: gcp-dms-migration-health
qualifiers: ["project", "dms_location"]
baseTemplateName: gcp-dms-migration-health
levelOfDetail: basic
outputItems:
- type: slx
- type: sli
- type: runbook
templateName: gcp-dms-migration-health-taskset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelIndicator
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
displayUnitsLong: Health Score
displayUnitsShort: score
locations:
- {{default_location}}
description: Measures DMS migration health from job list state, recent operations, and CDC replication lag (0 unhealthy to 1 healthy).
codeBundle:
{% if repo_url %}
repoUrl: {{repo_url}}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ref}}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/gcp-dms-migration-health/sli.robot
intervalStrategy: intermezzo
intervalSeconds: 180
configProvided:
- name: GCP_PROJECT_ID
value: "{{ match_resource.resource.project_id }}"
- name: GCP_DMS_LOCATION
value: "{{ match_resource.resource.dms_location | default(custom.gcp_dms_location | default('us-central1')) }}"
- name: REPLICATION_LAG_SEC_THRESHOLD
value: "{{ custom.replication_lag_sec_threshold | default('300') }}"
secretsProvided:
{% if wb_version %}
{% include "gcp-auth.yaml" ignore missing %}
{% else %}
- name: gcp_credentials
workspaceKey: AUTH DETAILS NOT FOUND
{% endif %}
alertConfig:
tasks:
persona: eager-edgar
sessionTTL: 10m
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelX
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/gcp/cloud_sql/cloud_sql.svg
alias: GCP DMS Migration Health for Project {{match_resource.resource.project_id}}
asMeasuredBy: Aggregate of DMS job health, recent operations, and CDC replication lag for the configured project and DMS region.
configProvided:
- name: SLX_PLACEHOLDER
value: SLX_PLACEHOLDER
owners:
- {{workspace.owner_email}}
statement: DMS migration jobs should progress without failures, operations should complete cleanly, and CDC lag should stay within thresholds before cutover.
additionalContext:
{% include "gcp-hierarchy.yaml" ignore missing %}
qualified_name: "{{ match_resource.qualified_name }}"
tags:
{% include "gcp-tags.yaml" ignore missing %}
- name: cloud
value: gcp
- name: service
value: database_migration_service
- name: scope
value: project
- name: access
value: read-only
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
apiVersion: runwhen.com/v1
kind: Runbook
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
location: {{default_location}}
description: Monitors Google Cloud Database Migration Service jobs for failures, stuck states, operation errors, and CDC replication lag.
codeBundle:
{% if repo_url %}
repoUrl: {{repo_url}}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ref}}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/gcp-dms-migration-health/runbook.robot
intervalStrategy: intermezzo
intervalSeconds: 300
configProvided:
- name: GCP_PROJECT_ID
value: "{{ match_resource.resource.project_id }}"
- name: GCP_DMS_LOCATION
value: "{{ match_resource.resource.dms_location | default(custom.gcp_dms_location | default('us-central1')) }}"
- name: DMS_JOB_NAMES
value: "{{ custom.dms_job_names | default('All') }}"
- name: REPLICATION_LAG_SEC_THRESHOLD
value: "{{ custom.replication_lag_sec_threshold | default('300') }}"
- name: REPLICATION_LAG_BYTES_THRESHOLD
value: "{{ custom.replication_lag_bytes_threshold | default('0') }}"
- name: DMS_STUCK_MINUTES
value: "{{ custom.dms_stuck_minutes | default('120') }}"
- name: DMS_OPERATION_STUCK_MINUTES
value: "{{ custom.dms_operation_stuck_minutes | default('45') }}"
- name: DMS_OPERATION_LIMIT
value: "{{ custom.dms_operation_limit | default('50') }}"
- name: DMS_LOG_LOOKBACK
value: "{{ custom.dms_log_lookback | default('1h') }}"
secretsProvided:
- name: gcp_credentials
workspaceKey: {{custom.gcp_ops_suite_sa}}
24 changes: 24 additions & 0 deletions codebundles/gcp-dms-migration-health/.test/Taskfile.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
version: "3"

tasks:
default:
desc: "Validate CodeBundle structure"
cmds:
- task: validate-structure

validate-structure:
desc: "Verify runbook, SLI, and RunWhen templates exist"
cmds:
- bash validate-bundle-structure.sh
dir: .
silent: false

clean:
desc: "No cloud resources to tear down for this bundle"
cmds:
- 'echo "No-op: GCP DMS bundle uses client-side validation only."'

build-infra:
desc: "No Terraform for this bundle; use a real GCP project for integration tests"
cmds:
- 'echo "Skipped: provision DMS test jobs manually in GCP if needed."'
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash
# Validates required CodeBundle files for gcp-dms-migration-health (CI / local).
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
test -f "$ROOT/runbook.robot"
test -f "$ROOT/sli.robot"
test -f "$ROOT/.runwhen/generation-rules/gcp-dms-migration-health.yaml"
test -f "$ROOT/.runwhen/templates/gcp-dms-migration-health-slx.yaml"
test -f "$ROOT/.runwhen/templates/gcp-dms-migration-health-taskset.yaml"
test -f "$ROOT/.runwhen/templates/gcp-dms-migration-health-sli.yaml"
echo "gcp-dms-migration-health bundle structure OK"
58 changes: 58 additions & 0 deletions codebundles/gcp-dms-migration-health/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# GCP Database Migration Service (DMS) Health

This CodeBundle monitors Google Cloud Database Migration Service (DMS) migration jobs for failed or stuck states, surfaces recent asynchronous operation failures, and evaluates CDC replication lag using Cloud Monitoring metrics (`migration_job/max_replica_sec_lag` and optionally `migration_job/max_replica_bytes_lag` on resource type `datamigration.googleapis.com/MigrationJob`). It helps confirm migrations are progressing and cutover-ready.

## Overview

- **Migration jobs**: Lists jobs with `gcloud database-migration migration-jobs list` using `--region` set from `GCP_DMS_LOCATION` (see [gcloud reference](https://cloud.google.com/sdk/gcloud/reference/database-migration/)). Flags terminal failures, long-lived transitional states, paused/cancelled jobs, and RUNNING jobs that remain outside CDC beyond a time threshold when continuous replication is expected.
- **Operations**: Lists recent DMS operations and raises issues on operation-level errors and long-running incomplete operations.
- **Replication lag**: For jobs in CDC phase, reads Cloud Monitoring time series and compares lag to `REPLICATION_LAG_SEC_THRESHOLD` and optional byte lag. Google documents that samples can appear in Monitoring up to about **180 seconds** after the observation window.
- **Describe**: Runs `gcloud database-migration migration-jobs describe` for jobs you name explicitly or that prior tasks flagged.
- **Logs**: Optionally correlates Cloud Logging entries for `datamigration.googleapis.com` when unhealthy jobs were flagged.

## Configuration

### Required Variables

- `GCP_PROJECT_ID`: GCP project ID that contains the DMS migration jobs.
- `GCP_DMS_LOCATION`: DMS location ID passed to `gcloud database-migration ... --region` (for example `us-central1`).

### Optional Variables

- `DMS_JOB_NAMES`: Comma-separated migration job IDs to scope listing and describe logic, or `All` for every job in the region (default: `All`).
- `REPLICATION_LAG_SEC_THRESHOLD`: Seconds; alert when `max_replica_sec_lag` exceeds this value during CDC (default: `300`).
- `REPLICATION_LAG_BYTES_THRESHOLD`: Bytes; set to `0` to disable byte-lag issues (default: `0`).
- `DMS_STUCK_MINUTES`: Minutes a job may remain in a transitional state (or RUNNING outside CDC) before stuck-style issues (default: `120`).
- `DMS_OPERATION_STUCK_MINUTES`: Minutes an incomplete operation may run before it is treated as stuck (default: `45`).
- `DMS_OPERATION_LIMIT`: Maximum operations returned by `gcloud database-migration operations list` (default: `50`).
- `DMS_LOG_LOOKBACK`: Freshness window for optional error log correlation (for example `1h` or `30m`) (default: `1h`).

### Secrets

- `gcp_credentials`: Service account JSON key. Typical roles include `roles/datamigration.viewer`, `roles/monitoring.viewer`, and `roles/logging.viewer` for list/describe, time series, and log read access.

## Tasks Overview

### List DMS Migration Jobs and Flag Unhealthy States for `${GCP_PROJECT_ID}`

Builds a summary table, writes structured issues for failed/cancelled/paused jobs, stuck transitional states, and delayed progression to CDC, and records flagged job IDs for follow-on tasks.

### List Recent DMS Operations and Flag Failures for `${GCP_PROJECT_ID}`

Surfaces failed operations and operations that stay incomplete beyond `DMS_OPERATION_STUCK_MINUTES`, and appends related job IDs to the shared flag list when identifiers appear in operation metadata.

### Report DMS Replication Lag from Cloud Monitoring for `${GCP_PROJECT_ID}`

Evaluates CDC jobs only for lag alerting. Skips lag evaluation when no jobs are in CDC (for example during full dump), which is expected per Google guidance.

### Summarize DMS Migration Job Details for Flagged Jobs in `${GCP_PROJECT_ID}`

Describes targets from `DMS_JOB_NAMES` when not `All`, otherwise describes jobs accumulated in the flag file from earlier tasks.

### Optional Error Log Correlation for DMS in `${GCP_PROJECT_ID}`

Runs a bounded Cloud Logging query when the flag file is non-empty; otherwise no-ops.

## SLI

`sli.robot` publishes a 0–1 score as the mean of binary dimensions: healthy job list (no FAILED/CANCELLED), operations without errors, and replication lag under threshold when CDC jobs exist.
82 changes: 82 additions & 0 deletions codebundles/gcp-dms-migration-health/describe-migration-jobs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env bash
set -euo pipefail
set -x
# -----------------------------------------------------------------------------
# Describes selected migration jobs for diagnostics. Targets come from DMS_JOB_NAMES
# or dms_flagged_jobs.txt (when DMS_JOB_NAMES is All).
# Output: describe_migration_jobs_issues.json, human summary on stdout
# -----------------------------------------------------------------------------

: "${GCP_PROJECT_ID:?Must set GCP_PROJECT_ID}"
: "${GCP_DMS_LOCATION:?Must set GCP_DMS_LOCATION}"

OUTPUT_FILE="describe_migration_jobs_issues.json"
FLAG_FILE="dms_flagged_jobs.txt"
DMS_JOB_NAMES="${DMS_JOB_NAMES:-All}"

issues_json='[]'

gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"

declare -a TARGETS=()

if [ "${DMS_JOB_NAMES}" != "All" ]; then
IFS=',' read -ra parts <<<"${DMS_JOB_NAMES}"
for p in "${parts[@]}"; do
p="${p#"${p%%[![:space:]]*}"}"
p="${p%"${p##*[![:space:]]}"}"
[ -n "$p" ] && TARGETS+=("$p")
done
else
if [ -f "$FLAG_FILE" ]; then
while IFS= read -r line; do
line="${line#"${line%%[![:space:]]*}"}"
line="${line%"${line##*[![:space:]]}"}"
[ -z "$line" ] && continue
TARGETS+=("$line")
done < <(sort -u "$FLAG_FILE")
fi
fi

if [ ${#TARGETS[@]} -eq 0 ]; then
echo "No migration job IDs to describe (set DMS_JOB_NAMES or run prior health tasks to populate ${FLAG_FILE})."
echo '[]' >"$OUTPUT_FILE"
exit 0
fi

summary="=== DMS migration job describe (${GCP_DMS_LOCATION}) ==="$'\n'

for jid in "${TARGETS[@]}"; do
if ! desc=$(gcloud database-migration migration-jobs describe "${jid}" \
--project="${GCP_PROJECT_ID}" \
--region="${GCP_DMS_LOCATION}" \
--format=json 2>err.log); then
err_msg=$(cat err.log || true)
rm -f err.log
issues_json=$(echo "$issues_json" | jq \
--arg title "Cannot describe DMS migration job \`${jid}\`" \
--arg details "describe failed: ${err_msg}" \
--arg severity "3" \
--arg next_steps "Verify job ID, region, and IAM datamigration.migrationJobs.get." \
'. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]')
continue
fi
rm -f err.log

summary+=$'---\n'"${jid}"$'\n'
summary+=$(echo "$desc" | jq -r '"state: \(.state // "n/a") phase: \(.phase // "n/a")"' 2>/dev/null || echo "$desc")$'\n'

err_block=$(echo "$desc" | jq -c '.error // empty' 2>/dev/null || echo "{}")
if [ "$err_block" != "{}" ] && [ -n "$err_block" ] && [ "$err_block" != "null" ]; then
issues_json=$(echo "$issues_json" | jq \
--arg title "DMS migration job \`${jid}\` describe shows an error block" \
--arg details "$(echo "$desc" | jq -c .)" \
--arg severity "4" \
--arg next_steps "Resolve the reported error: check connectivity, credentials, and engine-specific prerequisites." \
'. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]')
fi
done

echo "$issues_json" >"$OUTPUT_FILE"
echo "${summary}"
echo "Wrote ${OUTPUT_FILE}"
68 changes: 68 additions & 0 deletions codebundles/gcp-dms-migration-health/fetch-dms-error-logs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env bash
set -euo pipefail
set -x
# -----------------------------------------------------------------------------
# Bounded Cloud Logging query for DMS / datamigration when unhealthy jobs exist.
# No-op when dms_flagged_jobs.txt is empty.
# Env: GCP_PROJECT_ID, GCP_DMS_LOCATION, DMS_LOG_LOOKBACK
# Output: fetch_dms_error_logs_issues.json (usually empty; issues if critical errors found)
# -----------------------------------------------------------------------------

: "${GCP_PROJECT_ID:?Must set GCP_PROJECT_ID}"
: "${GCP_DMS_LOCATION:?Must set GCP_DMS_LOCATION}"

OUTPUT_FILE="fetch_dms_error_logs_issues.json"
FLAG_FILE="dms_flagged_jobs.txt"
DMS_LOG_LOOKBACK="${DMS_LOG_LOOKBACK:-1h}"

issues_json='[]'

gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}"

if [ ! -s "$FLAG_FILE" ]; then
echo "No flagged DMS jobs; skipping error log correlation (healthy or not yet evaluated)."
echo '[]' >"$OUTPUT_FILE"
exit 0
fi

# Broad DMS-related errors in project (read-only)
filter='(protoPayload.serviceName="datamigration.googleapis.com" OR resource.type="datamigration.googleapis.com/MigrationJob") AND severity>=ERROR'

if ! logs_out=$(gcloud logging read "${filter}" \
--project="${GCP_PROJECT_ID}" \
--freshness="${DMS_LOG_LOOKBACK}" \
--limit=50 \
--format=json 2>err.log); then
err_msg=$(cat err.log || true)
rm -f err.log
issues_json=$(echo "$issues_json" | jq \
--arg title "Cannot query Cloud Logging for DMS errors" \
--arg details "gcloud logging read failed: ${err_msg}" \
--arg severity "2" \
--arg next_steps "Grant logging.logEntries.list (roles/logging.viewer) and retry." \
'. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]')
echo "$issues_json" >"$OUTPUT_FILE"
exit 0
fi
rm -f err.log

count=$(echo "$logs_out" | jq 'length' 2>/dev/null || echo "0")
if [ "${count}" -gt 0 ] 2>/dev/null; then
issues_json=$(echo "$issues_json" | jq \
--arg title "Recent DMS-related ERROR logs found in \`${GCP_PROJECT_ID}\`" \
--arg details "Count=${count} (lookback ${DMS_LOG_LOOKBACK}). Sample entries: $(echo "$logs_out" | jq -c '.[0:3]')" \
--arg severity "2" \
--arg next_steps "Triage entries below; correlate with flagged migration jobs and operations." \
'. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]')
fi

echo "$issues_json" >"$OUTPUT_FILE"

echo "=== DMS-related error logs (freshness ${DMS_LOG_LOOKBACK}, limit 50) ==="
gcloud logging read "${filter}" \
--project="${GCP_PROJECT_ID}" \
--freshness="${DMS_LOG_LOOKBACK}" \
--limit=20 \
--format="table[box](timestamp,severity,logName)" || true

echo "Wrote ${OUTPUT_FILE}"
Loading